bio-batch-processing

Process multiple sequence files efficiently using Biopython.

Safety Notice

This listing is imported from skills.sh public index metadata. Review upstream SKILL.md and repository scripts before running.

Copy this and send it to your AI assistant to learn

Install skill "bio-batch-processing" with this command: npx skills add gptomics/bioskills/gptomics-bioskills-bio-batch-processing

Batch Processing

Process multiple sequence files efficiently using Biopython.

Required Imports

from pathlib import Path from Bio import SeqIO

Process Multiple Files

Iterate Over Files in Directory

from pathlib import Path

for fasta_file in Path('data/').glob('*.fasta'): records = list(SeqIO.parse(fasta_file, 'fasta')) print(f'{fasta_file.name}: {len(records)} sequences')

Process All FASTQ Files

for fq_file in Path('.').glob('*.fastq'): count = sum(1 for _ in SeqIO.parse(fq_file, 'fastq')) print(f'{fq_file.name}: {count} reads')

Recursive File Search

for gb_file in Path('data/').rglob('*.gb'): print(f'Found: {gb_file}')

Merge Files

Merge All FASTA Files

from pathlib import Path

def all_records(directory, pattern, format): for filepath in Path(directory).glob(pattern): yield from SeqIO.parse(filepath, format)

records = all_records('data/', '*.fasta', 'fasta') count = SeqIO.write(records, 'merged.fasta', 'fasta') print(f'Merged {count} records')

Merge with Source Tracking

def records_with_source(directory, pattern, format): for filepath in Path(directory).glob(pattern): for record in SeqIO.parse(filepath, format): record.description = f'{record.description} [source={filepath.name}]' yield record

records = records_with_source('data/', '*.fasta', 'fasta') SeqIO.write(records, 'merged_tracked.fasta', 'fasta')

Merge Specific Files

files = ['sample1.fasta', 'sample2.fasta', 'sample3.fasta']

def merge_files(file_list, format): for filepath in file_list: yield from SeqIO.parse(filepath, format)

SeqIO.write(merge_files(files, 'fasta'), 'combined.fasta', 'fasta')

Split Files

Split by Number of Records

from itertools import islice

def split_file(input_file, format, records_per_file, output_prefix): records = SeqIO.parse(input_file, format) file_num = 1 while True: batch = list(islice(records, records_per_file)) if not batch: break output_file = f'{output_prefix}_{file_num}.{format}' SeqIO.write(batch, output_file, format) print(f'Wrote {len(batch)} records to {output_file}') file_num += 1

split_file('large.fasta', 'fasta', 1000, 'split')

Split by Sequence ID Prefix

from collections import defaultdict

records_by_prefix = defaultdict(list) for record in SeqIO.parse('input.fasta', 'fasta'): prefix = record.id.split('_')[0] records_by_prefix[prefix].append(record)

for prefix, records in records_by_prefix.items(): SeqIO.write(records, f'{prefix}.fasta', 'fasta')

One Sequence Per File

for record in SeqIO.parse('multi.fasta', 'fasta'): SeqIO.write(record, f'{record.id}.fasta', 'fasta')

Batch Convert

Convert All Files in Directory

from pathlib import Path

for gb_file in Path('genbank/').glob('*.gb'): fasta_file = Path('fasta/') / gb_file.with_suffix('.fasta').name count = SeqIO.convert(str(gb_file), 'genbank', str(fasta_file), 'fasta') print(f'{gb_file.name} -> {fasta_file.name}: {count} records')

Batch Convert with Summary

from pathlib import Path

results = [] for input_file in Path('input/').glob('*.gb'): output_file = Path('output/') / input_file.with_suffix('.fasta').name count = SeqIO.convert(str(input_file), 'genbank', str(output_file), 'fasta') results.append({'file': input_file.name, 'records': count})

print(f'Converted {len(results)} files, {sum(r["records"] for r in results)} total records')

Parallel Processing

Using multiprocessing

from multiprocessing import Pool from pathlib import Path

def process_file(filepath): records = list(SeqIO.parse(filepath, 'fasta')) return {'file': filepath.name, 'count': len(records), 'total_bp': sum(len(r.seq) for r in records)}

files = list(Path('data/').glob('*.fasta')) with Pool(4) as pool: results = pool.map(process_file, files)

for r in results: print(f'{r["file"]}: {r["count"]} seqs, {r["total_bp"]} bp')

Using concurrent.futures

from concurrent.futures import ThreadPoolExecutor from pathlib import Path

def count_records(filepath): return filepath.name, sum(1 for _ in SeqIO.parse(filepath, 'fasta'))

files = list(Path('data/').glob('*.fasta')) with ThreadPoolExecutor(max_workers=4) as executor: results = executor.map(count_records, files)

for name, count in results: print(f'{name}: {count}')

Summary Statistics

Aggregate Stats Across Files

from pathlib import Path

total_seqs = 0 total_bp = 0 file_count = 0

for fasta_file in Path('data/').glob('*.fasta'): for record in SeqIO.parse(fasta_file, 'fasta'): total_seqs += 1 total_bp += len(record.seq) file_count += 1

print(f'Files: {file_count}') print(f'Sequences: {total_seqs}') print(f'Total bp: {total_bp}') print(f'Average length: {total_bp / total_seqs:.0f}')

Per-File Summary Report

from pathlib import Path import csv

summaries = [] for fasta_file in Path('data/').glob('*.fasta'): records = list(SeqIO.parse(fasta_file, 'fasta')) lengths = [len(r.seq) for r in records] summaries.append({ 'file': fasta_file.name, 'sequences': len(records), 'total_bp': sum(lengths), 'min_len': min(lengths) if lengths else 0, 'max_len': max(lengths) if lengths else 0, 'avg_len': sum(lengths) / len(lengths) if lengths else 0 })

with open('summary.csv', 'w', newline='') as f: writer = csv.DictWriter(f, fieldnames=summaries[0].keys()) writer.writeheader() writer.writerows(summaries)

File Organization

Organize by Criteria

from pathlib import Path from Bio.SeqUtils import gc_fraction

Path('high_gc').mkdir(exist_ok=True) Path('low_gc').mkdir(exist_ok=True)

for fasta_file in Path('input/').glob('*.fasta'): records = list(SeqIO.parse(fasta_file, 'fasta')) avg_gc = sum(gc_fraction(r.seq) for r in records) / len(records)

if avg_gc >= 0.5:
    dest = Path('high_gc') / fasta_file.name
else:
    dest = Path('low_gc') / fasta_file.name

SeqIO.write(records, dest, 'fasta')

Common Patterns

Task Approach

Merge files Generator yielding from each file

Split file islice with batch size

Convert all Loop with SeqIO.convert

Parallel processing multiprocessing.Pool or ThreadPoolExecutor

Summary stats Accumulate while iterating

Related Skills

  • read-sequences - Core parsing functions for each file

  • write-sequences - Write processed outputs

  • sequence-statistics - Generate per-file statistics

  • format-conversion - Batch format conversion

  • compressed-files - Handle compressed files in batch

  • database-access - Batch download sequences from NCBI

Source Transparency

This detail page is rendered from real SKILL.md content. Trust labels are metadata-based hints, not a safety guarantee.

Related Skills

Related by shared tags or category signals.

Coding

bio-clinical-databases-dbsnp-queries

No summary provided by upstream source.

Repository SourceNeeds Review
Coding

bio-workflows-clip-pipeline

No summary provided by upstream source.

Repository SourceNeeds Review
Coding

bio-clinical-databases-variant-prioritization

No summary provided by upstream source.

Repository SourceNeeds Review
Coding

bio-clip-seq-clip-peak-calling

No summary provided by upstream source.

Repository SourceNeeds Review