Biological Sciences Expert
Expert guidance for biology, biotechnology, genetics, bioinformatics, and computational biology applications.
Core Concepts
Molecular Biology
-
DNA, RNA, and protein structure
-
Central dogma (transcription, translation)
-
Gene expression and regulation
-
Genetic mutations and variations
-
CRISPR and gene editing
-
Protein folding and structure
Genomics & Bioinformatics
-
DNA sequencing (Sanger, NGS, long-read)
-
Genome assembly and annotation
-
Sequence alignment (BLAST, BLAT)
-
Variant calling and analysis
-
RNA-seq analysis
-
Phylogenetic analysis
Systems Biology
-
Metabolic pathways
-
Protein-protein interactions
-
Gene regulatory networks
-
Mathematical modeling
-
Pathway analysis
-
Network biology
DNA Sequence Analysis
from Bio import SeqIO, Seq from Bio.Seq import Seq from Bio.SeqUtils import gc_fraction, molecular_weight from typing import Dict, List
class DNAAnalyzer: """Analyze DNA sequences"""
def __init__(self, sequence: str):
self.sequence = Seq(sequence.upper())
def basic_stats(self) -> Dict:
"""Calculate basic sequence statistics"""
return {
"length": len(self.sequence),
"gc_content": gc_fraction(self.sequence) * 100,
"molecular_weight": molecular_weight(self.sequence, "DNA"),
"nucleotide_counts": self._count_nucleotides()
}
def _count_nucleotides(self) -> Dict[str, int]:
"""Count each nucleotide"""
return {
'A': self.sequence.count('A'),
'T': self.sequence.count('T'),
'G': self.sequence.count('G'),
'C': self.sequence.count('C')
}
def transcribe(self) -> str:
"""Transcribe DNA to RNA"""
return str(self.sequence.transcribe())
def translate(self, table: int = 1) -> str:
"""Translate DNA to protein"""
return str(self.sequence.translate(table=table))
def reverse_complement(self) -> str:
"""Get reverse complement"""
return str(self.sequence.reverse_complement())
def find_orfs(self, min_length: int = 100) -> List[Dict]:
"""Find Open Reading Frames"""
orfs = []
for strand, seq in [(+1, self.sequence), (-1, self.sequence.reverse_complement())]:
for frame in range(3):
trans = seq[frame:].translate(to_stop=False)
for i, aa in enumerate(trans):
if aa == 'M': # Start codon
for j in range(i + 1, len(trans)):
if trans[j] == '*': # Stop codon
orf_len = (j - i) * 3
if orf_len >= min_length:
orfs.append({
"strand": strand,
"frame": frame,
"start": i * 3 + frame,
"end": j * 3 + frame,
"length": orf_len,
"protein": str(trans[i:j])
})
break
return orfs
def find_motif(self, motif: str) -> List[int]:
"""Find motif positions in sequence"""
positions = []
motif = motif.upper()
for i in range(len(self.sequence) - len(motif) + 1):
if str(self.sequence[i:i+len(motif)]) == motif:
positions.append(i)
return positions
Sequence Alignment
from Bio import pairwise2 from Bio.pairwise2 import format_alignment import numpy as np
class SequenceAligner: """Perform sequence alignments"""
@staticmethod
def global_alignment(seq1: str, seq2: str,
match: float = 2,
mismatch: float = -1,
gap_open: float = -0.5,
gap_extend: float = -0.1):
"""Perform global alignment (Needleman-Wunsch)"""
alignments = pairwise2.align.globalms(
seq1, seq2,
match, mismatch,
gap_open, gap_extend
)
best = alignments[0]
return {
"aligned_seq1": best.seqA,
"aligned_seq2": best.seqB,
"score": best.score,
"identity": SequenceAligner._calculate_identity(best.seqA, best.seqB)
}
@staticmethod
def local_alignment(seq1: str, seq2: str,
match: float = 2,
mismatch: float = -1,
gap_open: float = -0.5,
gap_extend: float = -0.1):
"""Perform local alignment (Smith-Waterman)"""
alignments = pairwise2.align.localms(
seq1, seq2,
match, mismatch,
gap_open, gap_extend
)
best = alignments[0]
return {
"aligned_seq1": best.seqA,
"aligned_seq2": best.seqB,
"score": best.score,
"identity": SequenceAligner._calculate_identity(best.seqA, best.seqB)
}
@staticmethod
def _calculate_identity(seq1: str, seq2: str) -> float:
"""Calculate sequence identity percentage"""
matches = sum(1 for a, b in zip(seq1, seq2) if a == b and a != '-')
return (matches / min(len(seq1), len(seq2))) * 100
Genomic Variant Analysis
from dataclasses import dataclass from typing import Optional
@dataclass class Variant: chromosome: str position: int reference: str alternate: str quality: float genotype: str depth: int allele_frequency: Optional[float] = None
class VariantAnnotator: """Annotate genetic variants"""
def __init__(self):
self.gene_annotations = {}
def annotate_variant(self, variant: Variant) -> Dict:
"""Annotate variant with functional consequences"""
annotation = {
"variant": f"{variant.chromosome}:{variant.position}{variant.reference}>{variant.alternate}",
"type": self._classify_variant_type(variant),
"effect": self._predict_effect(variant),
"quality": variant.quality,
"depth": variant.depth
}
if variant.allele_frequency:
annotation["allele_frequency"] = variant.allele_frequency
annotation["rarity"] = self._classify_rarity(variant.allele_frequency)
return annotation
def _classify_variant_type(self, variant: Variant) -> str:
"""Classify variant type"""
ref_len = len(variant.reference)
alt_len = len(variant.alternate)
if ref_len == 1 and alt_len == 1:
return "SNV" # Single Nucleotide Variant
elif ref_len < alt_len:
return "INSERTION"
elif ref_len > alt_len:
return "DELETION"
else:
return "INDEL"
def _predict_effect(self, variant: Variant) -> str:
"""Predict variant effect on protein"""
# Simplified effect prediction
if self._classify_variant_type(variant) == "SNV":
# Would check if it's in coding region, causes stop codon, etc.
return "MISSENSE"
return "UNKNOWN"
def _classify_rarity(self, af: float) -> str:
"""Classify variant rarity"""
if af > 0.05:
return "COMMON"
elif af > 0.01:
return "LOW_FREQUENCY"
else:
return "RARE"
RNA-seq Analysis
import pandas as pd import numpy as np from scipy import stats
class RNASeqAnalyzer: """Analyze RNA-seq expression data"""
def __init__(self, counts_matrix: pd.DataFrame):
"""
counts_matrix: genes x samples matrix of raw counts
"""
self.counts = counts_matrix
self.normalized = None
def normalize_counts(self, method: str = "tpm"):
"""Normalize count data"""
if method == "tpm":
# Transcripts Per Million
self.normalized = (self.counts / self.counts.sum(axis=0)) * 1e6
elif method == "log2":
# Log2 transformation
self.normalized = np.log2(self.counts + 1)
return self.normalized
def differential_expression(self, condition1: List[str],
condition2: List[str],
method: str = "ttest") -> pd.DataFrame:
"""Perform differential expression analysis"""
results = []
for gene in self.counts.index:
expr1 = self.counts.loc[gene, condition1]
expr2 = self.counts.loc[gene, condition2]
if method == "ttest":
statistic, pvalue = stats.ttest_ind(expr1, expr2)
fc = expr2.mean() / (expr1.mean() + 1)
log2fc = np.log2(fc)
results.append({
"gene": gene,
"mean_condition1": expr1.mean(),
"mean_condition2": expr2.mean(),
"fold_change": fc,
"log2_fold_change": log2fc,
"p_value": pvalue,
"significant": pvalue < 0.05 and abs(log2fc) > 1
})
return pd.DataFrame(results)
def identify_marker_genes(self, threshold_fc: float = 2,
threshold_pval: float = 0.05) -> List[str]:
"""Identify significantly differentially expressed genes"""
# This would use the differential_expression results
pass
Best Practices
Data Analysis
-
Use appropriate statistical tests
-
Account for multiple testing correction
-
Validate results with independent methods
-
Document data preprocessing steps
-
Use version control for analysis scripts
-
Maintain reproducible workflows
Sequence Analysis
-
Quality control of sequencing data
-
Use appropriate reference genomes
-
Validate variant calls
-
Consider batch effects
-
Use established bioinformatics tools
-
Benchmark against known datasets
Computational Biology
-
Use efficient data structures for large datasets
-
Parallelize computationally intensive tasks
-
Validate biological interpretations
-
Consult domain experts
-
Document assumptions clearly
-
Use standardized file formats (FASTA, VCF, BAM)
Anti-Patterns
❌ No quality control of input data ❌ Ignoring batch effects ❌ No multiple testing correction ❌ Over-interpreting correlations ❌ Inadequate sample sizes ❌ Not validating computational predictions ❌ Ignoring biological context
Resources
-
Biopython: https://biopython.org/
-
NCBI Resources: https://www.ncbi.nlm.nih.gov/
-
Ensembl: https://www.ensembl.org/
-
Galaxy Project: https://galaxyproject.org/
-
Bioconductor: https://www.bioconductor.org/