bio-clinical-databases-variant-prioritization

Variant Prioritization

Safety Notice

This listing is imported from skills.sh public index metadata. Review upstream SKILL.md and repository scripts before running.

Copy this and send it to your AI assistant to learn

Install skill "bio-clinical-databases-variant-prioritization" with this command: npx skills add gptomics/bioskills/gptomics-bioskills-bio-clinical-databases-variant-prioritization

Variant Prioritization

Basic Filtering Pipeline

import pandas as pd

def prioritize_variants(df, gnomad_af_col='gnomad_af', clinvar_col='clinvar_sig'): '''Basic variant prioritization pipeline

Filters:
1. Rare in population (gnomAD AF < 0.01)
2. Pathogenic/likely pathogenic in ClinVar OR VUS with low AF
'''
# Filter rare variants (ACMG PM2: AF < 1%)
rare = df[df[gnomad_af_col].isna() | (df[gnomad_af_col] < 0.01)]

# Prioritize by ClinVar
pathogenic_terms = ['Pathogenic', 'Likely_pathogenic', 'Pathogenic/Likely_pathogenic']
prioritized = rare[
    rare[clinvar_col].isin(pathogenic_terms) |
    rare[clinvar_col].isna() |  # No ClinVar = needs review
    (rare[clinvar_col] == 'Uncertain_significance')
]

return prioritized

ACMG-Style Filtering

def acmg_filter(df): '''Apply ACMG-style filtering criteria

Strong pathogenic evidence:
- PVS1: Null variant in gene where LOF is disease mechanism
- PS1: Same amino acid change as established pathogenic
- PS3: Functional studies support damaging effect

Moderate evidence:
- PM1: Located in mutational hot spot
- PM2: Absent/rare in population databases (AF < 0.01)
- PM5: Novel missense at position of known pathogenic
'''
# PM2: Rare in gnomAD
df['pm2'] = df['gnomad_af'].isna() | (df['gnomad_af'] < 0.01)

# PVS1: Loss of function variants
lof_consequences = ['frameshift', 'stop_gained', 'splice_donor', 'splice_acceptor']
df['pvs1'] = df['consequence'].isin(lof_consequences)

# Score based on evidence
df['priority_score'] = df['pm2'].astype(int) + df['pvs1'].astype(int) * 2

return df.sort_values('priority_score', ascending=False)

Multi-Database Prioritization

import myvariant

def annotate_and_prioritize(variants): '''Annotate variants and apply prioritization''' mv = myvariant.MyVariantInfo()

# Fetch annotations
results = mv.getvariants(
    variants,
    fields=[
        'clinvar.clinical_significance',
        'clinvar.review_status',
        'gnomad_exome.af.af',
        'cadd.phred',
        'dbnsfp.revel.score'
    ]
)

records = []
for r in results:
    clinvar = r.get('clinvar', {})
    gnomad = r.get('gnomad_exome', {})
    cadd = r.get('cadd', {})
    revel = r.get('dbnsfp', {}).get('revel', {})

    records.append({
        'variant': r.get('query'),
        'clinvar_sig': clinvar.get('clinical_significance'),
        'clinvar_stars': clinvar.get('review_status'),
        'gnomad_af': gnomad.get('af', {}).get('af'),
        'cadd_phred': cadd.get('phred'),
        'revel_score': revel.get('score') if isinstance(revel, dict) else None
    })

df = pd.DataFrame(records)
return prioritize_with_scores(df)

def prioritize_with_scores(df): '''Apply multi-evidence prioritization''' # Computational predictions # CADD phred > 20 suggests deleteriousness # REVEL > 0.5 suggests pathogenicity df['cadd_deleterious'] = df['cadd_phred'].fillna(0) > 20 df['revel_pathogenic'] = df['revel_score'].fillna(0) > 0.5

# Rare in population
df['is_rare'] = df['gnomad_af'].isna() | (df['gnomad_af'] < 0.01)

# ClinVar pathogenic
pathogenic = ['Pathogenic', 'Likely_pathogenic']
df['clinvar_pathogenic'] = df['clinvar_sig'].apply(
    lambda x: any(p in str(x) for p in pathogenic) if pd.notna(x) else False
)

# Priority score
df['priority'] = (
    df['clinvar_pathogenic'].astype(int) * 10 +
    df['is_rare'].astype(int) * 3 +
    df['cadd_deleterious'].astype(int) * 2 +
    df['revel_pathogenic'].astype(int) * 2
)

return df.sort_values('priority', ascending=False)

Inheritance-Based Filtering

def filter_by_inheritance(df, inheritance='AD'): '''Filter variants by inheritance pattern

AD: Autosomal dominant - heterozygous variants
AR: Autosomal recessive - homozygous or compound het
XL: X-linked
'''
if inheritance == 'AD':
    # Dominant: heterozygous, rare
    return df[(df['zygosity'] == 'HET') & (df['gnomad_af'] < 0.0001)]

elif inheritance == 'AR':
    # Recessive: homozygous or two variants in same gene
    hom = df[df['zygosity'] == 'HOM']

    # Find genes with 2+ het variants (compound het candidates)
    het = df[df['zygosity'] == 'HET']
    compound_genes = het['gene'].value_counts()
    compound_genes = compound_genes[compound_genes >= 2].index
    compound_het = het[het['gene'].isin(compound_genes)]

    return pd.concat([hom, compound_het])

return df

Output Priority Tiers

def assign_tiers(df): '''Assign clinical interpretation tiers

Tier 1: Strong pathogenic evidence
Tier 2: Potential pathogenic
Tier 3: Uncertain significance
Tier 4: Likely benign
'''
def get_tier(row):
    if row['clinvar_pathogenic'] and row['is_rare']:
        return 1
    elif row['is_rare'] and (row['cadd_deleterious'] or row['revel_pathogenic']):
        return 2
    elif row['is_rare']:
        return 3
    else:
        return 4

df['tier'] = df.apply(get_tier, axis=1)
return df

Related Skills

  • clinvar-lookup - ClinVar pathogenicity queries

  • gnomad-frequencies - Population frequency filtering

  • variant-calling/clinical-interpretation - ACMG classification

  • variant-calling/filtering-best-practices - Quality filtering

Source Transparency

This detail page is rendered from real SKILL.md content. Trust labels are metadata-based hints, not a safety guarantee.

Related Skills

Related by shared tags or category signals.

Coding

bio-workflows-clip-pipeline

No summary provided by upstream source.

Repository SourceNeeds Review
Coding

bio-clinical-databases-dbsnp-queries

No summary provided by upstream source.

Repository SourceNeeds Review
Coding

bio-clip-seq-clip-peak-calling

No summary provided by upstream source.

Repository SourceNeeds Review