bio-proteomics-proteomics-qc

Proteomics Quality Control

Safety Notice

This listing is imported from skills.sh public index metadata. Review upstream SKILL.md and repository scripts before running.

Copy this and send it to your AI assistant to learn

Install skill "bio-proteomics-proteomics-qc" with this command: npx skills add gptomics/bioskills/gptomics-bioskills-bio-proteomics-proteomics-qc

Proteomics Quality Control

Sample Quality Metrics

import pandas as pd import numpy as np

def sample_qc_metrics(intensity_matrix): '''Calculate per-sample QC metrics''' metrics = pd.DataFrame(index=intensity_matrix.columns) metrics['n_proteins'] = intensity_matrix.notna().sum() metrics['median_intensity'] = intensity_matrix.median() metrics['mean_intensity'] = intensity_matrix.mean() metrics['cv'] = intensity_matrix.std() / intensity_matrix.mean() metrics['missing_pct'] = 100 * intensity_matrix.isna().sum() / len(intensity_matrix) return metrics

qc = sample_qc_metrics(log2_intensities) print(qc)

Replicate Correlation

import seaborn as sns import matplotlib.pyplot as plt from scipy.stats import pearsonr

def replicate_correlation(intensity_matrix, sample_groups): '''Calculate within-group correlations''' corr_matrix = intensity_matrix.corr(method='pearson')

# Mask for within-group comparisons
results = []
for group in sample_groups.unique():
    group_samples = sample_groups[sample_groups == group].index
    for i, s1 in enumerate(group_samples):
        for s2 in group_samples[i+1:]:
            r = corr_matrix.loc[s1, s2]
            results.append({'group': group, 'sample1': s1, 'sample2': s2, 'correlation': r})

return pd.DataFrame(results)

Heatmap

sns.clustermap(intensity_matrix.corr(), cmap='RdBu_r', center=0, vmin=-1, vmax=1, figsize=(10, 10), annot=False) plt.savefig('correlation_heatmap.pdf')

Missing Value Patterns

import missingno as msno

def analyze_missing_patterns(intensity_matrix): '''Analyze missing value patterns''' # Missing value matrix visualization msno.matrix(intensity_matrix, figsize=(12, 8)) plt.savefig('missing_pattern.pdf')

# Missing by sample
missing_per_sample = intensity_matrix.isna().sum() / len(intensity_matrix) * 100

# Missing by protein
missing_per_protein = intensity_matrix.isna().sum(axis=1) / intensity_matrix.shape[1] * 100

# Check for systematic patterns
return {'per_sample': missing_per_sample, 'per_protein': missing_per_protein}

Batch Effect Detection with PCA

from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA

def detect_batch_effects(intensity_matrix, sample_info, batch_col='batch'): '''PCA to detect batch effects''' # Impute for PCA (temporary) imputed = intensity_matrix.fillna(intensity_matrix.median()) scaled = StandardScaler().fit_transform(imputed.T)

pca = PCA(n_components=5)
pcs = pca.fit_transform(scaled)
pc_df = pd.DataFrame(pcs, columns=[f'PC{i+1}' for i in range(5)], index=intensity_matrix.columns)
pc_df = pc_df.join(sample_info)

# Check batch association with PCs
from scipy.stats import f_oneway
for pc in ['PC1', 'PC2', 'PC3']:
    groups = [pc_df[pc_df[batch_col] == b][pc] for b in pc_df[batch_col].unique()]
    stat, pval = f_oneway(*groups)
    print(f'{pc} ~ {batch_col}: F={stat:.2f}, p={pval:.4f}')

return pc_df, pca.explained_variance_ratio_

R: QC with limma

library(limma) library(ggplot2)

Intensity distribution

plotDensities(protein_matrix, legend = FALSE, main = 'Intensity Distributions')

MA plots between samples

for (i in 2:ncol(protein_matrix)) { plotMA(protein_matrix[, c(1, i)], main = paste('MA:', colnames(protein_matrix)[i])) }

MDS plot (similar to PCA)

plotMDS(protein_matrix, col = as.numeric(sample_info$condition))

Coefficient of Variation

def calculate_cv(intensity_matrix, sample_groups): '''Calculate CV within groups''' cv_results = [] for group in sample_groups.unique(): group_samples = sample_groups[sample_groups == group].index group_data = intensity_matrix[group_samples]

    # CV per protein
    cv = group_data.std(axis=1) / group_data.mean(axis=1) * 100
    cv_results.append({'group': group, 'median_cv': cv.median(), 'mean_cv': cv.mean()})

return pd.DataFrame(cv_results)

Technical replicates should have CV < 20%

Biological replicates typically 20-40%

Digestion Efficiency

def check_digestion(evidence_df): '''Check digestion efficiency from MaxQuant evidence.txt''' # Missed cleavages distribution mc_dist = evidence_df['Missed cleavages'].value_counts(normalize=True) * 100 print('Missed cleavage distribution:') print(mc_dist)

# Good digestion: >80% with 0 missed cleavages
if mc_dist.get(0, 0) &#x3C; 80:
    print('Warning: Poor digestion efficiency (&#x3C;80% fully cleaved)')

return mc_dist

QC Report Summary

def generate_qc_report(intensity_matrix, sample_info): '''Generate comprehensive QC summary''' report = { 'n_samples': intensity_matrix.shape[1], 'n_proteins': intensity_matrix.shape[0], 'median_proteins_per_sample': intensity_matrix.notna().sum().median(), 'overall_missing_pct': 100 * intensity_matrix.isna().sum().sum() / intensity_matrix.size, 'median_correlation': intensity_matrix.corr().values[np.triu_indices_from(intensity_matrix.corr(), k=1)].mean(), }

# Flags
report['flags'] = []
if report['overall_missing_pct'] > 30:
    report['flags'].append('High missing values (>30%)')
if report['median_correlation'] &#x3C; 0.9:
    report['flags'].append('Low replicate correlation (&#x3C;0.9)')

return report

Related Skills

  • data-import - Load data before QC

  • quantification - Normalization after QC

  • differential-abundance - Analysis after QC passes

  • data-visualization/heatmaps-clustering - QC heatmaps

Source Transparency

This detail page is rendered from real SKILL.md content. Trust labels are metadata-based hints, not a safety guarantee.

Related Skills

Related by shared tags or category signals.

General

bioskills

No summary provided by upstream source.

Repository SourceNeeds Review
General

bio-metagenomics-kraken

No summary provided by upstream source.

Repository SourceNeeds Review
General

bio-epitranscriptomics-merip-preprocessing

No summary provided by upstream source.

Repository SourceNeeds Review