bio-expression-matrix-metadata-joins

# Load metadata metadata = pd.read_csv('sample_info.csv', index_col=0)

Safety Notice

This listing is imported from skills.sh public index metadata. Review upstream SKILL.md and repository scripts before running.

Copy this and send it to your AI assistant to learn

Install skill "bio-expression-matrix-metadata-joins" with this command: npx skills add gptomics/bioskills/gptomics-bioskills-bio-expression-matrix-metadata-joins

Metadata Joins

Load Sample Metadata

import pandas as pd

Load metadata

metadata = pd.read_csv('sample_info.csv', index_col=0)

Metadata should have samples as rows, attributes as columns

Index should match count matrix column names

Basic Join

import pandas as pd

Count matrix: genes x samples

counts = pd.read_csv('counts.tsv', sep='\t', index_col=0)

Metadata: samples x attributes

metadata = pd.read_csv('metadata.csv', index_col=0)

Ensure sample order matches

common_samples = counts.columns.intersection(metadata.index) counts = counts[common_samples] metadata = metadata.loc[common_samples]

Verify alignment

assert all(counts.columns == metadata.index)

Handle Sample Name Mismatches

def harmonize_sample_names(counts, metadata): '''Match sample names between counts and metadata.''' count_samples = set(counts.columns) meta_samples = set(metadata.index)

common = count_samples & meta_samples
only_counts = count_samples - meta_samples
only_meta = meta_samples - count_samples

if only_counts:
    print(f'Samples in counts but not metadata: {only_counts}')
if only_meta:
    print(f'Samples in metadata but not counts: {only_meta}')

counts = counts[sorted(common)]
metadata = metadata.loc[sorted(common)]
return counts, metadata

counts, metadata = harmonize_sample_names(counts, metadata)

Flexible Sample Name Matching

def fuzzy_match_samples(counts, metadata): '''Try to match sample names with common transformations.''' count_cols = counts.columns.tolist() meta_idx = metadata.index.tolist()

# Try exact match first
if set(count_cols) == set(meta_idx):
    return counts, metadata

# Common transformations
transformations = [
    lambda x: x.replace('_', '-'),
    lambda x: x.replace('-', '_'),
    lambda x: x.split('_')[0],
    lambda x: x.replace('.bam', ''),
    lambda x: x.upper(),
    lambda x: x.lower(),
]

for transform in transformations:
    transformed = {transform(c): c for c in count_cols}
    matches = {m: transformed[transform(m)] for m in meta_idx if transform(m) in transformed}
    if len(matches) == len(meta_idx):
        print(f'Matched using transformation')
        counts = counts[[matches[m] for m in meta_idx]]
        return counts, metadata

raise ValueError('Could not match sample names')

Add Gene Annotations

import mygene

def add_gene_annotations(counts, fields=['symbol', 'name', 'type_of_gene']): '''Add gene annotation columns to count matrix.''' mg = mygene.MyGeneInfo()

clean_ids = [g.split('.')[0] for g in counts.index]
results = mg.querymany(clean_ids, scopes='ensembl.gene',
    fields=fields, species='human', as_dataframe=True)

# Merge annotations
results = results.reset_index().rename(columns={'query': 'gene_id'})
counts_reset = counts.reset_index().rename(columns={counts.index.name: 'gene_id'})
counts_reset['clean_id'] = counts_reset['gene_id'].str.split('.').str[0]

annotated = counts_reset.merge(
    results[['gene_id'] + fields].drop_duplicates(),
    left_on='clean_id', right_on='gene_id', how='left', suffixes=('', '_anno'))

annotated = annotated.drop(['clean_id', 'gene_id_anno'], axis=1, errors='ignore')
annotated = annotated.set_index('gene_id')

return annotated

R: Create DESeq2 Data

library(DESeq2)

Load data

counts <- read.delim('counts.tsv', row.names=1) metadata <- read.csv('metadata.csv', row.names=1)

Ensure matching samples

common <- intersect(colnames(counts), rownames(metadata)) counts <- counts[, common] metadata <- metadata[common, , drop=FALSE]

Create DESeqDataSet

dds <- DESeqDataSetFromMatrix( countData=as.matrix(counts), colData=metadata, design=~condition # Adjust to your design )

R: Create edgeR DGEList

library(edgeR)

Load data

counts <- read.delim('counts.tsv', row.names=1) metadata <- read.csv('metadata.csv', row.names=1)

Match samples

common <- intersect(colnames(counts), rownames(metadata)) counts <- counts[, common] metadata <- metadata[common, , drop=FALSE]

Create DGEList

y <- DGEList(counts=as.matrix(counts), group=metadata$condition) y$samples <- cbind(y$samples, metadata)

Create AnnData with Metadata

import anndata as ad import pandas as pd

def create_annotated_anndata(counts, sample_metadata, gene_metadata=None): '''Create AnnData object with full metadata.''' # AnnData expects samples as rows adata = ad.AnnData(X=counts.T)

# Add sample metadata (obs)
adata.obs = sample_metadata.loc[counts.columns].copy()

# Add gene metadata (var)
if gene_metadata is not None:
    adata.var = gene_metadata.loc[counts.index].copy()
else:
    adata.var_names = counts.index

return adata

Usage

adata = create_annotated_anndata(counts, metadata) adata.write_h5ad('annotated_counts.h5ad')

Validate Metadata

def validate_metadata(counts, metadata, required_columns=['condition']): '''Check metadata validity.''' issues = []

# Check sample overlap
count_samples = set(counts.columns)
meta_samples = set(metadata.index)

if count_samples != meta_samples:
    missing = count_samples - meta_samples
    extra = meta_samples - count_samples
    if missing:
        issues.append(f'Samples missing metadata: {missing}')
    if extra:
        issues.append(f'Extra metadata samples: {extra}')

# Check required columns
for col in required_columns:
    if col not in metadata.columns:
        issues.append(f'Missing required column: {col}')
    elif metadata[col].isna().any():
        n_na = metadata[col].isna().sum()
        issues.append(f'Column {col} has {n_na} missing values')

if issues:
    for issue in issues:
        print(f'WARNING: {issue}')
    return False

print('Metadata validation passed')
return True

Merge Multiple Metadata Files

def merge_metadata_files(files, on='sample_id'): '''Merge multiple metadata files.''' dfs = [pd.read_csv(f) for f in files] merged = dfs[0] for df in dfs[1:]: merged = merged.merge(df, on=on, how='outer') return merged.set_index(on)

Usage

metadata = merge_metadata_files(['clinical.csv', 'sequencing.csv', 'qc.csv'])

Related Skills

  • expression-matrix/counts-ingest - Load count data

  • expression-matrix/gene-id-mapping - Convert gene IDs

  • differential-expression/deseq2-basics - Downstream analysis

  • single-cell/preprocessing - Single-cell metadata handling

Source Transparency

This detail page is rendered from real SKILL.md content. Trust labels are metadata-based hints, not a safety guarantee.

Related Skills

Related by shared tags or category signals.

General

bioskills

No summary provided by upstream source.

Repository SourceNeeds Review
General

bio-metagenomics-kraken

No summary provided by upstream source.

Repository SourceNeeds Review
General

bio-epitranscriptomics-merip-preprocessing

No summary provided by upstream source.

Repository SourceNeeds Review