Semantic Search Skill

Search through files and directories for content using keyword matching and basic semantic analysis.

When to Use

✅ USE this skill when:

Finding code that implements a feature
Searching documentation for topics
Locating files by their content
Finding similar code patterns
Researching codebase structure

When NOT to Use

❌ DON'T use this skill when:

Searching binary files → use file tools
Exact regex patterns → use grep
Searching very large repos (>100k files) → use indexed search

Installation

cd /job npm install natural compromise

Features

Keyword Search: Simple text matching across files
Stemming: Matches word variations (run, running, ran)
TF-IDF Scoring: Ranks results by relevance
File Filtering: Filter by extension, path patterns
Context: Shows surrounding lines for each match

Usage

Basic Search

const { searchFiles } = require('./semantic-search');

const results = await searchFiles('.', { query: 'authentication middleware', extensions: ['.js', '.ts'], maxResults: 20 });

console.log(results);

Advanced Search

const results = await searchFiles('/path/to/code', { query: 'error handling database', excludeDirs: ['node_modules', 'dist', '.git'], extensions: ['.js', '.ts', '.py'], contextLines: 3, maxResults: 50, minScore: 0.3 });

Node.js Implementation

const fs = require('fs'); const path = require('path'); const natural = require('natural');

class SemanticSearcher { constructor(options = {}) { this.stemmer = natural.PorterStemmer; this.tokenizer = new natural.WordTokenizer(); this.maxFileSize = options.maxFileSize || 1024 * 1024; // 1MB this.excludeDirs = options.excludeDirs || [ 'node_modules', 'dist', 'build', '.git', 'vendor', 'pycache', '.next', '.nuxt' ]; }

tokenize(text) { return this.tokenizer.tokenize(text.toLowerCase()) .map(token => this.stemmer.stem(token)); }

calculateTF(tokens) { const tf = {}; tokens.forEach(token => { tf[token] = (tf[token] || 0) + 1; }); const maxFreq = Math.max(...Object.values(tf)); Object.keys(tf).forEach(key => { tf[key] /= maxFreq; }); return tf; }

scoreDocument(queryTokens, docTokens) { const querySet = new Set(queryTokens); let score = 0; docTokens.forEach(token => { if (querySet.has(token)) score++; }); return score / Math.max(docTokens.length, 1); }

async searchFiles(rootDir, query, options = {}) { const queryTokens = this.tokenize(query); const results = []; const files = await this.walkDirectory(rootDir, options);

for (const file of files) {
  try {
    const content = await fs.promises.readFile(file, 'utf-8');
    const tokens = this.tokenize(content);
    const score = this.scoreDocument(queryTokens, tokens);

    if (score > (options.minScore || 0.1)) {
      const lines = content.split('\n');
      const matchLines = this.findMatchingLines(lines, queryTokens, options.contextLines || 2);
      
      results.push({
        file: path.relative(rootDir, file),
        score: score.toFixed(3),
        matches: matchLines,
        totalLines: lines.length
      });
    }
  } catch (e) {
    // Skip unreadable files
  }
}

return results.sort((a, b) => parseFloat(b.score) - parseFloat(a.score))
  .slice(0, options.maxResults || 20);

}

async walkDirectory(dir, options = {}) { const files = []; const extensions = options.extensions || null;

async function walk(currentDir) {
  const entries = await fs.promises.readdir(currentDir, { withFileTypes: true });
  
  for (const entry of entries) {
    if (entry.isDirectory()) {
      if (!this.excludeDirs.includes(entry.name)) {
        await walk(path.join(currentDir, entry.name));
      }
    } else if (entry.isFile()) {
      if (!extensions || extensions.some(ext => entry.name.endsWith(ext))) {
        const filePath = path.join(currentDir, entry.name);
        const stats = await fs.promises.stat(filePath);
        if (stats.size &#x3C;= this.maxFileSize) {
          files.push(filePath);
        }
      }
    }
  }
}

await walk.call(this, dir);
return files;

}

findMatchingLines(lines, queryTokens, contextLines) { const matches = [];

lines.forEach((line, index) => {
  const lineTokens = this.tokenize(line);
  const matchCount = lineTokens.filter(t => queryTokens.includes(t)).length;
  
  if (matchCount > 0) {
    const start = Math.max(0, index - contextLines);
    const end = Math.min(lines.length, index + contextLines + 1);
    
    matches.push({
      lineNumber: index + 1,
      content: line.trim(),
      context: lines.slice(start, end).join('\n'),
      matchScore: matchCount
    });
  }
});

return matches.slice(0, 10);

} }

// Usage const searcher = new SemanticSearcher(); const results = await searcher.searchFiles('.', 'authentication', { extensions: ['.js', '.ts'], maxResults: 10 });

console.log(JSON.stringify(results, null, 2));

Command Line Usage

Search for authentication code

node index.js search "auth middleware" --ext .js,.ts --max 10

Search with context

node index.js search "error handling" --context 5

Search specific directory

node index.js search "database" --dir src/

Output Format

{ "query": "authentication middleware", "totalMatches": 5, "results": [ { "file": "src/middleware/auth.js", "score": "0.847", "matches": [ { "lineNumber": 42, "content": "function authenticateUser(token) {", "context": "...", "matchScore": 3 } ] } ] }

Quick Tips

Use specific terms: "JWT validation" not just "auth"
Include type hints: ".js" files often have different patterns
Multiple words improve accuracy
Use camelCase terms for code search

Notes

Searches text files only
Case-insensitive matching
Stemming improves recall
Scores range from 0.0 to 1.0

semantic-search

Safety Notice

Copy this and send it to your AI assistant to learn

Search for authentication code

Search with context

Search specific directory

Source Transparency

Related Skills

vector-memory

model-router

rss-reader

video-frames