website-auditor

Audit any website across 8 quality signals to determine if it is outdated, broken, or neglected. Returns a structured audit dict used by the lead-scorer skill.

Safety Notice

This listing is from the official public ClawHub registry. Review SKILL.md and referenced scripts before running.

Copy this and send it to your AI assistant to learn

Install skill "website-auditor" with this command: npx skills add maverick-software/website-auditor

Website Auditor Skill

Runs 8 automated checks on any URL. Returns a structured dict consumed by lead-scorer.

Output Format

{
    "url": "https://example.com",
    "domain": "example.com",
    "status_code": 200,           # or "DEAD", "TIMEOUT", "SSL_ERROR"
    "is_live": True,
    "copyright_year": 2018,
    "years_outdated": 7,
    "last_modified": "2019-03-15", # or None
    "tech_stack": ["WordPress 4.9", "jQuery 1.11", "PHP"],
    "has_outdated_cms": True,
    "pagespeed_mobile": 32,        # 0–100 or None if API failed
    "pagespeed_seo": 61,
    "is_mobile_friendly": False,
    "has_ssl": True,
    "design_signals": ["table_layout", "no_open_graph", "no_favicon"],
    "raw_html": "...",             # used by contact-enrichment
    "audit_timestamp": "2025-03-02T17:00:00"
}

Signal 1: HTTP Status Check

import requests

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def check_status(url: str) -> dict:
    try:
        r = requests.get(url, headers=HEADERS, timeout=10, allow_redirects=True)
        return {
            "status_code": r.status_code,
            "is_live": r.status_code < 400,
            "final_url": r.url,
            "raw_html": r.text if r.status_code < 400 else "",
            "response_headers": dict(r.headers)
        }
    except requests.exceptions.SSLError:
        return {"status_code": "SSL_ERROR", "is_live": False, "raw_html": "", "has_ssl": False}
    except requests.exceptions.ConnectionError:
        return {"status_code": "DEAD", "is_live": False, "raw_html": ""}
    except requests.exceptions.Timeout:
        return {"status_code": "TIMEOUT", "is_live": False, "raw_html": ""}

Signal 2: Copyright Year

import re
from datetime import datetime

def get_copyright_year(html: str) -> dict:
    # Match © 2019, Copyright 2019, (c) 2019, &copy; 2019
    pattern = r'(?:©|&copy;|copyright|\(c\))\s*(?:\d{4}\s*[-–]\s*)?(\d{4})'
    matches = re.findall(pattern, html, re.IGNORECASE)
    
    if not matches:
        return {"copyright_year": None, "years_outdated": None}
    
    latest_year = max(int(y) for y in matches)
    current_year = datetime.now().year
    years_outdated = current_year - latest_year
    
    return {
        "copyright_year": latest_year,
        "years_outdated": max(0, years_outdated)
    }

Signal 3: Last-Modified Header

from datetime import datetime
from email.utils import parsedate_to_datetime

def get_last_modified(headers: dict) -> str | None:
    lm = headers.get("Last-Modified") or headers.get("last-modified")
    if not lm:
        return None
    try:
        dt = parsedate_to_datetime(lm)
        return dt.strftime("%Y-%m-%d")
    except Exception:
        return lm  # return raw string if parsing fails

Signal 4: Technology Stack (Wappalyzer)

from Wappalyzer import Wappalyzer, WebPage

OUTDATED_TECH = {
    "Joomla", "Drupal 6", "Drupal 7", "osCommerce",
    "Magento 1", "vBulletin", "phpBB", "Flash",
    "Silverlight", "ASP.NET WebForms"
}

OUTDATED_JS = {"jQuery 1.", "jQuery 2.", "MooTools", "Prototype"}

def detect_tech_stack(url: str, html: str, headers: dict) -> dict:
    try:
        wappalyzer = Wappalyzer.latest()
        webpage = WebPage(url, html=html, headers=headers)
        techs = wappalyzer.analyze_with_categories(webpage)
        
        tech_list = list(techs.keys())
        has_outdated = any(
            any(bad in t for bad in OUTDATED_TECH) or
            any(t.startswith(js) for js in OUTDATED_JS)
            for t in tech_list
        )
        
        # Check for very old WordPress versions
        for t in tech_list:
            if t.startswith("WordPress"):
                try:
                    ver = float(t.split(" ")[1][:3])
                    if ver < 5.0:
                        has_outdated = True
                except:
                    pass
        
        return {"tech_stack": tech_list, "has_outdated_cms": has_outdated}
    except Exception as e:
        return {"tech_stack": [], "has_outdated_cms": False, "tech_error": str(e)}

Signal 5: PageSpeed Score (Google PSI API — Free)

import requests, os

def get_pagespeed(url: str) -> dict:
    api_key = os.environ.get("PAGESPEED_API_KEY")
    if not api_key:
        return {"pagespeed_mobile": None, "pagespeed_seo": None}
    
    endpoint = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
    params = {"url": url, "key": api_key, "strategy": "mobile"}
    
    try:
        r = requests.get(endpoint, params=params, timeout=30)
        data = r.json()
        cats = data.get("lighthouseResult", {}).get("categories", {})
        return {
            "pagespeed_mobile": round(cats.get("performance", {}).get("score", 0) * 100),
            "pagespeed_seo": round(cats.get("seo", {}).get("score", 0) * 100),
            "pagespeed_accessibility": round(cats.get("accessibility", {}).get("score", 0) * 100),
        }
    except Exception as e:
        return {"pagespeed_mobile": None, "pagespeed_seo": None, "pagespeed_error": str(e)}

Signal 6: Mobile Responsiveness

from bs4 import BeautifulSoup

def check_mobile_friendly(html: str) -> bool:
    soup = BeautifulSoup(html, "lxml")
    viewport = soup.find("meta", attrs={"name": lambda x: x and x.lower() == "viewport"})
    return viewport is not None

Signal 7: SSL Certificate

import ssl, socket, tldextract

def check_ssl(url: str) -> bool:
    """Returns True if valid SSL, False if no SSL or invalid cert."""
    if url.startswith("http://"):
        return False
    ext = tldextract.extract(url)
    domain = f"{ext.domain}.{ext.suffix}"
    try:
        ctx = ssl.create_default_context()
        conn = ctx.wrap_socket(socket.socket(), server_hostname=domain)
        conn.settimeout(5)
        conn.connect((domain, 443))
        conn.close()
        return True
    except Exception:
        return False

Signal 8: Design Age Signals

from bs4 import BeautifulSoup

def detect_design_signals(html: str, url: str) -> list[str]:
    soup = BeautifulSoup(html, "lxml")
    signals = []
    
    # 1. Table-based layout (old school)
    tables = soup.find_all("table")
    non_data_tables = [t for t in tables if not t.find_parent(["table", "th", "td"])]
    if len(non_data_tables) > 3:
        signals.append("table_layout")
    
    # 2. Flash or Silverlight
    if soup.find("object") or soup.find("embed"):
        if any(x in html.lower() for x in ["swf", "flash", "silverlight"]):
            signals.append("flash_detected")
    
    # 3. Heavy inline styles (pre-CSS era)
    inline_count = len(soup.find_all(style=True))
    if inline_count > 15:
        signals.append("heavy_inline_styles")
    
    # 4. No Open Graph tags (no social/modern marketing)
    if not soup.find("meta", property="og:title"):
        signals.append("no_open_graph")
    
    # 5. No favicon
    favicon = soup.find("link", rel=lambda x: x and "icon" in (x if isinstance(x, str) else " ".join(x)).lower())
    if not favicon:
        signals.append("no_favicon")
    
    # 6. No meta description
    if not soup.find("meta", attrs={"name": "description"}):
        signals.append("no_meta_description")
    
    # 7. Frames / framesets (ancient)
    if soup.find("frameset") or soup.find("frame"):
        signals.append("uses_frames")
    
    # 8. Font tags (pre-CSS styling)
    if len(soup.find_all("font")) > 3:
        signals.append("font_tags")
    
    # 9. No HTTPS in URL
    if url.startswith("http://"):
        signals.append("no_https_url")
    
    return signals

Full Audit Runner

import tldextract
from datetime import datetime
import time

def audit_website(url: str) -> dict:
    """Run all 8 signals. Returns complete audit dict."""
    result = {"url": url, "audit_timestamp": datetime.now().isoformat()}
    
    ext = tldextract.extract(url)
    result["domain"] = f"{ext.domain}.{ext.suffix}"
    
    # Signal 1: Status + HTML
    status = check_status(url)
    result.update(status)
    
    if not result.get("is_live"):
        result["has_ssl"] = False
        return result  # Dead site — score it and move on
    
    html = result.get("raw_html", "")
    headers = result.get("response_headers", {})
    
    # Signal 2: Copyright year
    result.update(get_copyright_year(html))
    
    # Signal 3: Last-Modified header
    result["last_modified"] = get_last_modified(headers)
    
    # Signal 4: Tech stack
    result.update(detect_tech_stack(url, html, headers))
    
    # Signal 5: PageSpeed (slowest — do last or async)
    result.update(get_pagespeed(url))
    
    # Signal 6: Mobile responsive
    result["is_mobile_friendly"] = check_mobile_friendly(html)
    
    # Signal 7: SSL
    result["has_ssl"] = check_ssl(url)
    
    # Signal 8: Design signals
    result["design_signals"] = detect_design_signals(html, url)
    
    return result


def audit_batch(urls: list[str], delay: float = 1.5) -> list[dict]:
    """Audit multiple URLs with delay between requests."""
    results = []
    for i, url in enumerate(urls):
        print(f"[{i+1}/{len(urls)}] Auditing: {url}")
        try:
            audit = audit_website(url)
            results.append(audit)
        except Exception as e:
            results.append({"url": url, "error": str(e), "is_live": False})
        time.sleep(delay)
    return results

Async Version (For Speed at Scale)

import asyncio, aiohttp

async def audit_website_async(session: aiohttp.ClientSession, url: str) -> dict:
    """Async version — fetch HTML only. Run other checks synchronously after."""
    try:
        async with session.get(url, timeout=aiohttp.ClientTimeout(total=10)) as resp:
            html = await resp.text()
            headers = dict(resp.headers)
            status = resp.status
    except Exception as e:
        return {"url": url, "status_code": "DEAD", "is_live": False, "raw_html": ""}
    
    # Run synchronous checks on fetched data
    result = {"url": url, "status_code": status, "is_live": status < 400, "raw_html": html}
    result.update(get_copyright_year(html))
    result["is_mobile_friendly"] = check_mobile_friendly(html)
    result["design_signals"] = detect_design_signals(html, url)
    result["last_modified"] = get_last_modified(headers)
    return result

async def audit_batch_async(urls: list[str], concurrency: int = 5) -> list[dict]:
    connector = aiohttp.TCPConnector(limit=concurrency, ssl=False)
    headers = {"User-Agent": "Mozilla/5.0 (compatible; LeadScanner/1.0)"}
    async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
        tasks = [audit_website_async(session, url) for url in urls]
        return await asyncio.gather(*tasks, return_exceptions=False)

Source Transparency

This detail page is rendered from real SKILL.md content. Trust labels are metadata-based hints, not a safety guarantee.

Related Skills

Related by shared tags or category signals.

Security

Skill Guard

Skill Security Scanner - Scan for risks before download/use. Check: code execution, file ops, network requests, command injection, vulnerabilities, permissio...

Registry SourceRecently Updated
Security

Session Password

Provides secure session authentication using bcrypt-hashed passwords, security questions, email recovery, and lockout protection with audit logging.

Registry SourceRecently Updated
114
Profile unavailable
Security

agent-bom registry

MCP server security registry and trust assessment — look up servers in the 427+ server security metadata registry, run pre-install marketplace checks, batch...

Registry SourceRecently Updated
0128
Profile unavailable