Website Auditor Skill
Runs 8 automated checks on any URL. Returns a structured dict consumed by lead-scorer.
Output Format
{
"url": "https://example.com",
"domain": "example.com",
"status_code": 200, # or "DEAD", "TIMEOUT", "SSL_ERROR"
"is_live": True,
"copyright_year": 2018,
"years_outdated": 7,
"last_modified": "2019-03-15", # or None
"tech_stack": ["WordPress 4.9", "jQuery 1.11", "PHP"],
"has_outdated_cms": True,
"pagespeed_mobile": 32, # 0–100 or None if API failed
"pagespeed_seo": 61,
"is_mobile_friendly": False,
"has_ssl": True,
"design_signals": ["table_layout", "no_open_graph", "no_favicon"],
"raw_html": "...", # used by contact-enrichment
"audit_timestamp": "2025-03-02T17:00:00"
}
Signal 1: HTTP Status Check
import requests
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
def check_status(url: str) -> dict:
try:
r = requests.get(url, headers=HEADERS, timeout=10, allow_redirects=True)
return {
"status_code": r.status_code,
"is_live": r.status_code < 400,
"final_url": r.url,
"raw_html": r.text if r.status_code < 400 else "",
"response_headers": dict(r.headers)
}
except requests.exceptions.SSLError:
return {"status_code": "SSL_ERROR", "is_live": False, "raw_html": "", "has_ssl": False}
except requests.exceptions.ConnectionError:
return {"status_code": "DEAD", "is_live": False, "raw_html": ""}
except requests.exceptions.Timeout:
return {"status_code": "TIMEOUT", "is_live": False, "raw_html": ""}
Signal 2: Copyright Year
import re
from datetime import datetime
def get_copyright_year(html: str) -> dict:
# Match © 2019, Copyright 2019, (c) 2019, © 2019
pattern = r'(?:©|©|copyright|\(c\))\s*(?:\d{4}\s*[-–]\s*)?(\d{4})'
matches = re.findall(pattern, html, re.IGNORECASE)
if not matches:
return {"copyright_year": None, "years_outdated": None}
latest_year = max(int(y) for y in matches)
current_year = datetime.now().year
years_outdated = current_year - latest_year
return {
"copyright_year": latest_year,
"years_outdated": max(0, years_outdated)
}
Signal 3: Last-Modified Header
from datetime import datetime
from email.utils import parsedate_to_datetime
def get_last_modified(headers: dict) -> str | None:
lm = headers.get("Last-Modified") or headers.get("last-modified")
if not lm:
return None
try:
dt = parsedate_to_datetime(lm)
return dt.strftime("%Y-%m-%d")
except Exception:
return lm # return raw string if parsing fails
Signal 4: Technology Stack (Wappalyzer)
from Wappalyzer import Wappalyzer, WebPage
OUTDATED_TECH = {
"Joomla", "Drupal 6", "Drupal 7", "osCommerce",
"Magento 1", "vBulletin", "phpBB", "Flash",
"Silverlight", "ASP.NET WebForms"
}
OUTDATED_JS = {"jQuery 1.", "jQuery 2.", "MooTools", "Prototype"}
def detect_tech_stack(url: str, html: str, headers: dict) -> dict:
try:
wappalyzer = Wappalyzer.latest()
webpage = WebPage(url, html=html, headers=headers)
techs = wappalyzer.analyze_with_categories(webpage)
tech_list = list(techs.keys())
has_outdated = any(
any(bad in t for bad in OUTDATED_TECH) or
any(t.startswith(js) for js in OUTDATED_JS)
for t in tech_list
)
# Check for very old WordPress versions
for t in tech_list:
if t.startswith("WordPress"):
try:
ver = float(t.split(" ")[1][:3])
if ver < 5.0:
has_outdated = True
except:
pass
return {"tech_stack": tech_list, "has_outdated_cms": has_outdated}
except Exception as e:
return {"tech_stack": [], "has_outdated_cms": False, "tech_error": str(e)}
Signal 5: PageSpeed Score (Google PSI API — Free)
import requests, os
def get_pagespeed(url: str) -> dict:
api_key = os.environ.get("PAGESPEED_API_KEY")
if not api_key:
return {"pagespeed_mobile": None, "pagespeed_seo": None}
endpoint = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
params = {"url": url, "key": api_key, "strategy": "mobile"}
try:
r = requests.get(endpoint, params=params, timeout=30)
data = r.json()
cats = data.get("lighthouseResult", {}).get("categories", {})
return {
"pagespeed_mobile": round(cats.get("performance", {}).get("score", 0) * 100),
"pagespeed_seo": round(cats.get("seo", {}).get("score", 0) * 100),
"pagespeed_accessibility": round(cats.get("accessibility", {}).get("score", 0) * 100),
}
except Exception as e:
return {"pagespeed_mobile": None, "pagespeed_seo": None, "pagespeed_error": str(e)}
Signal 6: Mobile Responsiveness
from bs4 import BeautifulSoup
def check_mobile_friendly(html: str) -> bool:
soup = BeautifulSoup(html, "lxml")
viewport = soup.find("meta", attrs={"name": lambda x: x and x.lower() == "viewport"})
return viewport is not None
Signal 7: SSL Certificate
import ssl, socket, tldextract
def check_ssl(url: str) -> bool:
"""Returns True if valid SSL, False if no SSL or invalid cert."""
if url.startswith("http://"):
return False
ext = tldextract.extract(url)
domain = f"{ext.domain}.{ext.suffix}"
try:
ctx = ssl.create_default_context()
conn = ctx.wrap_socket(socket.socket(), server_hostname=domain)
conn.settimeout(5)
conn.connect((domain, 443))
conn.close()
return True
except Exception:
return False
Signal 8: Design Age Signals
from bs4 import BeautifulSoup
def detect_design_signals(html: str, url: str) -> list[str]:
soup = BeautifulSoup(html, "lxml")
signals = []
# 1. Table-based layout (old school)
tables = soup.find_all("table")
non_data_tables = [t for t in tables if not t.find_parent(["table", "th", "td"])]
if len(non_data_tables) > 3:
signals.append("table_layout")
# 2. Flash or Silverlight
if soup.find("object") or soup.find("embed"):
if any(x in html.lower() for x in ["swf", "flash", "silverlight"]):
signals.append("flash_detected")
# 3. Heavy inline styles (pre-CSS era)
inline_count = len(soup.find_all(style=True))
if inline_count > 15:
signals.append("heavy_inline_styles")
# 4. No Open Graph tags (no social/modern marketing)
if not soup.find("meta", property="og:title"):
signals.append("no_open_graph")
# 5. No favicon
favicon = soup.find("link", rel=lambda x: x and "icon" in (x if isinstance(x, str) else " ".join(x)).lower())
if not favicon:
signals.append("no_favicon")
# 6. No meta description
if not soup.find("meta", attrs={"name": "description"}):
signals.append("no_meta_description")
# 7. Frames / framesets (ancient)
if soup.find("frameset") or soup.find("frame"):
signals.append("uses_frames")
# 8. Font tags (pre-CSS styling)
if len(soup.find_all("font")) > 3:
signals.append("font_tags")
# 9. No HTTPS in URL
if url.startswith("http://"):
signals.append("no_https_url")
return signals
Full Audit Runner
import tldextract
from datetime import datetime
import time
def audit_website(url: str) -> dict:
"""Run all 8 signals. Returns complete audit dict."""
result = {"url": url, "audit_timestamp": datetime.now().isoformat()}
ext = tldextract.extract(url)
result["domain"] = f"{ext.domain}.{ext.suffix}"
# Signal 1: Status + HTML
status = check_status(url)
result.update(status)
if not result.get("is_live"):
result["has_ssl"] = False
return result # Dead site — score it and move on
html = result.get("raw_html", "")
headers = result.get("response_headers", {})
# Signal 2: Copyright year
result.update(get_copyright_year(html))
# Signal 3: Last-Modified header
result["last_modified"] = get_last_modified(headers)
# Signal 4: Tech stack
result.update(detect_tech_stack(url, html, headers))
# Signal 5: PageSpeed (slowest — do last or async)
result.update(get_pagespeed(url))
# Signal 6: Mobile responsive
result["is_mobile_friendly"] = check_mobile_friendly(html)
# Signal 7: SSL
result["has_ssl"] = check_ssl(url)
# Signal 8: Design signals
result["design_signals"] = detect_design_signals(html, url)
return result
def audit_batch(urls: list[str], delay: float = 1.5) -> list[dict]:
"""Audit multiple URLs with delay between requests."""
results = []
for i, url in enumerate(urls):
print(f"[{i+1}/{len(urls)}] Auditing: {url}")
try:
audit = audit_website(url)
results.append(audit)
except Exception as e:
results.append({"url": url, "error": str(e), "is_live": False})
time.sleep(delay)
return results
Async Version (For Speed at Scale)
import asyncio, aiohttp
async def audit_website_async(session: aiohttp.ClientSession, url: str) -> dict:
"""Async version — fetch HTML only. Run other checks synchronously after."""
try:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=10)) as resp:
html = await resp.text()
headers = dict(resp.headers)
status = resp.status
except Exception as e:
return {"url": url, "status_code": "DEAD", "is_live": False, "raw_html": ""}
# Run synchronous checks on fetched data
result = {"url": url, "status_code": status, "is_live": status < 400, "raw_html": html}
result.update(get_copyright_year(html))
result["is_mobile_friendly"] = check_mobile_friendly(html)
result["design_signals"] = detect_design_signals(html, url)
result["last_modified"] = get_last_modified(headers)
return result
async def audit_batch_async(urls: list[str], concurrency: int = 5) -> list[dict]:
connector = aiohttp.TCPConnector(limit=concurrency, ssl=False)
headers = {"User-Agent": "Mozilla/5.0 (compatible; LeadScanner/1.0)"}
async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
tasks = [audit_website_async(session, url) for url in urls]
return await asyncio.gather(*tasks, return_exceptions=False)