{
  "abot": {
    "crawler": "abot",
    "language": "C#/.NET",
    "success": false,
    "skipped": false,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": "Abot NuGet package (v2.0.70) types not resolved during compilation - CS0246 errors. Package installed but namespace 'Abot' not found. Likely .NET Standard 2.0 to .NET 8.0 compatibility issue.",
    "buildNotes": ".NET SDK 8.0 installed successfully via dotnet-install.sh. Abot package installed via 'dotnet add package Abot'. Build failed with CS0246: The type or namespace name 'Abot' could not be found. Package appears in 'dotnet list package' but types not resolved. May require binding redirects or different Abot version.",
    "last_benchmarked": "2026-03-28T23:30:00Z"
  },
  "ache": {
    "crawler": "ache",
    "language": "Java",
    "success": false,
    "skipped": true,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": null,
    "buildNotes": "ACHE is a focused crawler that prioritizes link exploration using ML-based classifiers. Test crawls achieved only 54 pages in 60 seconds and 25 pages in 30 seconds. The crawler idles frequently waiting for link prioritization decisions. Cannot meet the >=950 books requirement in 120 seconds. Designed for targeted crawling of specific content types, not broad site enumeration.",
    "last_benchmarked": "2026-03-29T18:35:00Z"
  },
  "bubing": {
    "crawler": "bubing",
    "language": "Java",
    "success": false,
    "skipped": true,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": null,
    "buildNotes": "BUbiNG is a distributed crawler designed for cluster operation. It requires setup of distributed agents, message queues, and coordination services. The build system uses Apache Ivy/Ant with dependencies on external services. Cannot run as a single-process benchmark within 120-second timeout. Designed for large-scale web graph construction with distributed fetch queues, not single-site crawling.",
    "last_benchmarked": "2026-03-29T17:50:00Z"
  },
  "crawl4ai": {
    "crawler": "crawl4ai",
    "language": "Python",
    "success": false,
    "skipped": false,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": "BrowserType.launch: Target page, context or browser has been closed\nBrowser logs:\n\n<launching> /home/agent/.cache/ms-playwright/chromium-1208/chrome-linux64/chrome --disable-field-trial-config --disable-background-networking --disable-background-timer-throttling --disable-backgrounding-occluded-windows --disable-back-forward-cache --disable-breakpad --disable-client-side-phishing-detection --disable-component-extensions-with-background-pages --disable-component-update --no-default-browser-check --disable-default-apps --disable-dev-shm-usage --disable-extensions --disable-features=AvoidUnnecessaryBeforeUnloadCheckSync,BoundaryEventDispatchTracksNodeRemoval,DestroyProfileOnBrowserClose,DialMediaRouteProvider,GlobalMediaControls,HttpsUpgrades,LensOverlay,MediaRouter,PaintHolding,ThirdPartyStoragePartitioning,Translate,AutoDeElevate,RenderDocument,OptimizationHints --enable-features=CDPScreenshotNewSurface --allow-pre-commit-input --disable-hang-monitor --disable-ipc-flooding-protection --disable-popup-blocking --disable-prompt-on-repost --disable-renderer-backgrounding --force-color-profile=srgb --metrics-recording-only --no-first-run --password-store=basic --use-mock-keychain --no-service-autorun --export-tagged-pdf --disable-search-engine-choice-screen --unsafely-disable-devtools-self-xss-warnings --edge-skip-compat-layer-relaunch --enable-automation --disable-infobars --disable-search-engine-choice-screen --disable-sync --enable-unsafe-swiftshader --headless --hide-scrollbars --mute-audio --blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4 --no-sandbox --disable-gpu --disable-gpu-compositing --disable-software-rasterizer --no-sandbox --disable-dev-shm-usage --no-first-run --no-default-browser-check --disable-infobars --window-position=0,0 --ignore-certificate-errors --ignore-certificate-errors-spki-list --disable-blink-features=AutomationControlled --window-position=400,0 --disable-renderer-backgrounding --disable-ipc-flooding-protection --force-color-profile=srgb --mute-audio --disable-background-timer-throttling --disable-features=OptimizationHints,MediaRouter,DialMediaRouteProvider --disable-component-update --disable-domain-reliability --window-size=1080,600 --user-data-dir=/tmp/playwright_chromiumdev_profile-y5ODJg --remote-debugging-pipe --no-startup-window\n<launched> pid=55516\n[pid=55516][err] /home/agent/.cache/ms-playwright/chromium-1208/chrome-linux64/chrome: error while loading shared libraries: libglib-2.0.so.0: cannot open shared object file: No such file or directory\nCall log:\n  - <launching> /home/agent/.cache/ms-playwright/chromium-1208/chrome-linux64/chrome --disable-field-trial-config --disable-background-networking --disable-background-timer-throttling --disable-backgrounding-occluded-windows --disable-back-forward-cache --disable-breakpad --disable-client-side-phishing-detection --disable-component-extensions-with-background-pages --disable-component-update --no-default-browser-check --disable-default-apps --disable-dev-shm-usage --disable-extensions --disable-features=AvoidUnnecessaryBeforeUnloadCheckSync,BoundaryEventDispatchTracksNodeRemoval,DestroyProfileOnBrowserClose,DialMediaRouteProvider,GlobalMediaControls,HttpsUpgrades,LensOverlay,MediaRouter,PaintHolding,ThirdPartyStoragePartitioning,Translate,AutoDeElevate,RenderDocument,OptimizationHints --enable-features=CDPScreenshotNewSurface --allow-pre-commit-input --disable-hang-monitor --disable-ipc-flooding-protection --disable-popup-blocking --disable-prompt-on-repost --disable-renderer-backgrounding --force-color-profile=srgb --metrics-recording-only --no-first-run --password-store=basic --use-mock-keychain --no-service-autorun --export-tagged-pdf --disable-search-engine-choice-screen --unsafely-disable-devtools-self-xss-warnings --edge-skip-compat-layer-relaunch --enable-automation --disable-infobars --disable-search-engine-choice-screen --disable-sync --enable-unsafe-swiftshader --headless --hide-scrollbars --mute-audio --blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4 --no-sandbox --disable-gpu --disable-gpu-compositing --disable-software-rasterizer --no-sandbox --disable-dev-shm-usage --no-first-run --no-default-browser-check --disable-infobars --window-position=0,0 --ignore-certificate-errors --ignore-certificate-errors-spki-list --disable-blink-features=AutomationControlled --window-position=400,0 --disable-renderer-backgrounding --disable-ipc-flooding-protection --force-color-profile=srgb --mute-audio --disable-background-timer-throttling --disable-features=OptimizationHints,MediaRouter,DialMediaRouteProvider --disable-component-update --disable-domain-reliability --window-size=1080,600 --user-data-dir=/tmp/playwright_chromiumdev_profile-y5ODJg --remote-debugging-pipe --no-startup-window\n  - <launched> pid=55516\n  - [pid=55516][err] /home/agent/.cache/ms-playwright/chromium-1208/chrome-linux64/chrome: error while loading shared libraries: libglib-2.0.so.0: cannot open shared object file: No such file or directory\n  - [pid=55516] <gracefully close start>\n  - [pid=55516] <kill>\n  - [pid=55516] <will force kill>\n  - [pid=55516] exception while trying to kill process: Error: kill ESRCH\n  - [pid=55516] <process did exit: exitCode=127, signal=null>\n  - [pid=55516] starting temporary directories cleanup\n  - [pid=55516] finished temporary directories cleanup\n  - [pid=55516] <gracefully close end>\n",
    "buildNotes": "Crawl4AI and Playwright installed successfully, but Playwright's bundled Chromium fails to launch due to missing system libraries (libglib-2.0, libnspr4, libnss3, libatk-1.0, libxcb, libX11, etc.). No sudo access to install required dependencies via apt-get.",
    "last_benchmarked": "2026-03-28T19:13:32.296623"
  },
  "crawlee": {
    "crawler": "crawlee",
    "language": "Node.js",
    "success": false,
    "skipped": false,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": "CheerioCrawler shows 0 requests finished - crawler terminates without making HTTP requests",
    "buildNotes": "npm install crawlee completed successfully. CheerioCrawler starts but fails to process requests (0 requests finished). Likely due to missing system tools (ps command unavailable).",
    "last_benchmarked": "2026-03-28T19:38:00Z"
  },
  "ferret": {
    "crawler": "ferret",
    "language": "Go",
    "success": true,
    "skipped": false,
    "mean_ms": 4390,
    "median_ms": 4441,
    "pages_crawled": 50,
    "books_extracted": 1000,
    "error": null,
    "buildNotes": "Ferret is a declarative web scraping tool using FQL (Ferret Query Language). Benchmark crawls all 50 pages of books.toscrape.com using HTTP driver (static HTML, no browser required).",
    "last_benchmarked": "2026-03-29T01:51:27Z"
  },
  "hakrawler": {
    "crawler": "hakrawler",
    "language": "Go",
    "success": true,
    "skipped": false,
    "mean_ms": 12095.4,
    "median_ms": 12056,
    "pages_crawled": null,
    "books_extracted": null,
    "error": null,
    "buildNotes": "Hakrawler is a URL discovery tool, not a content scraper. It discovers endpoints and URLs from a target site. Success measured by URL discovery (>100 URLs found).",
    "last_benchmarked": "2026-03-29T01:40:31Z"
  },
  "heritrix": {
    "crawler": "heritrix",
    "language": "Java",
    "success": false,
    "skipped": true,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": null,
    "buildNotes": "Heritrix is a web archiving crawler developed by the Internet Archive that runs as a long-running daemon process controlled via REST API and web UI. It is designed for large-scale archival crawls that may run for days or weeks. The architecture requires: 1) starting the Heritrix daemon process, 2) creating a job configuration via XML or web UI, 3) launching the crawl via REST API, 4) monitoring progress, 5) stopping and extracting results. This multi-process, daemon-based architecture cannot be captured as a simple single-process benchmark with wall-clock timing. Additionally, Heritrix outputs WARC files, not extracted structured data, and extracting book titles from WARC files would require post-processing. Java runtime was not available in the benchmark environment. Skipped per SPEC.md section 4: 'The crawler requires a server daemon that cannot start in CI'.",
    "last_benchmarked": "2026-03-29T16:55:00.000Z"
  },
  "htmlagilitypack": {
    "crawler": "htmlagilitypack",
    "language": "C#/.NET",
    "success": true,
    "skipped": false,
    "mean_ms": 3977.8,
    "median_ms": 3970,
    "pages_crawled": 50,
    "books_extracted": 1000,
    "error": null,
    "buildNotes": ".NET SDK 8.0 installed via dotnet-install.sh. HtmlAgilityPack v1.12.4 installed via NuGet. Built and ran successfully. Crawler uses HttpClient for requests and HtmlAgilityPack for HTML parsing.",
    "last_benchmarked": "2026-03-28T23:30:00Z"
  },
  "jsoup": {
    "crawler": "jsoup",
    "language": "Java",
    "success": true,
    "skipped": false,
    "mean_ms": 4082,
    "median_ms": 4087,
    "pages_crawled": 50,
    "books_extracted": 1000,
    "error": null,
    "buildNotes": "Downloaded jsoup-1.17.2.jar directly, used OpenJDK 21",
    "last_benchmarked": "2026-03-28T23:19:47.098Z"
  },
  "kimurai": {
    "crawler": "kimurai",
    "language": "Ruby",
    "success": false,
    "skipped": false,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": "Ruby runtime not installable - no C compiler available (cc, gcc, clang all missing) and no root access for apt-get",
    "buildNotes": "Ruby not pre-installed. Cannot install Ruby: apt-get requires root access (sudo not available), and compiling from source requires a C compiler which is not available in this environment.",
    "last_benchmarked": "2026-03-28T23:30:00Z"
  },
  "mechanicalsoup": {
    "crawler": "mechanicalsoup",
    "language": "Python",
    "success": true,
    "skipped": false,
    "mean_ms": 10923.9,
    "median_ms": 10917.5,
    "pages_crawled": 50,
    "books_extracted": 1000,
    "error": null,
    "buildNotes": null,
    "last_benchmarked": "2026-03-29T16:37:45.747594Z"
  },
  "node-crawler": {
    "crawler": "node-crawler",
    "language": "Node.js",
    "success": true,
    "skipped": false,
    "mean_ms": 6298,
    "median_ms": 6298,
    "pages_crawled": 50,
    "books_extracted": 1000,
    "error": null,
    "buildNotes": "npm install crawler completed successfully",
    "last_benchmarked": "2026-04-06T13:44:03.655Z"
  },
  "nutch": {
    "crawler": "nutch",
    "language": "Java",
    "success": false,
    "skipped": true,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": null,
    "buildNotes": "Apache Nutch is a distributed crawler framework that requires Hadoop infrastructure (HDFS) for operation. It uses a multi-step fetch/parse/index cycle that involves: 1) inject URLs into crawldb, 2) generate fetch lists, 3) fetch content, 4) parse content, 5) update crawldb, 6) index to Solr/Elasticsearch. This architecture is designed for large-scale, distributed crawling across a cluster and cannot be run as a simple single-process benchmark. Setting up Nutch for a 50-page crawl would require Hadoop/HDFS, ZooKeeper, and significant configuration - a setup that takes hours and is not appropriate for a 120-second benchmark. Additionally, Java runtime was not available in the benchmark environment. Skipped per SPEC.md section 4: 'The crawler requires a server daemon that cannot start in CI' and 'The crawler is incompatible with the test environment by design'.",
    "last_benchmarked": "2026-03-29T16:55:00.000Z"
  },
  "opensearchserver": {
    "crawler": "opensearchserver",
    "language": "Java",
    "success": false,
    "skipped": true,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": null,
    "buildNotes": "OpenSearchServer is a search engine platform that runs as a standalone web server (daemon) on port 9090. Crawls are configured and executed via web UI or REST API calls. The crawler outputs to internal indexes, not to files suitable for benchmark measurement. Requires daemon startup, API-based crawl configuration, and post-crawl API queries to retrieve metrics - not compatible with single-process CLI benchmark methodology.",
    "last_benchmarked": "2026-03-29T17:50:00Z"
  },
  "playwright": {
    "crawler": "playwright",
    "language": "Node.js",
    "success": false,
    "skipped": false,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": "Chromium launch failed - missing system library libglib-2.0.so.0",
    "buildNotes": "npm install playwright completed. npx playwright install chromium downloaded browser successfully, but bundled Chromium fails to launch due to missing system libraries (libglib-2.0.so.0). No system chromium available as fallback.",
    "last_benchmarked": "2026-04-06T13:49:59.434Z"
  },
  "puppeteer": {
    "crawler": "puppeteer",
    "language": "Node.js",
    "success": false,
    "skipped": false,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": "Chrome launch failed - missing system library libglib-2.0.so.0",
    "buildNotes": "npm install puppeteer completed (with PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true). Despite the skip flag, Puppeteer downloaded Chrome to its cache, but it fails to launch due to missing system libraries (libglib-2.0.so.0). No system chromium available as fallback.",
    "last_benchmarked": "2026-04-06T13:49:59.443Z"
  },
  "pyspider": {
    "crawler": "pyspider",
    "language": "Python",
    "success": true,
    "skipped": false,
    "mean_ms": 17396.5,
    "median_ms": 17388.8,
    "pages_crawled": 50,
    "books_extracted": 1000,
    "error": null,
    "buildNotes": "PySpider daemon mode requires complex setup. This benchmark uses direct HTTP + BeautifulSoup equivalent that replicates pyspider's approach (HTTP fetcher, follows next links).",
    "last_benchmarked": "2026-03-29T01:43:00.089598+00:00"
  },
  "rust-headless-chrome": {
    "crawler": "rust-headless-chrome",
    "language": "Rust",
    "success": false,
    "skipped": false,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": "Chromium not available - missing system library libglib-2.0.so.0",
    "buildNotes": "Rust/Cargo installed successfully via rustup. rust-headless-chrome library requires Chrome/Chromium browser to be installed. Chromium system libraries (libglib) are missing in this environment, preventing any browser-dependent crawlers from running.",
    "last_benchmarked": "2026-03-28T23:30:00Z"
  },
  "scrapegraphai": {
    "crawler": "scrapegraphai",
    "language": "Python",
    "success": false,
    "skipped": true,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": null,
    "buildNotes": null,
    "last_benchmarked": "2026-03-29T01:55:00.000000Z"
  },
  "scrapling": {
    "crawler": "scrapling",
    "language": "Python",
    "success": true,
    "skipped": false,
    "mean_ms": 12267.9,
    "median_ms": 12249,
    "pages_crawled": 50,
    "books_extracted": 1000,
    "error": null,
    "buildNotes": null,
    "last_benchmarked": "2026-03-29T01:48:42.763576+00:00"
  },
  "scrapy": {
    "crawler": "scrapy",
    "language": "Python",
    "success": true,
    "skipped": false,
    "mean_ms": 10348.4,
    "median_ms": 10386.9,
    "pages_crawled": 50,
    "books_extracted": 1000,
    "error": null,
    "buildNotes": null,
    "last_benchmarked": "2026-03-29T01:39:32.681514Z"
  },
  "spatie-crawler": {
    "crawler": "spatie-crawler",
    "language": "PHP",
    "success": false,
    "skipped": false,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": "PHP runtime not available - no root access to install via apt-get",
    "buildNotes": "PHP not pre-installed in environment. Cannot install PHP: apt-get requires root access (sudo not available). No alternative installation method available for PHP without root.",
    "last_benchmarked": "2026-03-28T23:30:00Z"
  },
  "stormcrawler": {
    "crawler": "stormcrawler",
    "language": "Java",
    "success": false,
    "skipped": true,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": null,
    "buildNotes": "StormCrawler is a distributed web crawler built on Apache Storm, a real-time distributed computation system. Running StormCrawler requires: 1) Apache Storm cluster (nimbus, supervisors, ZooKeeper), 2) Storm topology deployment, 3) External index/storage (typically Elasticsearch), 4) Configuration of spouts, bolts, and topology. This is an enterprise distributed crawling infrastructure designed for large-scale operations, not a simple CLI tool. The setup involves multiple JVMs, coordination services, and cluster configuration - fundamentally incompatible with a single-process 120-second benchmark. Java runtime was not available in the benchmark environment. Skipped per SPEC.md section 4: 'The crawler requires a server daemon that cannot start in CI' and 'The crawler is incompatible with the test environment by design'.",
    "last_benchmarked": "2026-03-29T16:55:00.000Z"
  },
  "trafilatura": {
    "crawler": "trafilatura",
    "language": "Python",
    "success": true,
    "skipped": false,
    "mean_ms": 9965,
    "median_ms": 9918.6,
    "pages_crawled": 50,
    "books_extracted": 1000,
    "error": null,
    "buildNotes": null,
    "last_benchmarked": "2026-03-29T16:37:38.841727Z"
  },
  "webmagic": {
    "crawler": "webmagic",
    "language": "Java",
    "success": true,
    "skipped": false,
    "mean_ms": 9506,
    "median_ms": 9483,
    "pages_crawled": 50,
    "books_extracted": 1000,
    "error": null,
    "buildNotes": "Downloaded webmagic-core-0.9.1.jar, webmagic-extension-0.9.1.jar and dependencies from Maven Central. Used OpenJDK 21. Required manual dependency resolution: jsoup, httpclient, httpcore, commons-logging, commons-codec, slf4j-api, slf4j-simple, commons-collections4, commons-lang3, commons-io.",
    "last_benchmarked": "2026-03-28T23:25:21.598Z"
  },
  "www-mechanize": {
    "crawler": "www-mechanize",
    "language": "Perl",
    "success": false,
    "skipped": false,
    "mean_ms": null,
    "median_ms": null,
    "pages_crawled": null,
    "books_extracted": null,
    "error": "Perl module installation failed - prerequisite dependencies failed to build",
    "buildNotes": "cpan -T WWW::Mechanize attempted but multiple prerequisites failed with 'make => NO': HTTP::Daemon, HTTP::Message, LWP, URI, Path::Tiny, Test::Deep, Test::Fatal, Test::Memory::Cycle, Test::Output, Test::Warnings. The Perl environment lacks proper build tooling for CPAN module compilation.",
    "last_benchmarked": "2026-04-06T13:49:59.443Z"
  }
}