Scrapy Captcha Solver

Integrate FastCaptcha into your Scrapy spiders to solve image CAPTCHAs automatically. Works as a spider middleware, downloader middleware, or inline in your spider callbacks.

Scrapy Integration

Inline β€” solve in spider callback

Simplest approach β€” solve the captcha directly inside a parse method.

import scrapy
import requests

API_KEY = "YOUR_API_KEY"

def solve_captcha(image_url, cookies=None) -> str:
    """Download captcha image and send to FastCaptcha."""
    img_resp = requests.get(image_url, cookies=cookies)
    img_resp.raise_for_status()

    resp = requests.post(
        "https://fastcaptcha.org/api/v1/ocr/",
        headers={"X-API-Key": API_KEY},
        files={"image": ("captcha.png", img_resp.content, "image/png")},
        timeout=15,
    )
    data = resp.json()
    if not data.get("success"):
        raise ValueError(f"Captcha error: {data.get('error')}")
    return data["text"]


class LoginSpider(scrapy.Spider):
    name = "login_spider"
    start_urls = ["https://example.com/login"]

    def parse(self, response):
        captcha_url = response.css("img#captcha::attr(src)").get()
        if captcha_url:
            captcha_url = response.urljoin(captcha_url)

        answer = solve_captcha(captcha_url)
        self.logger.info(f"Solved captcha: {answer}")

        yield scrapy.FormRequest.from_response(
            response,
            formdata={
                "username": "myuser",
                "password": "mypass",
                "captcha": answer,
            },
            callback=self.after_login,
        )

    def after_login(self, response):
        if "dashboard" in response.url:
            self.logger.info("Login successful!")
            # continue scraping...

Spider Middleware (reusable)

Create a middleware that auto-solves captchas across all spiders.

# middlewares.py
import requests as req_lib
from scrapy.http import HtmlResponse

API_KEY = "YOUR_API_KEY"

class CaptchaSolverMiddleware:
    """Spider middleware β€” solve captchas before parse() runs."""

    def process_response(self, request, response, spider):
        if not self._has_captcha(response):
            return response

        captcha_url = self._get_captcha_url(response)
        if not captcha_url:
            return response

        try:
            answer = self._solve(captcha_url)
            spider.logger.info(f"Auto-solved captcha: {answer}")
            # Store answer for the spider to use
            response.meta["captcha_answer"] = answer
        except Exception as e:
            spider.logger.warning(f"Captcha solve failed: {e}")

        return response

    def _has_captcha(self, response):
        return bool(response.css(
            "img[id*='captcha'], img[class*='captcha'], "
            "img[src*='captcha']"
        ))

    def _get_captcha_url(self, response):
        src = response.css(
            "img[id*='captcha']::attr(src), "
            "img[class*='captcha']::attr(src), "
            "img[src*='captcha']::attr(src)"
        ).get()
        return response.urljoin(src) if src else None

    def _solve(self, image_url):
        img = req_lib.get(image_url, timeout=10)
        resp = req_lib.post(
            "https://fastcaptcha.org/api/v1/ocr/",
            headers={"X-API-Key": API_KEY},
            files={"image": ("c.png", img.content, "image/png")},
            timeout=15,
        )
        data = resp.json()
        if not data.get("success"):
            raise ValueError(data.get("error"))
        return data["text"]


# settings.py β€” enable the middleware:
# SPIDER_MIDDLEWARES = {
#   "myproject.middlewares.CaptchaSolverMiddleware": 543,
# }

settings.py configuration

# settings.py

FASTCAPTCHA_API_KEY = "YOUR_API_KEY"  # or load from env

# Enable the middleware
SPIDER_MIDDLEWARES = {
    "myproject.middlewares.CaptchaSolverMiddleware": 543,
}

# Recommended: add delay to avoid rate limits
DOWNLOAD_DELAY = 1
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0

# Retry failed requests
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [429, 500, 502, 503, 504]
πŸ•·οΈ
Middleware pattern

One middleware solves captchas across all spiders

⚑
0.3s solve time

Minimal crawl delay compared to manual captchas

πŸ”
Works with RETRY_HTTP_CODES

Integrates with Scrapy's built-in retry middleware