Scrapy Captcha Solver
Integrate FastCaptcha into your Scrapy spiders to solve image CAPTCHAs automatically. Works as a spider middleware, downloader middleware, or inline in your spider callbacks.
Scrapy Integration
Inline β solve in spider callback
Simplest approach β solve the captcha directly inside a parse method.
import scrapy
import requests
API_KEY = "YOUR_API_KEY"
def solve_captcha(image_url, cookies=None) -> str:
"""Download captcha image and send to FastCaptcha."""
img_resp = requests.get(image_url, cookies=cookies)
img_resp.raise_for_status()
resp = requests.post(
"https://fastcaptcha.org/api/v1/ocr/",
headers={"X-API-Key": API_KEY},
files={"image": ("captcha.png", img_resp.content, "image/png")},
timeout=15,
)
data = resp.json()
if not data.get("success"):
raise ValueError(f"Captcha error: {data.get('error')}")
return data["text"]
class LoginSpider(scrapy.Spider):
name = "login_spider"
start_urls = ["https://example.com/login"]
def parse(self, response):
captcha_url = response.css("img#captcha::attr(src)").get()
if captcha_url:
captcha_url = response.urljoin(captcha_url)
answer = solve_captcha(captcha_url)
self.logger.info(f"Solved captcha: {answer}")
yield scrapy.FormRequest.from_response(
response,
formdata={
"username": "myuser",
"password": "mypass",
"captcha": answer,
},
callback=self.after_login,
)
def after_login(self, response):
if "dashboard" in response.url:
self.logger.info("Login successful!")
# continue scraping...
Spider Middleware (reusable)
Create a middleware that auto-solves captchas across all spiders.
# middlewares.py
import requests as req_lib
from scrapy.http import HtmlResponse
API_KEY = "YOUR_API_KEY"
class CaptchaSolverMiddleware:
"""Spider middleware β solve captchas before parse() runs."""
def process_response(self, request, response, spider):
if not self._has_captcha(response):
return response
captcha_url = self._get_captcha_url(response)
if not captcha_url:
return response
try:
answer = self._solve(captcha_url)
spider.logger.info(f"Auto-solved captcha: {answer}")
# Store answer for the spider to use
response.meta["captcha_answer"] = answer
except Exception as e:
spider.logger.warning(f"Captcha solve failed: {e}")
return response
def _has_captcha(self, response):
return bool(response.css(
"img[id*='captcha'], img[class*='captcha'], "
"img[src*='captcha']"
))
def _get_captcha_url(self, response):
src = response.css(
"img[id*='captcha']::attr(src), "
"img[class*='captcha']::attr(src), "
"img[src*='captcha']::attr(src)"
).get()
return response.urljoin(src) if src else None
def _solve(self, image_url):
img = req_lib.get(image_url, timeout=10)
resp = req_lib.post(
"https://fastcaptcha.org/api/v1/ocr/",
headers={"X-API-Key": API_KEY},
files={"image": ("c.png", img.content, "image/png")},
timeout=15,
)
data = resp.json()
if not data.get("success"):
raise ValueError(data.get("error"))
return data["text"]
# settings.py β enable the middleware:
# SPIDER_MIDDLEWARES = {
# "myproject.middlewares.CaptchaSolverMiddleware": 543,
# }
settings.py configuration
# settings.py
FASTCAPTCHA_API_KEY = "YOUR_API_KEY" # or load from env
# Enable the middleware
SPIDER_MIDDLEWARES = {
"myproject.middlewares.CaptchaSolverMiddleware": 543,
}
# Recommended: add delay to avoid rate limits
DOWNLOAD_DELAY = 1
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
# Retry failed requests
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [429, 500, 502, 503, 504]
Middleware pattern
One middleware solves captchas across all spiders
0.3s solve time
Minimal crawl delay compared to manual captchas
Works with RETRY_HTTP_CODES
Integrates with Scrapy's built-in retry middleware