Install
Terminal · npx$
npx skills add https://github.com/jamditis/claude-skills-journalism --skill web-scrapingWorks with Paperclip
How Web Scraping fits into a Paperclip company.
Web Scraping drops into any Paperclip agent that handles this kind of work. Assign it to a specialist inside a pre-configured PaperclipOrg company and the skill becomes available on every heartbeat — no prompt engineering, no tool wiring.
S
SaaS FactoryPaired
Pre-configured AI company — 18 agents, 18 skills, one-time purchase.
$27$59
Explore packSource file
SKILL.md618 linesExpandCollapse
---name: web-scrapingdescription: Web scraping with anti-bot bypass, content extraction, undocumented APIs and poison pill detection. Use when extracting content from websites, handling paywalls, implementing scraping cascades or processing social media. Covers requests, trafilatura, Playwright with stealth mode, yt-dlp and instaloader patterns.--- # Web scraping methodology Patterns for reliable, ethical web scraping with fallback strategies and anti-bot handling. ## Scraping cascade architecture Implement multiple extraction strategies with automatic fallback: ```pythonfrom abc import ABC, abstractmethodfrom typing import Optionalimport requestsfrom bs4 import BeautifulSoupimport trafilatura #for .py filesfrom playwright.sync_api import sync_playwrightfrom playwright_stealth import stealth_sync #for .ipynb filesimport asynciofrom playwright.async_api import async_playwright class ScrapingResult: def __init__(self, content: str, title: str, method: str): self.content = content self.title = title self.method = method # Track which method succeeded class Scraper(ABC): @abstractmethod def fetch(self, url: str) -> Optional[ScrapingResult]: ... class TrafilaturaСscraper(Scraper): """Fast, lightweight extraction for standard articles.""" def fetch(self, url: str) -> Optional[ScrapingResult]: try: downloaded = trafilatura.fetch_url(url) if not downloaded: return None content = trafilatura.extract( downloaded, include_comments=False, include_tables=True, favor_recall=True ) if not content or len(content) < 100: return None # Extract title separately soup = BeautifulSoup(downloaded, 'html.parser') title = soup.find('title') title_text = title.get_text() if title else '' return ScrapingResult(content, title_text, 'trafilatura') except Exception: return None class RequestsScraper(Scraper): """HTTP requests with rotating user agents.""" USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36', ] def fetch(self, url: str) -> Optional[ScrapingResult]: import random headers = { 'User-Agent': random.choice(self.USER_AGENTS), 'Accept': 'text/html,application/xhtml+xml', 'Accept-Language': 'en-US,en;q=0.9', } try: response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Remove script/style elements for element in soup(['script', 'style', 'nav', 'footer', 'aside']): element.decompose() # Find main content main = soup.find('main') or soup.find('article') or soup.find('body') content = main.get_text(separator='\n', strip=True) if main else '' title = soup.find('title') title_text = title.get_text() if title else '' if len(content) < 100: return None return ScrapingResult(content, title_text, 'requests') except Exception: return None class PlaywrightScraper(Scraper): """Heavy JavaScript rendering with stealth mode for anti-bot bypass.""" def fetch(self, url: str) -> Optional[ScrapingResult]: try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) context = browser.new_context( viewport={'width': 1920, 'height': 1080}, user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' ) page = context.new_page() # Apply stealth to avoid detection stealth_sync(page) page.goto(url, wait_until='networkidle', timeout=60000) # Wait for content to load page.wait_for_timeout(2000) # Extract content content = page.evaluate('''() => { const article = document.querySelector('article, main, .content, #content'); return article ? article.innerText : document.body.innerText; }''') title = page.title() browser.close() if len(content) < 100: return None return ScrapingResult(content, title, 'playwright') except Exception: return None class PlaywrightScraperAsync: """Async Playwright scraper for Jupyter notebooks (.ipynb files). Jupyter notebooks run their own event loop, so sync Playwright won't work. Use this async version with `await` in notebook cells. """ async def fetch(self, url: str) -> Optional[ScrapingResult]: try: async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context( viewport={'width': 1920, 'height': 1080}, user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' ) page = await context.new_page() # Note: playwright-stealth async version # from playwright_stealth import stealth_async # await stealth_async(page) await page.goto(url, wait_until='networkidle', timeout=60000) # Wait for content to load await page.wait_for_timeout(2000) # Extract content content = await page.evaluate('''() => { const article = document.querySelector('article, main, .content, #content'); return article ? article.innerText : document.body.innerText; }''') title = await page.title() await browser.close() if len(content) < 100: return None return ScrapingResult(content, title, 'playwright_async') except Exception: return None # Usage in Jupyter notebook cells:# scraper = PlaywrightScraperAsync()# result = await scraper.fetch('https://example.com') class ScrapingCascade: """Try multiple scrapers in order until one succeeds.""" def __init__(self): self.scrapers = [ TrafilaturaСscraper(), RequestsScraper(), PlaywrightScraper(), ] def fetch(self, url: str) -> Optional[ScrapingResult]: for scraper in self.scrapers: result = scraper.fetch(url) if result: return result return None``` ## Undocumented APIs ### Finding undocumented APIs Use browser developer tools to discover APIs: 1. **Open developer tools** (right-click → Inspect, or F12)2. **Go to the Network tab** to monitor all requests3. **Filter by Fetch/XHR** to show only API calls4. **Trigger the action** you want to capture (search, scroll, click)5. **Analyze the response** — usually JSON with key-value pairs6. **Copy as cURL** (right-click the request)7. **Convert to code** using [curlconverter.com](https://curlconverter.com/) ### Stripping down API requests When you copy a cURL from dev tools, it includes many parameters. Strip it down by: 1. **Remove unnecessary cookies** — test without them first2. **Keep authentication tokens** if required3. **Identify the input parameters** you can modify (like `prefix` for search terms)4. **Test parameter values** — some expire, so periodically verify ### Example: Reverse-engineering an autocomplete API ```pythonimport requestsimport time def search_suggestions(keyword: str) -> dict: """ Get autocompleted search suggestions from an undocumented API. Stripped down from browser dev tools capture. """ headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:100.0) Gecko/20100101 Firefox/100.0', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Language': 'en-US,en;q=0.5', } params = { 'prefix': keyword, 'suggestion-type': ['WIDGET', 'KEYWORD'], 'alias': 'aps', 'plain-mid': '1', } response = requests.get( 'https://completion.amazon.com/api/2017/suggestions', params=params, headers=headers ) return response.json() # Collect suggestions for multiple keywordskeywords = ['a', 'b', 'cookie', 'sock']data = [] for keyword in keywords: suggestions = search_suggestions(keyword) suggestions['search_word'] = keyword # track seed keyword time.sleep(1) # rate limit yourself data.extend(suggestions.get('suggestions', []))```*Source: [Leon Yin, "Finding Undocumented APIs," Inspect Element](https://inspectelement.org/apis.html), 2023* ## Poison pill detection Detect paywalls, anti-bot pages, and other failures: ```pythonfrom dataclasses import dataclassfrom enum import Enumimport re class PoisonPillType(Enum): PAYWALL = 'paywall' CAPTCHA = 'captcha' RATE_LIMIT = 'rate_limit' CLOUDFLARE = 'cloudflare' LOGIN_REQUIRED = 'login_required' NOT_FOUND = 'not_found' NONE = 'none' @dataclassclass PoisonPillResult: detected: bool type: PoisonPillType confidence: float details: str class PoisonPillDetector: PATTERNS = { PoisonPillType.PAYWALL: [ r'subscribe to continue', r'subscription required', r'become a member', r'sign up to read', r'you\'ve reached your limit', r'article limit reached', ], PoisonPillType.CAPTCHA: [ r'verify you are human', r'captcha', r'robot verification', r'prove you\'re not a robot', ], PoisonPillType.RATE_LIMIT: [ r'too many requests', r'rate limit exceeded', r'slow down', r'429', ], PoisonPillType.CLOUDFLARE: [ r'checking your browser', r'cloudflare', r'ddos protection', r'please wait while we verify', ], PoisonPillType.LOGIN_REQUIRED: [ r'sign in to continue', r'log in required', r'create an account', ], } PAYWALL_DOMAINS = { 'nytimes.com': PoisonPillType.PAYWALL, 'wsj.com': PoisonPillType.PAYWALL, 'washingtonpost.com': PoisonPillType.PAYWALL, 'ft.com': PoisonPillType.PAYWALL, 'bloomberg.com': PoisonPillType.PAYWALL, } def detect(self, url: str, content: str, status_code: int = 200) -> PoisonPillResult: # Check status code if status_code == 429: return PoisonPillResult(True, PoisonPillType.RATE_LIMIT, 1.0, 'HTTP 429') if status_code == 403: return PoisonPillResult(True, PoisonPillType.CLOUDFLARE, 0.8, 'HTTP 403') if status_code == 404: return PoisonPillResult(True, PoisonPillType.NOT_FOUND, 1.0, 'HTTP 404') # Check known paywall domains from urllib.parse import urlparse domain = urlparse(url).netloc.replace('www.', '') for paywall_domain, pill_type in self.PAYWALL_DOMAINS.items(): if paywall_domain in domain: # Check if content is suspiciously short (paywall truncation) if len(content) < 500: return PoisonPillResult(True, pill_type, 0.9, f'Short content from {domain}') # Pattern matching content_lower = content.lower() for pill_type, patterns in self.PATTERNS.items(): for pattern in patterns: if re.search(pattern, content_lower): return PoisonPillResult(True, pill_type, 0.7, f'Pattern match: {pattern}') return PoisonPillResult(False, PoisonPillType.NONE, 0.0, '')``` ## Social media scraping ### YouTube with yt-dlp ```pythonimport yt_dlpfrom pathlib import Path def download_video_metadata(url: str) -> dict: """Extract metadata without downloading video.""" ydl_opts = { 'skip_download': True, 'quiet': True, 'no_warnings': True, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=False) return { 'title': info.get('title'), 'description': info.get('description'), 'duration': info.get('duration'), 'upload_date': info.get('upload_date'), 'view_count': info.get('view_count'), 'channel': info.get('channel'), 'thumbnail': info.get('thumbnail'), } def download_video(url: str, output_dir: Path, audio_only: bool = False) -> Path: """Download video or audio.""" output_template = str(output_dir / '%(title)s.%(ext)s') ydl_opts = { 'outtmpl': output_template, 'quiet': True, } if audio_only: ydl_opts['format'] = 'bestaudio/best' ydl_opts['postprocessors'] = [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', }] with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=True) filename = ydl.prepare_filename(info) if audio_only: filename = filename.rsplit('.', 1)[0] + '.mp3' return Path(filename) def get_transcript(url: str) -> list[dict]: """Extract auto-generated or manual subtitles.""" ydl_opts = { 'skip_download': True, 'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['en'], 'quiet': True, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=False) # Check for subtitles subtitles = info.get('subtitles', {}) auto_captions = info.get('automatic_captions', {}) # Prefer manual subtitles over auto-generated subs = subtitles.get('en') or auto_captions.get('en') if not subs: return [] # Get the vtt or json format for sub in subs: if sub['ext'] in ['vtt', 'json3']: # Download and parse subtitle file # ... implementation depends on format pass return []``` ### Instagram with instaloader ```pythonimport instaloaderfrom pathlib import Path class InstagramScraper: def __init__(self, username: str = None, session_file: str = None): self.loader = instaloader.Instaloader( download_videos=True, download_video_thumbnails=False, download_geotags=False, download_comments=False, save_metadata=True, compress_json=False, ) if session_file and Path(session_file).exists(): self.loader.load_session_from_file(username, session_file) def get_profile_posts(self, username: str, limit: int = 50) -> list[dict]: """Get recent posts from a profile.""" profile = instaloader.Profile.from_username(self.loader.context, username) posts = [] for i, post in enumerate(profile.get_posts()): if i >= limit: break posts.append({ 'shortcode': post.shortcode, 'url': f'https://instagram.com/p/{post.shortcode}/', 'caption': post.caption, 'timestamp': post.date_utc.isoformat(), 'likes': post.likes, 'comments': post.comments, 'is_video': post.is_video, 'video_url': post.video_url if post.is_video else None, }) return posts def download_post(self, shortcode: str, output_dir: Path): """Download a single post's media.""" post = instaloader.Post.from_shortcode(self.loader.context, shortcode) self.loader.download_post(post, target=str(output_dir))``` ### TikTok with yt-dlp ```pythondef scrape_tiktok_profile(username: str, output_dir: Path, limit: int = 50) -> list[dict]: """Scrape TikTok profile videos.""" profile_url = f'https://tiktok.com/@{username}' ydl_opts = { 'quiet': True, 'extract_flat': True, # Don't download, just get info 'playlistend': limit, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(profile_url, download=False) videos = [] for entry in info.get('entries', []): videos.append({ 'id': entry.get('id'), 'title': entry.get('title'), 'url': entry.get('url'), 'timestamp': entry.get('timestamp'), 'view_count': entry.get('view_count'), }) return videos def download_tiktok_video(url: str, output_dir: Path) -> Path: """Download a single TikTok video.""" ydl_opts = { 'outtmpl': str(output_dir / '%(id)s.%(ext)s'), 'quiet': True, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=True) return Path(ydl.prepare_filename(info))``` ## Request patterns ### Rotating user agents and headers ```pythonimport randomfrom fake_useragent import UserAgent class RequestManager: def __init__(self): self.ua = UserAgent() self.session = requests.Session() def get_headers(self) -> dict: return { 'User-Agent': self.ua.random, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } def fetch(self, url: str, retry_count: int = 3) -> requests.Response: for attempt in range(retry_count): try: response = self.session.get( url, headers=self.get_headers(), timeout=30 ) response.raise_for_status() return response except requests.RequestException as e: if attempt == retry_count - 1: raise time.sleep(2 ** attempt) # Exponential backoff``` ### Respectful scraping with delays ```pythonimport timeimport randomfrom urllib.parse import urlparse class PoliteRequester: def __init__(self, min_delay: float = 1.0, max_delay: float = 3.0): self.min_delay = min_delay self.max_delay = max_delay self.last_request_per_domain = {} def wait_for_domain(self, url: str): domain = urlparse(url).netloc last_request = self.last_request_per_domain.get(domain, 0) elapsed = time.time() - last_request delay = random.uniform(self.min_delay, self.max_delay) if elapsed < delay: time.sleep(delay - elapsed) self.last_request_per_domain[domain] = time.time()``` ## Ethical considerations - Always check `robots.txt` before scraping- Respect rate limits and add delays between requests- Don't scrape personal data without consent- Cache responses to avoid redundant requests- Identify yourself with a descriptive User-Agent when appropriate- Stop if you receive explicit blocking signals