You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

92 lines
3.1 KiB
Python

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

import requests
from bs4 import BeautifulSoup
import time
MXDROP_PREFIX = "https://mxdrop.to/e/"
def build_session():
s = requests.Session()
s.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
})
return s
def fetch_video_links(session, page_url, base_domain="https://striphub.cam"):
"""Collect /play/ links from one paginated list page."""
try:
r = session.get(page_url, timeout=10)
r.raise_for_status()
except requests.RequestException as e:
print(f"{page_url} failed: {e}")
return []
soup = BeautifulSoup(r.text, "html.parser")
links = []
for a in soup.find_all("a", href=True):
if a["href"].startswith("/play/"):
links.append(base_domain + a["href"])
print(f"{page_url}: {len(links)} /play/ links")
return links
def extract_iframe_links(session, play_url):
"""Open a /play/ page and collect all <iframe src="https://mxdrop.to/e/...">."""
try:
r = session.get(play_url, timeout=12, headers={"Referer": play_url})
r.raise_for_status()
except requests.RequestException as e:
print(f"❌ could not fetch {play_url}: {e}")
return []
soup = BeautifulSoup(r.text, "html.parser")
embeds = [
iframe["src"]
for iframe in soup.find_all("iframe", src=True)
if iframe["src"].startswith(MXDROP_PREFIX)
]
if embeds:
print(f"🔗 {play_url}: found {len(embeds)} mxdrop embeds")
else:
title = soup.title.string.strip() if soup.title and soup.title.string else "(no title)"
print(f"🚫 {play_url}: no iframes, title={title}")
return embeds
def crawl_all_pages(base_url, total_pages, base_domain="https://striphub.cam",
video_out="video_links.txt", embed_out="embedLinks.txt"):
session = build_session()
# 1⃣ gather all /play/ links
all_play = []
for p in range(1, total_pages + 1):
page_url = f"{base_url}/page/{p}"
print(f"\n🌐 Crawling {page_url}")
all_play.extend(fetch_video_links(session, page_url, base_domain))
time.sleep(0.5)
all_play = sorted(set(all_play))
with open(video_out, "w", encoding="utf-8") as f:
f.writelines(link + "\n" for link in all_play)
print(f"\n✅ saved {len(all_play)} /play/ links → {video_out}")
# 2⃣ visit each /play/ page and collect mxdrop iframes
all_embeds = set()
for i, link in enumerate(all_play, 1):
embeds = extract_iframe_links(session, link)
all_embeds.update(embeds)
print(f"[{i}/{len(all_play)}] total embeds: {len(all_embeds)}")
time.sleep(0.5)
with open(embed_out, "w", encoding="utf-8") as f:
f.writelines(e + "\n" for e in sorted(all_embeds))
print(f"\n✅ saved {len(all_embeds)} mxdrop embeds → {embed_out}")
return all_play, sorted(all_embeds)
# Example usage:
crawl_all_pages("https://striphub.cam", total_pages=5)