AllCode/Python/crawler/crawlStriphubtest2.py

# playwright_iframe_extract_debug.py
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import re, time, os, sys

TEST_URL = "https://striphub.cam/play/68f731ea62f66877cc80e54f"  # replace if needed
MX_PATTERN = re.compile(r"https?://(?:www\.)?mxdrop\.to/e/[^\s\"'<>]+", re.I)

def extract_from_html(html: str):
    """Fallback regex scan over raw HTML (catches inline JS/JSON)."""
    return set(MX_PATTERN.findall(html))

def run():
    found = set()

    with sync_playwright() as p:
        # Use headless=False while debugging to *see* what's happening
        browser = p.chromium.launch(headless=False, args=["--disable-blink-features=AutomationControlled"])
        context = browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                       "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            locale="en-US",
            viewport={"width": 1280, "height": 900}
        )
        page = context.new_page()

        # 1) Capture ANY network response hitting mxdrop
        def on_response(resp):
            url = resp.url
            if "mxdrop.to/e/" in url:
                print("[NET] mxdrop response:", url)
                found.add(url)
        page.on("response", on_response)

        print("Navigating to", TEST_URL)
        page.goto(TEST_URL, wait_until="domcontentloaded", timeout=60000)

        # 2) Give the page time to settle network/XHR
        try:
            page.wait_for_load_state("networkidle", timeout=20000)
        except Exception:
            print("⚠️ networkidle timed out—continuing")

        # 3) Try a few generic clicks that often reveal the player/iframe
        # (No-ops if not present; they just fail silently)
        for sel in [
            'button:has-text("Play")',
            'button:has-text("I understand")',
            'button:has-text("Continue")',
            'button:has-text("Accept")',
            "#player, .video-player, .plyr__control",
        ]:
            try:
                el = page.locator(sel)
                if el.count() > 0:
                    el.first.click(timeout=2000)
            except Exception:
                pass

        # 4) Scroll to bottom to trigger lazy-load iframes
        try:
            page.evaluate("""
                const delay = ms => new Promise(r => setTimeout(r, ms));
                (async () => {
                  for (let y = 0; y < document.body.scrollHeight; y += 800) {
                    window.scrollTo(0, y);
                    await delay(200);
                  }
                  window.scrollTo(0, document.body.scrollHeight);
                })();
            """)
            time.sleep(1.0)
        except Exception:
            pass

        # 5) Dump a screenshot & HTML so you can inspect what loaded
        try:
            page.screenshot(path="debug_page.png", full_page=True)
            print("Saved screenshot -> debug_page.png")
        except Exception:
            pass

        html = page.content()
        with open("debug_page.html", "w", encoding="utf-8") as f:
            f.write(html)
        print("Saved HTML -> debug_page.html")

        # 6) Parse DOM for iframes (src and data-src)
        soup = BeautifulSoup(html, "html.parser")
        iframes = soup.find_all("iframe")
        print(f"Found {len(iframes)} <iframe> tags in DOM")

        for iframe in iframes:
            for attr in ("src", "data-src"):
                val = iframe.get(attr)
                if val and "mxdrop.to/e/" in val:
                    print("[DOM] iframe", attr, "=", val)
                    found.add(val)

        # 7) Regex over the full HTML (catches script-injected strings)
        regex_hits = extract_from_html(html)
        for u in regex_hits:
            print("[HTML-REGEX]", u)
        found.update(regex_hits)

        # 8) Also list all frame URLs Playwright sees (nested frames)
        for fr in page.frames:
            if fr.url and "mxdrop.to/e/" in fr.url:
                print("[FRAME] url:", fr.url)
                found.add(fr.url)

        # 9) Print final result
        found = sorted(found)
        print("\n==== MXDROP RESULTS ====")
        if found:
            for u in found:
                print(u)
        else:
            title = soup.title.string.strip() if soup.title and soup.title.string else "(no title)"
            print("No mxdrop links detected.")
            print("Page title:", title)
            # Quick hint if you hit a challenge:
            snippet = html[:400].replace("\n", " ")
            if "Just a moment" in snippet or "Cloudflare" in snippet or "cf-chl" in snippet:
                print("Looks like a Cloudflare challenge / interstitial (human step required).")

        # 10) Save results if any
        if found:
            with open("embedLinks.txt", "w", encoding="utf-8") as f:
                for u in found:
                    f.write(u + "\n")
            print("Saved -> embedLinks.txt")

        browser.close()

if __name__ == "__main__":
    run()
Initial commit 3 days ago			`# playwright_iframe_extract_debug.py`
			`from playwright.sync_api import sync_playwright`
			`from bs4 import BeautifulSoup`
			`import re, time, os, sys`

			`TEST_URL = "https://striphub.cam/play/68f731ea62f66877cc80e54f" # replace if needed`
			`MX_PATTERN = re.compile(r"https?://(?:www\.)?mxdrop\.to/e/[^\s\"'<>]+", re.I)`

			`def extract_from_html(html: str):`
			`"""Fallback regex scan over raw HTML (catches inline JS/JSON)."""`
			`return set(MX_PATTERN.findall(html))`

			`def run():`
			`found = set()`

			`with sync_playwright() as p:`
			`# Use headless=False while debugging to see what's happening`
			`browser = p.chromium.launch(headless=False, args=["--disable-blink-features=AutomationControlled"])`
			`context = browser.new_context(`
			`user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "`
			`"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",`
			`locale="en-US",`
			`viewport={"width": 1280, "height": 900}`
			`)`
			`page = context.new_page()`

			`# 1) Capture ANY network response hitting mxdrop`
			`def on_response(resp):`
			`url = resp.url`
			`if "mxdrop.to/e/" in url:`
			`print("[NET] mxdrop response:", url)`
			`found.add(url)`
			`page.on("response", on_response)`

			`print("Navigating to", TEST_URL)`
			`page.goto(TEST_URL, wait_until="domcontentloaded", timeout=60000)`

			`# 2) Give the page time to settle network/XHR`
			`try:`
			`page.wait_for_load_state("networkidle", timeout=20000)`
			`except Exception:`
			`print("⚠️ networkidle timed out—continuing")`

			`# 3) Try a few generic clicks that often reveal the player/iframe`
			`# (No-ops if not present; they just fail silently)`
			`for sel in [`
			`'button:has-text("Play")',`
			`'button:has-text("I understand")',`
			`'button:has-text("Continue")',`
			`'button:has-text("Accept")',`
			`"#player, .video-player, .plyr__control",`
			`]:`
			`try:`
			`el = page.locator(sel)`
			`if el.count() > 0:`
			`el.first.click(timeout=2000)`
			`except Exception:`
			`pass`

			`# 4) Scroll to bottom to trigger lazy-load iframes`
			`try:`
			`page.evaluate("""`
			`const delay = ms => new Promise(r => setTimeout(r, ms));`
			`(async () => {`
			`for (let y = 0; y < document.body.scrollHeight; y += 800) {`
			`window.scrollTo(0, y);`
			`await delay(200);`
			`}`
			`window.scrollTo(0, document.body.scrollHeight);`
			`})();`
			`""")`
			`time.sleep(1.0)`
			`except Exception:`
			`pass`

			`# 5) Dump a screenshot & HTML so you can inspect what loaded`
			`try:`
			`page.screenshot(path="debug_page.png", full_page=True)`
			`print("Saved screenshot -> debug_page.png")`
			`except Exception:`
			`pass`

			`html = page.content()`
			`with open("debug_page.html", "w", encoding="utf-8") as f:`
			`f.write(html)`
			`print("Saved HTML -> debug_page.html")`

			`# 6) Parse DOM for iframes (src and data-src)`
			`soup = BeautifulSoup(html, "html.parser")`
			`iframes = soup.find_all("iframe")`
			`print(f"Found {len(iframes)} <iframe> tags in DOM")`

			`for iframe in iframes:`
			`for attr in ("src", "data-src"):`
			`val = iframe.get(attr)`
			`if val and "mxdrop.to/e/" in val:`
			`print("[DOM] iframe", attr, "=", val)`
			`found.add(val)`

			`# 7) Regex over the full HTML (catches script-injected strings)`
			`regex_hits = extract_from_html(html)`
			`for u in regex_hits:`
			`print("[HTML-REGEX]", u)`
			`found.update(regex_hits)`

			`# 8) Also list all frame URLs Playwright sees (nested frames)`
			`for fr in page.frames:`
			`if fr.url and "mxdrop.to/e/" in fr.url:`
			`print("[FRAME] url:", fr.url)`
			`found.add(fr.url)`

			`# 9) Print final result`
			`found = sorted(found)`
			`print("\n==== MXDROP RESULTS ====")`
			`if found:`
			`for u in found:`
			`print(u)`
			`else:`
			`title = soup.title.string.strip() if soup.title and soup.title.string else "(no title)"`
			`print("No mxdrop links detected.")`
			`print("Page title:", title)`
			`# Quick hint if you hit a challenge:`
			`snippet = html[:400].replace("\n", " ")`
			`if "Just a moment" in snippet or "Cloudflare" in snippet or "cf-chl" in snippet:`
			`print("Looks like a Cloudflare challenge / interstitial (human step required).")`

			`# 10) Save results if any`
			`if found:`
			`with open("embedLinks.txt", "w", encoding="utf-8") as f:`
			`for u in found:`
			`f.write(u + "\n")`
			`print("Saved -> embedLinks.txt")`

			`browser.close()`

			`if __name__ == "__main__":`
			`run()`