"""
Book Scraper & EPUB Generator (with Playwright)
================================================
This script scrapes a Vietnamese web novel from wikicv.net and
packages all chapters into a clean, readable EPUB ebook file.

Uses Playwright to render JavaScript-heavy pages.
Supports resumable scraping with JSON cache.
Includes book cover and metadata.

Requirements:
    pip install playwright beautifulsoup4 ebooklib pillow requests
    python -m playwright install chromium
"""

import time
import json
import os
import requests
from datetime import datetime
from pathlib import Path
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
from ebooklib import epub
from io import BytesIO
import re


# ─────────────────────────────────────────────
# CONFIGURATION — change these if needed
# ─────────────────────────────────────────────

BOOK_INTRO_URL = "https://wikicv.net/truyen/the-gioi-huy-diet-sau-trong-sinh-YOSSf1S4CHja3jT1"

# ── Scraping behavior ──────────────────────
WAIT_TIME_PER_CHAPTER = 1.5      # seconds between chapter fetches
MAX_CHAPTERS_PER_DAY = 300         # stop after this many chapters fetched today
LIMIT_CHAPTERS_FOR_TESTING = None  # Set to an integer to limit chapters during testing (e.g. 5)
CACHE_FILE = BOOK_INTRO_URL.rstrip("/").split("/")[-1] + ".json" # where to save progress

# ── Playwright settings ────────────────────
BROWSER_HEADLESS = True          # Set to False to SHOW browser
BROWSER_TIMEOUT = 360000          # milliseconds (6 minutes)
SLOW_MO = 1000                    # milliseconds - slows down browser actions
EXTRA_WAIT_AFTER_PAGE_LOAD = 23     # seconds to wait after page load for JS content

# ─────────────────────────────────────────────
# CONTENT CLEANER — Remove ads
# ─────────────────────────────────────────────

class ContentCleaner:
    """Removes ads and unwanted elements from chapter content"""
    
    # Elements to remove by ID
    AD_IDS = [
        "tpads_article_middle_container",
        "ubvideoFrame",
        "adContainer",
        "advertisement",
        "ads",
        "google_ads_div",
        "unibots-video"
    ]
    
    # Elements to remove by class
    AD_CLASSES = [
        "tpm-unit",
        "ad",
        "advertisement",
        "advert",
        "ads",
        "ad-banner",
        "ad-container",
        "sidebar-ads",
        "google-ads",
        "sponsored",
        "sponsored-content",
        "gliaplayer-container",
        "tpm-unit",
        "center ankhinho",
        "ankhito center",
        "gliaplayer-container styles-module_container_xuywD",
        "InstreamDom_root_21jVv"
    ]
    
    # Elements to remove by atrribute
    AD_ATTRIBUTES = [
        "data-ad-slot",
        "data-ad-client",
        "data-google-query-id",
        "data-readmore-toggle",
        "data-gc-boot-time", "data-gc-test-id"
    ]
    
    # Tags to completely remove
    REMOVE_TAGS = ["script", "style", "iframe"]
    
    UNWANTED_STRINGS = [
    "·",
    "dkạhsdsadjdá",
    "oiewơie",
    "✧⋄⋆⋅⋆⋄✧⋄⋆⋅⋆⋄✧ ฅ/ᐠ｡ꞈ｡ᐟ\ฅ Convert by Haruko ฅ/ᐠ｡ꞈ｡ᐟ\ฅ ✧⋄⋆⋅⋆⋄✧⋄⋆⋅⋆⋄✧",
    "☀Truyện được đăng bởi Reine☀"
    ]

    
    @classmethod
    def clean(cls, content_div) -> str:
        if content_div is None:
            return ""

        paragraphs = content_div.find_all("p")
        clean_html = "".join(
            f"<p>{p.get_text(strip=True)}</p>"
            for p in paragraphs
            if p and p.get_text(strip=True)
        )

        content_div.clear()
        cleaned_soup = BeautifulSoup(clean_html, "html.parser")
        for child in list(cleaned_soup.contents):
            content_div.append(child)

        return clean_html


# ─────────────────────────────────────────────
# CACHE MANAGEMENT — save/load progress
# ─────────────────────────────────────────────

class ScraperCache:
    """Manages JSON cache for resumable scraping"""
    
    def __init__(self, cache_file: str = CACHE_FILE):
        self.cache_file = cache_file
        self.data = self._load()
    
    def _load(self) -> dict:
        """Load cache from JSON file"""
        if os.path.exists(self.cache_file):
            try:
                with open(self.cache_file, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except Exception as e:
                print(f"  ⚠ Error loading cache: {e}")
                return self._default_cache()
        return self._default_cache()
    
    def _default_cache(self) -> dict:
        """Return empty cache structure"""
        return {
            "book_metadata": {},
            "chapters": [],
            "last_updated": None,
            "chapters_fetched_today": 0,
            "last_fetch_date": None,
        }
    
    def save(self) -> None:
        """Save cache to JSON file"""
        self.data["last_updated"] = datetime.now().isoformat()
        try:
            with open(self.cache_file, 'w', encoding='utf-8') as f:
                json.dump(self.data, f, indent=2, ensure_ascii=False)
            print(f"  ✓ Cache saved to {self.cache_file}")
        except Exception as e:
            print(f"  ✗ Error saving cache: {e}")
    
    def add_chapter(self, chapter: dict) -> None:
        """Add a chapter to cache"""
        existing = next((c for c in self.data["chapters"] if c["url"] == chapter["url"]), None)
        if not existing:
            self.data["chapters"].append(chapter)
            self.data["chapters_fetched_today"] += 1
            self.save()
    
    def get_chapters(self) -> list[dict]:
        """Get all cached chapters"""
        return self.data["chapters"]
    
    def get_metadata(self) -> dict:
        """Get cached metadata"""
        return self.data["book_metadata"]
    
    def set_metadata(self, metadata: dict) -> None:
        """Set book metadata (only stores serializable data)"""
        # Remove any non-serializable data before saving
        clean_metadata = {
            "title": metadata.get("title"),
            "author": metadata.get("author"),
            "cover_url": metadata.get("cover_url"),
            "cover_info": metadata.get("cover_info"),
            "description": metadata.get("description"),
        }
        self.data["book_metadata"] = clean_metadata
        self.save()
    
    def reset_daily_count(self) -> None:
        """Reset daily counter if it's a new day"""
        today = datetime.now().date().isoformat()
        last_date = self.data.get("last_fetch_date")
        
        if last_date != today:
            self.data["chapters_fetched_today"] = 0
            self.data["last_fetch_date"] = today
            self.save()
            print(f"  ✓ Daily counter reset for {today}")
    
    def can_fetch_more(self) -> bool:
        """Check if we can fetch more chapters today"""
        return self.data["chapters_fetched_today"] < MAX_CHAPTERS_PER_DAY
    
    def get_remaining_today(self) -> int:
        """Get how many chapters we can still fetch today"""
        return MAX_CHAPTERS_PER_DAY - self.data["chapters_fetched_today"]
    
    def print_status(self) -> None:
        """Print cache status"""
        print(f"\n  Cache Status:")
        print(f"    Chapters cached: {len(self.data['chapters'])}")
        print(f"    Fetched today: {self.data['chapters_fetched_today']}/{MAX_CHAPTERS_PER_DAY}")
        print(f"    Remaining today: {self.get_remaining_today()}")
        print(f"    Last updated: {self.data['last_updated']}")


# ─────────────────────────────────────────────
# STEP 1 — Fetch a page with Playwright and parse its HTML
# ─────────────────────────────────────────────

def fetch_page_with_playwright(url: str) -> BeautifulSoup:
    """
    Download a URL using Playwright (renders JavaScript) 
    and return a BeautifulSoup object for parsing.
    """
    print(f"  Fetching: {url}")
    
    try:
        with sync_playwright() as p:
            browser = p.chromium.launch(
                headless=BROWSER_HEADLESS,
                slow_mo=SLOW_MO if not BROWSER_HEADLESS else 0
            )
            page = browser.new_page()
            page.set_viewport_size({"width": 1024, "height": 768})
            
            page.goto(url, wait_until="load", timeout=BROWSER_TIMEOUT)
            
            try:
                page.wait_for_selector("body", timeout=5000)
            except:
                print("⚠ Timeout waiting for body element")
            
            time.sleep(EXTRA_WAIT_AFTER_PAGE_LOAD)  # Extra wait to ensure all content is loaded
            
            html_content = page.content()
            browser.close()
            
            return BeautifulSoup(html_content, "html.parser")
    
    except Exception as e:
        print(f"    ✗ Error fetching {url}: {e}")
        raise


def fetch_page(url: str) -> BeautifulSoup:
    """Wrapper function — uses Playwright instead of requests"""
    return fetch_page_with_playwright(url)


# ─────────────────────────────────────────────
# STEP 2 — Extract metadata, cover URL, and chapter links
# ─────────────────────────────────────────────

def get_intro_page_data(intro_url: str) -> tuple[dict, list[dict]]:
    """
    Fetch the intro page ONCE and extract:
    1. Book metadata (title, author, cover URL, description, info)
    2. All chapter links
    
    Returns: (metadata_dict, chapters_list)
    """
    print("\n[1/3] Fetching intro page (metadata + chapter list) …")
    soup = fetch_page(intro_url)
    
    # ─── Extract Metadata ───────────────────────────────
    
    # Title
    title_tag = soup.find("h2", style=lambda x: x and "font-size: 1.7rem" in x)
    book_title = title_tag.get_text(strip=True) if title_tag else "Rơi Xuống Chỗ Trống"

    # Author
    author = "Không rõ"
    for p_tag in soup.find_all("p"):
        text = p_tag.get_text(strip=True)
        if "tác giả" in text.lower():
            parts = text.split(":")
            if len(parts) > 1:
                author = parts[1].strip()
                author_link = p_tag.find("a")
                if author_link:
                    author = author_link.get_text(strip=True)
            break

    # ─── Extract Cover Image URL (don't download yet) ────
    cover_url = None
    
    cover_img_tag = soup.select_one("body > main > div > div.flexbox > div:nth-child(1) > div.book-info > div.cover-wrapper img")
    if cover_img_tag:
        cover_url = cover_img_tag.get("src")
        if cover_url:
            # Handle relative URLs
            if cover_url.startswith("http"):
                pass
            elif cover_url.startswith("/"):
                cover_url = "https://wikicv.net" + cover_url
            else:
                cover_url = "https://wikicv.net/" + cover_url
            
            print(f"  ✓ Cover URL found: {cover_url}")

    # ─── Extract Cover Info (book stats) ─────────────────
    cover_info_html = ""
    cover_info_div = soup.select_one("body > main > div > div.flexbox > div:nth-child(1) > div.book-info > div.cover-info")
    if cover_info_div:
        cover_info_html = str(cover_info_div)

    # ─── Extract Book Description ───────────────────────
    description_html = ""
    book_desc_div = soup.select_one("body > main > div > div.flexbox > div:nth-child(1) > div.book-desc")
    if book_desc_div:
        description_html = str(book_desc_div)

    metadata = {
        "title": book_title,
        "author": author,
        "cover_url": cover_url,  # Store only URL, not binary data
        "cover_info": cover_info_html,
        "description": description_html,
    }
    
    print(f"  ✓ Metadata extracted:")
    print(f"      Title : {metadata['title']}")
    print(f"      Author: {metadata['author']}")
    if cover_url:
        print(f"      Cover : {cover_url}")

    # ─── Extract Chapter Links ──────────────────────────
    volume_list = soup.select_one("div.volume-list")
    if not volume_list:
        raise RuntimeError(
            "Could not find 'div.volume-list' on the intro page. "
            "The site layout may have changed."
        )

    chapters = []
    for link_tag in volume_list.find_all("a", class_="truncate", href=True):
        title = link_tag.get_text(strip=True)
        href  = link_tag["href"]

        if href.startswith("/"):
            href = "https://wikicv.net" + href

        if title and href:
            chapters.append({"title": title, "url": href})

    print(f"  ✓ Found {len(chapters)} chapters on intro page.")
    
    if LIMIT_CHAPTERS_FOR_TESTING:
        print(f"  ⚠ Limiting to first {LIMIT_CHAPTERS_FOR_TESTING} chapters for testing.")
        chapters = chapters[:LIMIT_CHAPTERS_FOR_TESTING]
        
    return metadata, chapters


# ─────────────────────────────────────────────
# STEP 3 — Scrape chapter content
# ─────────────────────────────────────────────

def get_chapter_content(chapter: dict) -> str:
    """
    Visit a chapter URL and return its body text as an HTML string.
    Removes ads and unwanted elements using ContentCleaner.
    """
    soup = fetch_page(chapter["url"])

    content_div = soup.select_one("#bookContentBody")
    if not content_div:
        print(f"    ⚠ No #bookContentBody found at {chapter['url']}")
        return "<p><em>(Nội dung không tìm thấy.)</em></p>"

    # Clean unwanted elements
    ContentCleaner.clean(content_div)

    title_html   = f"<h2>{chapter['title']}</h2>"
    content_html = str(content_div)

    return f"{title_html}\n{content_html}"


# ─────────────────────────────────────────────
# STEP 4 — Download cover image
# ─────────────────────────────────────────────

def download_cover_image(cover_url: str) -> bytes:
    """
    Download cover image from URL.
    Returns bytes if successful, None if failed.
    """
    if not cover_url:
        print("  ⚠ No cover URL provided")
        return None
    
    try:
        print(f"  Downloading cover image: {cover_url}")
        response = requests.get(cover_url, timeout=15)
        if response.status_code == 200:
            print(f"  ✓ Cover image downloaded ({len(response.content)} bytes)")
            return response.content
        else:
            print(f"  ⚠ Failed to download cover (status {response.status_code})")
            return None
    except requests.exceptions.Timeout:
        print(f"  ⚠ Timeout downloading cover image")
        return None
    except Exception as e:
        print(f"  ⚠ Error downloading cover: {e}")
        return None


# ─────────────────────────────────────────────
# STEP 5 — Build EPUB with cover and metadata
# ─────────────────────────────────────────────

def build_epub(metadata: dict, chapters: list[dict]) -> None:
    """
    Create an EPUB ebook from the scraped chapter data.
    Includes cover image (downloaded at build time), cover info, and book description.
    """
    if not chapters:
        print("  ✗ No chapters to build EPUB!")
        return
    
    book = epub.EpubBook()

    # ── Basic metadata ──────────────────────────────────
    book.set_identifier("roi-xuong-cho-trong-001")
    book.set_title(metadata["title"])
    book.set_language("vi")
    book.add_author(metadata["author"])

    # ── Download and add cover image ─────────────────────
    cover_image = None
    if metadata.get("cover_url"):
        cover_image = download_cover_image(metadata["cover_url"])
        
        if cover_image:
            try:
                cover_item = epub.EpubItem(
                    uid        = "cover_image",
                    file_name  = "images/cover.jpg",
                    media_type = "image/jpeg",
                    content    = cover_image,
                )
                book.add_item(cover_item)
                book.set_cover("cover_image", cover_image)
                print("  ✓ Cover image added to EPUB")
            except Exception as e:
                print(f"  ⚠ Error adding cover to EPUB: {e}")
        else:
            print("  ⚠ Cover image not available, continuing without cover")

    # ── Shared CSS ──────────────────────────────────────
    css_content = """
        body  { font-family: Georgia, serif; line-height: 1.8;
                margin: 2em; color: #222; }
        h1, h2 { font-size: 1.4em; margin-top: 2em; color: #444; }
        p     { margin: 0.6em 0; text-indent: 1.5em; }
        .bookContentBody { margin: 1em 0; }
        .cover-info { margin: 1em 0; padding: 1em; border: 1px solid #ddd; }
        .book-desc { margin: 1em 0; padding: 1em; }
        .book-stats { display: inline-block; margin-right: 1em; }
    """
    css_item = epub.EpubItem(
        uid        = "style_main",
        file_name  = "style/main.css",
        media_type = "text/css",
        content    = css_content,
    )
    book.add_item(css_item)

    # ── Create cover page with metadata ──────────────────
    cover_page = epub.EpubHtml(
        title     = "Bìa sách",
        file_name = "cover_page.xhtml",
        lang      = "vi",
    )
    
    cover_page_content = f"""
        <html>
        <head>
            <link rel="stylesheet" type="text/css" href="style/main.css"/>
        </head>
        <body>
            <div style="text-align: center; margin: 2em 0;">
                <h1>{metadata['title']}</h1>
                <p>
                    <a href="{BOOK_INTRO_URL}">{BOOK_INTRO_URL}</a>
                </p>
            </div>
            
            {metadata.get('cover_info', '')}
            
            {metadata.get('description', '')}
        </body>
        </html>
    """
    
    cover_page.content = cover_page_content
    cover_page.add_item(css_item)
    book.add_item(cover_page)

    # ── Build chapter items ──────────────────────────────
    epub_chapters = []
    spine         = ["nav", cover_page]

    for index, chapter in enumerate(chapters):
        if "html" not in chapter or not chapter["html"]:
            continue
        
        print(f"  [{index + 1}/{len(chapters)}] Adding: {chapter['title']}")

        epub_chapter = epub.EpubHtml(
            title     = chapter["title"],
            file_name = f"chap_{index + 1:04d}.xhtml",
            lang      = "vi",
        )
        epub_chapter.content = chapter["html"]
        epub_chapter.add_item(css_item)

        book.add_item(epub_chapter)
        epub_chapters.append(epub_chapter)
        spine.append(epub_chapter)

    # ── Navigation ──────────────────────────────────────
    # FIX: Create proper structure for TOC
    book.toc = [cover_page] + epub_chapters  # ← Changed from tuple to list
    book.spine = spine

    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())

    # ── Write EPUB file ─────────────────────────────────
    output_path = make_safe_filename(metadata["title"]) + ".epub"
    epub.write_epub(output_path, book, {})
    print(f"\n✅  Saved ebook → {output_path}")



# ─────────────────────────────────────────────
# UTILITIES
# ─────────────────────────────────────────────
def make_safe_filename(text: str) -> str:
    """Remove characters not allowed in filenames"""
    return re.sub(r'[\\/:*?"<>|]', '', text).strip()

def clean_content_for_cached_chapters(cache_file: str = CACHE_FILE, save: bool = True) -> list[dict]:
    """
    Load chapters from JSON cache, re-clean their HTML content,
    and optionally save the cleaned result back to cache.

    Returns: list of cleaned chapters
    """
    print("\n🧹 Cleaning cached chapters...")

    cache = ScraperCache(cache_file)
    chapters = cache.get_chapters()

    if not chapters:
        print("  ⚠ No chapters found in cache")
        return []

    cleaned_count = 0

    for i, chapter in enumerate(chapters):
        html = chapter.get("html")
        if not html:
            continue

        try:
            soup = BeautifulSoup(html, "html.parser")
            ContentCleaner.clean(soup)

            # Save cleaned HTML back
            chapter["html"] = str(soup)
            cleaned_count += 1

            print(f"  ✓ Cleaned [{i+1}/{len(chapters)}]: {chapter.get('title', 'No title')}")

        except Exception as e:
            print(f"  ✗ Error cleaning chapter {chapter.get('title')}: {e}")
            continue

    if save:
        cache.save()
        print(f"\n  💾 Saved cleaned chapters back to cache")

    print(f"\n✅ Done. Cleaned {cleaned_count}/{len(chapters)} chapters.")
    return chapters


# ─────────────────────────────────────────────
# MAIN
# ─────────────────────────────────────────────

def main():
    print("=" * 70)
    print("  Web Novel → EPUB Converter (with Cover & Metadata)")
    print("=" * 70)
    if LIMIT_CHAPTERS_FOR_TESTING is not None:
        print(f"⚠ Running in TEST MODE: Limiting to first {LIMIT_CHAPTERS_FOR_TESTING} chapters")
        print("  To scrape all chapters, set LIMIT_CHAPTERS_FOR_TESTING = None in the script.")
        print("=" * 70)

    cache = ScraperCache(CACHE_FILE)
    cache.reset_daily_count()
    cache.print_status()

    try:
        # 1. Fetch intro page
        metadata = cache.get_metadata()
        if not metadata:
            metadata, all_chapters = get_intro_page_data(BOOK_INTRO_URL)
            cache.set_metadata(metadata)
        else:
            print("\n[1/3] Using cached metadata")
            print(f"      Title : {metadata['title']}")
            print(f"      Author: {metadata['author']}")
            
            _, all_chapters = get_intro_page_data(BOOK_INTRO_URL)

        # 2. Compare with cache
        cached_chapters = cache.get_chapters()
        cached_urls = {c["url"] for c in cached_chapters}
        new_chapters = [c for c in all_chapters if c["url"] not in cached_urls]
        
        print(f"      Total on site: {len(all_chapters)}")
        print(f"      Already cached: {len(cached_chapters)}")
        print(f"      Need to fetch: {len(new_chapters)}")

        # 3. Fetch new chapters
        print("\n[2/3] Downloading chapter content …")
        chapters_to_fetch = new_chapters[:cache.get_remaining_today()]
        
        if len(chapters_to_fetch) < len(new_chapters):
            print(f"  ⚠ Daily limit reached! Only fetching {len(chapters_to_fetch)}/{len(new_chapters)} new chapters")
        
        for i, chapter in enumerate(chapters_to_fetch):
            if not cache.can_fetch_more():
                print(f"\n  ⛔ Daily limit of {MAX_CHAPTERS_PER_DAY} chapters reached!")
                print(f"  Run again tomorrow to continue scraping.")
                break
            
            remaining = cache.get_remaining_today()
            print(f"  [{i + 1}/{len(chapters_to_fetch)}] (Remaining: {remaining}) {chapter['title']}")
            
            try:
                chapter["html"] = get_chapter_content(chapter)
                cache.add_chapter(chapter)
                time.sleep(WAIT_TIME_PER_CHAPTER)
            except Exception as e:
                print(f"    ✗ Error: {e}")
                continue

        # 4. Build EPUB
        print("\n[3/3] Building EPUB …")
        all_cached = cache.get_chapters()
        chapters_with_content = [c for c in all_cached if "html" in c and c["html"]]
        
        if chapters_with_content:
            build_epub(metadata, chapters_with_content)
        else:
            print("  ⚠ No chapters with content to build EPUB")

        cache.print_status()
        
        print("\n" + "=" * 70)
        if chapters_with_content:
            print("✅  Done! Open the .epub file with any ebook reader")
        else:
            print("⚠  Scraping in progress. Run again to continue.")
        print("=" * 70)

    except Exception as e:
        print(f"\n❌  Error: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()
    # clean_content_for_cached_chapters()
