""" Book Scraper & EPUB Generator (with Playwright) ================================================ This script scrapes a Vietnamese web novel from wikicv.net and packages all chapters into a clean, readable EPUB ebook file. Uses Playwright to render JavaScript-heavy pages. Supports resumable scraping with JSON cache. Includes book cover and metadata. Requirements: pip install playwright beautifulsoup4 ebooklib pillow requests python -m playwright install chromium """ import time import json import os import requests from datetime import datetime from pathlib import Path from playwright.sync_api import sync_playwright from bs4 import BeautifulSoup from ebooklib import epub from io import BytesIO import re # ───────────────────────────────────────────── # CONFIGURATION — change these if needed # ───────────────────────────────────────────── BOOK_INTRO_URL = "https://wikicv.net/truyen/the-gioi-huy-diet-sau-trong-sinh-YOSSf1S4CHja3jT1" # ── Scraping behavior ────────────────────── WAIT_TIME_PER_CHAPTER = 1.5 # seconds between chapter fetches MAX_CHAPTERS_PER_DAY = 300 # stop after this many chapters fetched today LIMIT_CHAPTERS_FOR_TESTING = None # Set to an integer to limit chapters during testing (e.g. 5) CACHE_FILE = BOOK_INTRO_URL.rstrip("/").split("/")[-1] + ".json" # where to save progress # ── Playwright settings ──────────────────── BROWSER_HEADLESS = True # Set to False to SHOW browser BROWSER_TIMEOUT = 360000 # milliseconds (6 minutes) SLOW_MO = 1000 # milliseconds - slows down browser actions EXTRA_WAIT_AFTER_PAGE_LOAD = 23 # seconds to wait after page load for JS content # ───────────────────────────────────────────── # CONTENT CLEANER — Remove ads # ───────────────────────────────────────────── class ContentCleaner: """Removes ads and unwanted elements from chapter content""" # Elements to remove by ID AD_IDS = [ "tpads_article_middle_container", "ubvideoFrame", "adContainer", "advertisement", "ads", "google_ads_div", "unibots-video" ] # Elements to remove by class AD_CLASSES = [ "tpm-unit", "ad", "advertisement", "advert", "ads", "ad-banner", "ad-container", "sidebar-ads", "google-ads", "sponsored", "sponsored-content", "gliaplayer-container", "tpm-unit", "center ankhinho", "ankhito center", "gliaplayer-container styles-module_container_xuywD", "InstreamDom_root_21jVv" ] # Elements to remove by atrribute AD_ATTRIBUTES = [ "data-ad-slot", "data-ad-client", "data-google-query-id", "data-readmore-toggle", "data-gc-boot-time", "data-gc-test-id" ] # Tags to completely remove REMOVE_TAGS = ["script", "style", "iframe"] UNWANTED_STRINGS = [ "·", "dkạhsdsadjdá", "oiewơie", "✧⋄⋆⋅⋆⋄✧⋄⋆⋅⋆⋄✧ ฅ/ᐠ｡ꞈ｡ᐟ\ฅ Convert by Haruko ฅ/ᐠ｡ꞈ｡ᐟ\ฅ ✧⋄⋆⋅⋆⋄✧⋄⋆⋅⋆⋄✧", "☀Truyện được đăng bởi Reine☀" ] @classmethod def clean(cls, content_div) -> str: if content_div is None: return "" paragraphs = content_div.find_all("p") clean_html = "".join( f"

{p.get_text(strip=True)}

" for p in paragraphs if p and p.get_text(strip=True) ) content_div.clear() cleaned_soup = BeautifulSoup(clean_html, "html.parser") for child in list(cleaned_soup.contents): content_div.append(child) return clean_html # ───────────────────────────────────────────── # CACHE MANAGEMENT — save/load progress # ───────────────────────────────────────────── class ScraperCache: """Manages JSON cache for resumable scraping""" def __init__(self, cache_file: str = CACHE_FILE): self.cache_file = cache_file self.data = self._load() def _load(self) -> dict: """Load cache from JSON file""" if os.path.exists(self.cache_file): try: with open(self.cache_file, 'r', encoding='utf-8') as f: return json.load(f) except Exception as e: print(f" ⚠ Error loading cache: {e}") return self._default_cache() return self._default_cache() def _default_cache(self) -> dict: """Return empty cache structure""" return { "book_metadata": {}, "chapters": [], "last_updated": None, "chapters_fetched_today": 0, "last_fetch_date": None, } def save(self) -> None: """Save cache to JSON file""" self.data["last_updated"] = datetime.now().isoformat() try: with open(self.cache_file, 'w', encoding='utf-8') as f: json.dump(self.data, f, indent=2, ensure_ascii=False) print(f" ✓ Cache saved to {self.cache_file}") except Exception as e: print(f" ✗ Error saving cache: {e}") def add_chapter(self, chapter: dict) -> None: """Add a chapter to cache""" existing = next((c for c in self.data["chapters"] if c["url"] == chapter["url"]), None) if not existing: self.data["chapters"].append(chapter) self.data["chapters_fetched_today"] += 1 self.save() def get_chapters(self) -> list[dict]: """Get all cached chapters""" return self.data["chapters"] def get_metadata(self) -> dict: """Get cached metadata""" return self.data["book_metadata"] def set_metadata(self, metadata: dict) -> None: """Set book metadata (only stores serializable data)""" # Remove any non-serializable data before saving clean_metadata = { "title": metadata.get("title"), "author": metadata.get("author"), "cover_url": metadata.get("cover_url"), "cover_info": metadata.get("cover_info"), "description": metadata.get("description"), } self.data["book_metadata"] = clean_metadata self.save() def reset_daily_count(self) -> None: """Reset daily counter if it's a new day""" today = datetime.now().date().isoformat() last_date = self.data.get("last_fetch_date") if last_date != today: self.data["chapters_fetched_today"] = 0 self.data["last_fetch_date"] = today self.save() print(f" ✓ Daily counter reset for {today}") def can_fetch_more(self) -> bool: """Check if we can fetch more chapters today""" return self.data["chapters_fetched_today"] < MAX_CHAPTERS_PER_DAY def get_remaining_today(self) -> int: """Get how many chapters we can still fetch today""" return MAX_CHAPTERS_PER_DAY - self.data["chapters_fetched_today"] def print_status(self) -> None: """Print cache status""" print(f"\n Cache Status:") print(f" Chapters cached: {len(self.data['chapters'])}") print(f" Fetched today: {self.data['chapters_fetched_today']}/{MAX_CHAPTERS_PER_DAY}") print(f" Remaining today: {self.get_remaining_today()}") print(f" Last updated: {self.data['last_updated']}") # ───────────────────────────────────────────── # STEP 1 — Fetch a page with Playwright and parse its HTML # ───────────────────────────────────────────── def fetch_page_with_playwright(url: str) -> BeautifulSoup: """ Download a URL using Playwright (renders JavaScript) and return a BeautifulSoup object for parsing. """ print(f" Fetching: {url}") try: with sync_playwright() as p: browser = p.chromium.launch( headless=BROWSER_HEADLESS, slow_mo=SLOW_MO if not BROWSER_HEADLESS else 0 ) page = browser.new_page() page.set_viewport_size({"width": 1024, "height": 768}) page.goto(url, wait_until="load", timeout=BROWSER_TIMEOUT) try: page.wait_for_selector("body", timeout=5000) except: print("⚠ Timeout waiting for body element") time.sleep(EXTRA_WAIT_AFTER_PAGE_LOAD) # Extra wait to ensure all content is loaded html_content = page.content() browser.close() return BeautifulSoup(html_content, "html.parser") except Exception as e: print(f" ✗ Error fetching {url}: {e}") raise def fetch_page(url: str) -> BeautifulSoup: """Wrapper function — uses Playwright instead of requests""" return fetch_page_with_playwright(url) # ───────────────────────────────────────────── # STEP 2 — Extract metadata, cover URL, and chapter links # ───────────────────────────────────────────── def get_intro_page_data(intro_url: str) -> tuple[dict, list[dict]]: """ Fetch the intro page ONCE and extract: 1. Book metadata (title, author, cover URL, description, info) 2. All chapter links Returns: (metadata_dict, chapters_list) """ print("\n[1/3] Fetching intro page (metadata + chapter list) …") soup = fetch_page(intro_url) # ─── Extract Metadata ─────────────────────────────── # Title title_tag = soup.find("h2", style=lambda x: x and "font-size: 1.7rem" in x) book_title = title_tag.get_text(strip=True) if title_tag else "Rơi Xuống Chỗ Trống" # Author author = "Không rõ" for p_tag in soup.find_all("p"): text = p_tag.get_text(strip=True) if "tác giả" in text.lower(): parts = text.split(":") if len(parts) > 1: author = parts[1].strip() author_link = p_tag.find("a") if author_link: author = author_link.get_text(strip=True) break # ─── Extract Cover Image URL (don't download yet) ──── cover_url = None cover_img_tag = soup.select_one("body > main > div > div.flexbox > div:nth-child(1) > div.book-info > div.cover-wrapper img") if cover_img_tag: cover_url = cover_img_tag.get("src") if cover_url: # Handle relative URLs if cover_url.startswith("http"): pass elif cover_url.startswith("/"): cover_url = "https://wikicv.net" + cover_url else: cover_url = "https://wikicv.net/" + cover_url print(f" ✓ Cover URL found: {cover_url}") # ─── Extract Cover Info (book stats) ───────────────── cover_info_html = "" cover_info_div = soup.select_one("body > main > div > div.flexbox > div:nth-child(1) > div.book-info > div.cover-info") if cover_info_div: cover_info_html = str(cover_info_div) # ─── Extract Book Description ─────────────────────── description_html = "" book_desc_div = soup.select_one("body > main > div > div.flexbox > div:nth-child(1) > div.book-desc") if book_desc_div: description_html = str(book_desc_div) metadata = { "title": book_title, "author": author, "cover_url": cover_url, # Store only URL, not binary data "cover_info": cover_info_html, "description": description_html, } print(f" ✓ Metadata extracted:") print(f" Title : {metadata['title']}") print(f" Author: {metadata['author']}") if cover_url: print(f" Cover : {cover_url}") # ─── Extract Chapter Links ────────────────────────── volume_list = soup.select_one("div.volume-list") if not volume_list: raise RuntimeError( "Could not find 'div.volume-list' on the intro page. " "The site layout may have changed." ) chapters = [] for link_tag in volume_list.find_all("a", class_="truncate", href=True): title = link_tag.get_text(strip=True) href = link_tag["href"] if href.startswith("/"): href = "https://wikicv.net" + href if title and href: chapters.append({"title": title, "url": href}) print(f" ✓ Found {len(chapters)} chapters on intro page.") if LIMIT_CHAPTERS_FOR_TESTING: print(f" ⚠ Limiting to first {LIMIT_CHAPTERS_FOR_TESTING} chapters for testing.") chapters = chapters[:LIMIT_CHAPTERS_FOR_TESTING] return metadata, chapters # ───────────────────────────────────────────── # STEP 3 — Scrape chapter content # ───────────────────────────────────────────── def get_chapter_content(chapter: dict) -> str: """ Visit a chapter URL and return its body text as an HTML string. Removes ads and unwanted elements using ContentCleaner. """ soup = fetch_page(chapter["url"]) content_div = soup.select_one("#bookContentBody") if not content_div: print(f" ⚠ No #bookContentBody found at {chapter['url']}") return "

(Nội dung không tìm thấy.)

" # Clean unwanted elements ContentCleaner.clean(content_div) title_html = f"

{chapter['title']}

" content_html = str(content_div) return f"{title_html}\n{content_html}" # ───────────────────────────────────────────── # STEP 4 — Download cover image # ───────────────────────────────────────────── def download_cover_image(cover_url: str) -> bytes: """ Download cover image from URL. Returns bytes if successful, None if failed. """ if not cover_url: print(" ⚠ No cover URL provided") return None try: print(f" Downloading cover image: {cover_url}") response = requests.get(cover_url, timeout=15) if response.status_code == 200: print(f" ✓ Cover image downloaded ({len(response.content)} bytes)") return response.content else: print(f" ⚠ Failed to download cover (status {response.status_code})") return None except requests.exceptions.Timeout: print(f" ⚠ Timeout downloading cover image") return None except Exception as e: print(f" ⚠ Error downloading cover: {e}") return None # ───────────────────────────────────────────── # STEP 5 — Build EPUB with cover and metadata # ───────────────────────────────────────────── def build_epub(metadata: dict, chapters: list[dict]) -> None: """ Create an EPUB ebook from the scraped chapter data. Includes cover image (downloaded at build time), cover info, and book description. """ if not chapters: print(" ✗ No chapters to build EPUB!") return book = epub.EpubBook() # ── Basic metadata ────────────────────────────────── book.set_identifier("roi-xuong-cho-trong-001") book.set_title(metadata["title"]) book.set_language("vi") book.add_author(metadata["author"]) # ── Download and add cover image ───────────────────── cover_image = None if metadata.get("cover_url"): cover_image = download_cover_image(metadata["cover_url"]) if cover_image: try: cover_item = epub.EpubItem( uid = "cover_image", file_name = "images/cover.jpg", media_type = "image/jpeg", content = cover_image, ) book.add_item(cover_item) book.set_cover("cover_image", cover_image) print(" ✓ Cover image added to EPUB") except Exception as e: print(f" ⚠ Error adding cover to EPUB: {e}") else: print(" ⚠ Cover image not available, continuing without cover") # ── Shared CSS ────────────────────────────────────── css_content = """ body { font-family: Georgia, serif; line-height: 1.8; margin: 2em; color: #222; } h1, h2 { font-size: 1.4em; margin-top: 2em; color: #444; } p { margin: 0.6em 0; text-indent: 1.5em; } .bookContentBody { margin: 1em 0; } .cover-info { margin: 1em 0; padding: 1em; border: 1px solid #ddd; } .book-desc { margin: 1em 0; padding: 1em; } .book-stats { display: inline-block; margin-right: 1em; } """ css_item = epub.EpubItem( uid = "style_main", file_name = "style/main.css", media_type = "text/css", content = css_content, ) book.add_item(css_item) # ── Create cover page with metadata ────────────────── cover_page = epub.EpubHtml( title = "Bìa sách", file_name = "cover_page.xhtml", lang = "vi", ) cover_page_content = f"""

{metadata['title']}

{BOOK_INTRO_URL}

{metadata.get('cover_info', '')} {metadata.get('description', '')} """ cover_page.content = cover_page_content cover_page.add_item(css_item) book.add_item(cover_page) # ── Build chapter items ────────────────────────────── epub_chapters = [] spine = ["nav", cover_page] for index, chapter in enumerate(chapters): if "html" not in chapter or not chapter["html"]: continue print(f" [{index + 1}/{len(chapters)}] Adding: {chapter['title']}") epub_chapter = epub.EpubHtml( title = chapter["title"], file_name = f"chap_{index + 1:04d}.xhtml", lang = "vi", ) epub_chapter.content = chapter["html"] epub_chapter.add_item(css_item) book.add_item(epub_chapter) epub_chapters.append(epub_chapter) spine.append(epub_chapter) # ── Navigation ────────────────────────────────────── # FIX: Create proper structure for TOC book.toc = [cover_page] + epub_chapters # ← Changed from tuple to list book.spine = spine book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # ── Write EPUB file ───────────────────────────────── output_path = make_safe_filename(metadata["title"]) + ".epub" epub.write_epub(output_path, book, {}) print(f"\n✅ Saved ebook → {output_path}") # ───────────────────────────────────────────── # UTILITIES # ───────────────────────────────────────────── def make_safe_filename(text: str) -> str: """Remove characters not allowed in filenames""" return re.sub(r'[\\/:*?"<>|]', '', text).strip() def clean_content_for_cached_chapters(cache_file: str = CACHE_FILE, save: bool = True) -> list[dict]: """ Load chapters from JSON cache, re-clean their HTML content, and optionally save the cleaned result back to cache. Returns: list of cleaned chapters """ print("\n🧹 Cleaning cached chapters...") cache = ScraperCache(cache_file) chapters = cache.get_chapters() if not chapters: print(" ⚠ No chapters found in cache") return [] cleaned_count = 0 for i, chapter in enumerate(chapters): html = chapter.get("html") if not html: continue try: soup = BeautifulSoup(html, "html.parser") ContentCleaner.clean(soup) # Save cleaned HTML back chapter["html"] = str(soup) cleaned_count += 1 print(f" ✓ Cleaned [{i+1}/{len(chapters)}]: {chapter.get('title', 'No title')}") except Exception as e: print(f" ✗ Error cleaning chapter {chapter.get('title')}: {e}") continue if save: cache.save() print(f"\n 💾 Saved cleaned chapters back to cache") print(f"\n✅ Done. Cleaned {cleaned_count}/{len(chapters)} chapters.") return chapters # ───────────────────────────────────────────── # MAIN # ───────────────────────────────────────────── def main(): print("=" * 70) print(" Web Novel → EPUB Converter (with Cover & Metadata)") print("=" * 70) if LIMIT_CHAPTERS_FOR_TESTING is not None: print(f"⚠ Running in TEST MODE: Limiting to first {LIMIT_CHAPTERS_FOR_TESTING} chapters") print(" To scrape all chapters, set LIMIT_CHAPTERS_FOR_TESTING = None in the script.") print("=" * 70) cache = ScraperCache(CACHE_FILE) cache.reset_daily_count() cache.print_status() try: # 1. Fetch intro page metadata = cache.get_metadata() if not metadata: metadata, all_chapters = get_intro_page_data(BOOK_INTRO_URL) cache.set_metadata(metadata) else: print("\n[1/3] Using cached metadata") print(f" Title : {metadata['title']}") print(f" Author: {metadata['author']}") _, all_chapters = get_intro_page_data(BOOK_INTRO_URL) # 2. Compare with cache cached_chapters = cache.get_chapters() cached_urls = {c["url"] for c in cached_chapters} new_chapters = [c for c in all_chapters if c["url"] not in cached_urls] print(f" Total on site: {len(all_chapters)}") print(f" Already cached: {len(cached_chapters)}") print(f" Need to fetch: {len(new_chapters)}") # 3. Fetch new chapters print("\n[2/3] Downloading chapter content …") chapters_to_fetch = new_chapters[:cache.get_remaining_today()] if len(chapters_to_fetch) < len(new_chapters): print(f" ⚠ Daily limit reached! Only fetching {len(chapters_to_fetch)}/{len(new_chapters)} new chapters") for i, chapter in enumerate(chapters_to_fetch): if not cache.can_fetch_more(): print(f"\n ⛔ Daily limit of {MAX_CHAPTERS_PER_DAY} chapters reached!") print(f" Run again tomorrow to continue scraping.") break remaining = cache.get_remaining_today() print(f" [{i + 1}/{len(chapters_to_fetch)}] (Remaining: {remaining}) {chapter['title']}") try: chapter["html"] = get_chapter_content(chapter) cache.add_chapter(chapter) time.sleep(WAIT_TIME_PER_CHAPTER) except Exception as e: print(f" ✗ Error: {e}") continue # 4. Build EPUB print("\n[3/3] Building EPUB …") all_cached = cache.get_chapters() chapters_with_content = [c for c in all_cached if "html" in c and c["html"]] if chapters_with_content: build_epub(metadata, chapters_with_content) else: print(" ⚠ No chapters with content to build EPUB") cache.print_status() print("\n" + "=" * 70) if chapters_with_content: print("✅ Done! Open the .epub file with any ebook reader") else: print("⚠ Scraping in progress. Run again to continue.") print("=" * 70) except Exception as e: print(f"\n❌ Error: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main() # clean_content_for_cached_chapters()