iBook/toc_parse.py


"""
toc_parse.py (OOP版)
-------------------
功能：
    - 解析EPUB电子书的toc.ncx目录文件，递归构建章节树结构。
    - 支持通过ref和filepos查找完整label路径。
    - 支持通过selectedtext在html文件中定位章节标题。
    - 兼容多种EPUB格式，支持批量测试。

依赖：config.py 统一管理路径和配置项。
主要接口：TOCParser
    - parse_navpoints(navpoints)：递归解析navPoint节点，返回章节树结构。
    - find_label_path(node, ref, filepos, path)：查找指定ref和filepos的章节label路径。
    - find_section_by_selectedtext(html_path, selectedtext)：通过选中文本定位章节标题。
    - parse_html_title(html_path)：解析html文件标题。
依赖：BeautifulSoup4, pprint, os, typing
"""
import config
from bs4 import BeautifulSoup
import os

class TOCParser:
    def __init__(self):
        pass

    @staticmethod
    def find_section_by_selectedtext(html_path, selectedtext):
        try:
            with open(html_path, 'r', encoding='utf-8') as f:
                soup = BeautifulSoup(f, 'html.parser')
            for elem in soup.find_all(string=True):
                if selectedtext and selectedtext.strip() and selectedtext.strip() in elem:
                    parent = elem.parent
                    while parent:
                        prev = parent.previous_sibling
                        while prev:
                            if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']:
                                return prev.get_text(strip=True)
                            prev = prev.previous_sibling
                        parent = parent.parent
            for tag in ['h1','h2','h3','h4','h5','h6']:
                h = soup.find(tag)
                if h and h.get_text(strip=True):
                    return h.get_text(strip=True)
        except Exception:
            pass
        return None

    @staticmethod
    def parse_html_title(html_path):
        try:
            with open(html_path, 'r', encoding='utf-8') as f:
                soup = BeautifulSoup(f, 'html.parser')
            if soup.title and soup.title.string:
                return soup.title.string.strip()
            for tag in ['h1','h2','h3','h4','h5','h6']:
                h = soup.find(tag)
                if h and h.get_text(strip=True):
                    return h.get_text(strip=True)
        except Exception:
            pass
        return None

    @staticmethod
    def parse_navpoints(navpoints):
        result = {}
        for navpoint in navpoints:
            label = navpoint.navLabel.text.strip().strip('"“”')
            src = navpoint.content["src"]
            if "#" in src:
                ref, filepos = src.split("#", 1)
            else:
                ref, filepos = src, None
            entry = {
                "label": label,
                "ref": ref,
                "filepos": filepos,
                "children": TOCParser.parse_navpoints(navpoint.find_all("navPoint", recursive=False))
            }
            result[navpoint.get("id")] = entry
        return result

    @staticmethod
    def find_label_path(node, ref, filepos=None, path=None):
        if path is None:
            path = []
        if isinstance(node, dict):
            nodes = node.values() if "label" not in node else [node]
            for v in nodes:
                if "label" in v:
                    new_path = path + [v["label"]]
                    if v["ref"] == ref and (filepos is None or v["filepos"] == filepos):
                        title = " / ".join(new_path)
                        return title
                    title = TOCParser.find_label_path(v["children"], ref, filepos, new_path)
                    if title:
                        return title
            if filepos is not None:
                for v in nodes:
                    if "label" in v:
                        new_path = path + [v["label"]]
                        if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]:
                            title = " / ".join(new_path)
                            return title
                        title = TOCParser.find_label_path(v["children"], ref, None, new_path)
                        if title:
                            return title
        if path == [] and ref and ref.endswith('.html'):
            caller_dir = os.path.dirname(os.path.abspath(__file__))
            search_dirs = [caller_dir, os.getcwd()]
            for d in search_dirs:
                html_path = os.path.join(d, ref)
                if os.path.isfile(html_path):
                    title = TOCParser.parse_html_title(html_path)
                    if title:
                        return title
            for d in search_dirs:
                for root, _, files in os.walk(d):
                    if ref in files:
                        html_path = os.path.join(root, ref)
                        title = TOCParser.parse_html_title(html_path)
                        if title:
                            return title
        return None

if __name__ == "__main__":
    # ==== 批量测试指定toc/html/filepos列表 ====
    test_cases = [
        [config.EXAMPLES_DIR + "/epub_format_1", "index_split_015.html", "filepos684970"],
        [config.EXAMPLES_DIR + "/epub_format_2", "Text/8c7276f38ead4738ee19249418898c18_split_006.html", "sigil_toc_id_12"],
        [config.EXAMPLES_DIR + "/epub_format_3", "Text/011.xhtml", ""],
        [config.EXAMPLES_DIR + "/epub_format_4", "xhtml/p-006.xhtml", ""],
        [config.EXAMPLES_DIR + "/变宋", "text/part0005.html", ""],
        [config.EXAMPLES_DIR + "/变宋", "text/part0002_split_003.html", ""],
        [config.EXAMPLES_DIR + "/规训与惩罚", "index_split_006.html", ""],
        [config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
    ]
    for epub_dir, html_file, filepos in test_cases:
        toc_path = None
        for root, _, files in os.walk(epub_dir):
            for f in files:
                if f.lower() == "toc.ncx":
                    toc_path = os.path.join(root, f)
                    break
            if toc_path:
                break
        print(f"\n==== 测试 epub: {epub_dir} html: {html_file} filepos: {filepos} ====")
        if not toc_path:
            print(f"未找到toc.ncx: {epub_dir}")
            continue
        try:
            with open(toc_path, "r", encoding="utf-8") as f:
                soup = BeautifulSoup(f, "xml")
            nav_map = soup.find("navMap")
            toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
            label_path = TOCParser.find_label_path(toc_tree, html_file, filepos)
            print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}")
            html_path = os.path.join(epub_dir, html_file.split('#')[0])
            if os.path.exists(html_path):
                title = TOCParser.parse_html_title(html_path)
                print(f"解析html标题: {html_path} => {title if title else '未找到标题'}")
                selectedtext = '从变法思想看，王安石变法最大的魅力是“民不加赋而国用足”：老百姓上缴的税率不增，国库的总收入仍可以'
                section = TOCParser.find_section_by_selectedtext(html_path, selectedtext)
                print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}")
            else:
                print(f"未找到html文件: {html_path}")
        except Exception as e:
            print(f"测试失败: {e}")

    print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====")
    note_idref = 'text/part0002_split_003.html'
    note_filepos = None
    bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx"
    if os.path.exists(bian_song_toc):
        with open(bian_song_toc, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f, "xml")
        nav_map = soup.find("navMap")
        toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
        label_path = TOCParser.find_label_path(toc_tree, note_idref, note_filepos)
        print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节，尝试解析html标题")
    else:
        print(f"未找到toc.ncx: {bian_song_toc}")