iBook/toc_parse.py

"""
toc_parse.py
------------
功能：
    - 解析EPUB电子书的toc.ncx目录文件，递归构建章节树结构。
    - 支持通过ref和filepos查找完整label路径。
    - 支持通过selectedtext在html文件中定位章节标题。
    - 兼容多种EPUB格式，支持批量测试。

依赖：config.py 统一管理路径和配置项。
主要接口：
    parse_navpoints(navpoints)  # 递归解析navPoint节点，返回章节树结构。
    find_label_path(node, ref, filepos, path)  # 查找指定ref和filepos的章节label路径。
    find_section_by_selectedtext(html_path, selectedtext)  # 通过选中文本定位章节标题。
    parse_html_title(html_path)  # 解析html文件标题。
依赖：BeautifulSoup4, pprint, os, typing
"""
import config


from bs4 import BeautifulSoup
from typing import Dict, Optional, List, Any
import pprint

# ==== 辅助函数：根据selectedtext在html文件中的位置推断所在章节 ====
def find_section_by_selectedtext(html_path, selectedtext):
    """
    在html文件中查找selectedtext出现的位置，向上回溯最近的h1-h6标题，返回该标题文本。
    若未找到标题，则返回None。
    """
    try:
        with open(html_path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')
        # 在所有文本节点中查找selectedtext
        for elem in soup.find_all(string=True):
            if selectedtext and selectedtext.strip() and selectedtext.strip() in elem:
                # 回溯父节点，查找最近的h1-h6
                parent = elem.parent
                while parent:
                    prev = parent.previous_sibling
                    # 向上查找同级前面的h1-h6
                    while prev:
                        if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']:
                            return prev.get_text(strip=True)
                        prev = prev.previous_sibling
                    parent = parent.parent
        # 若未找到，尝试全局第一个h1-h6
        for tag in ['h1','h2','h3','h4','h5','h6']:
            h = soup.find(tag)
            if h and h.get_text(strip=True):
                return h.get_text(strip=True)
    except Exception:
        pass
    return None

def parse_html_title(html_path):
    """
    解析html文件，优先返回<title>，否则返回body第一个h1/h2/h3/h4/h5/h6或None。
    """
    try:
        with open(html_path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')
        # 优先<title>
        if soup.title and soup.title.string:
            return soup.title.string.strip()
        # 其次正文第一个h1-h6
        for tag in ['h1','h2','h3','h4','h5','h6']:
            h = soup.find(tag)
            if h and h.get_text(strip=True):
                return h.get_text(strip=True)
    except Exception:
        pass
    return None

def parse_navpoints(navpoints) -> Dict[str, dict]:
    """
    递归解析 navpoints 节点，返回嵌套 dict 结构。
    :param navpoints: BeautifulSoup 查找到的 navPoint 节点列表
    :return: 章节树结构
    """
    result = {}
    for navpoint in navpoints:
        label = navpoint.navLabel.text.strip().strip('"“”')
        src = navpoint.content["src"]
        if "#" in src:
            ref, filepos = src.split("#", 1)
        else:
            ref, filepos = src, None
        entry = {
            "label": label,
            "ref": ref,
            "filepos": filepos,
            "children": parse_navpoints(navpoint.find_all("navPoint", recursive=False))
        }
        result[navpoint.get("id")] = entry

    #pprint.pprint(result)  # 格式化打印result

    return result

def find_label_path(
    node: Any,
    ref: str,
    filepos: Optional[str] = None,
    path: Optional[List[str]] = None
) -> Optional[str]:
    """
    在嵌套 dict 结构中查找指定 ref 和 filepos 的 label 路径。
    :param node: 当前节点（dict 或 dict集合）
    :param ref: html文件名
    :param filepos: 文件位置，可为 None
    :param path: label 路径累积
    :return: 以 / 分隔的完整 label 路径，未找到返回 None
    """
    if path is None:
        path = []
    if isinstance(node, dict):
        nodes = node.values() if "label" not in node else [node]
        # 1. 优先精确匹配ref和filepos
        for v in nodes:
            if "label" in v:
                new_path = path + [v["label"]]
                if v["ref"] == ref and (filepos is None or v["filepos"] == filepos):
                    title = " / ".join(new_path)
                    #print(f'title ref={ref} filepos={filepos} -> {title}') #DBG
                    return title
                title = find_label_path(v["children"], ref, filepos, new_path)
                if title:
                    #print(f'title1 ref={ref} filepos={filepos} -> {title}') #DBG
                    return title

        # 2. 如果带filepos查找失败，回退到同ref下第一个章节（即只要ref匹配就返回）
        if filepos is not None:
            for v in nodes:
                if "label" in v:
                    new_path = path + [v["label"]]
                    # print(f"对比 {v['ref']} == {ref}")
                    if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]:
                        title = " / ".join(new_path)
                        #print(f'title3 ref={ref} filepos={filepos} -> {title}') #DBG
                        return title
                    title = find_label_path(v["children"], ref, None, new_path)
                    if title:
                        #print(f'title4 ref={ref} filepos={filepos} -> {title}') #DBG
                        return title

    # 3. 若完全未找到，尝试直接解析idref所指html文件标题，获取章节label信息
    # 仅在顶层调用时执行此逻辑
    if path == [] and ref and ref.endswith('.html'):
        import os
        # 自动在常见目录下查找html文件（以toc文件目录为基准）
        caller_dir = os.path.dirname(os.path.abspath(__file__))
        search_dirs = [caller_dir, os.getcwd()]
        for d in search_dirs:
            html_path = os.path.join(d, ref)
            #print(f"查找 {html_path}")
            if os.path.isfile(html_path):
                title = parse_html_title(html_path)
                if title:
                    return title
        # 递归查找（以toc文件目录为根）
        for d in search_dirs:
            for root, _, files in os.walk(d):
                if ref in files:
                    html_path = os.path.join(root, ref)
                    #print(f"2 查找 {html_path}")
                    title = parse_html_title(html_path)
                    if title:
                        return title
    return None

if __name__ == "__main__":
    # ==== 批量测试指定toc/html/filepos列表 ====
    test_cases = [
        [config.EXAMPLES_DIR + "/epub_format_1", "index_split_015.html", "filepos684970"],
        [config.EXAMPLES_DIR + "/epub_format_2", "Text/8c7276f38ead4738ee19249418898c18_split_006.html", "sigil_toc_id_12"],
        [config.EXAMPLES_DIR + "/epub_format_3", "Text/011.xhtml", ""],
        [config.EXAMPLES_DIR + "/epub_format_4", "xhtml/p-006.xhtml", ""],
        [config.EXAMPLES_DIR + "/变宋", "text/part0005.html", ""],
        [config.EXAMPLES_DIR + "/变宋", "text/part0002_split_003.html", ""],
        [config.EXAMPLES_DIR + "/规训与惩罚", "index_split_006.html", ""],
        [config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
    ]
    for epub_dir, html_file, filepos in test_cases:
        # 自动查找epub目录下的toc.ncx
        import os
        toc_path = None
        for root, _, files in os.walk(epub_dir):
            for f in files:
                if f.lower() == "toc.ncx":
                    toc_path = os.path.join(root, f)
                    break
            if toc_path:
                break
        print(f"\n==== 测试 epub: {epub_dir} html: {html_file} filepos: {filepos} ====")
        if not toc_path:
            print(f"未找到toc.ncx: {epub_dir}")
            continue
        try:
            with open(toc_path, "r", encoding="utf-8") as f:
                soup = BeautifulSoup(f, "xml")
            nav_map = soup.find("navMap")
            toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False))
            label_path = find_label_path(toc_tree, html_file, filepos)
            print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}")

            # tocb中不存在html，直接测试parse_html_title
            html_path = os.path.join(epub_dir, html_file.split('#')[0])
            if os.path.exists(html_path):
                title = parse_html_title(html_path)
                print(f"解析html标题: {html_path} => {title if title else '未找到标题'}")
                # 新增：根据selectedtext定位章节标题
                selectedtext = '从变法思想看，王安石变法最大的魅力是“民不加赋而国用足”：老百姓上缴的税率不增，国库的总收入仍可以'
                section = find_section_by_selectedtext(html_path, selectedtext)
                print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}")
            else:
                print(f"未找到html文件: {html_path}")
        except Exception as e:
            print(f"测试失败: {e}")

    # ==== 新增：测试变宋笔记章节定位和html标题解析 ====
    print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====")
    # 假设笔记数据如下
    note_idref = 'text/part0002_split_003.html'
    note_filepos = None
    # 变宋toc.ncx路径
    bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx"
    import os
    if os.path.exists(bian_song_toc):
        with open(bian_song_toc, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f, "xml")
        nav_map = soup.find("navMap")
        toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False))
        # 先尝试用find_label_path查找章节
        label_path = find_label_path(toc_tree, note_idref, note_filepos)
        print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节，尝试解析html标题")
    else:
        print(f"未找到toc.ncx: {bian_song_toc}")