'update'

2025-08-15 17:20:30 +08:00
parent 0bc6844209
commit 4e3b8abc34
12 changed files with 406 additions and 516 deletions
--- a/toc_parse.py
+++ b/toc_parse.py
@@ -1,6 +1,7 @@
+
 """
-toc_parse.py
------------
+toc_parse.py (OOP版)
+-------------------
 功能：
    - 解析EPUB电子书的toc.ncx目录文件，递归构建章节树结构。
    - 支持通过ref和filepos查找完整label路径。
@@ -8,166 +9,120 @@ toc_parse.py
    - 兼容多种EPUB格式，支持批量测试。

 依赖：config.py 统一管理路径和配置项。
-主要接口：
-    parse_navpoints(navpoints)  # 递归解析navPoint节点，返回章节树结构。
-    find_label_path(node, ref, filepos, path)  # 查找指定ref和filepos的章节label路径。
-    find_section_by_selectedtext(html_path, selectedtext)  # 通过选中文本定位章节标题。
-    parse_html_title(html_path)  # 解析html文件标题。
+主要接口：TOCParser
+    - parse_navpoints(navpoints)：递归解析navPoint节点，返回章节树结构。
+    - find_label_path(node, ref, filepos, path)：查找指定ref和filepos的章节label路径。
+    - find_section_by_selectedtext(html_path, selectedtext)：通过选中文本定位章节标题。
+    - parse_html_title(html_path)：解析html文件标题。
 依赖：BeautifulSoup4, pprint, os, typing
 """
 import config
-
-
 from bs4 import BeautifulSoup
-from typing import Dict, Optional, List, Any
-import pprint
+import os

-# ==== 辅助函数：根据selectedtext在html文件中的位置推断所在章节 ====
-def find_section_by_selectedtext(html_path, selectedtext):
-    """
-    在html文件中查找selectedtext出现的位置，向上回溯最近的h1-h6标题，返回该标题文本。
-    若未找到标题，则返回None。
-    """
-    try:
-        with open(html_path, 'r', encoding='utf-8') as f:
-            soup = BeautifulSoup(f, 'html.parser')
-        # 在所有文本节点中查找selectedtext
-        for elem in soup.find_all(string=True):
-            if selectedtext and selectedtext.strip() and selectedtext.strip() in elem:
-                # 回溯父节点，查找最近的h1-h6
-                parent = elem.parent
-                while parent:
-                    prev = parent.previous_sibling
-                    # 向上查找同级前面的h1-h6
-                    while prev:
-                        if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']:
-                            return prev.get_text(strip=True)
-                        prev = prev.previous_sibling
-                    parent = parent.parent
-        # 若未找到，尝试全局第一个h1-h6
-        for tag in ['h1','h2','h3','h4','h5','h6']:
-            h = soup.find(tag)
-            if h and h.get_text(strip=True):
-                return h.get_text(strip=True)
-    except Exception:
+class TOCParser:
+    def __init__(self):
        pass
-    return None

-def parse_html_title(html_path):
-    """
-    解析html文件，优先返回<title>，否则返回body第一个h1/h2/h3/h4/h5/h6或None。
-    """
-    try:
-        with open(html_path, 'r', encoding='utf-8') as f:
-            soup = BeautifulSoup(f, 'html.parser')
-        # 优先<title>
-        if soup.title and soup.title.string:
-            return soup.title.string.strip()
-        # 其次正文第一个h1-h6
-        for tag in ['h1','h2','h3','h4','h5','h6']:
-            h = soup.find(tag)
-            if h and h.get_text(strip=True):
-                return h.get_text(strip=True)
-    except Exception:
-        pass
-    return None
+    @staticmethod
+    def find_section_by_selectedtext(html_path, selectedtext):
+        try:
+            with open(html_path, 'r', encoding='utf-8') as f:
+                soup = BeautifulSoup(f, 'html.parser')
+            for elem in soup.find_all(string=True):
+                if selectedtext and selectedtext.strip() and selectedtext.strip() in elem:
+                    parent = elem.parent
+                    while parent:
+                        prev = parent.previous_sibling
+                        while prev:
+                            if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']:
+                                return prev.get_text(strip=True)
+                            prev = prev.previous_sibling
+                        parent = parent.parent
+            for tag in ['h1','h2','h3','h4','h5','h6']:
+                h = soup.find(tag)
+                if h and h.get_text(strip=True):
+                    return h.get_text(strip=True)
+        except Exception:
+            pass
+        return None

-def parse_navpoints(navpoints) -> Dict[str, dict]:
-    """
-    递归解析 navpoints 节点，返回嵌套 dict 结构。
-    :param navpoints: BeautifulSoup 查找到的 navPoint 节点列表
-    :return: 章节树结构
-    """
-    result = {}
-    for navpoint in navpoints:
-        label = navpoint.navLabel.text.strip().strip('"“”')
-        src = navpoint.content["src"]
-        if "#" in src:
-            ref, filepos = src.split("#", 1)
-        else:
-            ref, filepos = src, None
-        entry = {
-            "label": label,
-            "ref": ref,
-            "filepos": filepos,
-            "children": parse_navpoints(navpoint.find_all("navPoint", recursive=False))
-        }
-        result[navpoint.get("id")] = entry
+    @staticmethod
+    def parse_html_title(html_path):
+        try:
+            with open(html_path, 'r', encoding='utf-8') as f:
+                soup = BeautifulSoup(f, 'html.parser')
+            if soup.title and soup.title.string:
+                return soup.title.string.strip()
+            for tag in ['h1','h2','h3','h4','h5','h6']:
+                h = soup.find(tag)
+                if h and h.get_text(strip=True):
+                    return h.get_text(strip=True)
+        except Exception:
+            pass
+        return None

-    #pprint.pprint(result)  # 格式化打印result
+    @staticmethod
+    def parse_navpoints(navpoints):
+        result = {}
+        for navpoint in navpoints:
+            label = navpoint.navLabel.text.strip().strip('"“”')
+            src = navpoint.content["src"]
+            if "#" in src:
+                ref, filepos = src.split("#", 1)
+            else:
+                ref, filepos = src, None
+            entry = {
+                "label": label,
+                "ref": ref,
+                "filepos": filepos,
+                "children": TOCParser.parse_navpoints(navpoint.find_all("navPoint", recursive=False))
+            }
+            result[navpoint.get("id")] = entry
+        return result

-    return result
-
-def find_label_path(
-    node: Any, 
-    ref: str, 
-    filepos: Optional[str] = None, 
-    path: Optional[List[str]] = None
-) -> Optional[str]:
-    """
-    在嵌套 dict 结构中查找指定 ref 和 filepos 的 label 路径。
-    :param node: 当前节点（dict 或 dict集合）
-    :param ref: html文件名
-    :param filepos: 文件位置，可为 None
-    :param path: label 路径累积
-    :return: 以 / 分隔的完整 label 路径，未找到返回 None
-    """
-    if path is None:
-        path = []
-    if isinstance(node, dict):
-        nodes = node.values() if "label" not in node else [node]
-        # 1. 优先精确匹配ref和filepos
-        for v in nodes:
-            if "label" in v:
-                new_path = path + [v["label"]]
-                if v["ref"] == ref and (filepos is None or v["filepos"] == filepos):
-                    title = " / ".join(new_path)
-                    #print(f'title ref={ref} filepos={filepos} -> {title}') #DBG
-                    return title
-                title = find_label_path(v["children"], ref, filepos, new_path)
-                if title:
-                    #print(f'title1 ref={ref} filepos={filepos} -> {title}') #DBG
-                    return title
-
-        # 2. 如果带filepos查找失败，回退到同ref下第一个章节（即只要ref匹配就返回）
-        if filepos is not None:
+    @staticmethod
+    def find_label_path(node, ref, filepos=None, path=None):
+        if path is None:
+            path = []
+        if isinstance(node, dict):
+            nodes = node.values() if "label" not in node else [node]
            for v in nodes:
                if "label" in v:
                    new_path = path + [v["label"]]
-                    # print(f"对比 {v['ref']} == {ref}")
-                    if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]:
+                    if v["ref"] == ref and (filepos is None or v["filepos"] == filepos):
                        title = " / ".join(new_path)
-                        #print(f'title3 ref={ref} filepos={filepos} -> {title}') #DBG
                        return title
-                    title = find_label_path(v["children"], ref, None, new_path)
-                    if title:
-                        #print(f'title4 ref={ref} filepos={filepos} -> {title}') #DBG
-                        return title
-
-    # 3. 若完全未找到，尝试直接解析idref所指html文件标题，获取章节label信息
-    # 仅在顶层调用时执行此逻辑
-    if path == [] and ref and ref.endswith('.html'):
-        import os
-        # 自动在常见目录下查找html文件（以toc文件目录为基准）
-        caller_dir = os.path.dirname(os.path.abspath(__file__))
-        search_dirs = [caller_dir, os.getcwd()]
-        for d in search_dirs:
-            html_path = os.path.join(d, ref)
-            #print(f"查找 {html_path}")
-            if os.path.isfile(html_path):
-                title = parse_html_title(html_path)
-                if title:
-                    return title
-        # 递归查找（以toc文件目录为根）
-        for d in search_dirs:
-            for root, _, files in os.walk(d):
-                if ref in files:
-                    html_path = os.path.join(root, ref)
-                    #print(f"2 查找 {html_path}")
-                    title = parse_html_title(html_path)
+                    title = TOCParser.find_label_path(v["children"], ref, filepos, new_path)
                    if title:
                        return title
-    return None
+            if filepos is not None:
+                for v in nodes:
+                    if "label" in v:
+                        new_path = path + [v["label"]]
+                        if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]:
+                            title = " / ".join(new_path)
+                            return title
+                        title = TOCParser.find_label_path(v["children"], ref, None, new_path)
+                        if title:
+                            return title
+        if path == [] and ref and ref.endswith('.html'):
+            caller_dir = os.path.dirname(os.path.abspath(__file__))
+            search_dirs = [caller_dir, os.getcwd()]
+            for d in search_dirs:
+                html_path = os.path.join(d, ref)
+                if os.path.isfile(html_path):
+                    title = TOCParser.parse_html_title(html_path)
+                    if title:
+                        return title
+            for d in search_dirs:
+                for root, _, files in os.walk(d):
+                    if ref in files:
+                        html_path = os.path.join(root, ref)
+                        title = TOCParser.parse_html_title(html_path)
+                        if title:
+                            return title
+        return None

 if __name__ == "__main__":
    # ==== 批量测试指定toc/html/filepos列表 ====
@@ -182,8 +137,6 @@ if __name__ == "__main__":
        [config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
    ]
    for epub_dir, html_file, filepos in test_cases:
-        # 自动查找epub目录下的toc.ncx
-        import os
        toc_path = None
        for root, _, files in os.walk(epub_dir):
            for f in files:
@@ -200,39 +153,32 @@ if __name__ == "__main__":
            with open(toc_path, "r", encoding="utf-8") as f:
                soup = BeautifulSoup(f, "xml")
            nav_map = soup.find("navMap")
-            toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False))
-            label_path = find_label_path(toc_tree, html_file, filepos)
+            toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
+            label_path = TOCParser.find_label_path(toc_tree, html_file, filepos)
            print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}")
-
-            # tocb中不存在html，直接测试parse_html_title
            html_path = os.path.join(epub_dir, html_file.split('#')[0])
            if os.path.exists(html_path):
-                title = parse_html_title(html_path)
+                title = TOCParser.parse_html_title(html_path)
                print(f"解析html标题: {html_path} => {title if title else '未找到标题'}")
-                # 新增：根据selectedtext定位章节标题
                selectedtext = '从变法思想看，王安石变法最大的魅力是“民不加赋而国用足”：老百姓上缴的税率不增，国库的总收入仍可以'
-                section = find_section_by_selectedtext(html_path, selectedtext)
+                section = TOCParser.find_section_by_selectedtext(html_path, selectedtext)
                print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}")
            else:
                print(f"未找到html文件: {html_path}")
        except Exception as e:
            print(f"测试失败: {e}")

-    # ==== 新增：测试变宋笔记章节定位和html标题解析 ====
    print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====")
-    # 假设笔记数据如下
    note_idref = 'text/part0002_split_003.html'
    note_filepos = None
-    # 变宋toc.ncx路径
    bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx"
-    import os
    if os.path.exists(bian_song_toc):
        with open(bian_song_toc, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f, "xml")
        nav_map = soup.find("navMap")
-        toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False))
-        # 先尝试用find_label_path查找章节
-        label_path = find_label_path(toc_tree, note_idref, note_filepos)
+        toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
+        label_path = TOCParser.find_label_path(toc_tree, note_idref, note_filepos)
        print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节，尝试解析html标题")
    else:
        print(f"未找到toc.ncx: {bian_song_toc}")
+