优化效率

2025-08-12 15:05:29 +08:00
parent 3eaaf8ad2b
commit e55178e316
67 changed files with 19174 additions and 46 deletions
--- a/toc_parse.py
+++ b/toc_parse.py
@@ -1,10 +1,21 @@
+# parsetoc.py
+# -----------------------------
+# 用于解析EPUB电子书的toc.ncx目录文件，递归构建章节树结构，支持通过ref和filepos查找完整label路径。
+# 支持多种EPUB格式的toc.ncx，包含批量测试用例。
+# 依赖：BeautifulSoup4
+# -----------------------------
+
+
+from bs4 import BeautifulSoup
+from typing import Dict, Optional, List, Any
+import pprint
+
 # ==== 辅助函数：根据selectedtext在html文件中的位置推断所在章节 ====
 def find_section_by_selectedtext(html_path, selectedtext):
    """
    在html文件中查找selectedtext出现的位置，向上回溯最近的h1-h6标题，返回该标题文本。
    若未找到标题，则返回None。
    """
-    from bs4 import BeautifulSoup
    try:
        with open(html_path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')
@@ -34,7 +45,6 @@ def parse_html_title(html_path):
    """
    解析html文件，优先返回<title>，否则返回body第一个h1/h2/h3/h4/h5/h6或None。
    """
-    from bs4 import BeautifulSoup
    try:
        with open(html_path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')
@@ -49,16 +59,6 @@ def parse_html_title(html_path):
    except Exception:
        pass
    return None
-# parsetoc.py
-# -----------------------------
-# 用于解析EPUB电子书的toc.ncx目录文件，递归构建章节树结构，支持通过ref和filepos查找完整label路径。
-# 支持多种EPUB格式的toc.ncx，包含批量测试用例。
-# 依赖：BeautifulSoup4
-# -----------------------------
-
-from bs4 import BeautifulSoup
-from typing import Dict, Optional, List, Any
-import pprint

 def parse_navpoints(navpoints) -> Dict[str, dict]:
    """
@@ -81,6 +81,9 @@ def parse_navpoints(navpoints) -> Dict[str, dict]:
            "children": parse_navpoints(navpoint.find_all("navPoint", recursive=False))
        }
        result[navpoint.get("id")] = entry
+
+    #pprint.pprint(result)  # 格式化打印result
+
    return result

 def find_label_path(
@@ -155,9 +158,9 @@ if __name__ == "__main__":
        ["examples/epub_format_3", "Text/011.xhtml", ""],
        ["examples/epub_format_4", "xhtml/p-006.xhtml", ""],
        ["examples/变宋", "text/part0005.html", ""],
+        ["examples/变宋", "text/part0002_split_003.html", ""],
        ["examples/规训与惩罚", "index_split_006.html", ""],
        ["examples/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
-        ["examples/变宋", "text/part0002_split_003.html", ""],
    ]
    for epub_dir, html_file, filepos in test_cases:
        # 自动查找epub目录下的toc.ncx