优化效率

This commit is contained in:
douboer
2025-08-12 15:05:29 +08:00
parent 3eaaf8ad2b
commit e55178e316
67 changed files with 19174 additions and 46 deletions

View File

@@ -1,10 +1,21 @@
# parsetoc.py
# -----------------------------
# 用于解析EPUB电子书的toc.ncx目录文件递归构建章节树结构支持通过ref和filepos查找完整label路径。
# 支持多种EPUB格式的toc.ncx包含批量测试用例。
# 依赖BeautifulSoup4
# -----------------------------
from bs4 import BeautifulSoup
from typing import Dict, Optional, List, Any
import pprint
# ==== 辅助函数根据selectedtext在html文件中的位置推断所在章节 ====
def find_section_by_selectedtext(html_path, selectedtext):
"""
在html文件中查找selectedtext出现的位置向上回溯最近的h1-h6标题返回该标题文本。
若未找到标题则返回None。
"""
from bs4 import BeautifulSoup
try:
with open(html_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
@@ -34,7 +45,6 @@ def parse_html_title(html_path):
"""
解析html文件优先返回<title>否则返回body第一个h1/h2/h3/h4/h5/h6或None。
"""
from bs4 import BeautifulSoup
try:
with open(html_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
@@ -49,16 +59,6 @@ def parse_html_title(html_path):
except Exception:
pass
return None
# parsetoc.py
# -----------------------------
# 用于解析EPUB电子书的toc.ncx目录文件递归构建章节树结构支持通过ref和filepos查找完整label路径。
# 支持多种EPUB格式的toc.ncx包含批量测试用例。
# 依赖BeautifulSoup4
# -----------------------------
from bs4 import BeautifulSoup
from typing import Dict, Optional, List, Any
import pprint
def parse_navpoints(navpoints) -> Dict[str, dict]:
"""
@@ -81,6 +81,9 @@ def parse_navpoints(navpoints) -> Dict[str, dict]:
"children": parse_navpoints(navpoint.find_all("navPoint", recursive=False))
}
result[navpoint.get("id")] = entry
#pprint.pprint(result) # 格式化打印result
return result
def find_label_path(
@@ -155,9 +158,9 @@ if __name__ == "__main__":
["examples/epub_format_3", "Text/011.xhtml", ""],
["examples/epub_format_4", "xhtml/p-006.xhtml", ""],
["examples/变宋", "text/part0005.html", ""],
["examples/变宋", "text/part0002_split_003.html", ""],
["examples/规训与惩罚", "index_split_006.html", ""],
["examples/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
["examples/变宋", "text/part0002_split_003.html", ""],
]
for epub_dir, html_file, filepos in test_cases:
# 自动查找epub目录下的toc.ncx