This commit is contained in:
douboer
2025-08-15 13:49:02 +08:00
parent 8873c70a60
commit 0bc6844209
76 changed files with 726 additions and 11197 deletions

View File

@@ -1,9 +1,21 @@
# parsetoc.py
# -----------------------------
# 用于解析EPUB电子书的toc.ncx目录文件递归构建章节树结构支持通过ref和filepos查找完整label路径。
# 支持多种EPUB格式的toc.ncx包含批量测试用例。
# 依赖BeautifulSoup4
# -----------------------------
"""
toc_parse.py
------------
功能:
- 解析EPUB电子书的toc.ncx目录文件递归构建章节树结构。
- 支持通过ref和filepos查找完整label路径。
- 支持通过selectedtext在html文件中定位章节标题。
- 兼容多种EPUB格式支持批量测试。
依赖config.py 统一管理路径和配置项。
主要接口:
parse_navpoints(navpoints) # 递归解析navPoint节点返回章节树结构。
find_label_path(node, ref, filepos, path) # 查找指定ref和filepos的章节label路径。
find_section_by_selectedtext(html_path, selectedtext) # 通过选中文本定位章节标题。
parse_html_title(html_path) # 解析html文件标题。
依赖BeautifulSoup4, pprint, os, typing
"""
import config
from bs4 import BeautifulSoup
@@ -160,14 +172,14 @@ def find_label_path(
if __name__ == "__main__":
# ==== 批量测试指定toc/html/filepos列表 ====
test_cases = [
["examples/epub_format_1", "index_split_015.html", "filepos684970"],
["examples/epub_format_2", "Text/8c7276f38ead4738ee19249418898c18_split_006.html", "sigil_toc_id_12"],
["examples/epub_format_3", "Text/011.xhtml", ""],
["examples/epub_format_4", "xhtml/p-006.xhtml", ""],
["examples/变宋", "text/part0005.html", ""],
["examples/变宋", "text/part0002_split_003.html", ""],
["examples/规训与惩罚", "index_split_006.html", ""],
["examples/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
[config.EXAMPLES_DIR + "/epub_format_1", "index_split_015.html", "filepos684970"],
[config.EXAMPLES_DIR + "/epub_format_2", "Text/8c7276f38ead4738ee19249418898c18_split_006.html", "sigil_toc_id_12"],
[config.EXAMPLES_DIR + "/epub_format_3", "Text/011.xhtml", ""],
[config.EXAMPLES_DIR + "/epub_format_4", "xhtml/p-006.xhtml", ""],
[config.EXAMPLES_DIR + "/变宋", "text/part0005.html", ""],
[config.EXAMPLES_DIR + "/变宋", "text/part0002_split_003.html", ""],
[config.EXAMPLES_DIR + "/规训与惩罚", "index_split_006.html", ""],
[config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
]
for epub_dir, html_file, filepos in test_cases:
# 自动查找epub目录下的toc.ncx
@@ -212,7 +224,7 @@ if __name__ == "__main__":
note_idref = 'text/part0002_split_003.html'
note_filepos = None
# 变宋toc.ncx路径
bian_song_toc = "examples/变宋/toc.ncx"
bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx"
import os
if os.path.exists(bian_song_toc):
with open(bian_song_toc, "r", encoding="utf-8") as f: