'update'
This commit is contained in:
42
toc_parse.py
42
toc_parse.py
@@ -1,9 +1,21 @@
|
||||
# parsetoc.py
|
||||
# -----------------------------
|
||||
# 用于解析EPUB电子书的toc.ncx目录文件,递归构建章节树结构,支持通过ref和filepos查找完整label路径。
|
||||
# 支持多种EPUB格式的toc.ncx,包含批量测试用例。
|
||||
# 依赖:BeautifulSoup4
|
||||
# -----------------------------
|
||||
"""
|
||||
toc_parse.py
|
||||
------------
|
||||
功能:
|
||||
- 解析EPUB电子书的toc.ncx目录文件,递归构建章节树结构。
|
||||
- 支持通过ref和filepos查找完整label路径。
|
||||
- 支持通过selectedtext在html文件中定位章节标题。
|
||||
- 兼容多种EPUB格式,支持批量测试。
|
||||
|
||||
依赖:config.py 统一管理路径和配置项。
|
||||
主要接口:
|
||||
parse_navpoints(navpoints) # 递归解析navPoint节点,返回章节树结构。
|
||||
find_label_path(node, ref, filepos, path) # 查找指定ref和filepos的章节label路径。
|
||||
find_section_by_selectedtext(html_path, selectedtext) # 通过选中文本定位章节标题。
|
||||
parse_html_title(html_path) # 解析html文件标题。
|
||||
依赖:BeautifulSoup4, pprint, os, typing
|
||||
"""
|
||||
import config
|
||||
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -160,14 +172,14 @@ def find_label_path(
|
||||
if __name__ == "__main__":
|
||||
# ==== 批量测试指定toc/html/filepos列表 ====
|
||||
test_cases = [
|
||||
["examples/epub_format_1", "index_split_015.html", "filepos684970"],
|
||||
["examples/epub_format_2", "Text/8c7276f38ead4738ee19249418898c18_split_006.html", "sigil_toc_id_12"],
|
||||
["examples/epub_format_3", "Text/011.xhtml", ""],
|
||||
["examples/epub_format_4", "xhtml/p-006.xhtml", ""],
|
||||
["examples/变宋", "text/part0005.html", ""],
|
||||
["examples/变宋", "text/part0002_split_003.html", ""],
|
||||
["examples/规训与惩罚", "index_split_006.html", ""],
|
||||
["examples/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
|
||||
[config.EXAMPLES_DIR + "/epub_format_1", "index_split_015.html", "filepos684970"],
|
||||
[config.EXAMPLES_DIR + "/epub_format_2", "Text/8c7276f38ead4738ee19249418898c18_split_006.html", "sigil_toc_id_12"],
|
||||
[config.EXAMPLES_DIR + "/epub_format_3", "Text/011.xhtml", ""],
|
||||
[config.EXAMPLES_DIR + "/epub_format_4", "xhtml/p-006.xhtml", ""],
|
||||
[config.EXAMPLES_DIR + "/变宋", "text/part0005.html", ""],
|
||||
[config.EXAMPLES_DIR + "/变宋", "text/part0002_split_003.html", ""],
|
||||
[config.EXAMPLES_DIR + "/规训与惩罚", "index_split_006.html", ""],
|
||||
[config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
|
||||
]
|
||||
for epub_dir, html_file, filepos in test_cases:
|
||||
# 自动查找epub目录下的toc.ncx
|
||||
@@ -212,7 +224,7 @@ if __name__ == "__main__":
|
||||
note_idref = 'text/part0002_split_003.html'
|
||||
note_filepos = None
|
||||
# 变宋toc.ncx路径
|
||||
bian_song_toc = "examples/变宋/toc.ncx"
|
||||
bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx"
|
||||
import os
|
||||
if os.path.exists(bian_song_toc):
|
||||
with open(bian_song_toc, "r", encoding="utf-8") as f:
|
||||
|
||||
Reference in New Issue
Block a user