优化效率
This commit is contained in:
29
toc_parse.py
29
toc_parse.py
@@ -1,10 +1,21 @@
|
||||
# parsetoc.py
|
||||
# -----------------------------
|
||||
# 用于解析EPUB电子书的toc.ncx目录文件,递归构建章节树结构,支持通过ref和filepos查找完整label路径。
|
||||
# 支持多种EPUB格式的toc.ncx,包含批量测试用例。
|
||||
# 依赖:BeautifulSoup4
|
||||
# -----------------------------
|
||||
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Dict, Optional, List, Any
|
||||
import pprint
|
||||
|
||||
# ==== 辅助函数:根据selectedtext在html文件中的位置推断所在章节 ====
|
||||
def find_section_by_selectedtext(html_path, selectedtext):
|
||||
"""
|
||||
在html文件中查找selectedtext出现的位置,向上回溯最近的h1-h6标题,返回该标题文本。
|
||||
若未找到标题,则返回None。
|
||||
"""
|
||||
from bs4 import BeautifulSoup
|
||||
try:
|
||||
with open(html_path, 'r', encoding='utf-8') as f:
|
||||
soup = BeautifulSoup(f, 'html.parser')
|
||||
@@ -34,7 +45,6 @@ def parse_html_title(html_path):
|
||||
"""
|
||||
解析html文件,优先返回<title>,否则返回body第一个h1/h2/h3/h4/h5/h6或None。
|
||||
"""
|
||||
from bs4 import BeautifulSoup
|
||||
try:
|
||||
with open(html_path, 'r', encoding='utf-8') as f:
|
||||
soup = BeautifulSoup(f, 'html.parser')
|
||||
@@ -49,16 +59,6 @@ def parse_html_title(html_path):
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
# parsetoc.py
|
||||
# -----------------------------
|
||||
# 用于解析EPUB电子书的toc.ncx目录文件,递归构建章节树结构,支持通过ref和filepos查找完整label路径。
|
||||
# 支持多种EPUB格式的toc.ncx,包含批量测试用例。
|
||||
# 依赖:BeautifulSoup4
|
||||
# -----------------------------
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Dict, Optional, List, Any
|
||||
import pprint
|
||||
|
||||
def parse_navpoints(navpoints) -> Dict[str, dict]:
|
||||
"""
|
||||
@@ -81,6 +81,9 @@ def parse_navpoints(navpoints) -> Dict[str, dict]:
|
||||
"children": parse_navpoints(navpoint.find_all("navPoint", recursive=False))
|
||||
}
|
||||
result[navpoint.get("id")] = entry
|
||||
|
||||
#pprint.pprint(result) # 格式化打印result
|
||||
|
||||
return result
|
||||
|
||||
def find_label_path(
|
||||
@@ -155,9 +158,9 @@ if __name__ == "__main__":
|
||||
["examples/epub_format_3", "Text/011.xhtml", ""],
|
||||
["examples/epub_format_4", "xhtml/p-006.xhtml", ""],
|
||||
["examples/变宋", "text/part0005.html", ""],
|
||||
["examples/变宋", "text/part0002_split_003.html", ""],
|
||||
["examples/规训与惩罚", "index_split_006.html", ""],
|
||||
["examples/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
|
||||
["examples/变宋", "text/part0002_split_003.html", ""],
|
||||
]
|
||||
for epub_dir, html_file, filepos in test_cases:
|
||||
# 自动查找epub目录下的toc.ncx
|
||||
|
||||
Reference in New Issue
Block a user