iBook/toc_parse.py

239 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
toc_parse.py
------------
功能:
- 解析EPUB电子书的toc.ncx目录文件递归构建章节树结构。
- 支持通过ref和filepos查找完整label路径。
- 支持通过selectedtext在html文件中定位章节标题。
- 兼容多种EPUB格式支持批量测试。
依赖config.py 统一管理路径和配置项。
主要接口:
parse_navpoints(navpoints) # 递归解析navPoint节点返回章节树结构。
find_label_path(node, ref, filepos, path) # 查找指定ref和filepos的章节label路径。
find_section_by_selectedtext(html_path, selectedtext) # 通过选中文本定位章节标题。
parse_html_title(html_path) # 解析html文件标题。
依赖BeautifulSoup4, pprint, os, typing
"""
import config
from bs4 import BeautifulSoup
from typing import Dict, Optional, List, Any
import pprint
# ==== 辅助函数根据selectedtext在html文件中的位置推断所在章节 ====
def find_section_by_selectedtext(html_path, selectedtext):
"""
在html文件中查找selectedtext出现的位置向上回溯最近的h1-h6标题返回该标题文本。
若未找到标题则返回None。
"""
try:
with open(html_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
# 在所有文本节点中查找selectedtext
for elem in soup.find_all(string=True):
if selectedtext and selectedtext.strip() and selectedtext.strip() in elem:
# 回溯父节点查找最近的h1-h6
parent = elem.parent
while parent:
prev = parent.previous_sibling
# 向上查找同级前面的h1-h6
while prev:
if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']:
return prev.get_text(strip=True)
prev = prev.previous_sibling
parent = parent.parent
# 若未找到尝试全局第一个h1-h6
for tag in ['h1','h2','h3','h4','h5','h6']:
h = soup.find(tag)
if h and h.get_text(strip=True):
return h.get_text(strip=True)
except Exception:
pass
return None
def parse_html_title(html_path):
"""
解析html文件优先返回<title>否则返回body第一个h1/h2/h3/h4/h5/h6或None。
"""
try:
with open(html_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
# 优先<title>
if soup.title and soup.title.string:
return soup.title.string.strip()
# 其次正文第一个h1-h6
for tag in ['h1','h2','h3','h4','h5','h6']:
h = soup.find(tag)
if h and h.get_text(strip=True):
return h.get_text(strip=True)
except Exception:
pass
return None
def parse_navpoints(navpoints) -> Dict[str, dict]:
"""
递归解析 navpoints 节点,返回嵌套 dict 结构。
:param navpoints: BeautifulSoup 查找到的 navPoint 节点列表
:return: 章节树结构
"""
result = {}
for navpoint in navpoints:
label = navpoint.navLabel.text.strip().strip('"“”')
src = navpoint.content["src"]
if "#" in src:
ref, filepos = src.split("#", 1)
else:
ref, filepos = src, None
entry = {
"label": label,
"ref": ref,
"filepos": filepos,
"children": parse_navpoints(navpoint.find_all("navPoint", recursive=False))
}
result[navpoint.get("id")] = entry
#pprint.pprint(result) # 格式化打印result
return result
def find_label_path(
node: Any,
ref: str,
filepos: Optional[str] = None,
path: Optional[List[str]] = None
) -> Optional[str]:
"""
在嵌套 dict 结构中查找指定 ref 和 filepos 的 label 路径。
:param node: 当前节点dict 或 dict集合
:param ref: html文件名
:param filepos: 文件位置,可为 None
:param path: label 路径累积
:return: 以 / 分隔的完整 label 路径,未找到返回 None
"""
if path is None:
path = []
if isinstance(node, dict):
nodes = node.values() if "label" not in node else [node]
# 1. 优先精确匹配ref和filepos
for v in nodes:
if "label" in v:
new_path = path + [v["label"]]
if v["ref"] == ref and (filepos is None or v["filepos"] == filepos):
title = " / ".join(new_path)
#print(f'title ref={ref} filepos={filepos} -> {title}') #DBG
return title
title = find_label_path(v["children"], ref, filepos, new_path)
if title:
#print(f'title1 ref={ref} filepos={filepos} -> {title}') #DBG
return title
# 2. 如果带filepos查找失败回退到同ref下第一个章节即只要ref匹配就返回
if filepos is not None:
for v in nodes:
if "label" in v:
new_path = path + [v["label"]]
# print(f"对比 {v['ref']} == {ref}")
if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]:
title = " / ".join(new_path)
#print(f'title3 ref={ref} filepos={filepos} -> {title}') #DBG
return title
title = find_label_path(v["children"], ref, None, new_path)
if title:
#print(f'title4 ref={ref} filepos={filepos} -> {title}') #DBG
return title
# 3. 若完全未找到尝试直接解析idref所指html文件标题获取章节label信息
# 仅在顶层调用时执行此逻辑
if path == [] and ref and ref.endswith('.html'):
import os
# 自动在常见目录下查找html文件以toc文件目录为基准
caller_dir = os.path.dirname(os.path.abspath(__file__))
search_dirs = [caller_dir, os.getcwd()]
for d in search_dirs:
html_path = os.path.join(d, ref)
#print(f"查找 {html_path}")
if os.path.isfile(html_path):
title = parse_html_title(html_path)
if title:
return title
# 递归查找以toc文件目录为根
for d in search_dirs:
for root, _, files in os.walk(d):
if ref in files:
html_path = os.path.join(root, ref)
#print(f"2 查找 {html_path}")
title = parse_html_title(html_path)
if title:
return title
return None
if __name__ == "__main__":
# ==== 批量测试指定toc/html/filepos列表 ====
test_cases = [
[config.EXAMPLES_DIR + "/epub_format_1", "index_split_015.html", "filepos684970"],
[config.EXAMPLES_DIR + "/epub_format_2", "Text/8c7276f38ead4738ee19249418898c18_split_006.html", "sigil_toc_id_12"],
[config.EXAMPLES_DIR + "/epub_format_3", "Text/011.xhtml", ""],
[config.EXAMPLES_DIR + "/epub_format_4", "xhtml/p-006.xhtml", ""],
[config.EXAMPLES_DIR + "/变宋", "text/part0005.html", ""],
[config.EXAMPLES_DIR + "/变宋", "text/part0002_split_003.html", ""],
[config.EXAMPLES_DIR + "/规训与惩罚", "index_split_006.html", ""],
[config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
]
for epub_dir, html_file, filepos in test_cases:
# 自动查找epub目录下的toc.ncx
import os
toc_path = None
for root, _, files in os.walk(epub_dir):
for f in files:
if f.lower() == "toc.ncx":
toc_path = os.path.join(root, f)
break
if toc_path:
break
print(f"\n==== 测试 epub: {epub_dir} html: {html_file} filepos: {filepos} ====")
if not toc_path:
print(f"未找到toc.ncx: {epub_dir}")
continue
try:
with open(toc_path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "xml")
nav_map = soup.find("navMap")
toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False))
label_path = find_label_path(toc_tree, html_file, filepos)
print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}")
# tocb中不存在html直接测试parse_html_title
html_path = os.path.join(epub_dir, html_file.split('#')[0])
if os.path.exists(html_path):
title = parse_html_title(html_path)
print(f"解析html标题: {html_path} => {title if title else '未找到标题'}")
# 新增根据selectedtext定位章节标题
selectedtext = '从变法思想看,王安石变法最大的魅力是“民不加赋而国用足”:老百姓上缴的税率不增,国库的总收入仍可以'
section = find_section_by_selectedtext(html_path, selectedtext)
print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}")
else:
print(f"未找到html文件: {html_path}")
except Exception as e:
print(f"测试失败: {e}")
# ==== 新增测试变宋笔记章节定位和html标题解析 ====
print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====")
# 假设笔记数据如下
note_idref = 'text/part0002_split_003.html'
note_filepos = None
# 变宋toc.ncx路径
bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx"
import os
if os.path.exists(bian_song_toc):
with open(bian_song_toc, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "xml")
nav_map = soup.find("navMap")
toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False))
# 先尝试用find_label_path查找章节
label_path = find_label_path(toc_tree, note_idref, note_filepos)
print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节尝试解析html标题")
else:
print(f"未找到toc.ncx: {bian_song_toc}")