iBook/toc_parse.py

185 lines
8.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
toc_parse.py (OOP版)
-------------------
功能:
- 解析EPUB电子书的toc.ncx目录文件递归构建章节树结构。
- 支持通过ref和filepos查找完整label路径。
- 支持通过selectedtext在html文件中定位章节标题。
- 兼容多种EPUB格式支持批量测试。
依赖config.py 统一管理路径和配置项。
主要接口TOCParser
- parse_navpoints(navpoints)递归解析navPoint节点返回章节树结构。
- find_label_path(node, ref, filepos, path)查找指定ref和filepos的章节label路径。
- find_section_by_selectedtext(html_path, selectedtext):通过选中文本定位章节标题。
- parse_html_title(html_path)解析html文件标题。
依赖BeautifulSoup4, pprint, os, typing
"""
import config
from bs4 import BeautifulSoup
import os
class TOCParser:
def __init__(self):
pass
@staticmethod
def find_section_by_selectedtext(html_path, selectedtext):
try:
with open(html_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
for elem in soup.find_all(string=True):
if selectedtext and selectedtext.strip() and selectedtext.strip() in elem:
parent = elem.parent
while parent:
prev = parent.previous_sibling
while prev:
if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']:
return prev.get_text(strip=True)
prev = prev.previous_sibling
parent = parent.parent
for tag in ['h1','h2','h3','h4','h5','h6']:
h = soup.find(tag)
if h and h.get_text(strip=True):
return h.get_text(strip=True)
except Exception:
pass
return None
@staticmethod
def parse_html_title(html_path):
try:
with open(html_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
if soup.title and soup.title.string:
return soup.title.string.strip()
for tag in ['h1','h2','h3','h4','h5','h6']:
h = soup.find(tag)
if h and h.get_text(strip=True):
return h.get_text(strip=True)
except Exception:
pass
return None
@staticmethod
def parse_navpoints(navpoints):
result = {}
for navpoint in navpoints:
label = navpoint.navLabel.text.strip().strip('"“”')
src = navpoint.content["src"]
if "#" in src:
ref, filepos = src.split("#", 1)
else:
ref, filepos = src, None
entry = {
"label": label,
"ref": ref,
"filepos": filepos,
"children": TOCParser.parse_navpoints(navpoint.find_all("navPoint", recursive=False))
}
result[navpoint.get("id")] = entry
return result
@staticmethod
def find_label_path(node, ref, filepos=None, path=None):
if path is None:
path = []
if isinstance(node, dict):
nodes = node.values() if "label" not in node else [node]
for v in nodes:
if "label" in v:
new_path = path + [v["label"]]
if v["ref"] == ref and (filepos is None or v["filepos"] == filepos):
title = " / ".join(new_path)
return title
title = TOCParser.find_label_path(v["children"], ref, filepos, new_path)
if title:
return title
if filepos is not None:
for v in nodes:
if "label" in v:
new_path = path + [v["label"]]
if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]:
title = " / ".join(new_path)
return title
title = TOCParser.find_label_path(v["children"], ref, None, new_path)
if title:
return title
if path == [] and ref and ref.endswith('.html'):
caller_dir = os.path.dirname(os.path.abspath(__file__))
search_dirs = [caller_dir, os.getcwd()]
for d in search_dirs:
html_path = os.path.join(d, ref)
if os.path.isfile(html_path):
title = TOCParser.parse_html_title(html_path)
if title:
return title
for d in search_dirs:
for root, _, files in os.walk(d):
if ref in files:
html_path = os.path.join(root, ref)
title = TOCParser.parse_html_title(html_path)
if title:
return title
return None
if __name__ == "__main__":
# ==== 批量测试指定toc/html/filepos列表 ====
test_cases = [
[config.EXAMPLES_DIR + "/epub_format_1", "index_split_015.html", "filepos684970"],
[config.EXAMPLES_DIR + "/epub_format_2", "Text/8c7276f38ead4738ee19249418898c18_split_006.html", "sigil_toc_id_12"],
[config.EXAMPLES_DIR + "/epub_format_3", "Text/011.xhtml", ""],
[config.EXAMPLES_DIR + "/epub_format_4", "xhtml/p-006.xhtml", ""],
[config.EXAMPLES_DIR + "/变宋", "text/part0005.html", ""],
[config.EXAMPLES_DIR + "/变宋", "text/part0002_split_003.html", ""],
[config.EXAMPLES_DIR + "/规训与惩罚", "index_split_006.html", ""],
[config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
]
for epub_dir, html_file, filepos in test_cases:
toc_path = None
for root, _, files in os.walk(epub_dir):
for f in files:
if f.lower() == "toc.ncx":
toc_path = os.path.join(root, f)
break
if toc_path:
break
print(f"\n==== 测试 epub: {epub_dir} html: {html_file} filepos: {filepos} ====")
if not toc_path:
print(f"未找到toc.ncx: {epub_dir}")
continue
try:
with open(toc_path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "xml")
nav_map = soup.find("navMap")
toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
label_path = TOCParser.find_label_path(toc_tree, html_file, filepos)
print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}")
html_path = os.path.join(epub_dir, html_file.split('#')[0])
if os.path.exists(html_path):
title = TOCParser.parse_html_title(html_path)
print(f"解析html标题: {html_path} => {title if title else '未找到标题'}")
selectedtext = '从变法思想看,王安石变法最大的魅力是“民不加赋而国用足”:老百姓上缴的税率不增,国库的总收入仍可以'
section = TOCParser.find_section_by_selectedtext(html_path, selectedtext)
print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}")
else:
print(f"未找到html文件: {html_path}")
except Exception as e:
print(f"测试失败: {e}")
print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====")
note_idref = 'text/part0002_split_003.html'
note_filepos = None
bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx"
if os.path.exists(bian_song_toc):
with open(bian_song_toc, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "xml")
nav_map = soup.find("navMap")
toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
label_path = TOCParser.find_label_path(toc_tree, note_idref, note_filepos)
print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节尝试解析html标题")
else:
print(f"未找到toc.ncx: {bian_song_toc}")