""" toc_parse.py (OOP版) ------------------- 功能: - 解析EPUB电子书的toc.ncx目录文件,递归构建章节树结构。 - 支持通过ref和filepos查找完整label路径。 - 支持通过selectedtext在html文件中定位章节标题。 - 兼容多种EPUB格式,支持批量测试。 依赖:config.py 统一管理路径和配置项。 主要接口:TOCParser - parse_navpoints(navpoints):递归解析navPoint节点,返回章节树结构。 - find_label_path(node, ref, filepos, path):查找指定ref和filepos的章节label路径。 - find_section_by_selectedtext(html_path, selectedtext):通过选中文本定位章节标题。 - parse_html_title(html_path):解析html文件标题。 依赖:BeautifulSoup4, pprint, os, typing """ import config from bs4 import BeautifulSoup import os class TOCParser: def __init__(self): pass @staticmethod def find_section_by_selectedtext(html_path, selectedtext): try: with open(html_path, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'html.parser') for elem in soup.find_all(string=True): if selectedtext and selectedtext.strip() and selectedtext.strip() in elem: parent = elem.parent while parent: prev = parent.previous_sibling while prev: if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']: return prev.get_text(strip=True) prev = prev.previous_sibling parent = parent.parent for tag in ['h1','h2','h3','h4','h5','h6']: h = soup.find(tag) if h and h.get_text(strip=True): return h.get_text(strip=True) except Exception: pass return None @staticmethod def parse_html_title(html_path): try: with open(html_path, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'html.parser') if soup.title and soup.title.string: return soup.title.string.strip() for tag in ['h1','h2','h3','h4','h5','h6']: h = soup.find(tag) if h and h.get_text(strip=True): return h.get_text(strip=True) except Exception: pass return None @staticmethod def parse_navpoints(navpoints): result = {} for navpoint in navpoints: label = navpoint.navLabel.text.strip().strip('"“”') src = navpoint.content["src"] if "#" in src: ref, filepos = src.split("#", 1) else: ref, filepos = src, None entry = { "label": label, "ref": ref, "filepos": filepos, "children": TOCParser.parse_navpoints(navpoint.find_all("navPoint", recursive=False)) } result[navpoint.get("id")] = entry return result @staticmethod def find_label_path(node, ref, filepos=None, path=None): if path is None: path = [] if isinstance(node, dict): nodes = node.values() if "label" not in node else [node] for v in nodes: if "label" in v: new_path = path + [v["label"]] if v["ref"] == ref and (filepos is None or v["filepos"] == filepos): title = " / ".join(new_path) return title title = TOCParser.find_label_path(v["children"], ref, filepos, new_path) if title: return title if filepos is not None: for v in nodes: if "label" in v: new_path = path + [v["label"]] if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]: title = " / ".join(new_path) return title title = TOCParser.find_label_path(v["children"], ref, None, new_path) if title: return title if path == [] and ref and ref.endswith('.html'): caller_dir = os.path.dirname(os.path.abspath(__file__)) search_dirs = [caller_dir, os.getcwd()] for d in search_dirs: html_path = os.path.join(d, ref) if os.path.isfile(html_path): title = TOCParser.parse_html_title(html_path) if title: return title for d in search_dirs: for root, _, files in os.walk(d): if ref in files: html_path = os.path.join(root, ref) title = TOCParser.parse_html_title(html_path) if title: return title return None if __name__ == "__main__": # ==== 批量测试指定toc/html/filepos列表 ==== test_cases = [ [config.EXAMPLES_DIR + "/epub_format_1", "index_split_015.html", "filepos684970"], [config.EXAMPLES_DIR + "/epub_format_2", "Text/8c7276f38ead4738ee19249418898c18_split_006.html", "sigil_toc_id_12"], [config.EXAMPLES_DIR + "/epub_format_3", "Text/011.xhtml", ""], [config.EXAMPLES_DIR + "/epub_format_4", "xhtml/p-006.xhtml", ""], [config.EXAMPLES_DIR + "/变宋", "text/part0005.html", ""], [config.EXAMPLES_DIR + "/变宋", "text/part0002_split_003.html", ""], [config.EXAMPLES_DIR + "/规训与惩罚", "index_split_006.html", ""], [config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""], ] for epub_dir, html_file, filepos in test_cases: toc_path = None for root, _, files in os.walk(epub_dir): for f in files: if f.lower() == "toc.ncx": toc_path = os.path.join(root, f) break if toc_path: break print(f"\n==== 测试 epub: {epub_dir} html: {html_file} filepos: {filepos} ====") if not toc_path: print(f"未找到toc.ncx: {epub_dir}") continue try: with open(toc_path, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "xml") nav_map = soup.find("navMap") toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False)) label_path = TOCParser.find_label_path(toc_tree, html_file, filepos) print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}") html_path = os.path.join(epub_dir, html_file.split('#')[0]) if os.path.exists(html_path): title = TOCParser.parse_html_title(html_path) print(f"解析html标题: {html_path} => {title if title else '未找到标题'}") selectedtext = '从变法思想看,王安石变法最大的魅力是“民不加赋而国用足”:老百姓上缴的税率不增,国库的总收入仍可以' section = TOCParser.find_section_by_selectedtext(html_path, selectedtext) print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}") else: print(f"未找到html文件: {html_path}") except Exception as e: print(f"测试失败: {e}") print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====") note_idref = 'text/part0002_split_003.html' note_filepos = None bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx" if os.path.exists(bian_song_toc): with open(bian_song_toc, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "xml") nav_map = soup.find("navMap") toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False)) label_path = TOCParser.find_label_path(toc_tree, note_idref, note_filepos) print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节,尝试解析html标题") else: print(f"未找到toc.ncx: {bian_song_toc}")