'update'
This commit is contained in:
268
toc_parse.py
268
toc_parse.py
@@ -1,6 +1,7 @@
|
||||
|
||||
"""
|
||||
toc_parse.py
|
||||
------------
|
||||
toc_parse.py (OOP版)
|
||||
-------------------
|
||||
功能:
|
||||
- 解析EPUB电子书的toc.ncx目录文件,递归构建章节树结构。
|
||||
- 支持通过ref和filepos查找完整label路径。
|
||||
@@ -8,166 +9,120 @@ toc_parse.py
|
||||
- 兼容多种EPUB格式,支持批量测试。
|
||||
|
||||
依赖:config.py 统一管理路径和配置项。
|
||||
主要接口:
|
||||
parse_navpoints(navpoints) # 递归解析navPoint节点,返回章节树结构。
|
||||
find_label_path(node, ref, filepos, path) # 查找指定ref和filepos的章节label路径。
|
||||
find_section_by_selectedtext(html_path, selectedtext) # 通过选中文本定位章节标题。
|
||||
parse_html_title(html_path) # 解析html文件标题。
|
||||
主要接口:TOCParser
|
||||
- parse_navpoints(navpoints):递归解析navPoint节点,返回章节树结构。
|
||||
- find_label_path(node, ref, filepos, path):查找指定ref和filepos的章节label路径。
|
||||
- find_section_by_selectedtext(html_path, selectedtext):通过选中文本定位章节标题。
|
||||
- parse_html_title(html_path):解析html文件标题。
|
||||
依赖:BeautifulSoup4, pprint, os, typing
|
||||
"""
|
||||
import config
|
||||
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Dict, Optional, List, Any
|
||||
import pprint
|
||||
import os
|
||||
|
||||
# ==== 辅助函数:根据selectedtext在html文件中的位置推断所在章节 ====
|
||||
def find_section_by_selectedtext(html_path, selectedtext):
|
||||
"""
|
||||
在html文件中查找selectedtext出现的位置,向上回溯最近的h1-h6标题,返回该标题文本。
|
||||
若未找到标题,则返回None。
|
||||
"""
|
||||
try:
|
||||
with open(html_path, 'r', encoding='utf-8') as f:
|
||||
soup = BeautifulSoup(f, 'html.parser')
|
||||
# 在所有文本节点中查找selectedtext
|
||||
for elem in soup.find_all(string=True):
|
||||
if selectedtext and selectedtext.strip() and selectedtext.strip() in elem:
|
||||
# 回溯父节点,查找最近的h1-h6
|
||||
parent = elem.parent
|
||||
while parent:
|
||||
prev = parent.previous_sibling
|
||||
# 向上查找同级前面的h1-h6
|
||||
while prev:
|
||||
if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']:
|
||||
return prev.get_text(strip=True)
|
||||
prev = prev.previous_sibling
|
||||
parent = parent.parent
|
||||
# 若未找到,尝试全局第一个h1-h6
|
||||
for tag in ['h1','h2','h3','h4','h5','h6']:
|
||||
h = soup.find(tag)
|
||||
if h and h.get_text(strip=True):
|
||||
return h.get_text(strip=True)
|
||||
except Exception:
|
||||
class TOCParser:
|
||||
def __init__(self):
|
||||
pass
|
||||
return None
|
||||
|
||||
def parse_html_title(html_path):
|
||||
"""
|
||||
解析html文件,优先返回<title>,否则返回body第一个h1/h2/h3/h4/h5/h6或None。
|
||||
"""
|
||||
try:
|
||||
with open(html_path, 'r', encoding='utf-8') as f:
|
||||
soup = BeautifulSoup(f, 'html.parser')
|
||||
# 优先<title>
|
||||
if soup.title and soup.title.string:
|
||||
return soup.title.string.strip()
|
||||
# 其次正文第一个h1-h6
|
||||
for tag in ['h1','h2','h3','h4','h5','h6']:
|
||||
h = soup.find(tag)
|
||||
if h and h.get_text(strip=True):
|
||||
return h.get_text(strip=True)
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
@staticmethod
|
||||
def find_section_by_selectedtext(html_path, selectedtext):
|
||||
try:
|
||||
with open(html_path, 'r', encoding='utf-8') as f:
|
||||
soup = BeautifulSoup(f, 'html.parser')
|
||||
for elem in soup.find_all(string=True):
|
||||
if selectedtext and selectedtext.strip() and selectedtext.strip() in elem:
|
||||
parent = elem.parent
|
||||
while parent:
|
||||
prev = parent.previous_sibling
|
||||
while prev:
|
||||
if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']:
|
||||
return prev.get_text(strip=True)
|
||||
prev = prev.previous_sibling
|
||||
parent = parent.parent
|
||||
for tag in ['h1','h2','h3','h4','h5','h6']:
|
||||
h = soup.find(tag)
|
||||
if h and h.get_text(strip=True):
|
||||
return h.get_text(strip=True)
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
def parse_navpoints(navpoints) -> Dict[str, dict]:
|
||||
"""
|
||||
递归解析 navpoints 节点,返回嵌套 dict 结构。
|
||||
:param navpoints: BeautifulSoup 查找到的 navPoint 节点列表
|
||||
:return: 章节树结构
|
||||
"""
|
||||
result = {}
|
||||
for navpoint in navpoints:
|
||||
label = navpoint.navLabel.text.strip().strip('"“”')
|
||||
src = navpoint.content["src"]
|
||||
if "#" in src:
|
||||
ref, filepos = src.split("#", 1)
|
||||
else:
|
||||
ref, filepos = src, None
|
||||
entry = {
|
||||
"label": label,
|
||||
"ref": ref,
|
||||
"filepos": filepos,
|
||||
"children": parse_navpoints(navpoint.find_all("navPoint", recursive=False))
|
||||
}
|
||||
result[navpoint.get("id")] = entry
|
||||
@staticmethod
|
||||
def parse_html_title(html_path):
|
||||
try:
|
||||
with open(html_path, 'r', encoding='utf-8') as f:
|
||||
soup = BeautifulSoup(f, 'html.parser')
|
||||
if soup.title and soup.title.string:
|
||||
return soup.title.string.strip()
|
||||
for tag in ['h1','h2','h3','h4','h5','h6']:
|
||||
h = soup.find(tag)
|
||||
if h and h.get_text(strip=True):
|
||||
return h.get_text(strip=True)
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
#pprint.pprint(result) # 格式化打印result
|
||||
@staticmethod
|
||||
def parse_navpoints(navpoints):
|
||||
result = {}
|
||||
for navpoint in navpoints:
|
||||
label = navpoint.navLabel.text.strip().strip('"“”')
|
||||
src = navpoint.content["src"]
|
||||
if "#" in src:
|
||||
ref, filepos = src.split("#", 1)
|
||||
else:
|
||||
ref, filepos = src, None
|
||||
entry = {
|
||||
"label": label,
|
||||
"ref": ref,
|
||||
"filepos": filepos,
|
||||
"children": TOCParser.parse_navpoints(navpoint.find_all("navPoint", recursive=False))
|
||||
}
|
||||
result[navpoint.get("id")] = entry
|
||||
return result
|
||||
|
||||
return result
|
||||
|
||||
def find_label_path(
|
||||
node: Any,
|
||||
ref: str,
|
||||
filepos: Optional[str] = None,
|
||||
path: Optional[List[str]] = None
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
在嵌套 dict 结构中查找指定 ref 和 filepos 的 label 路径。
|
||||
:param node: 当前节点(dict 或 dict集合)
|
||||
:param ref: html文件名
|
||||
:param filepos: 文件位置,可为 None
|
||||
:param path: label 路径累积
|
||||
:return: 以 / 分隔的完整 label 路径,未找到返回 None
|
||||
"""
|
||||
if path is None:
|
||||
path = []
|
||||
if isinstance(node, dict):
|
||||
nodes = node.values() if "label" not in node else [node]
|
||||
# 1. 优先精确匹配ref和filepos
|
||||
for v in nodes:
|
||||
if "label" in v:
|
||||
new_path = path + [v["label"]]
|
||||
if v["ref"] == ref and (filepos is None or v["filepos"] == filepos):
|
||||
title = " / ".join(new_path)
|
||||
#print(f'title ref={ref} filepos={filepos} -> {title}') #DBG
|
||||
return title
|
||||
title = find_label_path(v["children"], ref, filepos, new_path)
|
||||
if title:
|
||||
#print(f'title1 ref={ref} filepos={filepos} -> {title}') #DBG
|
||||
return title
|
||||
|
||||
# 2. 如果带filepos查找失败,回退到同ref下第一个章节(即只要ref匹配就返回)
|
||||
if filepos is not None:
|
||||
@staticmethod
|
||||
def find_label_path(node, ref, filepos=None, path=None):
|
||||
if path is None:
|
||||
path = []
|
||||
if isinstance(node, dict):
|
||||
nodes = node.values() if "label" not in node else [node]
|
||||
for v in nodes:
|
||||
if "label" in v:
|
||||
new_path = path + [v["label"]]
|
||||
# print(f"对比 {v['ref']} == {ref}")
|
||||
if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]:
|
||||
if v["ref"] == ref and (filepos is None or v["filepos"] == filepos):
|
||||
title = " / ".join(new_path)
|
||||
#print(f'title3 ref={ref} filepos={filepos} -> {title}') #DBG
|
||||
return title
|
||||
title = find_label_path(v["children"], ref, None, new_path)
|
||||
if title:
|
||||
#print(f'title4 ref={ref} filepos={filepos} -> {title}') #DBG
|
||||
return title
|
||||
|
||||
# 3. 若完全未找到,尝试直接解析idref所指html文件标题,获取章节label信息
|
||||
# 仅在顶层调用时执行此逻辑
|
||||
if path == [] and ref and ref.endswith('.html'):
|
||||
import os
|
||||
# 自动在常见目录下查找html文件(以toc文件目录为基准)
|
||||
caller_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
search_dirs = [caller_dir, os.getcwd()]
|
||||
for d in search_dirs:
|
||||
html_path = os.path.join(d, ref)
|
||||
#print(f"查找 {html_path}")
|
||||
if os.path.isfile(html_path):
|
||||
title = parse_html_title(html_path)
|
||||
if title:
|
||||
return title
|
||||
# 递归查找(以toc文件目录为根)
|
||||
for d in search_dirs:
|
||||
for root, _, files in os.walk(d):
|
||||
if ref in files:
|
||||
html_path = os.path.join(root, ref)
|
||||
#print(f"2 查找 {html_path}")
|
||||
title = parse_html_title(html_path)
|
||||
title = TOCParser.find_label_path(v["children"], ref, filepos, new_path)
|
||||
if title:
|
||||
return title
|
||||
return None
|
||||
if filepos is not None:
|
||||
for v in nodes:
|
||||
if "label" in v:
|
||||
new_path = path + [v["label"]]
|
||||
if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]:
|
||||
title = " / ".join(new_path)
|
||||
return title
|
||||
title = TOCParser.find_label_path(v["children"], ref, None, new_path)
|
||||
if title:
|
||||
return title
|
||||
if path == [] and ref and ref.endswith('.html'):
|
||||
caller_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
search_dirs = [caller_dir, os.getcwd()]
|
||||
for d in search_dirs:
|
||||
html_path = os.path.join(d, ref)
|
||||
if os.path.isfile(html_path):
|
||||
title = TOCParser.parse_html_title(html_path)
|
||||
if title:
|
||||
return title
|
||||
for d in search_dirs:
|
||||
for root, _, files in os.walk(d):
|
||||
if ref in files:
|
||||
html_path = os.path.join(root, ref)
|
||||
title = TOCParser.parse_html_title(html_path)
|
||||
if title:
|
||||
return title
|
||||
return None
|
||||
|
||||
if __name__ == "__main__":
|
||||
# ==== 批量测试指定toc/html/filepos列表 ====
|
||||
@@ -182,8 +137,6 @@ if __name__ == "__main__":
|
||||
[config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
|
||||
]
|
||||
for epub_dir, html_file, filepos in test_cases:
|
||||
# 自动查找epub目录下的toc.ncx
|
||||
import os
|
||||
toc_path = None
|
||||
for root, _, files in os.walk(epub_dir):
|
||||
for f in files:
|
||||
@@ -200,39 +153,32 @@ if __name__ == "__main__":
|
||||
with open(toc_path, "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f, "xml")
|
||||
nav_map = soup.find("navMap")
|
||||
toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False))
|
||||
label_path = find_label_path(toc_tree, html_file, filepos)
|
||||
toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
|
||||
label_path = TOCParser.find_label_path(toc_tree, html_file, filepos)
|
||||
print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}")
|
||||
|
||||
# tocb中不存在html,直接测试parse_html_title
|
||||
html_path = os.path.join(epub_dir, html_file.split('#')[0])
|
||||
if os.path.exists(html_path):
|
||||
title = parse_html_title(html_path)
|
||||
title = TOCParser.parse_html_title(html_path)
|
||||
print(f"解析html标题: {html_path} => {title if title else '未找到标题'}")
|
||||
# 新增:根据selectedtext定位章节标题
|
||||
selectedtext = '从变法思想看,王安石变法最大的魅力是“民不加赋而国用足”:老百姓上缴的税率不增,国库的总收入仍可以'
|
||||
section = find_section_by_selectedtext(html_path, selectedtext)
|
||||
section = TOCParser.find_section_by_selectedtext(html_path, selectedtext)
|
||||
print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}")
|
||||
else:
|
||||
print(f"未找到html文件: {html_path}")
|
||||
except Exception as e:
|
||||
print(f"测试失败: {e}")
|
||||
|
||||
# ==== 新增:测试变宋笔记章节定位和html标题解析 ====
|
||||
print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====")
|
||||
# 假设笔记数据如下
|
||||
note_idref = 'text/part0002_split_003.html'
|
||||
note_filepos = None
|
||||
# 变宋toc.ncx路径
|
||||
bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx"
|
||||
import os
|
||||
if os.path.exists(bian_song_toc):
|
||||
with open(bian_song_toc, "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f, "xml")
|
||||
nav_map = soup.find("navMap")
|
||||
toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False))
|
||||
# 先尝试用find_label_path查找章节
|
||||
label_path = find_label_path(toc_tree, note_idref, note_filepos)
|
||||
toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
|
||||
label_path = TOCParser.find_label_path(toc_tree, note_idref, note_filepos)
|
||||
print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节,尝试解析html标题")
|
||||
else:
|
||||
print(f"未找到toc.ncx: {bian_song_toc}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user