This commit is contained in:
douboer 2025-08-15 17:20:30 +08:00
parent 0bc6844209
commit 4e3b8abc34
12 changed files with 406 additions and 516 deletions

View File

@ -1,27 +1,30 @@
"""
annotationdata.py
-----------------
annotationdata.py (OOP版)
------------------------
功能
- 解析iBooks的AEAnnotation.sqlite数据库提取所有或指定书籍assetid/bookid的笔记
- 提供parse_location辅助函数解析笔记定位信息
- 返回结构化的annotations数据便于后续章节定位与导出
依赖config.py 统一管理路径和配置项
主要接口
- get_annotations(db_path, bookid=None)返回所有或指定assetid的笔记结构为{assetid: {uuid: {...}}}
主要接口AnnotationManager
- get_annotations(bookid=None)返回所有或指定assetid的笔记结构为{assetid: {uuid: {...}}}
- parse_location(location)解析ZANNOTATIONLOCATION返回(idref, filepos)
依赖sqlite3, collections, re, os, datetime
"""
import config
import sqlite3
from collections import defaultdict
import re
import os
from collections import defaultdict
def parse_location(location):
class AnnotationManager:
def __init__(self, db_path=None):
self.db_path = db_path or config.LOCAL_ANNOTATION_DB
@staticmethod
def parse_location(location):
"""
解析ZANNOTATIONLOCATION返回(idref, filepos)
- epubcfi(...)格式优先提取[]内内容为idref
@ -31,21 +34,20 @@ def parse_location(location):
filepos = None
if not location:
return idref, filepos
# 统一处理,提取前两个[]内容
matches = re.findall(r'\[(.*?)\]', location) if location else []
idref = matches[0] if len(matches) > 0 else None
filepos = matches[1] if len(matches) > 1 else None
return idref, filepos
def get_annotations(db_path=config.LOCAL_ANNOTATION_DB, bookid=None):
def get_annotations(self, bookid=None):
# 检查WAL模式相关文件
base = db_path.rsplit('.', 1)[0]
base = self.db_path.rsplit('.', 1)[0]
wal_path = base + '.sqlite-wal'
shm_path = base + '.sqlite-shm'
for f in [db_path, wal_path, shm_path]:
for f in [self.db_path, wal_path, shm_path]:
if not os.path.exists(f):
print(f'警告: 缺少 {f},可能无法获取全部最新笔记')
conn = sqlite3.connect(db_path)
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
if bookid is not None:
cursor.execute('''
@ -67,7 +69,6 @@ def get_annotations(db_path=config.LOCAL_ANNOTATION_DB, bookid=None):
if creationdate:
try:
origin = datetime.datetime(2001, 1, 1)
# 苹果时间戳 float/int 或数字字符串
if isinstance(creationdate, (int, float)):
dt = origin + datetime.timedelta(seconds=creationdate)
elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit():
@ -77,8 +78,7 @@ def get_annotations(db_path=config.LOCAL_ANNOTATION_DB, bookid=None):
date_str = f"{dt.year}/{dt.month}/{dt.day}"
except Exception:
date_str = str(creationdate)
idref, filepos = parse_location(location)
# 跳过note和selectedtext都为None的笔记
idref, filepos = self.parse_location(location)
if note is None and selectedtext is None:
continue
annotations[str(assetid)][uuid] = {
@ -90,47 +90,24 @@ def get_annotations(db_path=config.LOCAL_ANNOTATION_DB, bookid=None):
}
conn.close()
if bookid is not None:
# 只返回特定bookid的笔记结构
return {str(bookid): annotations.get(str(bookid), {})}
return annotations
# 用法示例输出每本书的前3条笔记
if __name__ == "__main__":
manager = AnnotationManager()
# 测试 parse_location
'''
test_locations = [
'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8',
'epubcfi(/6/22[id15]!/4/156/1,:21,:157)',
'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)'
]
for loc in test_locations:
idref, filepos = parse_location(loc)
idref, filepos = manager.parse_location(loc)
print(f"location: {loc}\n idref: {idref}\n filepos: {filepos}\n")
'''
# 测试只获取特定 assetid 的笔记
test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C"
annotations = get_annotations(bookid=test_bookid)
# 格式化打印该书的所有笔记
annotations = manager.get_annotations(bookid=test_bookid)
from pprint import pprint
print(f"\nAssetID={test_bookid} 的所有笔记:")
pprint(annotations, indent=2, sort_dicts=False)
# 输出每本书的前3条笔记
'''
book_notes = defaultdict(list)
for assetid, notes_dict in annotations.items():
for uuid, ann in notes_dict.items():
book_notes[assetid].append({**ann, 'uuid': uuid})
for assetid, notes in book_notes.items():
print(f"\nAssetID: {assetid}")
for i, note in enumerate(notes[:3]):
print(f" 笔记{i+1}:")
print(f" creationdate: {note['creationdate']}")
print(f" idref: {note['idref']}")
print(f" filepos: {note['filepos']}")
print(f" note: {note['note']}")
print(f" selectedtext: {note['selectedtext']}")
print(f" uuid: {note['uuid']}")
'''

View File

@ -1,32 +1,27 @@
"""
booklist_parse.py
-----------------
功能
- 解析iBooks的Books.plist提取所有书籍元数据书名作者路径时间等
- 解析BKLibrary.sqlite获取每本书的最近打开时间苹果时间戳基准2001-01-01
依赖config.py 统一管理路径和配置项
主要接口
- parse_books_plist(plist_path)返回所有书籍元数据结构为{bk_id: {...}}
- get_books_last_open(db_path)返回所有书籍最近打开时间结构为{bk_id: {'last_open': 时间戳}}
依赖plistlib, collections, sqlite3, os, datetime
典型用法
booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST)
books_open = get_books_last_open(config.LOCAL_LIBRARY_DB)
"""
import config
import plistlib
import sqlite3
import os
from collections import defaultdict
def parse_books_plist(plist_path=config.LOCAL_BOOKS_PLIST):
class BookListManager:
def __init__(self, plist_path=None, db_path=None):
self.plist_path = plist_path or config.LOCAL_BOOKS_PLIST
self.db_path = db_path or config.LOCAL_LIBRARY_DB
self._booksinfo = None
self._books_open = None
def get_books_info(self):
if self._booksinfo is not None:
return self._booksinfo
booksinfo = defaultdict(dict)
with open(plist_path, 'rb') as f: plist_data = plistlib.load(f)
with open(self.plist_path, 'rb') as f:
plist_data = plistlib.load(f)
for book in plist_data.get('Books', []):
bk_id = book.get('BKGeneratedItemId')
if not bk_id: continue
if not bk_id:
continue
booksinfo[bk_id] = {
'displayname': book.get('BKDisplayName', ''),
'author': book.get('artistName', ''),
@ -37,39 +32,35 @@ def parse_books_plist(plist_path=config.LOCAL_BOOKS_PLIST):
'date': book.get('BKInsertionDate',''),
'updatedate': book.get('updateDate','')
}
self._booksinfo = booksinfo
return booksinfo
import sqlite3
import os
def get_books_last_open(db_path=config.LOCAL_LIBRARY_DB):
"""
从BKLibrary.sqlite获取书籍最近打开时间
返回defaultdict(dict)bk_id为索引包含最近打开时间
"""
def get_books_last_open(self):
if self._books_open is not None:
return self._books_open
books_open = defaultdict(dict)
if not os.path.exists(db_path):
if not os.path.exists(self.db_path):
return books_open
try:
conn = sqlite3.connect(db_path)
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
# ZBKLIBRARYASSET表包含书籍信息
cursor.execute(''' SELECT ZASSETID, zlastopendate FROM ZBKLIBRARYASSET WHERE zlastopendate IS NOT NULL ''')
rows = cursor.fetchall()
for row in rows:
asset_id, last_open = row
if asset_id:
books_open[asset_id] = {
'last_open': last_open # 苹果时间戳基准时间为2001-01-01
'last_open': last_open
}
conn.close()
except Exception as e:
print(f'警告: 读取BKLibrary.sqlite失败: {e}')
self._books_open = books_open
return books_open
if __name__ == '__main__':
booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST)
manager = BookListManager()
booksinfo = manager.get_books_info()
from pprint import pprint
print("\n【前三条示例】")
for k, v in list(booksinfo.items())[:3]:
@ -77,19 +68,10 @@ if __name__ == '__main__':
pprint(v, sort_dicts=False, indent=2)
print('-' * 60)
'''
print("\n【全部内容】")
for k, v in booksinfo.items():
print(f"{k}:")
pprint(v, sort_dicts=False, indent=2)
print('-' * 60)
'''
# 测试最近打开时间
print("\n【最近打开时间示例】")
books_open = get_books_last_open()
books_open = manager.get_books_last_open()
import datetime
for k, v in list(books_open.items())[:3]:
ts = v['last_open']
# 苹果时间戳基准2001-01-01
dt = datetime.datetime(2001, 1, 1) + datetime.timedelta(seconds=ts)
print(f"{k}: {dt} (timestamp: {ts})")

Binary file not shown.

View File

@ -158,31 +158,43 @@ answer = inquirer.fuzzy(
## 9.1 主要代码文件说明(细化)
- `exportbooknotes.py`
- 采用 OOP 设计,核心类为 `BookNotesExporter`
- `build_booksnote(bookid=None)`:构建结构化笔记数据。
- `export_booksnote_to_md(booksnote, booksinfo, out_path=None)`:导出为 Markdown。
- `find_file_by_ext`、`get_toc_tree` 等辅助方法。
- 数据同步:自动复制 iBooks 数据库和元数据到本地。
- 菜单交互:按最近打开时间戳排序,显示“书名 [时间戳]”,支持模糊搜索。
- 只处理用户选中书籍的笔记,按章节分组导出 Markdown。
- 依赖核心解析模块,负责主流程调度。
- `annotationdata.py`
- OOP 设计,核心类为 `AnnotationManager`
- `get_annotations(bookid=None)`:返回所有或指定 assetid 的笔记。
- `parse_location(location)`:静态方法,解析定位信息。
- 解析 AEAnnotation.sqlite提取所有或指定 assetid 的笔记。
- 支持苹果时间戳转换,结构化输出。
- parse_location 辅助函数,统一解析笔记定位信息。
- `booklist_parse.py`
- OOP 设计,核心类为 `BookListManager`
- `get_books_info()`:获取书籍元数据。
- `get_books_last_open()`:获取每本书的最近打开时间。
- 解析 Books.plist获取书籍元数据书名、作者、路径、时间等
- 解析 BKLibrary.sqlite获取每本书的最近打开时间zlastopendate苹果时间戳
- 提供统一数据接口,便于主流程排序和展示。
- 解析 BKLibrary.sqlite获取每本书的最近打开时间。
- `opf_parse.py`
- OOP 设计,核心类为 `OPFParser`
- `parse_opf(filepath)`:静态方法,返回 id->href 映射。
- 解析 epub 的 OPF 文件获取章节与文件映射关系idref -> href
- 支持多种 epub 目录结构。
- `toc_parse.py`
- OOP 设计,核心类为 `TOCParser`
- `parse_navpoints(navpoints)`:递归解析 navPoint 节点。
- `find_label_path(node, ref, filepos, path)`:查找章节路径。
- `find_section_by_selectedtext(html_path, selectedtext)`:通过选中文本定位章节标题。
- `parse_html_title(html_path)`:解析 html 文件标题。
- 解析 NCX 目录文件,递归构建章节树结构。
- find_label_path支持通过 ref 和 filepos 查找完整 label 路径。
- find_section_by_selectedtext通过选中文本在 html 文件中定位章节标题。
- parse_html_title解析 html 文件标题。
- `backup/booksnote.py`
- 历史/备份脚本,辅助数据迁移或格式转换。

View File

@ -1,4 +1,4 @@
# 笔记导出 2025-08-15 13:25
# 笔记导出 2025-08-15 17:20
## 传统十论

View File

@ -1,31 +1,17 @@
"""
exportbooknotes.py
------------------
exportbooknotes.py (OOP版)
-------------------------
功能
- 自动同步iBooks数据库和元数据文件到本地data目录
- 解析AEAnnotation.sqliteBooks.plistBKLibrary.sqlite构建结构化笔记数据
- 解析epub目录和章节信息定位每条笔记所属章节
- 命令行菜单按最近打开时间降序展示书籍列表供用户选择导出
- 仅导出选中书籍的所有笔记按章节分组生成Markdown文件
依赖config.py 统一管理路径和配置项
主要数据流
1. 数据同步到data目录
2. 解析Books.plist获取书籍元数据
3. 解析BKLibrary.sqlite获取最近打开时间
4. 菜单排序与显示书名+时间戳
5. 解析AEAnnotation.sqlite获取笔记
6. 解析epub目录定位章节
7. 导出Markdown文件
依赖Python 3, InquirerPy, bs4, shutil, os, datetime, sqlite3
主要数据流
典型用法
python exportbooknotes.py
# 按提示选择书籍自动导出笔记到export_notes目录
主要接口BookNotesExporter
- run()命令行交互式导出主流程
- build_booksnote(bookid=None)构建结构化笔记数据
- export_booksnote_to_md(booksnote, booksinfo, out_path=None)导出为Markdown
"""
import config
"""
@ -40,17 +26,24 @@ booksnote = {
}}}
}
"""
from collections import defaultdict
import os
from annotationdata import get_annotations
from booklist_parse import parse_books_plist
from collections import defaultdict
from annotationdata import AnnotationManager
from booklist_parse import BookListManager
from opf_parse import parse_opf
from toc_parse import parse_navpoints, find_label_path
from toc_parse import TOCParser
from bs4 import BeautifulSoup
from pprint import pprint
def find_file_by_ext(root, exts):
"""在root下递归查找第一个指定后缀的文件"""
class BookNotesExporter:
def __init__(self, config_module=config):
self.config = config_module
self.annotation_db = config_module.LOCAL_ANNOTATION_DB
self.books_plist = config_module.LOCAL_BOOKS_PLIST
self.library_db = config_module.LOCAL_LIBRARY_DB
@staticmethod
def find_file_by_ext(root, exts):
for dirpath, _, files in os.walk(root):
for f in files:
for ext in exts:
@ -58,48 +51,44 @@ def find_file_by_ext(root, exts):
return os.path.join(dirpath, f)
return None
def get_toc_tree(toc_path):
@staticmethod
def get_toc_tree(toc_path):
with open(toc_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'xml')
nav_map = soup.find('navMap')
nav_points = nav_map.find_all('navPoint', recursive=False)
toc_tree = parse_navpoints(nav_points)
#pprint(toc_tree, indent=2, depth=5)
toc_tree = TOCParser.parse_navpoints(nav_points)
return toc_tree
def build_booksnote(annotation_db=config.LOCAL_ANNOTATION_DB, books_plist=config.LOCAL_BOOKS_PLIST, bookid=None):
# 支持只处理特定 assetid 的笔记
annotations = get_annotations(annotation_db, bookid=bookid)
booksinfo = parse_books_plist(books_plist)
def build_booksnote(self, bookid=None):
manager = AnnotationManager(self.annotation_db)
annotations = manager.get_annotations(bookid=bookid)
bl_manager = BookListManager(plist_path=self.books_plist)
booksinfo = bl_manager.get_books_info()
booksnote = defaultdict(lambda: defaultdict(dict))
for assetid, notes in annotations.items():
# 获取epub路径
bookinfo = booksinfo.get(assetid)
if not bookinfo:
continue
epub_path = bookinfo.get('path')
if not epub_path or not os.path.isdir(epub_path):
continue
# 查找opf和ncx
opf_path = find_file_by_ext(epub_path, ['.opf'])
ncx_path = find_file_by_ext(epub_path, ['.ncx'])
opf_path = self.find_file_by_ext(epub_path, ['.opf'])
ncx_path = self.find_file_by_ext(epub_path, ['.ncx'])
if not opf_path or not ncx_path:
continue
id2href = parse_opf(opf_path)
toc_tree = get_toc_tree(ncx_path)
toc_tree = self.get_toc_tree(ncx_path)
for uuid, ann in notes.items():
idref = ann['idref']
filepos = ann['filepos']
href = id2href.get(idref, idref)
chapter = find_label_path(toc_tree, href, filepos)
chapter = TOCParser.find_label_path(toc_tree, href, filepos)
if chapter is None:
# 直接从html文件获取章节信息
html_path = os.path.join(epub_path, href.split('#')[0])
selectedtext = ann.get('selectedtext')
if os.path.exists(html_path) and selectedtext:
from toc_parse import find_section_by_selectedtext
section = find_section_by_selectedtext(html_path, selectedtext)
section = TOCParser.find_section_by_selectedtext(html_path, selectedtext)
if section:
chapter = section
else:
@ -115,17 +104,8 @@ def build_booksnote(annotation_db=config.LOCAL_ANNOTATION_DB, books_plist=config
}
return booksnote
import datetime
def export_booksnote_to_md(booksnote, booksinfo, out_path=None):
"""
依据booksnote结构导出markdown文件格式
# “笔记导出”+导出时间
## 书名
### chapter
selectedtext
> note (如果存在)
"""
def export_booksnote_to_md(self, booksnote, booksinfo, out_path=None):
import datetime
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
lines = [f'# 笔记导出 {now}\n']
for assetid, chapters in booksnote.items():
@ -151,6 +131,8 @@ def export_booksnote_to_md(booksnote, booksinfo, out_path=None):
if __name__ == '__main__':
import shutil
import os.path
from InquirerPy import inquirer # type: ignore
exporter = BookNotesExporter(config)
# 自动覆盖 ./data 下的数据库和plist文件源为iBooks真实路径
src_files = [
(config.IBOOKS_ANNOTATION_DB, config.LOCAL_ANNOTATION_DB),
@ -166,31 +148,19 @@ if __name__ == '__main__':
else:
print(f'file not found: {src} ')
from booklist_parse import parse_books_plist
from InquirerPy import inquirer # type: ignore
# 先获取所有书籍元数据
booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST)
# 构建书名列表优先displayname, 其次itemname, 否则assetid按parse_books_plist中的date字段排序
manager = BookListManager(plist_path=config.LOCAL_BOOKS_PLIST, db_path=config.LOCAL_LIBRARY_DB)
booksinfo = manager.get_books_info()
assetid2name = {}
assetid2lastopen = {}
from booklist_parse import get_books_last_open
# 获取所有书籍的最后打开时间(字典,值为{'last_open': 时间戳}
last_open_times = get_books_last_open(config.LOCAL_LIBRARY_DB)
last_open_times = manager.get_books_last_open()
for assetid, info in booksinfo.items():
name = info.get('displayname') or info.get('itemname') or assetid
# 如果书名中包含“-”,只取“-”前面的部分
if '-' in name:
name = name.split('-', 1)[0].strip()
assetid2name[assetid] = name
# 用 get_books_last_open 返回的时间戳排序如无则为0
ts = last_open_times.get(assetid, {}).get('last_open', 0)
assetid2lastopen[assetid] = ts
# 按last_open时间戳降序排列
sorted_assetids = sorted(assetid2name.keys(), key=lambda aid: assetid2lastopen[aid], reverse=True)
choices = [f"{assetid2name[aid]} [{assetid2lastopen[aid]}]" for aid in sorted_assetids]
if not choices:
@ -202,8 +172,6 @@ if __name__ == '__main__':
multiselect=False,
instruction="上下键选择,输入可模糊筛选,回车确定"
).execute()
# 解析选中assetid
for aid, name in assetid2name.items():
if answer.startswith(name):
selected_assetid = aid
@ -211,10 +179,8 @@ if __name__ == '__main__':
else:
print("未找到选中书籍")
exit(1)
# 只导出选中书的笔记
selected_booksnote = build_booksnote(bookid=selected_assetid)
selected_booksnote = exporter.build_booksnote(bookid=selected_assetid)
selected_booksinfo = {selected_assetid: booksinfo.get(selected_assetid, {})}
out_path = f'export_notes/notes_export_{selected_assetid}.md'
export_booksnote_to_md(selected_booksnote, selected_booksinfo, out_path)
exporter.export_booksnote_to_md(selected_booksnote, selected_booksinfo, out_path)
print(f'{selected_booksinfo[selected_assetid].get("displayname") or selected_booksinfo[selected_assetid].get("itemname") or selected_assetid}》 导出笔记 {out_path}')

View File

@ -1,21 +1,29 @@
def parse_opf(filepath):
"""
兼容旧代码的顶层函数实际调用 OPFParser.parse_opf
"""
return OPFParser.parse_opf(filepath)
# parseopf.py
# -----------------------------
# 用于解析EPUB电子书的OPF文件提取manifest部分所有id对应的html文件href。
# 支持批量测试和通过id快速查找href。
# 依赖BeautifulSoup4
# -----------------------------
"""
opf_parse.py (OOP版)
-------------------
功能
- 解析EPUB电子书的OPF文件提取manifest部分所有id对应的html文件href
- 支持通过id快速查找href
- 支持批量测试
依赖BeautifulSoup4
主要接口OPFParser
- parse_opf(filepath)静态方法返回id->href映射仅html文件
"""
from collections import defaultdict
from bs4 import BeautifulSoup
import pprint
def parse_opf(filepath):
class OPFParser:
@staticmethod
def parse_opf(filepath):
"""
解析OPF文件返回{id: href}的defaultdict(dict)结构
仅保留href以.html结尾的项
参数
filepath (str): OPF文件路径
返回
@ -24,7 +32,6 @@ def parse_opf(filepath):
result = defaultdict(dict)
with open(filepath, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'xml')
# 查找manifest部分遍历所有item筛选html结尾的href
manifest = soup.find('manifest')
if manifest:
for item in manifest.find_all('item'):
@ -34,6 +41,7 @@ def parse_opf(filepath):
result[id_] = href
return result
if __name__ == "__main__":
test_files = [
'./examples/epub_format_2/OEBPS/content.opf',
@ -44,8 +52,7 @@ if __name__ == "__main__":
for file in test_files:
print(f"\n==== 测试文件: {file} ====")
try:
result = parse_opf(file)
pprint.pprint(result, indent=2, width=120, sort_dicts=False)
result = OPFParser.parse_opf(file)
# 增加通过id快速打印href的测试
test_ids = list(result.keys())[:3] # 取前三个id做演示

View File

@ -1,6 +1,7 @@
"""
toc_parse.py
------------
toc_parse.py (OOP版)
-------------------
功能
- 解析EPUB电子书的toc.ncx目录文件递归构建章节树结构
- 支持通过ref和filepos查找完整label路径
@ -8,43 +9,36 @@ toc_parse.py
- 兼容多种EPUB格式支持批量测试
依赖config.py 统一管理路径和配置项
主要接口
parse_navpoints(navpoints) # 递归解析navPoint节点返回章节树结构
find_label_path(node, ref, filepos, path) # 查找指定ref和filepos的章节label路径
find_section_by_selectedtext(html_path, selectedtext) # 通过选中文本定位章节标题
parse_html_title(html_path) # 解析html文件标题
主要接口TOCParser
- parse_navpoints(navpoints)递归解析navPoint节点返回章节树结构
- find_label_path(node, ref, filepos, path)查找指定ref和filepos的章节label路径
- find_section_by_selectedtext(html_path, selectedtext)通过选中文本定位章节标题
- parse_html_title(html_path)解析html文件标题
依赖BeautifulSoup4, pprint, os, typing
"""
import config
from bs4 import BeautifulSoup
from typing import Dict, Optional, List, Any
import pprint
import os
# ==== 辅助函数根据selectedtext在html文件中的位置推断所在章节 ====
def find_section_by_selectedtext(html_path, selectedtext):
"""
在html文件中查找selectedtext出现的位置向上回溯最近的h1-h6标题返回该标题文本
若未找到标题则返回None
"""
class TOCParser:
def __init__(self):
pass
@staticmethod
def find_section_by_selectedtext(html_path, selectedtext):
try:
with open(html_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
# 在所有文本节点中查找selectedtext
for elem in soup.find_all(string=True):
if selectedtext and selectedtext.strip() and selectedtext.strip() in elem:
# 回溯父节点查找最近的h1-h6
parent = elem.parent
while parent:
prev = parent.previous_sibling
# 向上查找同级前面的h1-h6
while prev:
if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']:
return prev.get_text(strip=True)
prev = prev.previous_sibling
parent = parent.parent
# 若未找到尝试全局第一个h1-h6
for tag in ['h1','h2','h3','h4','h5','h6']:
h = soup.find(tag)
if h and h.get_text(strip=True):
@ -53,17 +47,13 @@ def find_section_by_selectedtext(html_path, selectedtext):
pass
return None
def parse_html_title(html_path):
"""
解析html文件优先返回<title>否则返回body第一个h1/h2/h3/h4/h5/h6或None
"""
@staticmethod
def parse_html_title(html_path):
try:
with open(html_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
# 优先<title>
if soup.title and soup.title.string:
return soup.title.string.strip()
# 其次正文第一个h1-h6
for tag in ['h1','h2','h3','h4','h5','h6']:
h = soup.find(tag)
if h and h.get_text(strip=True):
@ -72,12 +62,8 @@ def parse_html_title(html_path):
pass
return None
def parse_navpoints(navpoints) -> Dict[str, dict]:
"""
递归解析 navpoints 节点返回嵌套 dict 结构
:param navpoints: BeautifulSoup 查找到的 navPoint 节点列表
:return: 章节树结构
"""
@staticmethod
def parse_navpoints(navpoints):
result = {}
for navpoint in navpoints:
label = navpoint.navLabel.text.strip().strip('"“”')
@ -90,81 +76,50 @@ def parse_navpoints(navpoints) -> Dict[str, dict]:
"label": label,
"ref": ref,
"filepos": filepos,
"children": parse_navpoints(navpoint.find_all("navPoint", recursive=False))
"children": TOCParser.parse_navpoints(navpoint.find_all("navPoint", recursive=False))
}
result[navpoint.get("id")] = entry
#pprint.pprint(result) # 格式化打印result
return result
def find_label_path(
node: Any,
ref: str,
filepos: Optional[str] = None,
path: Optional[List[str]] = None
) -> Optional[str]:
"""
在嵌套 dict 结构中查找指定 ref filepos label 路径
:param node: 当前节点dict dict集合
:param ref: html文件名
:param filepos: 文件位置可为 None
:param path: label 路径累积
:return: / 分隔的完整 label 路径未找到返回 None
"""
@staticmethod
def find_label_path(node, ref, filepos=None, path=None):
if path is None:
path = []
if isinstance(node, dict):
nodes = node.values() if "label" not in node else [node]
# 1. 优先精确匹配ref和filepos
for v in nodes:
if "label" in v:
new_path = path + [v["label"]]
if v["ref"] == ref and (filepos is None or v["filepos"] == filepos):
title = " / ".join(new_path)
#print(f'title ref={ref} filepos={filepos} -> {title}') #DBG
return title
title = find_label_path(v["children"], ref, filepos, new_path)
title = TOCParser.find_label_path(v["children"], ref, filepos, new_path)
if title:
#print(f'title1 ref={ref} filepos={filepos} -> {title}') #DBG
return title
# 2. 如果带filepos查找失败回退到同ref下第一个章节即只要ref匹配就返回
if filepos is not None:
for v in nodes:
if "label" in v:
new_path = path + [v["label"]]
# print(f"对比 {v['ref']} == {ref}")
if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]:
title = " / ".join(new_path)
#print(f'title3 ref={ref} filepos={filepos} -> {title}') #DBG
return title
title = find_label_path(v["children"], ref, None, new_path)
title = TOCParser.find_label_path(v["children"], ref, None, new_path)
if title:
#print(f'title4 ref={ref} filepos={filepos} -> {title}') #DBG
return title
# 3. 若完全未找到尝试直接解析idref所指html文件标题获取章节label信息
# 仅在顶层调用时执行此逻辑
if path == [] and ref and ref.endswith('.html'):
import os
# 自动在常见目录下查找html文件以toc文件目录为基准
caller_dir = os.path.dirname(os.path.abspath(__file__))
search_dirs = [caller_dir, os.getcwd()]
for d in search_dirs:
html_path = os.path.join(d, ref)
#print(f"查找 {html_path}")
if os.path.isfile(html_path):
title = parse_html_title(html_path)
title = TOCParser.parse_html_title(html_path)
if title:
return title
# 递归查找以toc文件目录为根
for d in search_dirs:
for root, _, files in os.walk(d):
if ref in files:
html_path = os.path.join(root, ref)
#print(f"2 查找 {html_path}")
title = parse_html_title(html_path)
title = TOCParser.parse_html_title(html_path)
if title:
return title
return None
@ -182,8 +137,6 @@ if __name__ == "__main__":
[config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
]
for epub_dir, html_file, filepos in test_cases:
# 自动查找epub目录下的toc.ncx
import os
toc_path = None
for root, _, files in os.walk(epub_dir):
for f in files:
@ -200,39 +153,32 @@ if __name__ == "__main__":
with open(toc_path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "xml")
nav_map = soup.find("navMap")
toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False))
label_path = find_label_path(toc_tree, html_file, filepos)
toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
label_path = TOCParser.find_label_path(toc_tree, html_file, filepos)
print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}")
# tocb中不存在html直接测试parse_html_title
html_path = os.path.join(epub_dir, html_file.split('#')[0])
if os.path.exists(html_path):
title = parse_html_title(html_path)
title = TOCParser.parse_html_title(html_path)
print(f"解析html标题: {html_path} => {title if title else '未找到标题'}")
# 新增根据selectedtext定位章节标题
selectedtext = '从变法思想看,王安石变法最大的魅力是“民不加赋而国用足”:老百姓上缴的税率不增,国库的总收入仍可以'
section = find_section_by_selectedtext(html_path, selectedtext)
section = TOCParser.find_section_by_selectedtext(html_path, selectedtext)
print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}")
else:
print(f"未找到html文件: {html_path}")
except Exception as e:
print(f"测试失败: {e}")
# ==== 新增测试变宋笔记章节定位和html标题解析 ====
print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====")
# 假设笔记数据如下
note_idref = 'text/part0002_split_003.html'
note_filepos = None
# 变宋toc.ncx路径
bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx"
import os
if os.path.exists(bian_song_toc):
with open(bian_song_toc, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "xml")
nav_map = soup.find("navMap")
toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False))
# 先尝试用find_label_path查找章节
label_path = find_label_path(toc_tree, note_idref, note_filepos)
toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
label_path = TOCParser.find_label_path(toc_tree, note_idref, note_filepos)
print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节尝试解析html标题")
else:
print(f"未找到toc.ncx: {bian_song_toc}")