This commit is contained in:
douboer 2025-08-15 17:20:30 +08:00
parent 0bc6844209
commit 4e3b8abc34
12 changed files with 406 additions and 516 deletions

View File

@ -1,136 +1,113 @@
""" """
annotationdata.py annotationdata.py (OOP版)
----------------- ------------------------
功能 功能
- 解析iBooks的AEAnnotation.sqlite数据库提取所有或指定书籍assetid/bookid的笔记 - 解析iBooks的AEAnnotation.sqlite数据库提取所有或指定书籍assetid/bookid的笔记
- 提供parse_location辅助函数解析笔记定位信息 - 提供parse_location辅助函数解析笔记定位信息
- 返回结构化的annotations数据便于后续章节定位与导出 - 返回结构化的annotations数据便于后续章节定位与导出
依赖config.py 统一管理路径和配置项 依赖config.py 统一管理路径和配置项
主要接口AnnotationManager
主要接口 - get_annotations(bookid=None)返回所有或指定assetid的笔记结构为{assetid: {uuid: {...}}}
- get_annotations(db_path, bookid=None)返回所有或指定assetid的笔记结构为{assetid: {uuid: {...}}}
- parse_location(location)解析ZANNOTATIONLOCATION返回(idref, filepos) - parse_location(location)解析ZANNOTATIONLOCATION返回(idref, filepos)
依赖sqlite3, collections, re, os, datetime 依赖sqlite3, collections, re, os, datetime
""" """
import config import config
import sqlite3 import sqlite3
from collections import defaultdict
import re import re
import os import os
from collections import defaultdict
def parse_location(location): class AnnotationManager:
""" def __init__(self, db_path=None):
解析ZANNOTATIONLOCATION返回(idref, filepos) self.db_path = db_path or config.LOCAL_ANNOTATION_DB
- epubcfi(...)格式优先提取[]内内容为idref
- 其他格式兼容原逻辑 @staticmethod
""" def parse_location(location):
idref = None """
filepos = None 解析ZANNOTATIONLOCATION返回(idref, filepos)
if not location: - epubcfi(...)格式优先提取[]内内容为idref
- 其他格式兼容原逻辑
"""
idref = None
filepos = None
if not location:
return idref, filepos
matches = re.findall(r'\[(.*?)\]', location) if location else []
idref = matches[0] if len(matches) > 0 else None
filepos = matches[1] if len(matches) > 1 else None
return idref, filepos return idref, filepos
# 统一处理,提取前两个[]内容
matches = re.findall(r'\[(.*?)\]', location) if location else []
idref = matches[0] if len(matches) > 0 else None
filepos = matches[1] if len(matches) > 1 else None
return idref, filepos
def get_annotations(db_path=config.LOCAL_ANNOTATION_DB, bookid=None): def get_annotations(self, bookid=None):
# 检查WAL模式相关文件 # 检查WAL模式相关文件
base = db_path.rsplit('.', 1)[0] base = self.db_path.rsplit('.', 1)[0]
wal_path = base + '.sqlite-wal' wal_path = base + '.sqlite-wal'
shm_path = base + '.sqlite-shm' shm_path = base + '.sqlite-shm'
for f in [db_path, wal_path, shm_path]: for f in [self.db_path, wal_path, shm_path]:
if not os.path.exists(f): if not os.path.exists(f):
print(f'警告: 缺少 {f},可能无法获取全部最新笔记') print(f'警告: 缺少 {f},可能无法获取全部最新笔记')
conn = sqlite3.connect(db_path) conn = sqlite3.connect(self.db_path)
cursor = conn.cursor() cursor = conn.cursor()
if bookid is not None: if bookid is not None:
cursor.execute(''' cursor.execute('''
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=? FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=?
''', (bookid,)) ''', (bookid,))
else: else:
cursor.execute(''' cursor.execute('''
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
FROM ZAEANNOTATION FROM ZAEANNOTATION
''') ''')
rows = cursor.fetchall() rows = cursor.fetchall()
annotations = defaultdict(dict) annotations = defaultdict(dict)
import datetime import datetime
for row in rows: for row in rows:
assetid, creationdate, location, note, selectedtext, uuid = row assetid, creationdate, location, note, selectedtext, uuid = row
# 转换 creationdate 格式支持苹果时间戳以2001-01-01为基准 # 转换 creationdate 格式支持苹果时间戳以2001-01-01为基准
date_str = creationdate date_str = creationdate
if creationdate: if creationdate:
try: try:
origin = datetime.datetime(2001, 1, 1) origin = datetime.datetime(2001, 1, 1)
# 苹果时间戳 float/int 或数字字符串 if isinstance(creationdate, (int, float)):
if isinstance(creationdate, (int, float)): dt = origin + datetime.timedelta(seconds=creationdate)
dt = origin + datetime.timedelta(seconds=creationdate) elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit():
elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit(): dt = origin + datetime.timedelta(seconds=float(creationdate))
dt = origin + datetime.timedelta(seconds=float(creationdate)) else:
else: dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d")
dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d") date_str = f"{dt.year}/{dt.month}/{dt.day}"
date_str = f"{dt.year}/{dt.month}/{dt.day}" except Exception:
except Exception: date_str = str(creationdate)
date_str = str(creationdate) idref, filepos = self.parse_location(location)
idref, filepos = parse_location(location) if note is None and selectedtext is None:
# 跳过note和selectedtext都为None的笔记 continue
if note is None and selectedtext is None: annotations[str(assetid)][uuid] = {
continue 'creationdate': date_str,
annotations[str(assetid)][uuid] = { 'filepos': filepos,
'creationdate': date_str, 'idref': idref,
'filepos': filepos, 'note': note,
'idref': idref, 'selectedtext': selectedtext
'note': note, }
'selectedtext': selectedtext conn.close()
} if bookid is not None:
conn.close() return {str(bookid): annotations.get(str(bookid), {})}
if bookid is not None: return annotations
# 只返回特定bookid的笔记结构
return {str(bookid): annotations.get(str(bookid), {})}
return annotations
# 用法示例输出每本书的前3条笔记
if __name__ == "__main__": if __name__ == "__main__":
manager = AnnotationManager()
# 测试 parse_location # 测试 parse_location
'''
test_locations = [ test_locations = [
'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8', 'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8',
'epubcfi(/6/22[id15]!/4/156/1,:21,:157)', 'epubcfi(/6/22[id15]!/4/156/1,:21,:157)',
'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)' 'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)'
] ]
for loc in test_locations: for loc in test_locations:
idref, filepos = parse_location(loc) idref, filepos = manager.parse_location(loc)
print(f"location: {loc}\n idref: {idref}\n filepos: {filepos}\n") print(f"location: {loc}\n idref: {idref}\n filepos: {filepos}\n")
'''
# 测试只获取特定 assetid 的笔记 # 测试只获取特定 assetid 的笔记
test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C" test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C"
annotations = get_annotations(bookid=test_bookid) annotations = manager.get_annotations(bookid=test_bookid)
# 格式化打印该书的所有笔记
from pprint import pprint from pprint import pprint
print(f"\nAssetID={test_bookid} 的所有笔记:") print(f"\nAssetID={test_bookid} 的所有笔记:")
pprint(annotations, indent=2, sort_dicts=False) pprint(annotations, indent=2, sort_dicts=False)
# 输出每本书的前3条笔记
'''
book_notes = defaultdict(list)
for assetid, notes_dict in annotations.items():
for uuid, ann in notes_dict.items():
book_notes[assetid].append({**ann, 'uuid': uuid})
for assetid, notes in book_notes.items():
print(f"\nAssetID: {assetid}")
for i, note in enumerate(notes[:3]):
print(f" 笔记{i+1}:")
print(f" creationdate: {note['creationdate']}")
print(f" idref: {note['idref']}")
print(f" filepos: {note['filepos']}")
print(f" note: {note['note']}")
print(f" selectedtext: {note['selectedtext']}")
print(f" uuid: {note['uuid']}")
'''

View File

@ -1,75 +1,66 @@
"""
booklist_parse.py
-----------------
功能
- 解析iBooks的Books.plist提取所有书籍元数据书名作者路径时间等
- 解析BKLibrary.sqlite获取每本书的最近打开时间苹果时间戳基准2001-01-01
依赖config.py 统一管理路径和配置项
主要接口
- parse_books_plist(plist_path)返回所有书籍元数据结构为{bk_id: {...}}
- get_books_last_open(db_path)返回所有书籍最近打开时间结构为{bk_id: {'last_open': 时间戳}}
依赖plistlib, collections, sqlite3, os, datetime
典型用法
booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST)
books_open = get_books_last_open(config.LOCAL_LIBRARY_DB)
"""
import config import config
import plistlib import plistlib
from collections import defaultdict
def parse_books_plist(plist_path=config.LOCAL_BOOKS_PLIST):
booksinfo = defaultdict(dict)
with open(plist_path, 'rb') as f: plist_data = plistlib.load(f)
for book in plist_data.get('Books', []):
bk_id = book.get('BKGeneratedItemId')
if not bk_id: continue
booksinfo[bk_id] = {
'displayname': book.get('BKDisplayName', ''),
'author': book.get('artistName', ''),
'type': book.get('BKBookType', ''),
'bookid': bk_id,
'itemname': book.get('itemName', ''),
'path': book.get('path', ''),
'date': book.get('BKInsertionDate',''),
'updatedate': book.get('updateDate','')
}
return booksinfo
import sqlite3 import sqlite3
import os import os
from collections import defaultdict
def get_books_last_open(db_path=config.LOCAL_LIBRARY_DB): class BookListManager:
""" def __init__(self, plist_path=None, db_path=None):
从BKLibrary.sqlite获取书籍最近打开时间 self.plist_path = plist_path or config.LOCAL_BOOKS_PLIST
返回defaultdict(dict)bk_id为索引包含最近打开时间 self.db_path = db_path or config.LOCAL_LIBRARY_DB
""" self._booksinfo = None
books_open = defaultdict(dict) self._books_open = None
if not os.path.exists(db_path):
def get_books_info(self):
if self._booksinfo is not None:
return self._booksinfo
booksinfo = defaultdict(dict)
with open(self.plist_path, 'rb') as f:
plist_data = plistlib.load(f)
for book in plist_data.get('Books', []):
bk_id = book.get('BKGeneratedItemId')
if not bk_id:
continue
booksinfo[bk_id] = {
'displayname': book.get('BKDisplayName', ''),
'author': book.get('artistName', ''),
'type': book.get('BKBookType', ''),
'bookid': bk_id,
'itemname': book.get('itemName', ''),
'path': book.get('path', ''),
'date': book.get('BKInsertionDate',''),
'updatedate': book.get('updateDate','')
}
self._booksinfo = booksinfo
return booksinfo
def get_books_last_open(self):
if self._books_open is not None:
return self._books_open
books_open = defaultdict(dict)
if not os.path.exists(self.db_path):
return books_open
try:
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute(''' SELECT ZASSETID, zlastopendate FROM ZBKLIBRARYASSET WHERE zlastopendate IS NOT NULL ''')
rows = cursor.fetchall()
for row in rows:
asset_id, last_open = row
if asset_id:
books_open[asset_id] = {
'last_open': last_open
}
conn.close()
except Exception as e:
print(f'警告: 读取BKLibrary.sqlite失败: {e}')
self._books_open = books_open
return books_open return books_open
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# ZBKLIBRARYASSET表包含书籍信息
cursor.execute(''' SELECT ZASSETID, zlastopendate FROM ZBKLIBRARYASSET WHERE zlastopendate IS NOT NULL ''')
rows = cursor.fetchall()
for row in rows:
asset_id, last_open = row
if asset_id:
books_open[asset_id] = {
'last_open': last_open # 苹果时间戳基准时间为2001-01-01
}
conn.close()
except Exception as e:
print(f'警告: 读取BKLibrary.sqlite失败: {e}')
return books_open
if __name__ == '__main__': if __name__ == '__main__':
booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST) manager = BookListManager()
booksinfo = manager.get_books_info()
from pprint import pprint from pprint import pprint
print("\n【前三条示例】") print("\n【前三条示例】")
for k, v in list(booksinfo.items())[:3]: for k, v in list(booksinfo.items())[:3]:
@ -77,19 +68,10 @@ if __name__ == '__main__':
pprint(v, sort_dicts=False, indent=2) pprint(v, sort_dicts=False, indent=2)
print('-' * 60) print('-' * 60)
'''
print("\n【全部内容】")
for k, v in booksinfo.items():
print(f"{k}:")
pprint(v, sort_dicts=False, indent=2)
print('-' * 60)
'''
# 测试最近打开时间
print("\n【最近打开时间示例】") print("\n【最近打开时间示例】")
books_open = get_books_last_open() books_open = manager.get_books_last_open()
import datetime import datetime
for k, v in list(books_open.items())[:3]: for k, v in list(books_open.items())[:3]:
ts = v['last_open'] ts = v['last_open']
# 苹果时间戳基准2001-01-01
dt = datetime.datetime(2001, 1, 1) + datetime.timedelta(seconds=ts) dt = datetime.datetime(2001, 1, 1) + datetime.timedelta(seconds=ts)
print(f"{k}: {dt} (timestamp: {ts})") print(f"{k}: {dt} (timestamp: {ts})")

Binary file not shown.

View File

@ -158,31 +158,43 @@ answer = inquirer.fuzzy(
## 9.1 主要代码文件说明(细化) ## 9.1 主要代码文件说明(细化)
- `exportbooknotes.py` - `exportbooknotes.py`
- 采用 OOP 设计,核心类为 `BookNotesExporter`
- `build_booksnote(bookid=None)`:构建结构化笔记数据。
- `export_booksnote_to_md(booksnote, booksinfo, out_path=None)`:导出为 Markdown。
- `find_file_by_ext`、`get_toc_tree` 等辅助方法。
- 数据同步:自动复制 iBooks 数据库和元数据到本地。 - 数据同步:自动复制 iBooks 数据库和元数据到本地。
- 菜单交互:按最近打开时间戳排序,显示“书名 [时间戳]”,支持模糊搜索。 - 菜单交互:按最近打开时间戳排序,显示“书名 [时间戳]”,支持模糊搜索。
- 只处理用户选中书籍的笔记,按章节分组导出 Markdown。 - 只处理用户选中书籍的笔记,按章节分组导出 Markdown。
- 依赖核心解析模块,负责主流程调度。 - 依赖核心解析模块,负责主流程调度。
- `annotationdata.py` - `annotationdata.py`
- OOP 设计,核心类为 `AnnotationManager`
- `get_annotations(bookid=None)`:返回所有或指定 assetid 的笔记。
- `parse_location(location)`:静态方法,解析定位信息。
- 解析 AEAnnotation.sqlite提取所有或指定 assetid 的笔记。 - 解析 AEAnnotation.sqlite提取所有或指定 assetid 的笔记。
- 支持苹果时间戳转换,结构化输出。 - 支持苹果时间戳转换,结构化输出。
- parse_location 辅助函数,统一解析笔记定位信息。
- `booklist_parse.py` - `booklist_parse.py`
- OOP 设计,核心类为 `BookListManager`
- `get_books_info()`:获取书籍元数据。
- `get_books_last_open()`:获取每本书的最近打开时间。
- 解析 Books.plist获取书籍元数据书名、作者、路径、时间等 - 解析 Books.plist获取书籍元数据书名、作者、路径、时间等
- 解析 BKLibrary.sqlite获取每本书的最近打开时间zlastopendate苹果时间戳 - 解析 BKLibrary.sqlite获取每本书的最近打开时间。
- 提供统一数据接口,便于主流程排序和展示。
- `opf_parse.py` - `opf_parse.py`
- OOP 设计,核心类为 `OPFParser`
- `parse_opf(filepath)`:静态方法,返回 id->href 映射。
- 解析 epub 的 OPF 文件获取章节与文件映射关系idref -> href - 解析 epub 的 OPF 文件获取章节与文件映射关系idref -> href
- 支持多种 epub 目录结构。
- `toc_parse.py` - `toc_parse.py`
- OOP 设计,核心类为 `TOCParser`
- `parse_navpoints(navpoints)`:递归解析 navPoint 节点。
- `find_label_path(node, ref, filepos, path)`:查找章节路径。
- `find_section_by_selectedtext(html_path, selectedtext)`:通过选中文本定位章节标题。
- `parse_html_title(html_path)`:解析 html 文件标题。
- 解析 NCX 目录文件,递归构建章节树结构。 - 解析 NCX 目录文件,递归构建章节树结构。
- find_label_path支持通过 ref 和 filepos 查找完整 label 路径。
- find_section_by_selectedtext通过选中文本在 html 文件中定位章节标题。
- parse_html_title解析 html 文件标题。
- `backup/booksnote.py` - `backup/booksnote.py`
- 历史/备份脚本,辅助数据迁移或格式转换。 - 历史/备份脚本,辅助数据迁移或格式转换。

View File

@ -1,4 +1,4 @@
# 笔记导出 2025-08-15 13:25 # 笔记导出 2025-08-15 17:20
## 传统十论 ## 传统十论

View File

@ -1,31 +1,17 @@
""" """
exportbooknotes.py exportbooknotes.py (OOP版)
------------------ -------------------------
功能 功能
- 自动同步iBooks数据库和元数据文件到本地data目录 - 自动同步iBooks数据库和元数据文件到本地data目录
- 解析AEAnnotation.sqliteBooks.plistBKLibrary.sqlite构建结构化笔记数据 - 解析AEAnnotation.sqliteBooks.plistBKLibrary.sqlite构建结构化笔记数据
- 解析epub目录和章节信息定位每条笔记所属章节 - 解析epub目录和章节信息定位每条笔记所属章节
- 命令行菜单按最近打开时间降序展示书籍列表供用户选择导出 - 命令行菜单按最近打开时间降序展示书籍列表供用户选择导出
- 仅导出选中书籍的所有笔记按章节分组生成Markdown文件 - 仅导出选中书籍的所有笔记按章节分组生成Markdown文件
依赖config.py 统一管理路径和配置项 依赖config.py 统一管理路径和配置项
主要接口BookNotesExporter
主要数据流 - run()命令行交互式导出主流程
1. 数据同步到data目录 - build_booksnote(bookid=None)构建结构化笔记数据
2. 解析Books.plist获取书籍元数据 - export_booksnote_to_md(booksnote, booksinfo, out_path=None)导出为Markdown
3. 解析BKLibrary.sqlite获取最近打开时间
4. 菜单排序与显示书名+时间戳
5. 解析AEAnnotation.sqlite获取笔记
6. 解析epub目录定位章节
7. 导出Markdown文件
依赖Python 3, InquirerPy, bs4, shutil, os, datetime, sqlite3
主要数据流
典型用法
python exportbooknotes.py
# 按提示选择书籍自动导出笔记到export_notes目录
""" """
import config import config
""" """
@ -40,117 +26,113 @@ booksnote = {
}}} }}}
} }
""" """
from collections import defaultdict
import os import os
from annotationdata import get_annotations from collections import defaultdict
from booklist_parse import parse_books_plist from annotationdata import AnnotationManager
from booklist_parse import BookListManager
from opf_parse import parse_opf from opf_parse import parse_opf
from toc_parse import parse_navpoints, find_label_path from toc_parse import TOCParser
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from pprint import pprint
def find_file_by_ext(root, exts):
"""在root下递归查找第一个指定后缀的文件"""
for dirpath, _, files in os.walk(root):
for f in files:
for ext in exts:
if f.lower().endswith(ext):
return os.path.join(dirpath, f)
return None
def get_toc_tree(toc_path): class BookNotesExporter:
with open(toc_path, 'r', encoding='utf-8') as f: def __init__(self, config_module=config):
soup = BeautifulSoup(f, 'xml') self.config = config_module
nav_map = soup.find('navMap') self.annotation_db = config_module.LOCAL_ANNOTATION_DB
self.books_plist = config_module.LOCAL_BOOKS_PLIST
self.library_db = config_module.LOCAL_LIBRARY_DB
nav_points = nav_map.find_all('navPoint', recursive=False) @staticmethod
toc_tree = parse_navpoints(nav_points) def find_file_by_ext(root, exts):
#pprint(toc_tree, indent=2, depth=5) for dirpath, _, files in os.walk(root):
return toc_tree for f in files:
for ext in exts:
if f.lower().endswith(ext):
return os.path.join(dirpath, f)
return None
def build_booksnote(annotation_db=config.LOCAL_ANNOTATION_DB, books_plist=config.LOCAL_BOOKS_PLIST, bookid=None): @staticmethod
# 支持只处理特定 assetid 的笔记 def get_toc_tree(toc_path):
annotations = get_annotations(annotation_db, bookid=bookid) with open(toc_path, 'r', encoding='utf-8') as f:
booksinfo = parse_books_plist(books_plist) soup = BeautifulSoup(f, 'xml')
booksnote = defaultdict(lambda: defaultdict(dict)) nav_map = soup.find('navMap')
for assetid, notes in annotations.items(): nav_points = nav_map.find_all('navPoint', recursive=False)
# 获取epub路径 toc_tree = TOCParser.parse_navpoints(nav_points)
bookinfo = booksinfo.get(assetid) return toc_tree
if not bookinfo:
continue def build_booksnote(self, bookid=None):
epub_path = bookinfo.get('path') manager = AnnotationManager(self.annotation_db)
if not epub_path or not os.path.isdir(epub_path): annotations = manager.get_annotations(bookid=bookid)
continue bl_manager = BookListManager(plist_path=self.books_plist)
# 查找opf和ncx booksinfo = bl_manager.get_books_info()
opf_path = find_file_by_ext(epub_path, ['.opf']) booksnote = defaultdict(lambda: defaultdict(dict))
ncx_path = find_file_by_ext(epub_path, ['.ncx']) for assetid, notes in annotations.items():
if not opf_path or not ncx_path: bookinfo = booksinfo.get(assetid)
continue if not bookinfo:
id2href = parse_opf(opf_path) continue
toc_tree = get_toc_tree(ncx_path) epub_path = bookinfo.get('path')
for uuid, ann in notes.items(): if not epub_path or not os.path.isdir(epub_path):
idref = ann['idref'] continue
filepos = ann['filepos'] opf_path = self.find_file_by_ext(epub_path, ['.opf'])
href = id2href.get(idref, idref) ncx_path = self.find_file_by_ext(epub_path, ['.ncx'])
chapter = find_label_path(toc_tree, href, filepos) if not opf_path or not ncx_path:
if chapter is None: continue
# 直接从html文件获取章节信息 id2href = parse_opf(opf_path)
html_path = os.path.join(epub_path, href.split('#')[0]) toc_tree = self.get_toc_tree(ncx_path)
selectedtext = ann.get('selectedtext') for uuid, ann in notes.items():
if os.path.exists(html_path) and selectedtext: idref = ann['idref']
from toc_parse import find_section_by_selectedtext filepos = ann['filepos']
section = find_section_by_selectedtext(html_path, selectedtext) href = id2href.get(idref, idref)
if section: chapter = TOCParser.find_label_path(toc_tree, href, filepos)
chapter = section if chapter is None:
html_path = os.path.join(epub_path, href.split('#')[0])
selectedtext = ann.get('selectedtext')
if os.path.exists(html_path) and selectedtext:
section = TOCParser.find_section_by_selectedtext(html_path, selectedtext)
if section:
chapter = section
else:
chapter = "(未找到章节)"
else: else:
chapter = "(未找到章节)" chapter = "(未找到章节)"
else: booksnote[assetid][chapter][uuid] = {
chapter = "(未找到章节)" 'creationdate': ann['creationdate'],
booksnote[assetid][chapter][uuid] = { 'filepos': filepos,
'creationdate': ann['creationdate'], 'idref': href,
'filepos': filepos, 'note': ann['note'],
'idref': href, 'selectedtext': ann['selectedtext']
'note': ann['note'], }
'selectedtext': ann['selectedtext'] return booksnote
}
return booksnote
import datetime def export_booksnote_to_md(self, booksnote, booksinfo, out_path=None):
import datetime
def export_booksnote_to_md(booksnote, booksinfo, out_path=None): now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
""" lines = [f'# 笔记导出 {now}\n']
依据booksnote结构导出markdown文件格式 for assetid, chapters in booksnote.items():
# “笔记导出”+导出时间 bookname = booksinfo.get(assetid, {}).get('itemname', assetid)
## 书名 lines.append(f'\n## {bookname}\n')
### chapter for chapter, notes in chapters.items():
selectedtext lines.append(f'### {chapter}')
> note (如果存在) for uuid, ann in notes.items():
""" sel = ann.get('selectedtext')
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') note = ann.get('note')
lines = [f'# 笔记导出 {now}\n'] if sel:
for assetid, chapters in booksnote.items(): lines.append(sel)
bookname = booksinfo.get(assetid, {}).get('itemname', assetid) if note:
lines.append(f'\n## {bookname}\n') lines.append(f'> {note}')
for chapter, notes in chapters.items(): lines.append('')
lines.append(f'### {chapter}') md = '\n'.join(lines)
for uuid, ann in notes.items(): if out_path:
sel = ann.get('selectedtext') with open(out_path, 'w', encoding='utf-8') as f:
note = ann.get('note') f.write(md)
if sel: return md
lines.append(sel)
if note:
lines.append(f'> {note}')
lines.append('')
md = '\n'.join(lines)
if out_path:
with open(out_path, 'w', encoding='utf-8') as f:
f.write(md)
return md
if __name__ == '__main__': if __name__ == '__main__':
import shutil import shutil
import os.path import os.path
from InquirerPy import inquirer # type: ignore
exporter = BookNotesExporter(config)
# 自动覆盖 ./data 下的数据库和plist文件源为iBooks真实路径 # 自动覆盖 ./data 下的数据库和plist文件源为iBooks真实路径
src_files = [ src_files = [
(config.IBOOKS_ANNOTATION_DB, config.LOCAL_ANNOTATION_DB), (config.IBOOKS_ANNOTATION_DB, config.LOCAL_ANNOTATION_DB),
@ -166,31 +148,19 @@ if __name__ == '__main__':
else: else:
print(f'file not found: {src} ') print(f'file not found: {src} ')
from booklist_parse import parse_books_plist
from InquirerPy import inquirer # type: ignore
# 先获取所有书籍元数据 # 先获取所有书籍元数据
booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST) manager = BookListManager(plist_path=config.LOCAL_BOOKS_PLIST, db_path=config.LOCAL_LIBRARY_DB)
booksinfo = manager.get_books_info()
# 构建书名列表优先displayname, 其次itemname, 否则assetid按parse_books_plist中的date字段排序
assetid2name = {} assetid2name = {}
assetid2lastopen = {} assetid2lastopen = {}
from booklist_parse import get_books_last_open last_open_times = manager.get_books_last_open()
# 获取所有书籍的最后打开时间(字典,值为{'last_open': 时间戳}
last_open_times = get_books_last_open(config.LOCAL_LIBRARY_DB)
for assetid, info in booksinfo.items(): for assetid, info in booksinfo.items():
name = info.get('displayname') or info.get('itemname') or assetid name = info.get('displayname') or info.get('itemname') or assetid
# 如果书名中包含“-”,只取“-”前面的部分
if '-' in name: if '-' in name:
name = name.split('-', 1)[0].strip() name = name.split('-', 1)[0].strip()
assetid2name[assetid] = name assetid2name[assetid] = name
# 用 get_books_last_open 返回的时间戳排序如无则为0
ts = last_open_times.get(assetid, {}).get('last_open', 0) ts = last_open_times.get(assetid, {}).get('last_open', 0)
assetid2lastopen[assetid] = ts assetid2lastopen[assetid] = ts
# 按last_open时间戳降序排列
sorted_assetids = sorted(assetid2name.keys(), key=lambda aid: assetid2lastopen[aid], reverse=True) sorted_assetids = sorted(assetid2name.keys(), key=lambda aid: assetid2lastopen[aid], reverse=True)
choices = [f"{assetid2name[aid]} [{assetid2lastopen[aid]}]" for aid in sorted_assetids] choices = [f"{assetid2name[aid]} [{assetid2lastopen[aid]}]" for aid in sorted_assetids]
if not choices: if not choices:
@ -202,8 +172,6 @@ if __name__ == '__main__':
multiselect=False, multiselect=False,
instruction="上下键选择,输入可模糊筛选,回车确定" instruction="上下键选择,输入可模糊筛选,回车确定"
).execute() ).execute()
# 解析选中assetid
for aid, name in assetid2name.items(): for aid, name in assetid2name.items():
if answer.startswith(name): if answer.startswith(name):
selected_assetid = aid selected_assetid = aid
@ -211,10 +179,8 @@ if __name__ == '__main__':
else: else:
print("未找到选中书籍") print("未找到选中书籍")
exit(1) exit(1)
selected_booksnote = exporter.build_booksnote(bookid=selected_assetid)
# 只导出选中书的笔记
selected_booksnote = build_booksnote(bookid=selected_assetid)
selected_booksinfo = {selected_assetid: booksinfo.get(selected_assetid, {})} selected_booksinfo = {selected_assetid: booksinfo.get(selected_assetid, {})}
out_path = f'export_notes/notes_export_{selected_assetid}.md' out_path = f'export_notes/notes_export_{selected_assetid}.md'
export_booksnote_to_md(selected_booksnote, selected_booksinfo, out_path) exporter.export_booksnote_to_md(selected_booksnote, selected_booksinfo, out_path)
print(f'{selected_booksinfo[selected_assetid].get("displayname") or selected_booksinfo[selected_assetid].get("itemname") or selected_assetid}》 导出笔记 {out_path}') print(f'{selected_booksinfo[selected_assetid].get("displayname") or selected_booksinfo[selected_assetid].get("itemname") or selected_assetid}》 导出笔记 {out_path}')

View File

@ -1,38 +1,46 @@
# parseopf.py
# -----------------------------
# 用于解析EPUB电子书的OPF文件提取manifest部分所有id对应的html文件href。
# 支持批量测试和通过id快速查找href。
# 依赖BeautifulSoup4
# -----------------------------
from collections import defaultdict
from bs4 import BeautifulSoup
import pprint
def parse_opf(filepath): def parse_opf(filepath):
""" """
解析OPF文件返回{id: href}的defaultdict(dict)结构 兼容旧代码的顶层函数实际调用 OPFParser.parse_opf
仅保留href以.html结尾的项
参数
filepath (str): OPF文件路径
返回
defaultdict(dict): id到href的映射仅html文件
""" """
result = defaultdict(dict) return OPFParser.parse_opf(filepath)
with open(filepath, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'xml') """
# 查找manifest部分遍历所有item筛选html结尾的href opf_parse.py (OOP版)
manifest = soup.find('manifest') -------------------
if manifest: 功能
for item in manifest.find_all('item'): - 解析EPUB电子书的OPF文件提取manifest部分所有id对应的html文件href
id_ = item.get('id') - 支持通过id快速查找href
href = item.get('href') - 支持批量测试
if id_ and href and href.strip().lower().endswith('html'): 依赖BeautifulSoup4
result[id_] = href 主要接口OPFParser
return result - parse_opf(filepath)静态方法返回id->href映射仅html文件
"""
from collections import defaultdict
from bs4 import BeautifulSoup
class OPFParser:
@staticmethod
def parse_opf(filepath):
"""
解析OPF文件返回{id: href}的defaultdict(dict)结构
仅保留href以.html结尾的项
参数
filepath (str): OPF文件路径
返回
defaultdict(dict): id到href的映射仅html文件
"""
result = defaultdict(dict)
with open(filepath, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'xml')
manifest = soup.find('manifest')
if manifest:
for item in manifest.find_all('item'):
id_ = item.get('id')
href = item.get('href')
if id_ and href and href.strip().lower().endswith('html'):
result[id_] = href
return result
if __name__ == "__main__": if __name__ == "__main__":
test_files = [ test_files = [
@ -44,8 +52,7 @@ if __name__ == "__main__":
for file in test_files: for file in test_files:
print(f"\n==== 测试文件: {file} ====") print(f"\n==== 测试文件: {file} ====")
try: try:
result = parse_opf(file) result = OPFParser.parse_opf(file)
pprint.pprint(result, indent=2, width=120, sort_dicts=False)
# 增加通过id快速打印href的测试 # 增加通过id快速打印href的测试
test_ids = list(result.keys())[:3] # 取前三个id做演示 test_ids = list(result.keys())[:3] # 取前三个id做演示

View File

@ -1,6 +1,7 @@
""" """
toc_parse.py toc_parse.py (OOP版)
------------ -------------------
功能 功能
- 解析EPUB电子书的toc.ncx目录文件递归构建章节树结构 - 解析EPUB电子书的toc.ncx目录文件递归构建章节树结构
- 支持通过ref和filepos查找完整label路径 - 支持通过ref和filepos查找完整label路径
@ -8,166 +9,120 @@ toc_parse.py
- 兼容多种EPUB格式支持批量测试 - 兼容多种EPUB格式支持批量测试
依赖config.py 统一管理路径和配置项 依赖config.py 统一管理路径和配置项
主要接口 主要接口TOCParser
parse_navpoints(navpoints) # 递归解析navPoint节点返回章节树结构 - parse_navpoints(navpoints)递归解析navPoint节点返回章节树结构
find_label_path(node, ref, filepos, path) # 查找指定ref和filepos的章节label路径 - find_label_path(node, ref, filepos, path)查找指定ref和filepos的章节label路径
find_section_by_selectedtext(html_path, selectedtext) # 通过选中文本定位章节标题 - find_section_by_selectedtext(html_path, selectedtext)通过选中文本定位章节标题
parse_html_title(html_path) # 解析html文件标题 - parse_html_title(html_path)解析html文件标题
依赖BeautifulSoup4, pprint, os, typing 依赖BeautifulSoup4, pprint, os, typing
""" """
import config import config
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from typing import Dict, Optional, List, Any import os
import pprint
# ==== 辅助函数根据selectedtext在html文件中的位置推断所在章节 ==== class TOCParser:
def find_section_by_selectedtext(html_path, selectedtext): def __init__(self):
"""
在html文件中查找selectedtext出现的位置向上回溯最近的h1-h6标题返回该标题文本
若未找到标题则返回None
"""
try:
with open(html_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
# 在所有文本节点中查找selectedtext
for elem in soup.find_all(string=True):
if selectedtext and selectedtext.strip() and selectedtext.strip() in elem:
# 回溯父节点查找最近的h1-h6
parent = elem.parent
while parent:
prev = parent.previous_sibling
# 向上查找同级前面的h1-h6
while prev:
if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']:
return prev.get_text(strip=True)
prev = prev.previous_sibling
parent = parent.parent
# 若未找到尝试全局第一个h1-h6
for tag in ['h1','h2','h3','h4','h5','h6']:
h = soup.find(tag)
if h and h.get_text(strip=True):
return h.get_text(strip=True)
except Exception:
pass pass
return None
def parse_html_title(html_path): @staticmethod
""" def find_section_by_selectedtext(html_path, selectedtext):
解析html文件优先返回<title>否则返回body第一个h1/h2/h3/h4/h5/h6或None try:
""" with open(html_path, 'r', encoding='utf-8') as f:
try: soup = BeautifulSoup(f, 'html.parser')
with open(html_path, 'r', encoding='utf-8') as f: for elem in soup.find_all(string=True):
soup = BeautifulSoup(f, 'html.parser') if selectedtext and selectedtext.strip() and selectedtext.strip() in elem:
# 优先<title> parent = elem.parent
if soup.title and soup.title.string: while parent:
return soup.title.string.strip() prev = parent.previous_sibling
# 其次正文第一个h1-h6 while prev:
for tag in ['h1','h2','h3','h4','h5','h6']: if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']:
h = soup.find(tag) return prev.get_text(strip=True)
if h and h.get_text(strip=True): prev = prev.previous_sibling
return h.get_text(strip=True) parent = parent.parent
except Exception: for tag in ['h1','h2','h3','h4','h5','h6']:
pass h = soup.find(tag)
return None if h and h.get_text(strip=True):
return h.get_text(strip=True)
except Exception:
pass
return None
def parse_navpoints(navpoints) -> Dict[str, dict]: @staticmethod
""" def parse_html_title(html_path):
递归解析 navpoints 节点返回嵌套 dict 结构 try:
:param navpoints: BeautifulSoup 查找到的 navPoint 节点列表 with open(html_path, 'r', encoding='utf-8') as f:
:return: 章节树结构 soup = BeautifulSoup(f, 'html.parser')
""" if soup.title and soup.title.string:
result = {} return soup.title.string.strip()
for navpoint in navpoints: for tag in ['h1','h2','h3','h4','h5','h6']:
label = navpoint.navLabel.text.strip().strip('"“”') h = soup.find(tag)
src = navpoint.content["src"] if h and h.get_text(strip=True):
if "#" in src: return h.get_text(strip=True)
ref, filepos = src.split("#", 1) except Exception:
else: pass
ref, filepos = src, None return None
entry = {
"label": label,
"ref": ref,
"filepos": filepos,
"children": parse_navpoints(navpoint.find_all("navPoint", recursive=False))
}
result[navpoint.get("id")] = entry
#pprint.pprint(result) # 格式化打印result @staticmethod
def parse_navpoints(navpoints):
result = {}
for navpoint in navpoints:
label = navpoint.navLabel.text.strip().strip('"“”')
src = navpoint.content["src"]
if "#" in src:
ref, filepos = src.split("#", 1)
else:
ref, filepos = src, None
entry = {
"label": label,
"ref": ref,
"filepos": filepos,
"children": TOCParser.parse_navpoints(navpoint.find_all("navPoint", recursive=False))
}
result[navpoint.get("id")] = entry
return result
return result @staticmethod
def find_label_path(node, ref, filepos=None, path=None):
def find_label_path( if path is None:
node: Any, path = []
ref: str, if isinstance(node, dict):
filepos: Optional[str] = None, nodes = node.values() if "label" not in node else [node]
path: Optional[List[str]] = None
) -> Optional[str]:
"""
在嵌套 dict 结构中查找指定 ref filepos label 路径
:param node: 当前节点dict dict集合
:param ref: html文件名
:param filepos: 文件位置可为 None
:param path: label 路径累积
:return: / 分隔的完整 label 路径未找到返回 None
"""
if path is None:
path = []
if isinstance(node, dict):
nodes = node.values() if "label" not in node else [node]
# 1. 优先精确匹配ref和filepos
for v in nodes:
if "label" in v:
new_path = path + [v["label"]]
if v["ref"] == ref and (filepos is None or v["filepos"] == filepos):
title = " / ".join(new_path)
#print(f'title ref={ref} filepos={filepos} -> {title}') #DBG
return title
title = find_label_path(v["children"], ref, filepos, new_path)
if title:
#print(f'title1 ref={ref} filepos={filepos} -> {title}') #DBG
return title
# 2. 如果带filepos查找失败回退到同ref下第一个章节即只要ref匹配就返回
if filepos is not None:
for v in nodes: for v in nodes:
if "label" in v: if "label" in v:
new_path = path + [v["label"]] new_path = path + [v["label"]]
# print(f"对比 {v['ref']} == {ref}") if v["ref"] == ref and (filepos is None or v["filepos"] == filepos):
if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]:
title = " / ".join(new_path) title = " / ".join(new_path)
#print(f'title3 ref={ref} filepos={filepos} -> {title}') #DBG
return title return title
title = find_label_path(v["children"], ref, None, new_path) title = TOCParser.find_label_path(v["children"], ref, filepos, new_path)
if title:
#print(f'title4 ref={ref} filepos={filepos} -> {title}') #DBG
return title
# 3. 若完全未找到尝试直接解析idref所指html文件标题获取章节label信息
# 仅在顶层调用时执行此逻辑
if path == [] and ref and ref.endswith('.html'):
import os
# 自动在常见目录下查找html文件以toc文件目录为基准
caller_dir = os.path.dirname(os.path.abspath(__file__))
search_dirs = [caller_dir, os.getcwd()]
for d in search_dirs:
html_path = os.path.join(d, ref)
#print(f"查找 {html_path}")
if os.path.isfile(html_path):
title = parse_html_title(html_path)
if title:
return title
# 递归查找以toc文件目录为根
for d in search_dirs:
for root, _, files in os.walk(d):
if ref in files:
html_path = os.path.join(root, ref)
#print(f"2 查找 {html_path}")
title = parse_html_title(html_path)
if title: if title:
return title return title
return None if filepos is not None:
for v in nodes:
if "label" in v:
new_path = path + [v["label"]]
if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]:
title = " / ".join(new_path)
return title
title = TOCParser.find_label_path(v["children"], ref, None, new_path)
if title:
return title
if path == [] and ref and ref.endswith('.html'):
caller_dir = os.path.dirname(os.path.abspath(__file__))
search_dirs = [caller_dir, os.getcwd()]
for d in search_dirs:
html_path = os.path.join(d, ref)
if os.path.isfile(html_path):
title = TOCParser.parse_html_title(html_path)
if title:
return title
for d in search_dirs:
for root, _, files in os.walk(d):
if ref in files:
html_path = os.path.join(root, ref)
title = TOCParser.parse_html_title(html_path)
if title:
return title
return None
if __name__ == "__main__": if __name__ == "__main__":
# ==== 批量测试指定toc/html/filepos列表 ==== # ==== 批量测试指定toc/html/filepos列表 ====
@ -182,8 +137,6 @@ if __name__ == "__main__":
[config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""], [config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
] ]
for epub_dir, html_file, filepos in test_cases: for epub_dir, html_file, filepos in test_cases:
# 自动查找epub目录下的toc.ncx
import os
toc_path = None toc_path = None
for root, _, files in os.walk(epub_dir): for root, _, files in os.walk(epub_dir):
for f in files: for f in files:
@ -200,39 +153,32 @@ if __name__ == "__main__":
with open(toc_path, "r", encoding="utf-8") as f: with open(toc_path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "xml") soup = BeautifulSoup(f, "xml")
nav_map = soup.find("navMap") nav_map = soup.find("navMap")
toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False)) toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
label_path = find_label_path(toc_tree, html_file, filepos) label_path = TOCParser.find_label_path(toc_tree, html_file, filepos)
print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}") print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}")
# tocb中不存在html直接测试parse_html_title
html_path = os.path.join(epub_dir, html_file.split('#')[0]) html_path = os.path.join(epub_dir, html_file.split('#')[0])
if os.path.exists(html_path): if os.path.exists(html_path):
title = parse_html_title(html_path) title = TOCParser.parse_html_title(html_path)
print(f"解析html标题: {html_path} => {title if title else '未找到标题'}") print(f"解析html标题: {html_path} => {title if title else '未找到标题'}")
# 新增根据selectedtext定位章节标题
selectedtext = '从变法思想看,王安石变法最大的魅力是“民不加赋而国用足”:老百姓上缴的税率不增,国库的总收入仍可以' selectedtext = '从变法思想看,王安石变法最大的魅力是“民不加赋而国用足”:老百姓上缴的税率不增,国库的总收入仍可以'
section = find_section_by_selectedtext(html_path, selectedtext) section = TOCParser.find_section_by_selectedtext(html_path, selectedtext)
print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}") print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}")
else: else:
print(f"未找到html文件: {html_path}") print(f"未找到html文件: {html_path}")
except Exception as e: except Exception as e:
print(f"测试失败: {e}") print(f"测试失败: {e}")
# ==== 新增测试变宋笔记章节定位和html标题解析 ====
print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====") print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====")
# 假设笔记数据如下
note_idref = 'text/part0002_split_003.html' note_idref = 'text/part0002_split_003.html'
note_filepos = None note_filepos = None
# 变宋toc.ncx路径
bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx" bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx"
import os
if os.path.exists(bian_song_toc): if os.path.exists(bian_song_toc):
with open(bian_song_toc, "r", encoding="utf-8") as f: with open(bian_song_toc, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "xml") soup = BeautifulSoup(f, "xml")
nav_map = soup.find("navMap") nav_map = soup.find("navMap")
toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False)) toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
# 先尝试用find_label_path查找章节 label_path = TOCParser.find_label_path(toc_tree, note_idref, note_filepos)
label_path = find_label_path(toc_tree, note_idref, note_filepos)
print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节尝试解析html标题") print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节尝试解析html标题")
else: else:
print(f"未找到toc.ncx: {bian_song_toc}") print(f"未找到toc.ncx: {bian_song_toc}")