This commit is contained in:
douboer
2025-08-15 17:20:30 +08:00
parent 0bc6844209
commit 4e3b8abc34
12 changed files with 406 additions and 516 deletions

View File

@@ -1,31 +1,17 @@
"""
exportbooknotes.py
------------------
exportbooknotes.py (OOP版)
-------------------------
功能:
- 自动同步iBooks数据库和元数据文件到本地data目录。
- 解析AEAnnotation.sqlite、Books.plist、BKLibrary.sqlite构建结构化笔记数据。
- 解析epub目录和章节信息定位每条笔记所属章节。
- 命令行菜单按最近打开时间降序展示书籍列表,供用户选择导出。
- 仅导出选中书籍的所有笔记按章节分组生成Markdown文件。
依赖config.py 统一管理路径和配置项。
主要数据流:
1. 数据同步到data目录
2. 解析Books.plist获取书籍元数据
3. 解析BKLibrary.sqlite获取最近打开时间
4. 菜单排序与显示(书名+时间戳)
5. 解析AEAnnotation.sqlite获取笔记
6. 解析epub目录定位章节
7. 导出Markdown文件
依赖Python 3, InquirerPy, bs4, shutil, os, datetime, sqlite3
主要数据流:
典型用法:
python exportbooknotes.py
# 按提示选择书籍自动导出笔记到export_notes目录
主要接口BookNotesExporter
- run():命令行交互式导出主流程
- build_booksnote(bookid=None):构建结构化笔记数据
- export_booksnote_to_md(booksnote, booksinfo, out_path=None)导出为Markdown
"""
import config
"""
@@ -40,117 +26,113 @@ booksnote = {
}}}
}
"""
from collections import defaultdict
import os
from annotationdata import get_annotations
from booklist_parse import parse_books_plist
from collections import defaultdict
from annotationdata import AnnotationManager
from booklist_parse import BookListManager
from opf_parse import parse_opf
from toc_parse import parse_navpoints, find_label_path
from toc_parse import TOCParser
from bs4 import BeautifulSoup
from pprint import pprint
def find_file_by_ext(root, exts):
"""在root下递归查找第一个指定后缀的文件"""
for dirpath, _, files in os.walk(root):
for f in files:
for ext in exts:
if f.lower().endswith(ext):
return os.path.join(dirpath, f)
return None
def get_toc_tree(toc_path):
with open(toc_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'xml')
nav_map = soup.find('navMap')
class BookNotesExporter:
def __init__(self, config_module=config):
self.config = config_module
self.annotation_db = config_module.LOCAL_ANNOTATION_DB
self.books_plist = config_module.LOCAL_BOOKS_PLIST
self.library_db = config_module.LOCAL_LIBRARY_DB
nav_points = nav_map.find_all('navPoint', recursive=False)
toc_tree = parse_navpoints(nav_points)
#pprint(toc_tree, indent=2, depth=5)
return toc_tree
@staticmethod
def find_file_by_ext(root, exts):
for dirpath, _, files in os.walk(root):
for f in files:
for ext in exts:
if f.lower().endswith(ext):
return os.path.join(dirpath, f)
return None
def build_booksnote(annotation_db=config.LOCAL_ANNOTATION_DB, books_plist=config.LOCAL_BOOKS_PLIST, bookid=None):
# 支持只处理特定 assetid 的笔记
annotations = get_annotations(annotation_db, bookid=bookid)
booksinfo = parse_books_plist(books_plist)
booksnote = defaultdict(lambda: defaultdict(dict))
for assetid, notes in annotations.items():
# 获取epub路径
bookinfo = booksinfo.get(assetid)
if not bookinfo:
continue
epub_path = bookinfo.get('path')
if not epub_path or not os.path.isdir(epub_path):
continue
# 查找opf和ncx
opf_path = find_file_by_ext(epub_path, ['.opf'])
ncx_path = find_file_by_ext(epub_path, ['.ncx'])
if not opf_path or not ncx_path:
continue
id2href = parse_opf(opf_path)
toc_tree = get_toc_tree(ncx_path)
for uuid, ann in notes.items():
idref = ann['idref']
filepos = ann['filepos']
href = id2href.get(idref, idref)
chapter = find_label_path(toc_tree, href, filepos)
if chapter is None:
# 直接从html文件获取章节信息
html_path = os.path.join(epub_path, href.split('#')[0])
selectedtext = ann.get('selectedtext')
if os.path.exists(html_path) and selectedtext:
from toc_parse import find_section_by_selectedtext
section = find_section_by_selectedtext(html_path, selectedtext)
if section:
chapter = section
@staticmethod
def get_toc_tree(toc_path):
with open(toc_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'xml')
nav_map = soup.find('navMap')
nav_points = nav_map.find_all('navPoint', recursive=False)
toc_tree = TOCParser.parse_navpoints(nav_points)
return toc_tree
def build_booksnote(self, bookid=None):
manager = AnnotationManager(self.annotation_db)
annotations = manager.get_annotations(bookid=bookid)
bl_manager = BookListManager(plist_path=self.books_plist)
booksinfo = bl_manager.get_books_info()
booksnote = defaultdict(lambda: defaultdict(dict))
for assetid, notes in annotations.items():
bookinfo = booksinfo.get(assetid)
if not bookinfo:
continue
epub_path = bookinfo.get('path')
if not epub_path or not os.path.isdir(epub_path):
continue
opf_path = self.find_file_by_ext(epub_path, ['.opf'])
ncx_path = self.find_file_by_ext(epub_path, ['.ncx'])
if not opf_path or not ncx_path:
continue
id2href = parse_opf(opf_path)
toc_tree = self.get_toc_tree(ncx_path)
for uuid, ann in notes.items():
idref = ann['idref']
filepos = ann['filepos']
href = id2href.get(idref, idref)
chapter = TOCParser.find_label_path(toc_tree, href, filepos)
if chapter is None:
html_path = os.path.join(epub_path, href.split('#')[0])
selectedtext = ann.get('selectedtext')
if os.path.exists(html_path) and selectedtext:
section = TOCParser.find_section_by_selectedtext(html_path, selectedtext)
if section:
chapter = section
else:
chapter = "(未找到章节)"
else:
chapter = "(未找到章节)"
else:
chapter = "(未找到章节)"
booksnote[assetid][chapter][uuid] = {
'creationdate': ann['creationdate'],
'filepos': filepos,
'idref': href,
'note': ann['note'],
'selectedtext': ann['selectedtext']
}
return booksnote
booksnote[assetid][chapter][uuid] = {
'creationdate': ann['creationdate'],
'filepos': filepos,
'idref': href,
'note': ann['note'],
'selectedtext': ann['selectedtext']
}
return booksnote
import datetime
def export_booksnote_to_md(booksnote, booksinfo, out_path=None):
"""
依据booksnote结构导出markdown文件格式
# “笔记导出”+导出时间
## 书名
### chapter
selectedtext
> note (如果存在)
"""
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
lines = [f'# 笔记导出 {now}\n']
for assetid, chapters in booksnote.items():
bookname = booksinfo.get(assetid, {}).get('itemname', assetid)
lines.append(f'\n## {bookname}\n')
for chapter, notes in chapters.items():
lines.append(f'### {chapter}')
for uuid, ann in notes.items():
sel = ann.get('selectedtext')
note = ann.get('note')
if sel:
lines.append(sel)
if note:
lines.append(f'> {note}')
lines.append('')
md = '\n'.join(lines)
if out_path:
with open(out_path, 'w', encoding='utf-8') as f:
f.write(md)
return md
def export_booksnote_to_md(self, booksnote, booksinfo, out_path=None):
import datetime
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
lines = [f'# 笔记导出 {now}\n']
for assetid, chapters in booksnote.items():
bookname = booksinfo.get(assetid, {}).get('itemname', assetid)
lines.append(f'\n## {bookname}\n')
for chapter, notes in chapters.items():
lines.append(f'### {chapter}')
for uuid, ann in notes.items():
sel = ann.get('selectedtext')
note = ann.get('note')
if sel:
lines.append(sel)
if note:
lines.append(f'> {note}')
lines.append('')
md = '\n'.join(lines)
if out_path:
with open(out_path, 'w', encoding='utf-8') as f:
f.write(md)
return md
if __name__ == '__main__':
import shutil
import os.path
from InquirerPy import inquirer # type: ignore
exporter = BookNotesExporter(config)
# 自动覆盖 ./data 下的数据库和plist文件源为iBooks真实路径
src_files = [
(config.IBOOKS_ANNOTATION_DB, config.LOCAL_ANNOTATION_DB),
@@ -166,31 +148,19 @@ if __name__ == '__main__':
else:
print(f'file not found: {src} ')
from booklist_parse import parse_books_plist
from InquirerPy import inquirer # type: ignore
# 先获取所有书籍元数据
booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST)
# 构建书名列表优先displayname, 其次itemname, 否则assetid按parse_books_plist中的date字段排序
manager = BookListManager(plist_path=config.LOCAL_BOOKS_PLIST, db_path=config.LOCAL_LIBRARY_DB)
booksinfo = manager.get_books_info()
assetid2name = {}
assetid2lastopen = {}
from booklist_parse import get_books_last_open
# 获取所有书籍的最后打开时间(字典,值为{'last_open': 时间戳}
last_open_times = get_books_last_open(config.LOCAL_LIBRARY_DB)
last_open_times = manager.get_books_last_open()
for assetid, info in booksinfo.items():
name = info.get('displayname') or info.get('itemname') or assetid
# 如果书名中包含“-”,只取“-”前面的部分
if '-' in name:
name = name.split('-', 1)[0].strip()
assetid2name[assetid] = name
# 用 get_books_last_open 返回的时间戳排序如无则为0
ts = last_open_times.get(assetid, {}).get('last_open', 0)
assetid2lastopen[assetid] = ts
# 按last_open时间戳降序排列
sorted_assetids = sorted(assetid2name.keys(), key=lambda aid: assetid2lastopen[aid], reverse=True)
choices = [f"{assetid2name[aid]} [{assetid2lastopen[aid]}]" for aid in sorted_assetids]
if not choices:
@@ -202,8 +172,6 @@ if __name__ == '__main__':
multiselect=False,
instruction="上下键选择,输入可模糊筛选,回车确定"
).execute()
# 解析选中assetid
for aid, name in assetid2name.items():
if answer.startswith(name):
selected_assetid = aid
@@ -211,10 +179,8 @@ if __name__ == '__main__':
else:
print("未找到选中书籍")
exit(1)
# 只导出选中书的笔记
selected_booksnote = build_booksnote(bookid=selected_assetid)
selected_booksnote = exporter.build_booksnote(bookid=selected_assetid)
selected_booksinfo = {selected_assetid: booksinfo.get(selected_assetid, {})}
out_path = f'export_notes/notes_export_{selected_assetid}.md'
export_booksnote_to_md(selected_booksnote, selected_booksinfo, out_path)
exporter.export_booksnote_to_md(selected_booksnote, selected_booksinfo, out_path)
print(f'{selected_booksinfo[selected_assetid].get("displayname") or selected_booksinfo[selected_assetid].get("itemname") or selected_assetid}》 导出笔记 {out_path}')