iBook/exportbooknotes.py

221 lines
8.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
exportbooknotes.py
------------------
功能:
- 自动同步iBooks数据库和元数据文件到本地data目录。
- 解析AEAnnotation.sqlite、Books.plist、BKLibrary.sqlite构建结构化笔记数据。
- 解析epub目录和章节信息定位每条笔记所属章节。
- 命令行菜单按最近打开时间降序展示书籍列表,供用户选择导出。
- 仅导出选中书籍的所有笔记按章节分组生成Markdown文件。
依赖config.py 统一管理路径和配置项。
主要数据流:
1. 数据同步到data目录
2. 解析Books.plist获取书籍元数据
3. 解析BKLibrary.sqlite获取最近打开时间
4. 菜单排序与显示(书名+时间戳)
5. 解析AEAnnotation.sqlite获取笔记
6. 解析epub目录定位章节
7. 导出Markdown文件
依赖Python 3, InquirerPy, bs4, shutil, os, datetime, sqlite3
主要数据流:
典型用法:
python exportbooknotes.py
# 按提示选择书籍自动导出笔记到export_notes目录
"""
import config
"""
自动生成 booksnote 数据结构:
booksnote = {
assetid: { label_path: { uuid: {
'creationdate': '2023/7/12',
'filepos': None,
'idref': '008.xhtml',
'note': None,
'selectedtext': '這就是宣傳的恐怖之處'
}}}
}
"""
from collections import defaultdict
import os
from annotationdata import get_annotations
from booklist_parse import parse_books_plist
from opf_parse import parse_opf
from toc_parse import parse_navpoints, find_label_path
from bs4 import BeautifulSoup
from pprint import pprint
def find_file_by_ext(root, exts):
"""在root下递归查找第一个指定后缀的文件"""
for dirpath, _, files in os.walk(root):
for f in files:
for ext in exts:
if f.lower().endswith(ext):
return os.path.join(dirpath, f)
return None
def get_toc_tree(toc_path):
with open(toc_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'xml')
nav_map = soup.find('navMap')
nav_points = nav_map.find_all('navPoint', recursive=False)
toc_tree = parse_navpoints(nav_points)
#pprint(toc_tree, indent=2, depth=5)
return toc_tree
def build_booksnote(annotation_db=config.LOCAL_ANNOTATION_DB, books_plist=config.LOCAL_BOOKS_PLIST, bookid=None):
# 支持只处理特定 assetid 的笔记
annotations = get_annotations(annotation_db, bookid=bookid)
booksinfo = parse_books_plist(books_plist)
booksnote = defaultdict(lambda: defaultdict(dict))
for assetid, notes in annotations.items():
# 获取epub路径
bookinfo = booksinfo.get(assetid)
if not bookinfo:
continue
epub_path = bookinfo.get('path')
if not epub_path or not os.path.isdir(epub_path):
continue
# 查找opf和ncx
opf_path = find_file_by_ext(epub_path, ['.opf'])
ncx_path = find_file_by_ext(epub_path, ['.ncx'])
if not opf_path or not ncx_path:
continue
id2href = parse_opf(opf_path)
toc_tree = get_toc_tree(ncx_path)
for uuid, ann in notes.items():
idref = ann['idref']
filepos = ann['filepos']
href = id2href.get(idref, idref)
chapter = find_label_path(toc_tree, href, filepos)
if chapter is None:
# 直接从html文件获取章节信息
html_path = os.path.join(epub_path, href.split('#')[0])
selectedtext = ann.get('selectedtext')
if os.path.exists(html_path) and selectedtext:
from toc_parse import find_section_by_selectedtext
section = find_section_by_selectedtext(html_path, selectedtext)
if section:
chapter = section
else:
chapter = "(未找到章节)"
else:
chapter = "(未找到章节)"
booksnote[assetid][chapter][uuid] = {
'creationdate': ann['creationdate'],
'filepos': filepos,
'idref': href,
'note': ann['note'],
'selectedtext': ann['selectedtext']
}
return booksnote
import datetime
def export_booksnote_to_md(booksnote, booksinfo, out_path=None):
"""
依据booksnote结构导出markdown文件格式
# “笔记导出”+导出时间
## 书名
### chapter
selectedtext
> note (如果存在)
"""
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
lines = [f'# 笔记导出 {now}\n']
for assetid, chapters in booksnote.items():
bookname = booksinfo.get(assetid, {}).get('itemname', assetid)
lines.append(f'\n## {bookname}\n')
for chapter, notes in chapters.items():
lines.append(f'### {chapter}')
for uuid, ann in notes.items():
sel = ann.get('selectedtext')
note = ann.get('note')
if sel:
lines.append(sel)
if note:
lines.append(f'> {note}')
lines.append('')
md = '\n'.join(lines)
if out_path:
with open(out_path, 'w', encoding='utf-8') as f:
f.write(md)
return md
if __name__ == '__main__':
import shutil
import os.path
# 自动覆盖 ./data 下的数据库和plist文件源为iBooks真实路径
src_files = [
(config.IBOOKS_ANNOTATION_DB, config.LOCAL_ANNOTATION_DB),
(config.IBOOKS_ANNOTATION_SHM, config.LOCAL_ANNOTATION_SHM),
(config.IBOOKS_ANNOTATION_WAL, config.LOCAL_ANNOTATION_WAL),
(config.IBOOKS_LIBRARY_DB, config.LOCAL_LIBRARY_DB),
(config.IBOOKS_BOOKS_PLIST, config.LOCAL_BOOKS_PLIST)
]
for src, dst in src_files:
if os.path.exists(src):
shutil.copy2(src, dst)
print(f'copy source data file to ./data : {dst}')
else:
print(f'file not found: {src} ')
from booklist_parse import parse_books_plist
from InquirerPy import inquirer # type: ignore
# 先获取所有书籍元数据
booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST)
# 构建书名列表优先displayname, 其次itemname, 否则assetid按parse_books_plist中的date字段排序
assetid2name = {}
assetid2lastopen = {}
from booklist_parse import get_books_last_open
# 获取所有书籍的最后打开时间(字典,值为{'last_open': 时间戳}
last_open_times = get_books_last_open(config.LOCAL_LIBRARY_DB)
for assetid, info in booksinfo.items():
name = info.get('displayname') or info.get('itemname') or assetid
# 如果书名中包含“-”,只取“-”前面的部分
if '-' in name:
name = name.split('-', 1)[0].strip()
assetid2name[assetid] = name
# 用 get_books_last_open 返回的时间戳排序如无则为0
ts = last_open_times.get(assetid, {}).get('last_open', 0)
assetid2lastopen[assetid] = ts
# 按last_open时间戳降序排列
sorted_assetids = sorted(assetid2name.keys(), key=lambda aid: assetid2lastopen[aid], reverse=True)
choices = [f"{assetid2name[aid]} [{assetid2lastopen[aid]}]" for aid in sorted_assetids]
if not choices:
print("无可导出的笔记")
exit(0)
answer = inquirer.fuzzy(
message="请选择要导出的书名(支持模糊搜索):",
choices=choices,
multiselect=False,
instruction="上下键选择,输入可模糊筛选,回车确定"
).execute()
# 解析选中assetid
for aid, name in assetid2name.items():
if answer.startswith(name):
selected_assetid = aid
break
else:
print("未找到选中书籍")
exit(1)
# 只导出选中书的笔记
selected_booksnote = build_booksnote(bookid=selected_assetid)
selected_booksinfo = {selected_assetid: booksinfo.get(selected_assetid, {})}
out_path = f'export_notes/notes_export_{selected_assetid}.md'
export_booksnote_to_md(selected_booksnote, selected_booksinfo, out_path)
print(f'{selected_booksinfo[selected_assetid].get("displayname") or selected_booksinfo[selected_assetid].get("itemname") or selected_assetid}》 导出笔记 {out_path}')