Initial commit

2025-08-06 13:11:08 +08:00
commit 2f2f98cea1
723 changed files with 69242 additions and 0 deletions
--- a/exportbooknotes.py
+++ b/exportbooknotes.py
@@ -0,0 +1,180 @@
+"""
+自动生成 booksnote 数据结构：
+booksnote = {
+  assetid: { label_path: { uuid: {
+      'creationdate': '2023/7/12',
+      'filepos': None,
+      'idref': '008.xhtml',
+      'note': None,
+      'selectedtext': '這就是宣傳的恐怖之處'
+      }}}
+}
+"""
+from collections import defaultdict
+import os
+from annotationdata import get_annotations
+from booklist_parse import parse_books_plist
+from opf_parse import parse_opf
+from toc_parse import parse_navpoints, find_label_path
+from bs4 import BeautifulSoup
+
+def find_file_by_ext(root, exts):
+    """在root下递归查找第一个指定后缀的文件"""
+    for dirpath, _, files in os.walk(root):
+        for f in files:
+            for ext in exts:
+                if f.lower().endswith(ext):
+                    return os.path.join(dirpath, f)
+    return None
+
+def get_toc_tree(toc_path):
+    with open(toc_path, 'r', encoding='utf-8') as f:
+        soup = BeautifulSoup(f, 'xml')
+    nav_map = soup.find('navMap')
+    return parse_navpoints(nav_map.find_all('navPoint', recursive=False))
+
+def build_booksnote(annotation_db='data/AEAnnotation.sqlite', books_plist='data/Books.plist'):
+    annotations = get_annotations(annotation_db)
+    booksinfo = parse_books_plist(books_plist)
+    booksnote = defaultdict(lambda: defaultdict(dict))
+    for assetid, notes in annotations.items():
+        # 获取epub路径
+        bookinfo = booksinfo.get(assetid)
+        if not bookinfo:
+            continue
+        epub_path = bookinfo.get('path')
+        if not epub_path or not os.path.isdir(epub_path):
+            continue
+        # 查找opf和ncx
+        opf_path = find_file_by_ext(epub_path, ['.opf'])
+        ncx_path = find_file_by_ext(epub_path, ['.ncx'])
+        if not opf_path or not ncx_path:
+            continue
+        id2href = parse_opf(opf_path)
+        toc_tree = get_toc_tree(ncx_path)
+        for uuid, ann in notes.items():
+            idref = ann['idref']
+            filepos = ann['filepos']
+            href = id2href.get(idref, idref)
+            chapter = find_label_path(toc_tree, href, filepos)
+            if chapter is None:
+                # 直接从html文件获取章节信息
+                html_path = os.path.join(epub_path, href.split('#')[0])
+                selectedtext = ann.get('selectedtext')
+                if os.path.exists(html_path) and selectedtext:
+                    from toc_parse import find_section_by_selectedtext
+                    section = find_section_by_selectedtext(html_path, selectedtext)
+                    if section:
+                        chapter = section
+                    else:
+                        chapter = "(未找到章节)"
+                else:
+                    chapter = "(未找到章节)"
+            booksnote[assetid][chapter][uuid] = {
+                'creationdate': ann['creationdate'],
+                'filepos': filepos,
+                'idref': href,
+                'note': ann['note'],
+                'selectedtext': ann['selectedtext']
+            }
+    return booksnote
+
+import datetime
+
+def export_booksnote_to_md(booksnote, booksinfo, out_path=None):
+    """
+    依据booksnote结构导出markdown文件，格式：
+    # “笔记导出”+导出时间
+    ## 书名
+    ### chapter
+    selectedtext
+    > note      (如果存在)
+    """
+    now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
+    lines = [f'# 笔记导出 {now}\n']
+    for assetid, chapters in booksnote.items():
+        bookname = booksinfo.get(assetid, {}).get('itemname', assetid)
+        lines.append(f'\n## {bookname}\n')
+        for chapter, notes in chapters.items():
+            lines.append(f'### {chapter}')
+            for uuid, ann in notes.items():
+                sel = ann.get('selectedtext')
+                note = ann.get('note')
+                if sel:
+                    lines.append(sel)
+                if note:
+                    lines.append(f'> {note}')
+                lines.append('')
+    md = '\n'.join(lines)
+    if out_path:
+        with open(out_path, 'w', encoding='utf-8') as f:
+            f.write(md)
+    return md
+
+
+if __name__ == '__main__':
+    import shutil
+    import os.path
+    # 自动覆盖 ./data 下的数据库和plist文件，源为iBooks真实路径
+    src_files = [
+        (os.path.expanduser('~/Library/Containers/com.apple.iBooksX/Data/Documents/AEAnnotation/AEAnnotation_v10312011_1727_local.sqlite'), 'data/AEAnnotation.sqlite'),
+        (os.path.expanduser('~/Library/Containers/com.apple.iBooksX/Data/Documents/AEAnnotation/AEAnnotation_v10312011_1727_local.sqlite-shm'), 'data/AEAnnotation.sqlite-shm'),
+        (os.path.expanduser('~/Library/Containers/com.apple.iBooksX/Data/Documents/AEAnnotation/AEAnnotation_v10312011_1727_local.sqlite-wal'), 'data/AEAnnotation.sqlite-wal'),
+        (os.path.expanduser('~/Library/Containers/com.apple.iBooksX/Data/Documents/BKLibrary/BKLibrary-1-091020131601.sqlite'), 'data/BKLibrary.sqlite'),
+        (os.path.expanduser('~/Library/Containers/com.apple.BKAgentService/Data/Documents/iBooks/Books/Books.plist'), 'data/Books.plist')
+    ]
+    for src, dst in src_files:
+        if os.path.exists(src):
+            shutil.copy2(src, dst)
+            print(f'copy source data file to ./data : {dst}')
+        else:
+            print(f'file not found: {src} ')
+
+    from booklist_parse import parse_books_plist
+    from InquirerPy import inquirer
+    booksnote = build_booksnote()
+    booksinfo = parse_books_plist('data/Books.plist')
+    # 构建书名列表（优先displayname, 其次itemname, 否则assetid），按最新笔记时间排序
+    assetid2name = {}
+    assetid2latest = {}
+    for assetid in booksnote:
+        info = booksinfo.get(assetid, {})
+        name = info.get('displayname') or info.get('itemname') or assetid
+        # 如果书名中包含“-”，只取“-”前面的部分
+        if '-' in name: name = name.split('-', 1)[0].strip()
+        assetid2name[assetid] = name
+        # 获取该书所有笔记的最新creationdate
+        latest = None
+        for chapter in booksnote[assetid].values():
+            for ann in chapter.values():
+                dt = ann.get('creationdate')
+                if dt:
+                    if latest is None or dt > latest:
+                        latest = dt
+        assetid2latest[assetid] = latest or ''
+    # 按最新时间降序排列
+    sorted_assetids = sorted(assetid2name.keys(), key=lambda aid: assetid2latest[aid], reverse=True)
+    choices = [f"{assetid2name[aid]} [{aid}]" for aid in sorted_assetids]
+    if not choices:
+        print("无可导出的笔记")
+        exit(0)
+    answer = inquirer.fuzzy(
+        message="请选择要导出的书名（支持模糊搜索）:",
+        choices=choices,
+        multiselect=False,
+        instruction="上下键选择，输入可模糊筛选，回车确定"
+    ).execute()
+    # 解析选中assetid
+    for aid, name in assetid2name.items():
+        if answer.startswith(name):
+            selected_assetid = aid
+            break
+    else:
+        print("未找到选中书籍")
+        exit(1)
+    # 只导出选中书的笔记
+    selected_booksnote = {selected_assetid: booksnote[selected_assetid]}
+    selected_booksinfo = {selected_assetid: booksinfo.get(selected_assetid, {})}
+    out_path = f'export_notes/notes_export_{selected_assetid}.md'
+    export_booksnote_to_md(selected_booksnote, selected_booksinfo, out_path)
+    print(f'《{selected_booksinfo[selected_assetid].get("displayname") or selected_booksinfo[selected_assetid].get("itemname") or selected_assetid}》 导出笔记 {out_path}')