Initial commit

2025-08-06 13:11:08 +08:00
commit 2f2f98cea1
723 changed files with 69242 additions and 0 deletions
--- a/backup/booksnote.py
+++ b/backup/booksnote.py
@@ -0,0 +1,60 @@
+"""
+生成 booksnote 数据结构：
+booksnote = {
+  assetid: {uuid: {
+      'chapter': label_path,
+      'creationdate': '2023/7/12',
+      'filepos': None,
+      'idref': '008.xhtml',
+      'note': None,
+      'selectedtext': '這就是宣傳的恐怖之處'
+      }}
+}
+"""
+from collections import defaultdict
+from annotationdata import get_annotations
+from opf_parse import parse_opf
+from toc_parse import parse_navpoints, find_label_path
+import os
+from bs4 import BeautifulSoup
+
+def get_toc_tree(toc_path):
+    with open(toc_path, 'r', encoding='utf-8') as f:
+        soup = BeautifulSoup(f, 'xml')
+    nav_map = soup.find('navMap')
+    return parse_navpoints(nav_map.find_all('navPoint', recursive=False))
+
+def build_booksnote(annotation_db='data/AEAnnotation.sqlite', opf_path=None, toc_path=None):
+    annotations = get_annotations(annotation_db)
+    booksnote = defaultdict(lambda: defaultdict(dict))
+    # 解析OPF和TOC
+    if not opf_path or not toc_path:
+        raise ValueError('必须提供OPF和TOC路径')
+    id2href = parse_opf(opf_path)
+    toc_tree = get_toc_tree(toc_path)
+    # 遍历所有笔记
+    for assetid, notes in annotations.items():
+        for uuid, ann in notes.items():
+            idref = ann['idref']
+            filepos = ann['filepos']
+            href = id2href.get(idref, idref)
+            chapter = find_label_path(toc_tree, href, filepos)
+            if chapter is None:
+                chapter = "(未找到章节)"
+            booksnote[assetid][chapter][uuid] = {
+                'chapter': chapter,
+                'creationdate': ann['creationdate'],
+                'filepos': filepos,
+                'idref': href,
+                'note': ann['note'],
+                'selectedtext': ann['selectedtext']
+            }
+    return booksnote
+
+if __name__ == '__main__':
+    # 示例：请根据实际epub路径填写
+    opf_path = './examples/epub_format_3/OEBPS/content.opf'
+    toc_path = './examples/epub_format_3/OEBPS/toc.ncx'
+    booksnote = build_booksnote(opf_path=opf_path, toc_path=toc_path)
+    from pprint import pprint
+    pprint(booksnote, indent=2, width=120, sort_dicts=False)