iBook/backup/booksnote.py

"""
生成 booksnote 数据结构：
booksnote = {
  assetid: {uuid: {
      'chapter': label_path,
      'creationdate': '2023/7/12',
      'filepos': None,
      'idref': '008.xhtml',
      'note': None,
      'selectedtext': '這就是宣傳的恐怖之處'
      }}
}
"""
from collections import defaultdict
from annotationdata import get_annotations
from opf_parse import parse_opf
from toc_parse import parse_navpoints, find_label_path
import os
from bs4 import BeautifulSoup

def get_toc_tree(toc_path):
    with open(toc_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'xml')
    nav_map = soup.find('navMap')
    return parse_navpoints(nav_map.find_all('navPoint', recursive=False))

def build_booksnote(annotation_db='data/AEAnnotation.sqlite', opf_path=None, toc_path=None):
    annotations = get_annotations(annotation_db)
    booksnote = defaultdict(lambda: defaultdict(dict))
    # 解析OPF和TOC
    if not opf_path or not toc_path:
        raise ValueError('必须提供OPF和TOC路径')
    id2href = parse_opf(opf_path)
    toc_tree = get_toc_tree(toc_path)
    # 遍历所有笔记
    for assetid, notes in annotations.items():
        for uuid, ann in notes.items():
            idref = ann['idref']
            filepos = ann['filepos']
            href = id2href.get(idref, idref)
            chapter = find_label_path(toc_tree, href, filepos)
            if chapter is None:
                chapter = "(未找到章节)"
            booksnote[assetid][chapter][uuid] = {
                'chapter': chapter,
                'creationdate': ann['creationdate'],
                'filepos': filepos,
                'idref': href,
                'note': ann['note'],
                'selectedtext': ann['selectedtext']
            }
    return booksnote

if __name__ == '__main__':
    # 示例：请根据实际epub路径填写
    opf_path = './examples/epub_format_3/OEBPS/content.opf'
    toc_path = './examples/epub_format_3/OEBPS/toc.ncx'
    booksnote = build_booksnote(opf_path=opf_path, toc_path=toc_path)
    from pprint import pprint
    pprint(booksnote, indent=2, width=120, sort_dicts=False)