""" 生成 booksnote 数据结构: booksnote = { assetid: {uuid: { 'chapter': label_path, 'creationdate': '2023/7/12', 'filepos': None, 'idref': '008.xhtml', 'note': None, 'selectedtext': '這就是宣傳的恐怖之處' }} } """ from collections import defaultdict from annotationdata import get_annotations from opf_parse import parse_opf from toc_parse import parse_navpoints, find_label_path import os from bs4 import BeautifulSoup def get_toc_tree(toc_path): with open(toc_path, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'xml') nav_map = soup.find('navMap') return parse_navpoints(nav_map.find_all('navPoint', recursive=False)) def build_booksnote(annotation_db='data/AEAnnotation.sqlite', opf_path=None, toc_path=None): annotations = get_annotations(annotation_db) booksnote = defaultdict(lambda: defaultdict(dict)) # 解析OPF和TOC if not opf_path or not toc_path: raise ValueError('必须提供OPF和TOC路径') id2href = parse_opf(opf_path) toc_tree = get_toc_tree(toc_path) # 遍历所有笔记 for assetid, notes in annotations.items(): for uuid, ann in notes.items(): idref = ann['idref'] filepos = ann['filepos'] href = id2href.get(idref, idref) chapter = find_label_path(toc_tree, href, filepos) if chapter is None: chapter = "(未找到章节)" booksnote[assetid][chapter][uuid] = { 'chapter': chapter, 'creationdate': ann['creationdate'], 'filepos': filepos, 'idref': href, 'note': ann['note'], 'selectedtext': ann['selectedtext'] } return booksnote if __name__ == '__main__': # 示例:请根据实际epub路径填写 opf_path = './examples/epub_format_3/OEBPS/content.opf' toc_path = './examples/epub_format_3/OEBPS/toc.ncx' booksnote = build_booksnote(opf_path=opf_path, toc_path=toc_path) from pprint import pprint pprint(booksnote, indent=2, width=120, sort_dicts=False)