60 lines
2.1 KiB
Python
60 lines
2.1 KiB
Python
"""
|
||
生成 booksnote 数据结构:
|
||
booksnote = {
|
||
assetid: {uuid: {
|
||
'chapter': label_path,
|
||
'creationdate': '2023/7/12',
|
||
'filepos': None,
|
||
'idref': '008.xhtml',
|
||
'note': None,
|
||
'selectedtext': '這就是宣傳的恐怖之處'
|
||
}}
|
||
}
|
||
"""
|
||
from collections import defaultdict
|
||
from annotationdata import get_annotations
|
||
from opf_parse import parse_opf
|
||
from toc_parse import parse_navpoints, find_label_path
|
||
from bs4 import BeautifulSoup
|
||
|
||
def get_toc_tree(toc_path):
|
||
with open(toc_path, 'r', encoding='utf-8') as f:
|
||
soup = BeautifulSoup(f, 'xml')
|
||
nav_map = soup.find('navMap')
|
||
return parse_navpoints(nav_map.find_all('navPoint', recursive=False))
|
||
|
||
def build_booksnote(annotation_db='data/AEAnnotation.sqlite', opf_path=None, toc_path=None):
|
||
annotations = get_annotations(annotation_db)
|
||
booksnote = defaultdict(lambda: defaultdict(dict))
|
||
# 解析OPF和TOC
|
||
if not opf_path or not toc_path:
|
||
raise ValueError('必须提供OPF和TOC路径')
|
||
id2href = parse_opf(opf_path)
|
||
toc_tree = get_toc_tree(toc_path)
|
||
# 遍历所有笔记
|
||
for assetid, notes in annotations.items():
|
||
for uuid, ann in notes.items():
|
||
idref = ann['idref']
|
||
filepos = ann['filepos']
|
||
href = id2href.get(idref, idref)
|
||
chapter = find_label_path(toc_tree, href, filepos)
|
||
if chapter is None:
|
||
chapter = "(未找到章节)"
|
||
booksnote[assetid][chapter][uuid] = {
|
||
'chapter': chapter,
|
||
'creationdate': ann['creationdate'],
|
||
'filepos': filepos,
|
||
'idref': href,
|
||
'note': ann['note'],
|
||
'selectedtext': ann['selectedtext']
|
||
}
|
||
return booksnote
|
||
|
||
if __name__ == '__main__':
|
||
# 示例:请根据实际epub路径填写
|
||
opf_path = './examples/epub_format_3/OEBPS/content.opf'
|
||
toc_path = './examples/epub_format_3/OEBPS/toc.ncx'
|
||
booksnote = build_booksnote(opf_path=opf_path, toc_path=toc_path)
|
||
from pprint import pprint
|
||
pprint(booksnote, indent=2, width=120, sort_dicts=False)
|