""" annotationdata.py ----------------- 功能: - 解析iBooks的AEAnnotation.sqlite数据库,提取所有或指定书籍(assetid/bookid)的笔记。 - 提供parse_location辅助函数,解析笔记定位信息。 - 返回结构化的annotations数据,便于后续章节定位与导出。 主要接口: - get_annotations(db_path, bookid=None):返回所有或指定assetid的笔记,结构为{assetid: {uuid: {...}}} - parse_location(location):解析ZANNOTATIONLOCATION,返回(idref, filepos) 依赖:sqlite3, collections, re, os, datetime """ import sqlite3 from collections import defaultdict import re import os def parse_location(location): """ 解析ZANNOTATIONLOCATION,返回(idref, filepos) - epubcfi(...)格式优先提取[]内内容为idref - 其他格式兼容原逻辑 """ idref = None filepos = None if not location: return idref, filepos # 统一处理,提取前两个[]内容 matches = re.findall(r'\[(.*?)\]', location) if location else [] idref = matches[0] if len(matches) > 0 else None filepos = matches[1] if len(matches) > 1 else None return idref, filepos def get_annotations(db_path='./data/AEAnnotation.sqlite', bookid=None): # 检查WAL模式相关文件 base = db_path.rsplit('.', 1)[0] wal_path = base + '.sqlite-wal' shm_path = base + '.sqlite-shm' for f in [db_path, wal_path, shm_path]: if not os.path.exists(f): print(f'警告: 缺少 {f},可能无法获取全部最新笔记') conn = sqlite3.connect(db_path) cursor = conn.cursor() if bookid is not None: cursor.execute(''' SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=? ''', (bookid,)) else: cursor.execute(''' SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID FROM ZAEANNOTATION ''') rows = cursor.fetchall() annotations = defaultdict(dict) import datetime for row in rows: assetid, creationdate, location, note, selectedtext, uuid = row # 转换 creationdate 格式,支持苹果时间戳(以2001-01-01为基准) date_str = creationdate if creationdate: try: origin = datetime.datetime(2001, 1, 1) # 苹果时间戳 float/int 或数字字符串 if isinstance(creationdate, (int, float)): dt = origin + datetime.timedelta(seconds=creationdate) elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit(): dt = origin + datetime.timedelta(seconds=float(creationdate)) else: dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d") date_str = f"{dt.year}/{dt.month}/{dt.day}" except Exception: date_str = str(creationdate) idref, filepos = parse_location(location) # 跳过note和selectedtext都为None的笔记 if note is None and selectedtext is None: continue annotations[str(assetid)][uuid] = { 'creationdate': date_str, 'filepos': filepos, 'idref': idref, 'note': note, 'selectedtext': selectedtext } conn.close() if bookid is not None: # 只返回特定bookid的笔记结构 return {str(bookid): annotations.get(str(bookid), {})} return annotations # 用法示例:输出每本书的前3条笔记 if __name__ == "__main__": # 测试 parse_location ''' test_locations = [ 'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8', 'epubcfi(/6/22[id15]!/4/156/1,:21,:157)', 'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)' ] for loc in test_locations: idref, filepos = parse_location(loc) print(f"location: {loc}\n idref: {idref}\n filepos: {filepos}\n") ''' # 测试只获取特定 assetid 的笔记 test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C" annotations = get_annotations(bookid=test_bookid) # 格式化打印该书的所有笔记 from pprint import pprint print(f"\nAssetID={test_bookid} 的所有笔记:") pprint(annotations, indent=2, sort_dicts=False) # 输出每本书的前3条笔记 ''' book_notes = defaultdict(list) for assetid, notes_dict in annotations.items(): for uuid, ann in notes_dict.items(): book_notes[assetid].append({**ann, 'uuid': uuid}) for assetid, notes in book_notes.items(): print(f"\nAssetID: {assetid}") for i, note in enumerate(notes[:3]): print(f" 笔记{i+1}:") print(f" creationdate: {note['creationdate']}") print(f" idref: {note['idref']}") print(f" filepos: {note['filepos']}") print(f" note: {note['note']}") print(f" selectedtext: {note['selectedtext']}") print(f" uuid: {note['uuid']}") '''