iBook/annotationdata.py


"""
annotationdata.py (OOP版)
------------------------
功能：
    - 解析iBooks的AEAnnotation.sqlite数据库，提取所有或指定书籍（assetid/bookid）的笔记。
    - 提供parse_location辅助函数，解析笔记定位信息。
    - 返回结构化的annotations数据，便于后续章节定位与导出。

依赖：config.py 统一管理路径和配置项。
主要接口：AnnotationManager
    - get_annotations(bookid=None)：返回所有或指定assetid的笔记，结构为{assetid: {uuid: {...}}}
    - parse_location(location)：解析ZANNOTATIONLOCATION，返回(idref, filepos)
依赖：sqlite3, collections, re, os, datetime
"""
import config
import sqlite3
import re
import os
from collections import defaultdict

class AnnotationManager:
    def __init__(self, db_path=None):
        self.db_path = db_path or config.LOCAL_ANNOTATION_DB

    @staticmethod
    def parse_location(location):
        """
        解析ZANNOTATIONLOCATION，返回(idref, filepos)
        - epubcfi(...)格式优先提取[]内内容为idref
        - 其他格式兼容原逻辑
        """
        idref = None
        filepos = None
        if not location:
            return idref, filepos
        matches = re.findall(r'\[(.*?)\]', location) if location else []
        idref = matches[0] if len(matches) > 0 else None
        filepos = matches[1] if len(matches) > 1 else None
        return idref, filepos

    def get_annotations(self, bookid=None):
        # 检查WAL模式相关文件
        base = self.db_path.rsplit('.', 1)[0]
        wal_path = base + '.sqlite-wal'
        shm_path = base + '.sqlite-shm'
        for f in [self.db_path, wal_path, shm_path]:
            if not os.path.exists(f):
                print(f'警告: 缺少 {f}，可能无法获取全部最新笔记')
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        if bookid is not None:
            cursor.execute('''
                SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
                FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=?
            ''', (bookid,))
        else:
            cursor.execute('''
                SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
                FROM ZAEANNOTATION
            ''')
        rows = cursor.fetchall()
        annotations = defaultdict(dict)
        import datetime
        for row in rows:
            assetid, creationdate, location, note, selectedtext, uuid = row
            # 转换 creationdate 格式，支持苹果时间戳（以2001-01-01为基准）
            date_str = creationdate
            if creationdate:
                try:
                    origin = datetime.datetime(2001, 1, 1)
                    if isinstance(creationdate, (int, float)):
                        dt = origin + datetime.timedelta(seconds=creationdate)
                    elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit():
                        dt = origin + datetime.timedelta(seconds=float(creationdate))
                    else:
                        dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d")
                    date_str = f"{dt.year}/{dt.month}/{dt.day}"
                except Exception:
                    date_str = str(creationdate)
            idref, filepos = self.parse_location(location)
            if note is None and selectedtext is None:
                continue
            annotations[str(assetid)][uuid] = {
                'creationdate': date_str,
                'filepos': filepos,
                'idref': idref,
                'note': note,
                'selectedtext': selectedtext
            }
        conn.close()
        if bookid is not None:
            return {str(bookid): annotations.get(str(bookid), {})}
        return annotations

if __name__ == "__main__":
    manager = AnnotationManager()
    # 测试 parse_location
    test_locations = [
        'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8',
        'epubcfi(/6/22[id15]!/4/156/1,:21,:157)',
        'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)'
    ]
    for loc in test_locations:
        idref, filepos = manager.parse_location(loc)
        print(f"location: {loc}\n  idref: {idref}\n  filepos: {filepos}\n")

    # 测试只获取特定 assetid 的笔记
    test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C"
    annotations = manager.get_annotations(bookid=test_bookid)
    from pprint import pprint
    print(f"\nAssetID={test_bookid} 的所有笔记:")
    pprint(annotations, indent=2, sort_dicts=False)