iBook/annotationdata.py


"""
annotationdata.py (OOP版)
------------------------
功能：
    - 解析iBooks的AEAnnotation.sqlite数据库，提取所有或指定书籍（assetid/bookid）的笔记。
    - 提供parse_location辅助函数，解析笔记定位信息。
    - 返回结构化的annotations数据，便于后续章节定位与导出。
    - 使用 EPUB CFI 解析器实现正确的位置排序

依赖：config.py 统一管理路径和配置项。
主要接口：AnnotationManager
    - get_annotations(bookid=None)：返回所有或指定assetid的笔记，结构为{assetid: {uuid: {...}}}，按CFI位置排序
    - parse_location(location)：解析ZANNOTATIONLOCATION，返回(idref, filepos)
依赖：sqlite3, collections, re, os, datetime, epub_cfi_parser
"""
import config
import sqlite3
import re
import os
from collections import defaultdict
from epub_cfi_parser import EpubCFIParser

class AnnotationManager:
    """
    iBooks笔记管理器

    负责从iBooks的AEAnnotation.sqlite数据库中提取和解析用户的阅读笔记和高亮标记。
    支持获取所有书籍的笔记或指定书籍的笔记，并提供位置信息解析功能。
    """

    def __init__(self, db_path=None):
        """
        初始化笔记管理器

        Args:
            db_path (str, optional): 数据库文件路径，默认使用config.LOCAL_ANNOTATION_DB
        """
        self.db_path = db_path or config.LOCAL_ANNOTATION_DB

    @staticmethod
    def parse_location(location):
        """
        解析iBooks笔记的位置信息

        从ZANNOTATIONLOCATION字段解析出章节标识符和文件内位置信息。
        支持epubcfi格式的位置字符串解析。

        Args:
            location (str): 笔记位置字符串，通常为epubcfi格式

        Returns:
            tuple: (idref, filepos)
                - idref (str): 章节标识符，用于定位具体章节
                - filepos (str): 文件内位置，用于精确定位笔记位置

        Examples:
            >>> parse_location('epubcfi(/6/746[id509]!/4[abc]/10,/2/1:0,/7:8)')
            ('id509', 'abc')
        """
        idref = None
        filepos = None
        if not location:
            return idref, filepos
        # 使用正则表达式提取[]内的内容
        matches = re.findall(r'\[(.*?)\]', location) if location else []
        idref = matches[0] if len(matches) > 0 else None
        filepos = matches[1] if len(matches) > 1 else None
        return idref, filepos

    def get_annotations(self, bookid=None):
        """
        从数据库获取笔记数据，按 CFI 位置排序

        从iBooks的AEAnnotation.sqlite数据库中提取所有或指定书籍的笔记和高亮内容。
        自动处理时间戳转换和位置信息解析。现在按照 EPUB CFI 位置进行正确排序。

        Args:
            bookid (str, optional): 书籍资产ID，如果为None则获取所有书籍的笔记

        Returns:
            dict: 笔记数据字典，结构为：
                {
                    assetid: [
                        {
                            'uuid': '笔记唯一标识',
                            'creationdate': '创建日期',
                            'filepos': '文件位置',
                            'idref': '章节标识',
                            'note': '笔记内容',
                            'selectedtext': '选中文本',
                            'location': 'CFI位置字符串',
                            'chapter_info': '章节信息'
                        }
                    ]  # 现在返回按CFI位置排序的列表
                }

        Note:
            - 会检查WAL模式相关文件(-wal, -shm)的存在性
            - 自动转换苹果时间戳格式(以2001-01-01为基准)
            - 过滤掉既没有笔记也没有选中文本的空记录
            - 按照 EPUB CFI 位置进行排序，确保笔记按阅读顺序排列
        """
        # 检查WAL模式相关文件（只显示一次警告）
        base = self.db_path.rsplit('.', 1)[0]
        wal_path = base + '.sqlite-wal'
        shm_path = base + '.sqlite-shm'
        missing_files = []
        for f in [wal_path, shm_path]:
            if not os.path.exists(f):
                missing_files.append(os.path.basename(f))

        if missing_files and not hasattr(self, '_wal_warning_shown'):
            print(f'提示: 缺少WAL文件 {", ".join(missing_files)}，这是正常的（数据库未被其他进程打开时）')
            self._wal_warning_shown = True

        # 连接数据库并执行查询
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()

        # 根据是否指定bookid选择不同的查询语句，使用已有的列
        if bookid is not None:
            cursor.execute('''
                SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION,
                       ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID,
                       ZPLABSOLUTEPHYSICALLOCATION
                FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=?
            ''', (bookid,))
        else:
            cursor.execute('''
                SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION,
                       ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID,
                       ZPLABSOLUTEPHYSICALLOCATION
                FROM ZAEANNOTATION
            ''')

        rows = cursor.fetchall()
        annotations = defaultdict(list)
        import datetime

        # 处理每一行数据
        for row in rows:
            assetid, creationdate, location, note, selectedtext, uuid, physical_location = row

            # 转换 creationdate格式为'YYYY-MM-DD HH:MM:SS'，支持苹果时间戳（以2001-01-01为基准）
            date_str = creationdate
            if creationdate:
                try:
                    origin = datetime.datetime(2001, 1, 1)
                    if isinstance(creationdate, (int, float)):
                        dt = origin + datetime.timedelta(seconds=creationdate)
                    elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit():
                        dt = origin + datetime.timedelta(seconds=float(creationdate))
                    else:
                        # 支持原有格式'2025/9/6'等
                        dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d")
                    date_str = dt.strftime('%Y-%m-%d %H:%M:%S')
                except Exception:
                    date_str = str(creationdate)

            # 解析位置信息
            idref, filepos = self.parse_location(location)

            # 过滤空记录（既没有笔记也没有选中文本）
            if note is None and selectedtext is None:
                continue

            # 提取章节信息
            chapter_info = EpubCFIParser.extract_chapter_info(location or "")

            # 构建笔记数据结构
            annotation = {
                'uuid': uuid,
                'creationdate': date_str,
                'filepos': filepos,
                'idref': idref,
                'note': note,
                'selectedtext': selectedtext,
                'location': location or "",  # CFI 字符串
                'chapter_info': chapter_info,
                'physical_location': physical_location or 0  # 物理位置作为后备排序
            }

            annotations[str(assetid)].append(annotation)

        conn.close()

        # 对每本书的标注按 CFI 位置排序
        for assetid in annotations:
            annotations[assetid].sort(key=self._create_annotation_sort_key)

        # 根据查询类型返回相应结果
        if bookid is not None:
            return {str(bookid): annotations.get(str(bookid), [])}
        return annotations

    def _create_annotation_sort_key(self, annotation: dict) -> tuple:
        """
        为标注创建排序键，优先使用 CFI，失败时回退到物理位置或创建时间

        Args:
            annotation: 标注数据字典

        Returns:
            排序元组
        """
        cfi = annotation.get('location', '')

        if cfi:
            # 尝试 CFI 排序
            try:
                cfi_key = EpubCFIParser.create_sort_key(cfi)
                # CFI 排序成功，返回 CFI 键（优先级 0）
                return (0, cfi_key)
            except Exception as e:
                print(f"CFI 排序失败: {cfi} -> {e}")

        # CFI 排序失败，使用物理位置或创建时间（优先级 1）
        physical_location = annotation.get('physical_location', 0)
        creation_date = annotation.get('creationdate', '')
        return (1, physical_location, creation_date)

        conn.close()

        # 根据查询类型返回相应结果
        if bookid is not None:
            return {str(bookid): annotations.get(str(bookid), {})}
        return annotations

if __name__ == "__main__":
    """
    测试模块功能

    包含两个测试用例：
    1. 测试parse_location方法解析各种格式的位置字符串
    2. 测试get_annotations方法获取指定书籍的笔记数据
    """
    manager = AnnotationManager()

    # 测试 parse_location 方法
    print("=== 测试位置解析功能 ===")
    test_locations = [
        'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8',
        'epubcfi(/6/22[id15]!/4/156/1,:21,:157)',
        'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)'
    ]
    for loc in test_locations:
        idref, filepos = manager.parse_location(loc)
        print(f"location: {loc}\n  idref: {idref}\n  filepos: {filepos}\n")

    # 测试获取特定书籍的笔记
    print("=== 测试笔记获取功能 ===")
    test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C"
    annotations = manager.get_annotations(bookid=test_bookid)
    from pprint import pprint
    print(f"\nAssetID={test_bookid} 的所有笔记:")
    pprint(annotations, indent=2, sort_dicts=False)