iBook/annotationdata.py

"""
annotationdata.py
-----------------
功能：
    - 解析iBooks的AEAnnotation.sqlite数据库，提取所有或指定书籍（assetid/bookid）的笔记。
    - 提供parse_location辅助函数，解析笔记定位信息。
    - 返回结构化的annotations数据，便于后续章节定位与导出。

依赖：config.py 统一管理路径和配置项。

主要接口：
    - get_annotations(db_path, bookid=None)：返回所有或指定assetid的笔记，结构为{assetid: {uuid: {...}}}
    - parse_location(location)：解析ZANNOTATIONLOCATION，返回(idref, filepos)

依赖：sqlite3, collections, re, os, datetime
"""
import config

import sqlite3
from collections import defaultdict
import re
import os

def parse_location(location):
    """
    解析ZANNOTATIONLOCATION，返回(idref, filepos)
    - epubcfi(...)格式优先提取[]内内容为idref
    - 其他格式兼容原逻辑
    """
    idref = None
    filepos = None
    if not location:
        return idref, filepos
    # 统一处理，提取前两个[]内容
    matches = re.findall(r'\[(.*?)\]', location) if location else []
    idref = matches[0] if len(matches) > 0 else None
    filepos = matches[1] if len(matches) > 1 else None
    return idref, filepos

def get_annotations(db_path=config.LOCAL_ANNOTATION_DB, bookid=None):
    # 检查WAL模式相关文件
    base = db_path.rsplit('.', 1)[0]
    wal_path = base + '.sqlite-wal'
    shm_path = base + '.sqlite-shm'
    for f in [db_path, wal_path, shm_path]:
        if not os.path.exists(f):
            print(f'警告: 缺少 {f}，可能无法获取全部最新笔记')
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    if bookid is not None:
        cursor.execute('''
            SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
            FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=?
        ''', (bookid,))
    else:
        cursor.execute('''
            SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
            FROM ZAEANNOTATION
        ''')
    rows = cursor.fetchall()
    annotations = defaultdict(dict)
    import datetime
    for row in rows:
        assetid, creationdate, location, note, selectedtext, uuid = row
        # 转换 creationdate 格式，支持苹果时间戳（以2001-01-01为基准）
        date_str = creationdate
        if creationdate:
            try:
                origin = datetime.datetime(2001, 1, 1)
                # 苹果时间戳 float/int 或数字字符串
                if isinstance(creationdate, (int, float)):
                    dt = origin + datetime.timedelta(seconds=creationdate)
                elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit():
                    dt = origin + datetime.timedelta(seconds=float(creationdate))
                else:
                    dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d")
                date_str = f"{dt.year}/{dt.month}/{dt.day}"
            except Exception:
                date_str = str(creationdate)
        idref, filepos = parse_location(location)
        # 跳过note和selectedtext都为None的笔记
        if note is None and selectedtext is None:
            continue
        annotations[str(assetid)][uuid] = {
            'creationdate': date_str,
            'filepos': filepos,
            'idref': idref,
            'note': note,
            'selectedtext': selectedtext
        }
    conn.close()
    if bookid is not None:
        # 只返回特定bookid的笔记结构
        return {str(bookid): annotations.get(str(bookid), {})}
    return annotations

# 用法示例：输出每本书的前3条笔记
if __name__ == "__main__":
    # 测试 parse_location
    '''
    test_locations = [
        'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8',
        'epubcfi(/6/22[id15]!/4/156/1,:21,:157)',
        'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)'
    ]
    for loc in test_locations:
        idref, filepos = parse_location(loc)
        print(f"location: {loc}\n  idref: {idref}\n  filepos: {filepos}\n")
    '''

    # 测试只获取特定 assetid 的笔记
    test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C"
    annotations = get_annotations(bookid=test_bookid)

    # 格式化打印该书的所有笔记
    from pprint import pprint
    print(f"\nAssetID={test_bookid} 的所有笔记:")
    pprint(annotations, indent=2, sort_dicts=False)

    # 输出每本书的前3条笔记
    '''
    book_notes = defaultdict(list)
    for assetid, notes_dict in annotations.items():
        for uuid, ann in notes_dict.items():
            book_notes[assetid].append({**ann, 'uuid': uuid})
    for assetid, notes in book_notes.items():
        print(f"\nAssetID: {assetid}")
        for i, note in enumerate(notes[:3]):
            print(f"  笔记{i+1}:")
            print(f"    creationdate: {note['creationdate']}")
            print(f"    idref: {note['idref']}")
            print(f"    filepos: {note['filepos']}")
            print(f"    note: {note['note']}")
            print(f"    selectedtext: {note['selectedtext']}")
            print(f"    uuid: {note['uuid']}")
    '''