iBook/annotationdata.py

import sqlite3
from collections import defaultdict
import re
import os

def parse_location(location):
    """
    解析ZANNOTATIONLOCATION，返回(idref, filepos)
    - epubcfi(...)格式优先提取[]内内容为idref
    - 其他格式兼容原逻辑
    """
    idref = None
    filepos = None
    if not location:
        return idref, filepos
    # 统一处理，提取前两个[]内容
    matches = re.findall(r'\[(.*?)\]', location) if location else []
    idref = matches[0] if len(matches) > 0 else None
    filepos = matches[1] if len(matches) > 1 else None
    return idref, filepos

def get_annotations(db_path='./data/AEAnnotation.sqlite'):
    # 检查WAL模式相关文件
    base = db_path.rsplit('.', 1)[0]
    wal_path = base + '.sqlite-wal'
    shm_path = base + '.sqlite-shm'
    for f in [db_path, wal_path, shm_path]:
        if not os.path.exists(f):
            print(f'警告: 缺少 {f}，可能无法获取全部最新笔记')
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''
        SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
        FROM ZAEANNOTATION
    ''')
    rows = cursor.fetchall()
    annotations = defaultdict(dict)
    import datetime
    for row in rows:
        assetid, creationdate, location, note, selectedtext, uuid = row
        # 转换 creationdate 格式，支持苹果时间戳（以2001-01-01为基准）
        date_str = creationdate
        if creationdate:
            try:
                origin = datetime.datetime(2001, 1, 1)
                # 苹果时间戳 float/int 或数字字符串
                if isinstance(creationdate, (int, float)):
                    dt = origin + datetime.timedelta(seconds=creationdate)
                elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit():
                    dt = origin + datetime.timedelta(seconds=float(creationdate))
                else:
                    dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d")
                date_str = f"{dt.year}/{dt.month}/{dt.day}"
            except Exception:
                date_str = str(creationdate)
        idref, filepos = parse_location(location)
        # 跳过note和selectedtext都为None的笔记
        if note is None and selectedtext is None:
            continue
        annotations[str(assetid)][uuid] = {
            'creationdate': date_str,
            'filepos': filepos,
            'idref': idref,
            'note': note,
            'selectedtext': selectedtext
        }
    conn.close()
    return annotations

# 用法示例：输出每本书的前3条笔记
if __name__ == "__main__":
    # 测试 parse_location
    '''
    test_locations = [
        'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8',
        'epubcfi(/6/22[id15]!/4/156/1,:21,:157)',
        'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)'
    ]
    for loc in test_locations:
        idref, filepos = parse_location(loc)
        print(f"location: {loc}\n  idref: {idref}\n  filepos: {filepos}\n")
    '''

    annotations = get_annotations()

    # 格式化打印所有 annotations
    from pprint import pprint
    print("\n所有笔记:")
    pprint(annotations, indent=2, sort_dicts=False)

    # 输出每本书的前3条笔记
    '''
    book_notes = defaultdict(list)
    for assetid, notes_dict in annotations.items():
        for uuid, ann in notes_dict.items():
            book_notes[assetid].append({**ann, 'uuid': uuid})
    for assetid, notes in book_notes.items():
        print(f"\nAssetID: {assetid}")
        for i, note in enumerate(notes[:3]):
            print(f"  笔记{i+1}:")
            print(f"    creationdate: {note['creationdate']}")
            print(f"    idref: {note['idref']}")
            print(f"    filepos: {note['filepos']}")
            print(f"    note: {note['note']}")
            print(f"    selectedtext: {note['selectedtext']}")
            print(f"    uuid: {note['uuid']}")
    '''