iBook/annotationdata.py

137 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
annotationdata.py
-----------------
功能:
- 解析iBooks的AEAnnotation.sqlite数据库提取所有或指定书籍assetid/bookid的笔记。
- 提供parse_location辅助函数解析笔记定位信息。
- 返回结构化的annotations数据便于后续章节定位与导出。
依赖config.py 统一管理路径和配置项。
主要接口:
- get_annotations(db_path, bookid=None)返回所有或指定assetid的笔记结构为{assetid: {uuid: {...}}}
- parse_location(location)解析ZANNOTATIONLOCATION返回(idref, filepos)
依赖sqlite3, collections, re, os, datetime
"""
import config
import sqlite3
from collections import defaultdict
import re
import os
def parse_location(location):
"""
解析ZANNOTATIONLOCATION返回(idref, filepos)
- epubcfi(...)格式优先提取[]内内容为idref
- 其他格式兼容原逻辑
"""
idref = None
filepos = None
if not location:
return idref, filepos
# 统一处理,提取前两个[]内容
matches = re.findall(r'\[(.*?)\]', location) if location else []
idref = matches[0] if len(matches) > 0 else None
filepos = matches[1] if len(matches) > 1 else None
return idref, filepos
def get_annotations(db_path=config.LOCAL_ANNOTATION_DB, bookid=None):
# 检查WAL模式相关文件
base = db_path.rsplit('.', 1)[0]
wal_path = base + '.sqlite-wal'
shm_path = base + '.sqlite-shm'
for f in [db_path, wal_path, shm_path]:
if not os.path.exists(f):
print(f'警告: 缺少 {f},可能无法获取全部最新笔记')
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
if bookid is not None:
cursor.execute('''
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=?
''', (bookid,))
else:
cursor.execute('''
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
FROM ZAEANNOTATION
''')
rows = cursor.fetchall()
annotations = defaultdict(dict)
import datetime
for row in rows:
assetid, creationdate, location, note, selectedtext, uuid = row
# 转换 creationdate 格式支持苹果时间戳以2001-01-01为基准
date_str = creationdate
if creationdate:
try:
origin = datetime.datetime(2001, 1, 1)
# 苹果时间戳 float/int 或数字字符串
if isinstance(creationdate, (int, float)):
dt = origin + datetime.timedelta(seconds=creationdate)
elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit():
dt = origin + datetime.timedelta(seconds=float(creationdate))
else:
dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d")
date_str = f"{dt.year}/{dt.month}/{dt.day}"
except Exception:
date_str = str(creationdate)
idref, filepos = parse_location(location)
# 跳过note和selectedtext都为None的笔记
if note is None and selectedtext is None:
continue
annotations[str(assetid)][uuid] = {
'creationdate': date_str,
'filepos': filepos,
'idref': idref,
'note': note,
'selectedtext': selectedtext
}
conn.close()
if bookid is not None:
# 只返回特定bookid的笔记结构
return {str(bookid): annotations.get(str(bookid), {})}
return annotations
# 用法示例输出每本书的前3条笔记
if __name__ == "__main__":
# 测试 parse_location
'''
test_locations = [
'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8',
'epubcfi(/6/22[id15]!/4/156/1,:21,:157)',
'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)'
]
for loc in test_locations:
idref, filepos = parse_location(loc)
print(f"location: {loc}\n idref: {idref}\n filepos: {filepos}\n")
'''
# 测试只获取特定 assetid 的笔记
test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C"
annotations = get_annotations(bookid=test_bookid)
# 格式化打印该书的所有笔记
from pprint import pprint
print(f"\nAssetID={test_bookid} 的所有笔记:")
pprint(annotations, indent=2, sort_dicts=False)
# 输出每本书的前3条笔记
'''
book_notes = defaultdict(list)
for assetid, notes_dict in annotations.items():
for uuid, ann in notes_dict.items():
book_notes[assetid].append({**ann, 'uuid': uuid})
for assetid, notes in book_notes.items():
print(f"\nAssetID: {assetid}")
for i, note in enumerate(notes[:3]):
print(f" 笔记{i+1}:")
print(f" creationdate: {note['creationdate']}")
print(f" idref: {note['idref']}")
print(f" filepos: {note['filepos']}")
print(f" note: {note['note']}")
print(f" selectedtext: {note['selectedtext']}")
print(f" uuid: {note['uuid']}")
'''