iBook/annotationdata.py

196 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
annotationdata.py (OOP版)
------------------------
功能:
- 解析iBooks的AEAnnotation.sqlite数据库提取所有或指定书籍assetid/bookid的笔记。
- 提供parse_location辅助函数解析笔记定位信息。
- 返回结构化的annotations数据便于后续章节定位与导出。
依赖config.py 统一管理路径和配置项。
主要接口AnnotationManager
- get_annotations(bookid=None)返回所有或指定assetid的笔记结构为{assetid: {uuid: {...}}}
- parse_location(location)解析ZANNOTATIONLOCATION返回(idref, filepos)
依赖sqlite3, collections, re, os, datetime
"""
import config
import sqlite3
import re
import os
from collections import defaultdict
class AnnotationManager:
"""
iBooks笔记管理器
负责从iBooks的AEAnnotation.sqlite数据库中提取和解析用户的阅读笔记和高亮标记。
支持获取所有书籍的笔记或指定书籍的笔记,并提供位置信息解析功能。
"""
def __init__(self, db_path=None):
"""
初始化笔记管理器
Args:
db_path (str, optional): 数据库文件路径默认使用config.LOCAL_ANNOTATION_DB
"""
self.db_path = db_path or config.LOCAL_ANNOTATION_DB
@staticmethod
def parse_location(location):
"""
解析iBooks笔记的位置信息
从ZANNOTATIONLOCATION字段解析出章节标识符和文件内位置信息。
支持epubcfi格式的位置字符串解析。
Args:
location (str): 笔记位置字符串通常为epubcfi格式
Returns:
tuple: (idref, filepos)
- idref (str): 章节标识符,用于定位具体章节
- filepos (str): 文件内位置,用于精确定位笔记位置
Examples:
>>> parse_location('epubcfi(/6/746[id509]!/4[abc]/10,/2/1:0,/7:8)')
('id509', 'abc')
"""
idref = None
filepos = None
if not location:
return idref, filepos
# 使用正则表达式提取[]内的内容
matches = re.findall(r'\[(.*?)\]', location) if location else []
idref = matches[0] if len(matches) > 0 else None
filepos = matches[1] if len(matches) > 1 else None
return idref, filepos
def get_annotations(self, bookid=None):
"""
从数据库获取笔记数据
从iBooks的AEAnnotation.sqlite数据库中提取所有或指定书籍的笔记和高亮内容。
自动处理时间戳转换和位置信息解析。
Args:
bookid (str, optional): 书籍资产ID如果为None则获取所有书籍的笔记
Returns:
dict: 笔记数据字典,结构为:
{
assetid: {
uuid: {
'creationdate': '创建日期',
'filepos': '文件位置',
'idref': '章节标识',
'note': '笔记内容',
'selectedtext': '选中文本'
}
}
}
Note:
- 会检查WAL模式相关文件(-wal, -shm)的存在性
- 自动转换苹果时间戳格式(以2001-01-01为基准)
- 过滤掉既没有笔记也没有选中文本的空记录
"""
# 检查WAL模式相关文件
base = self.db_path.rsplit('.', 1)[0]
wal_path = base + '.sqlite-wal'
shm_path = base + '.sqlite-shm'
for f in [self.db_path, wal_path, shm_path]:
if not os.path.exists(f):
print(f'警告: 缺少 {f},可能无法获取全部最新笔记')
# 连接数据库并执行查询
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
# 根据是否指定bookid选择不同的查询语句
if bookid is not None:
cursor.execute('''
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=?
''', (bookid,))
else:
cursor.execute('''
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
FROM ZAEANNOTATION
''')
rows = cursor.fetchall()
annotations = defaultdict(dict)
import datetime
# 处理每一行数据
for row in rows:
assetid, creationdate, location, note, selectedtext, uuid = row
# 转换 creationdate格式为'YYYY-MM-DD HH:MM:SS'支持苹果时间戳以2001-01-01为基准
date_str = creationdate
if creationdate:
try:
origin = datetime.datetime(2001, 1, 1)
if isinstance(creationdate, (int, float)):
dt = origin + datetime.timedelta(seconds=creationdate)
elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit():
dt = origin + datetime.timedelta(seconds=float(creationdate))
else:
# 支持原有格式'2025/9/6'等
dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d")
date_str = dt.strftime('%Y-%m-%d %H:%M:%S')
except Exception:
date_str = str(creationdate)
# 解析位置信息
idref, filepos = self.parse_location(location)
# 过滤空记录(既没有笔记也没有选中文本)
if note is None and selectedtext is None:
continue
# 构建笔记数据结构
annotations[str(assetid)][uuid] = {
'creationdate': date_str,
'filepos': filepos,
'idref': idref,
'note': note,
'selectedtext': selectedtext
}
conn.close()
# 根据查询类型返回相应结果
if bookid is not None:
return {str(bookid): annotations.get(str(bookid), {})}
return annotations
if __name__ == "__main__":
"""
测试模块功能
包含两个测试用例:
1. 测试parse_location方法解析各种格式的位置字符串
2. 测试get_annotations方法获取指定书籍的笔记数据
"""
manager = AnnotationManager()
# 测试 parse_location 方法
print("=== 测试位置解析功能 ===")
test_locations = [
'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8',
'epubcfi(/6/22[id15]!/4/156/1,:21,:157)',
'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)'
]
for loc in test_locations:
idref, filepos = manager.parse_location(loc)
print(f"location: {loc}\n idref: {idref}\n filepos: {filepos}\n")
# 测试获取特定书籍的笔记
print("=== 测试笔记获取功能 ===")
test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C"
annotations = manager.get_annotations(bookid=test_bookid)
from pprint import pprint
print(f"\nAssetID={test_bookid} 的所有笔记:")
pprint(annotations, indent=2, sort_dicts=False)