258 lines
10 KiB
Python
258 lines
10 KiB
Python
|
||
"""
|
||
annotationdata.py (OOP版)
|
||
------------------------
|
||
功能:
|
||
- 解析iBooks的AEAnnotation.sqlite数据库,提取所有或指定书籍(assetid/bookid)的笔记。
|
||
- 提供parse_location辅助函数,解析笔记定位信息。
|
||
- 返回结构化的annotations数据,便于后续章节定位与导出。
|
||
- 使用 EPUB CFI 解析器实现正确的位置排序
|
||
|
||
依赖:config.py 统一管理路径和配置项。
|
||
主要接口:AnnotationManager
|
||
- get_annotations(bookid=None):返回所有或指定assetid的笔记,结构为{assetid: {uuid: {...}}},按CFI位置排序
|
||
- parse_location(location):解析ZANNOTATIONLOCATION,返回(idref, filepos)
|
||
依赖:sqlite3, collections, re, os, datetime, epub_cfi_parser
|
||
"""
|
||
import config
|
||
import sqlite3
|
||
import re
|
||
import os
|
||
from collections import defaultdict
|
||
from epub_cfi_parser import EpubCFIParser
|
||
|
||
class AnnotationManager:
|
||
"""
|
||
iBooks笔记管理器
|
||
|
||
负责从iBooks的AEAnnotation.sqlite数据库中提取和解析用户的阅读笔记和高亮标记。
|
||
支持获取所有书籍的笔记或指定书籍的笔记,并提供位置信息解析功能。
|
||
"""
|
||
|
||
def __init__(self, db_path=None):
|
||
"""
|
||
初始化笔记管理器
|
||
|
||
Args:
|
||
db_path (str, optional): 数据库文件路径,默认使用config.LOCAL_ANNOTATION_DB
|
||
"""
|
||
self.db_path = db_path or config.LOCAL_ANNOTATION_DB
|
||
|
||
@staticmethod
|
||
def parse_location(location):
|
||
"""
|
||
解析iBooks笔记的位置信息
|
||
|
||
从ZANNOTATIONLOCATION字段解析出章节标识符和文件内位置信息。
|
||
支持epubcfi格式的位置字符串解析。
|
||
|
||
Args:
|
||
location (str): 笔记位置字符串,通常为epubcfi格式
|
||
|
||
Returns:
|
||
tuple: (idref, filepos)
|
||
- idref (str): 章节标识符,用于定位具体章节
|
||
- filepos (str): 文件内位置,用于精确定位笔记位置
|
||
|
||
Examples:
|
||
>>> parse_location('epubcfi(/6/746[id509]!/4[abc]/10,/2/1:0,/7:8)')
|
||
('id509', 'abc')
|
||
"""
|
||
idref = None
|
||
filepos = None
|
||
if not location:
|
||
return idref, filepos
|
||
# 使用正则表达式提取[]内的内容
|
||
matches = re.findall(r'\[(.*?)\]', location) if location else []
|
||
idref = matches[0] if len(matches) > 0 else None
|
||
filepos = matches[1] if len(matches) > 1 else None
|
||
return idref, filepos
|
||
|
||
def get_annotations(self, bookid=None):
|
||
"""
|
||
从数据库获取笔记数据,按 CFI 位置排序
|
||
|
||
从iBooks的AEAnnotation.sqlite数据库中提取所有或指定书籍的笔记和高亮内容。
|
||
自动处理时间戳转换和位置信息解析。现在按照 EPUB CFI 位置进行正确排序。
|
||
|
||
Args:
|
||
bookid (str, optional): 书籍资产ID,如果为None则获取所有书籍的笔记
|
||
|
||
Returns:
|
||
dict: 笔记数据字典,结构为:
|
||
{
|
||
assetid: [
|
||
{
|
||
'uuid': '笔记唯一标识',
|
||
'creationdate': '创建日期',
|
||
'filepos': '文件位置',
|
||
'idref': '章节标识',
|
||
'note': '笔记内容',
|
||
'selectedtext': '选中文本',
|
||
'location': 'CFI位置字符串',
|
||
'chapter_info': '章节信息'
|
||
}
|
||
] # 现在返回按CFI位置排序的列表
|
||
}
|
||
|
||
Note:
|
||
- 会检查WAL模式相关文件(-wal, -shm)的存在性
|
||
- 自动转换苹果时间戳格式(以2001-01-01为基准)
|
||
- 过滤掉既没有笔记也没有选中文本的空记录
|
||
- 按照 EPUB CFI 位置进行排序,确保笔记按阅读顺序排列
|
||
"""
|
||
# 检查WAL模式相关文件(只显示一次警告)
|
||
base = self.db_path.rsplit('.', 1)[0]
|
||
wal_path = base + '.sqlite-wal'
|
||
shm_path = base + '.sqlite-shm'
|
||
missing_files = []
|
||
for f in [wal_path, shm_path]:
|
||
if not os.path.exists(f):
|
||
missing_files.append(os.path.basename(f))
|
||
|
||
if missing_files and not hasattr(self, '_wal_warning_shown'):
|
||
print(f'提示: 缺少WAL文件 {", ".join(missing_files)},这是正常的(数据库未被其他进程打开时)')
|
||
self._wal_warning_shown = True
|
||
|
||
# 连接数据库并执行查询
|
||
conn = sqlite3.connect(self.db_path)
|
||
cursor = conn.cursor()
|
||
|
||
# 根据是否指定bookid选择不同的查询语句,使用已有的列
|
||
if bookid is not None:
|
||
cursor.execute('''
|
||
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION,
|
||
ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID,
|
||
ZPLABSOLUTEPHYSICALLOCATION
|
||
FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=?
|
||
''', (bookid,))
|
||
else:
|
||
cursor.execute('''
|
||
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION,
|
||
ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID,
|
||
ZPLABSOLUTEPHYSICALLOCATION
|
||
FROM ZAEANNOTATION
|
||
''')
|
||
|
||
rows = cursor.fetchall()
|
||
annotations = defaultdict(list)
|
||
import datetime
|
||
|
||
# 处理每一行数据
|
||
for row in rows:
|
||
assetid, creationdate, location, note, selectedtext, uuid, physical_location = row
|
||
|
||
# 转换 creationdate格式为'YYYY-MM-DD HH:MM:SS',支持苹果时间戳(以2001-01-01为基准)
|
||
date_str = creationdate
|
||
if creationdate:
|
||
try:
|
||
origin = datetime.datetime(2001, 1, 1)
|
||
if isinstance(creationdate, (int, float)):
|
||
dt = origin + datetime.timedelta(seconds=creationdate)
|
||
elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit():
|
||
dt = origin + datetime.timedelta(seconds=float(creationdate))
|
||
else:
|
||
# 支持原有格式'2025/9/6'等
|
||
dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d")
|
||
date_str = dt.strftime('%Y-%m-%d %H:%M:%S')
|
||
except Exception:
|
||
date_str = str(creationdate)
|
||
|
||
# 解析位置信息
|
||
idref, filepos = self.parse_location(location)
|
||
|
||
# 过滤空记录(既没有笔记也没有选中文本)
|
||
if note is None and selectedtext is None:
|
||
continue
|
||
|
||
# 提取章节信息
|
||
chapter_info = EpubCFIParser.extract_chapter_info(location or "")
|
||
|
||
# 构建笔记数据结构
|
||
annotation = {
|
||
'uuid': uuid,
|
||
'creationdate': date_str,
|
||
'filepos': filepos,
|
||
'idref': idref,
|
||
'note': note,
|
||
'selectedtext': selectedtext,
|
||
'location': location or "", # CFI 字符串
|
||
'chapter_info': chapter_info,
|
||
'physical_location': physical_location or 0 # 物理位置作为后备排序
|
||
}
|
||
|
||
annotations[str(assetid)].append(annotation)
|
||
|
||
conn.close()
|
||
|
||
# 对每本书的标注按 CFI 位置排序
|
||
for assetid in annotations:
|
||
annotations[assetid].sort(key=self._create_annotation_sort_key)
|
||
|
||
# 根据查询类型返回相应结果
|
||
if bookid is not None:
|
||
return {str(bookid): annotations.get(str(bookid), [])}
|
||
return annotations
|
||
|
||
def _create_annotation_sort_key(self, annotation: dict) -> tuple:
|
||
"""
|
||
为标注创建排序键,优先使用 CFI,失败时回退到物理位置或创建时间
|
||
|
||
Args:
|
||
annotation: 标注数据字典
|
||
|
||
Returns:
|
||
排序元组
|
||
"""
|
||
cfi = annotation.get('location', '')
|
||
|
||
if cfi:
|
||
# 尝试 CFI 排序
|
||
try:
|
||
cfi_key = EpubCFIParser.create_sort_key(cfi)
|
||
# CFI 排序成功,返回 CFI 键(优先级 0)
|
||
return (0, cfi_key)
|
||
except Exception as e:
|
||
print(f"CFI 排序失败: {cfi} -> {e}")
|
||
|
||
# CFI 排序失败,使用物理位置或创建时间(优先级 1)
|
||
physical_location = annotation.get('physical_location', 0)
|
||
creation_date = annotation.get('creationdate', '')
|
||
return (1, physical_location, creation_date)
|
||
|
||
conn.close()
|
||
|
||
# 根据查询类型返回相应结果
|
||
if bookid is not None:
|
||
return {str(bookid): annotations.get(str(bookid), {})}
|
||
return annotations
|
||
|
||
if __name__ == "__main__":
|
||
"""
|
||
测试模块功能
|
||
|
||
包含两个测试用例:
|
||
1. 测试parse_location方法解析各种格式的位置字符串
|
||
2. 测试get_annotations方法获取指定书籍的笔记数据
|
||
"""
|
||
manager = AnnotationManager()
|
||
|
||
# 测试 parse_location 方法
|
||
print("=== 测试位置解析功能 ===")
|
||
test_locations = [
|
||
'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8',
|
||
'epubcfi(/6/22[id15]!/4/156/1,:21,:157)',
|
||
'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)'
|
||
]
|
||
for loc in test_locations:
|
||
idref, filepos = manager.parse_location(loc)
|
||
print(f"location: {loc}\n idref: {idref}\n filepos: {filepos}\n")
|
||
|
||
# 测试获取特定书籍的笔记
|
||
print("=== 测试笔记获取功能 ===")
|
||
test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C"
|
||
annotations = manager.get_annotations(bookid=test_bookid)
|
||
from pprint import pprint
|
||
print(f"\nAssetID={test_bookid} 的所有笔记:")
|
||
pprint(annotations, indent=2, sort_dicts=False)
|