This commit is contained in:
douboer
2025-08-15 17:20:30 +08:00
parent 0bc6844209
commit 4e3b8abc34
12 changed files with 406 additions and 516 deletions

View File

@@ -1,136 +1,113 @@
"""
annotationdata.py
-----------------
annotationdata.py (OOP版)
------------------------
功能:
- 解析iBooks的AEAnnotation.sqlite数据库提取所有或指定书籍assetid/bookid的笔记。
- 提供parse_location辅助函数解析笔记定位信息。
- 返回结构化的annotations数据便于后续章节定位与导出。
依赖config.py 统一管理路径和配置项。
主要接口:
- get_annotations(db_path, bookid=None)返回所有或指定assetid的笔记结构为{assetid: {uuid: {...}}}
主要接口AnnotationManager
- get_annotations(bookid=None)返回所有或指定assetid的笔记结构为{assetid: {uuid: {...}}}
- parse_location(location)解析ZANNOTATIONLOCATION返回(idref, filepos)
依赖sqlite3, collections, re, os, datetime
"""
import config
import sqlite3
from collections import defaultdict
import re
import os
from collections import defaultdict
def parse_location(location):
"""
解析ZANNOTATIONLOCATION返回(idref, filepos)
- epubcfi(...)格式优先提取[]内内容为idref
- 其他格式兼容原逻辑
"""
idref = None
filepos = None
if not location:
class AnnotationManager:
def __init__(self, db_path=None):
self.db_path = db_path or config.LOCAL_ANNOTATION_DB
@staticmethod
def parse_location(location):
"""
解析ZANNOTATIONLOCATION返回(idref, filepos)
- epubcfi(...)格式优先提取[]内内容为idref
- 其他格式兼容原逻辑
"""
idref = None
filepos = None
if not location:
return idref, filepos
matches = re.findall(r'\[(.*?)\]', location) if location else []
idref = matches[0] if len(matches) > 0 else None
filepos = matches[1] if len(matches) > 1 else None
return idref, filepos
# 统一处理,提取前两个[]内容
matches = re.findall(r'\[(.*?)\]', location) if location else []
idref = matches[0] if len(matches) > 0 else None
filepos = matches[1] if len(matches) > 1 else None
return idref, filepos
def get_annotations(db_path=config.LOCAL_ANNOTATION_DB, bookid=None):
# 检查WAL模式相关文件
base = db_path.rsplit('.', 1)[0]
wal_path = base + '.sqlite-wal'
shm_path = base + '.sqlite-shm'
for f in [db_path, wal_path, shm_path]:
if not os.path.exists(f):
print(f'警告: 缺少 {f},可能无法获取全部最新笔记')
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
if bookid is not None:
cursor.execute('''
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=?
''', (bookid,))
else:
cursor.execute('''
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
FROM ZAEANNOTATION
''')
rows = cursor.fetchall()
annotations = defaultdict(dict)
import datetime
for row in rows:
assetid, creationdate, location, note, selectedtext, uuid = row
# 转换 creationdate 格式支持苹果时间戳以2001-01-01为基准
date_str = creationdate
if creationdate:
try:
origin = datetime.datetime(2001, 1, 1)
# 苹果时间戳 float/int 或数字字符串
if isinstance(creationdate, (int, float)):
dt = origin + datetime.timedelta(seconds=creationdate)
elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit():
dt = origin + datetime.timedelta(seconds=float(creationdate))
else:
dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d")
date_str = f"{dt.year}/{dt.month}/{dt.day}"
except Exception:
date_str = str(creationdate)
idref, filepos = parse_location(location)
# 跳过note和selectedtext都为None的笔记
if note is None and selectedtext is None:
continue
annotations[str(assetid)][uuid] = {
'creationdate': date_str,
'filepos': filepos,
'idref': idref,
'note': note,
'selectedtext': selectedtext
}
conn.close()
if bookid is not None:
# 只返回特定bookid的笔记结构
return {str(bookid): annotations.get(str(bookid), {})}
return annotations
def get_annotations(self, bookid=None):
# 检查WAL模式相关文件
base = self.db_path.rsplit('.', 1)[0]
wal_path = base + '.sqlite-wal'
shm_path = base + '.sqlite-shm'
for f in [self.db_path, wal_path, shm_path]:
if not os.path.exists(f):
print(f'警告: 缺少 {f},可能无法获取全部最新笔记')
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
if bookid is not None:
cursor.execute('''
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=?
''', (bookid,))
else:
cursor.execute('''
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
FROM ZAEANNOTATION
''')
rows = cursor.fetchall()
annotations = defaultdict(dict)
import datetime
for row in rows:
assetid, creationdate, location, note, selectedtext, uuid = row
# 转换 creationdate 格式支持苹果时间戳以2001-01-01为基准
date_str = creationdate
if creationdate:
try:
origin = datetime.datetime(2001, 1, 1)
if isinstance(creationdate, (int, float)):
dt = origin + datetime.timedelta(seconds=creationdate)
elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit():
dt = origin + datetime.timedelta(seconds=float(creationdate))
else:
dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d")
date_str = f"{dt.year}/{dt.month}/{dt.day}"
except Exception:
date_str = str(creationdate)
idref, filepos = self.parse_location(location)
if note is None and selectedtext is None:
continue
annotations[str(assetid)][uuid] = {
'creationdate': date_str,
'filepos': filepos,
'idref': idref,
'note': note,
'selectedtext': selectedtext
}
conn.close()
if bookid is not None:
return {str(bookid): annotations.get(str(bookid), {})}
return annotations
# 用法示例输出每本书的前3条笔记
if __name__ == "__main__":
manager = AnnotationManager()
# 测试 parse_location
'''
test_locations = [
'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8',
'epubcfi(/6/22[id15]!/4/156/1,:21,:157)',
'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)'
]
for loc in test_locations:
idref, filepos = parse_location(loc)
idref, filepos = manager.parse_location(loc)
print(f"location: {loc}\n idref: {idref}\n filepos: {filepos}\n")
'''
# 测试只获取特定 assetid 的笔记
test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C"
annotations = get_annotations(bookid=test_bookid)
# 格式化打印该书的所有笔记
annotations = manager.get_annotations(bookid=test_bookid)
from pprint import pprint
print(f"\nAssetID={test_bookid} 的所有笔记:")
pprint(annotations, indent=2, sort_dicts=False)
# 输出每本书的前3条笔记
'''
book_notes = defaultdict(list)
for assetid, notes_dict in annotations.items():
for uuid, ann in notes_dict.items():
book_notes[assetid].append({**ann, 'uuid': uuid})
for assetid, notes in book_notes.items():
print(f"\nAssetID: {assetid}")
for i, note in enumerate(notes[:3]):
print(f" 笔记{i+1}:")
print(f" creationdate: {note['creationdate']}")
print(f" idref: {note['idref']}")
print(f" filepos: {note['filepos']}")
print(f" note: {note['note']}")
print(f" selectedtext: {note['selectedtext']}")
print(f" uuid: {note['uuid']}")
'''