'update'
This commit is contained in:
parent
0bc6844209
commit
4e3b8abc34
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -1,136 +1,113 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
annotationdata.py
|
annotationdata.py (OOP版)
|
||||||
-----------------
|
------------------------
|
||||||
功能:
|
功能:
|
||||||
- 解析iBooks的AEAnnotation.sqlite数据库,提取所有或指定书籍(assetid/bookid)的笔记。
|
- 解析iBooks的AEAnnotation.sqlite数据库,提取所有或指定书籍(assetid/bookid)的笔记。
|
||||||
- 提供parse_location辅助函数,解析笔记定位信息。
|
- 提供parse_location辅助函数,解析笔记定位信息。
|
||||||
- 返回结构化的annotations数据,便于后续章节定位与导出。
|
- 返回结构化的annotations数据,便于后续章节定位与导出。
|
||||||
|
|
||||||
依赖:config.py 统一管理路径和配置项。
|
依赖:config.py 统一管理路径和配置项。
|
||||||
|
主要接口:AnnotationManager
|
||||||
主要接口:
|
- get_annotations(bookid=None):返回所有或指定assetid的笔记,结构为{assetid: {uuid: {...}}}
|
||||||
- get_annotations(db_path, bookid=None):返回所有或指定assetid的笔记,结构为{assetid: {uuid: {...}}}
|
|
||||||
- parse_location(location):解析ZANNOTATIONLOCATION,返回(idref, filepos)
|
- parse_location(location):解析ZANNOTATIONLOCATION,返回(idref, filepos)
|
||||||
|
|
||||||
依赖:sqlite3, collections, re, os, datetime
|
依赖:sqlite3, collections, re, os, datetime
|
||||||
"""
|
"""
|
||||||
import config
|
import config
|
||||||
|
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from collections import defaultdict
|
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
def parse_location(location):
|
class AnnotationManager:
|
||||||
"""
|
def __init__(self, db_path=None):
|
||||||
解析ZANNOTATIONLOCATION,返回(idref, filepos)
|
self.db_path = db_path or config.LOCAL_ANNOTATION_DB
|
||||||
- epubcfi(...)格式优先提取[]内内容为idref
|
|
||||||
- 其他格式兼容原逻辑
|
@staticmethod
|
||||||
"""
|
def parse_location(location):
|
||||||
idref = None
|
"""
|
||||||
filepos = None
|
解析ZANNOTATIONLOCATION,返回(idref, filepos)
|
||||||
if not location:
|
- epubcfi(...)格式优先提取[]内内容为idref
|
||||||
|
- 其他格式兼容原逻辑
|
||||||
|
"""
|
||||||
|
idref = None
|
||||||
|
filepos = None
|
||||||
|
if not location:
|
||||||
|
return idref, filepos
|
||||||
|
matches = re.findall(r'\[(.*?)\]', location) if location else []
|
||||||
|
idref = matches[0] if len(matches) > 0 else None
|
||||||
|
filepos = matches[1] if len(matches) > 1 else None
|
||||||
return idref, filepos
|
return idref, filepos
|
||||||
# 统一处理,提取前两个[]内容
|
|
||||||
matches = re.findall(r'\[(.*?)\]', location) if location else []
|
|
||||||
idref = matches[0] if len(matches) > 0 else None
|
|
||||||
filepos = matches[1] if len(matches) > 1 else None
|
|
||||||
return idref, filepos
|
|
||||||
|
|
||||||
def get_annotations(db_path=config.LOCAL_ANNOTATION_DB, bookid=None):
|
def get_annotations(self, bookid=None):
|
||||||
# 检查WAL模式相关文件
|
# 检查WAL模式相关文件
|
||||||
base = db_path.rsplit('.', 1)[0]
|
base = self.db_path.rsplit('.', 1)[0]
|
||||||
wal_path = base + '.sqlite-wal'
|
wal_path = base + '.sqlite-wal'
|
||||||
shm_path = base + '.sqlite-shm'
|
shm_path = base + '.sqlite-shm'
|
||||||
for f in [db_path, wal_path, shm_path]:
|
for f in [self.db_path, wal_path, shm_path]:
|
||||||
if not os.path.exists(f):
|
if not os.path.exists(f):
|
||||||
print(f'警告: 缺少 {f},可能无法获取全部最新笔记')
|
print(f'警告: 缺少 {f},可能无法获取全部最新笔记')
|
||||||
conn = sqlite3.connect(db_path)
|
conn = sqlite3.connect(self.db_path)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
if bookid is not None:
|
if bookid is not None:
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
|
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
|
||||||
FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=?
|
FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=?
|
||||||
''', (bookid,))
|
''', (bookid,))
|
||||||
else:
|
else:
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
|
SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID
|
||||||
FROM ZAEANNOTATION
|
FROM ZAEANNOTATION
|
||||||
''')
|
''')
|
||||||
rows = cursor.fetchall()
|
rows = cursor.fetchall()
|
||||||
annotations = defaultdict(dict)
|
annotations = defaultdict(dict)
|
||||||
import datetime
|
import datetime
|
||||||
for row in rows:
|
for row in rows:
|
||||||
assetid, creationdate, location, note, selectedtext, uuid = row
|
assetid, creationdate, location, note, selectedtext, uuid = row
|
||||||
# 转换 creationdate 格式,支持苹果时间戳(以2001-01-01为基准)
|
# 转换 creationdate 格式,支持苹果时间戳(以2001-01-01为基准)
|
||||||
date_str = creationdate
|
date_str = creationdate
|
||||||
if creationdate:
|
if creationdate:
|
||||||
try:
|
try:
|
||||||
origin = datetime.datetime(2001, 1, 1)
|
origin = datetime.datetime(2001, 1, 1)
|
||||||
# 苹果时间戳 float/int 或数字字符串
|
if isinstance(creationdate, (int, float)):
|
||||||
if isinstance(creationdate, (int, float)):
|
dt = origin + datetime.timedelta(seconds=creationdate)
|
||||||
dt = origin + datetime.timedelta(seconds=creationdate)
|
elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit():
|
||||||
elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit():
|
dt = origin + datetime.timedelta(seconds=float(creationdate))
|
||||||
dt = origin + datetime.timedelta(seconds=float(creationdate))
|
else:
|
||||||
else:
|
dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d")
|
||||||
dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d")
|
date_str = f"{dt.year}/{dt.month}/{dt.day}"
|
||||||
date_str = f"{dt.year}/{dt.month}/{dt.day}"
|
except Exception:
|
||||||
except Exception:
|
date_str = str(creationdate)
|
||||||
date_str = str(creationdate)
|
idref, filepos = self.parse_location(location)
|
||||||
idref, filepos = parse_location(location)
|
if note is None and selectedtext is None:
|
||||||
# 跳过note和selectedtext都为None的笔记
|
continue
|
||||||
if note is None and selectedtext is None:
|
annotations[str(assetid)][uuid] = {
|
||||||
continue
|
'creationdate': date_str,
|
||||||
annotations[str(assetid)][uuid] = {
|
'filepos': filepos,
|
||||||
'creationdate': date_str,
|
'idref': idref,
|
||||||
'filepos': filepos,
|
'note': note,
|
||||||
'idref': idref,
|
'selectedtext': selectedtext
|
||||||
'note': note,
|
}
|
||||||
'selectedtext': selectedtext
|
conn.close()
|
||||||
}
|
if bookid is not None:
|
||||||
conn.close()
|
return {str(bookid): annotations.get(str(bookid), {})}
|
||||||
if bookid is not None:
|
return annotations
|
||||||
# 只返回特定bookid的笔记结构
|
|
||||||
return {str(bookid): annotations.get(str(bookid), {})}
|
|
||||||
return annotations
|
|
||||||
|
|
||||||
# 用法示例:输出每本书的前3条笔记
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
manager = AnnotationManager()
|
||||||
# 测试 parse_location
|
# 测试 parse_location
|
||||||
'''
|
|
||||||
test_locations = [
|
test_locations = [
|
||||||
'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8',
|
'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8',
|
||||||
'epubcfi(/6/22[id15]!/4/156/1,:21,:157)',
|
'epubcfi(/6/22[id15]!/4/156/1,:21,:157)',
|
||||||
'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)'
|
'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)'
|
||||||
]
|
]
|
||||||
for loc in test_locations:
|
for loc in test_locations:
|
||||||
idref, filepos = parse_location(loc)
|
idref, filepos = manager.parse_location(loc)
|
||||||
print(f"location: {loc}\n idref: {idref}\n filepos: {filepos}\n")
|
print(f"location: {loc}\n idref: {idref}\n filepos: {filepos}\n")
|
||||||
'''
|
|
||||||
|
|
||||||
# 测试只获取特定 assetid 的笔记
|
# 测试只获取特定 assetid 的笔记
|
||||||
test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C"
|
test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C"
|
||||||
annotations = get_annotations(bookid=test_bookid)
|
annotations = manager.get_annotations(bookid=test_bookid)
|
||||||
|
|
||||||
# 格式化打印该书的所有笔记
|
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
print(f"\nAssetID={test_bookid} 的所有笔记:")
|
print(f"\nAssetID={test_bookid} 的所有笔记:")
|
||||||
pprint(annotations, indent=2, sort_dicts=False)
|
pprint(annotations, indent=2, sort_dicts=False)
|
||||||
|
|
||||||
# 输出每本书的前3条笔记
|
|
||||||
'''
|
|
||||||
book_notes = defaultdict(list)
|
|
||||||
for assetid, notes_dict in annotations.items():
|
|
||||||
for uuid, ann in notes_dict.items():
|
|
||||||
book_notes[assetid].append({**ann, 'uuid': uuid})
|
|
||||||
for assetid, notes in book_notes.items():
|
|
||||||
print(f"\nAssetID: {assetid}")
|
|
||||||
for i, note in enumerate(notes[:3]):
|
|
||||||
print(f" 笔记{i+1}:")
|
|
||||||
print(f" creationdate: {note['creationdate']}")
|
|
||||||
print(f" idref: {note['idref']}")
|
|
||||||
print(f" filepos: {note['filepos']}")
|
|
||||||
print(f" note: {note['note']}")
|
|
||||||
print(f" selectedtext: {note['selectedtext']}")
|
|
||||||
print(f" uuid: {note['uuid']}")
|
|
||||||
'''
|
|
||||||
|
|
|
@ -1,75 +1,66 @@
|
||||||
"""
|
|
||||||
booklist_parse.py
|
|
||||||
-----------------
|
|
||||||
功能:
|
|
||||||
- 解析iBooks的Books.plist,提取所有书籍元数据(书名、作者、路径、时间等)。
|
|
||||||
- 解析BKLibrary.sqlite,获取每本书的最近打开时间(苹果时间戳,基准2001-01-01)。
|
|
||||||
|
|
||||||
依赖:config.py 统一管理路径和配置项。
|
|
||||||
|
|
||||||
主要接口:
|
|
||||||
- parse_books_plist(plist_path):返回所有书籍元数据,结构为{bk_id: {...}}
|
|
||||||
- get_books_last_open(db_path):返回所有书籍最近打开时间,结构为{bk_id: {'last_open': 时间戳}}
|
|
||||||
|
|
||||||
依赖:plistlib, collections, sqlite3, os, datetime
|
|
||||||
|
|
||||||
典型用法:
|
|
||||||
booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST)
|
|
||||||
books_open = get_books_last_open(config.LOCAL_LIBRARY_DB)
|
|
||||||
"""
|
|
||||||
import config
|
import config
|
||||||
import plistlib
|
import plistlib
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
def parse_books_plist(plist_path=config.LOCAL_BOOKS_PLIST):
|
|
||||||
booksinfo = defaultdict(dict)
|
|
||||||
with open(plist_path, 'rb') as f: plist_data = plistlib.load(f)
|
|
||||||
for book in plist_data.get('Books', []):
|
|
||||||
bk_id = book.get('BKGeneratedItemId')
|
|
||||||
if not bk_id: continue
|
|
||||||
booksinfo[bk_id] = {
|
|
||||||
'displayname': book.get('BKDisplayName', ''),
|
|
||||||
'author': book.get('artistName', ''),
|
|
||||||
'type': book.get('BKBookType', ''),
|
|
||||||
'bookid': bk_id,
|
|
||||||
'itemname': book.get('itemName', ''),
|
|
||||||
'path': book.get('path', ''),
|
|
||||||
'date': book.get('BKInsertionDate',''),
|
|
||||||
'updatedate': book.get('updateDate','')
|
|
||||||
}
|
|
||||||
return booksinfo
|
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import os
|
import os
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
def get_books_last_open(db_path=config.LOCAL_LIBRARY_DB):
|
class BookListManager:
|
||||||
"""
|
def __init__(self, plist_path=None, db_path=None):
|
||||||
从BKLibrary.sqlite获取书籍最近打开时间
|
self.plist_path = plist_path or config.LOCAL_BOOKS_PLIST
|
||||||
返回:defaultdict(dict),bk_id为索引,包含最近打开时间
|
self.db_path = db_path or config.LOCAL_LIBRARY_DB
|
||||||
"""
|
self._booksinfo = None
|
||||||
books_open = defaultdict(dict)
|
self._books_open = None
|
||||||
if not os.path.exists(db_path):
|
|
||||||
|
def get_books_info(self):
|
||||||
|
if self._booksinfo is not None:
|
||||||
|
return self._booksinfo
|
||||||
|
booksinfo = defaultdict(dict)
|
||||||
|
with open(self.plist_path, 'rb') as f:
|
||||||
|
plist_data = plistlib.load(f)
|
||||||
|
for book in plist_data.get('Books', []):
|
||||||
|
bk_id = book.get('BKGeneratedItemId')
|
||||||
|
if not bk_id:
|
||||||
|
continue
|
||||||
|
booksinfo[bk_id] = {
|
||||||
|
'displayname': book.get('BKDisplayName', ''),
|
||||||
|
'author': book.get('artistName', ''),
|
||||||
|
'type': book.get('BKBookType', ''),
|
||||||
|
'bookid': bk_id,
|
||||||
|
'itemname': book.get('itemName', ''),
|
||||||
|
'path': book.get('path', ''),
|
||||||
|
'date': book.get('BKInsertionDate',''),
|
||||||
|
'updatedate': book.get('updateDate','')
|
||||||
|
}
|
||||||
|
self._booksinfo = booksinfo
|
||||||
|
return booksinfo
|
||||||
|
|
||||||
|
def get_books_last_open(self):
|
||||||
|
if self._books_open is not None:
|
||||||
|
return self._books_open
|
||||||
|
books_open = defaultdict(dict)
|
||||||
|
if not os.path.exists(self.db_path):
|
||||||
|
return books_open
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(self.db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute(''' SELECT ZASSETID, zlastopendate FROM ZBKLIBRARYASSET WHERE zlastopendate IS NOT NULL ''')
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
for row in rows:
|
||||||
|
asset_id, last_open = row
|
||||||
|
if asset_id:
|
||||||
|
books_open[asset_id] = {
|
||||||
|
'last_open': last_open
|
||||||
|
}
|
||||||
|
conn.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f'警告: 读取BKLibrary.sqlite失败: {e}')
|
||||||
|
self._books_open = books_open
|
||||||
return books_open
|
return books_open
|
||||||
|
|
||||||
try:
|
|
||||||
conn = sqlite3.connect(db_path)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
# ZBKLIBRARYASSET表包含书籍信息
|
|
||||||
cursor.execute(''' SELECT ZASSETID, zlastopendate FROM ZBKLIBRARYASSET WHERE zlastopendate IS NOT NULL ''')
|
|
||||||
rows = cursor.fetchall()
|
|
||||||
for row in rows:
|
|
||||||
asset_id, last_open = row
|
|
||||||
if asset_id:
|
|
||||||
books_open[asset_id] = {
|
|
||||||
'last_open': last_open # 苹果时间戳,基准时间为2001-01-01
|
|
||||||
}
|
|
||||||
conn.close()
|
|
||||||
except Exception as e:
|
|
||||||
print(f'警告: 读取BKLibrary.sqlite失败: {e}')
|
|
||||||
|
|
||||||
return books_open
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST)
|
manager = BookListManager()
|
||||||
|
booksinfo = manager.get_books_info()
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
print("\n【前三条示例】")
|
print("\n【前三条示例】")
|
||||||
for k, v in list(booksinfo.items())[:3]:
|
for k, v in list(booksinfo.items())[:3]:
|
||||||
|
@ -77,19 +68,10 @@ if __name__ == '__main__':
|
||||||
pprint(v, sort_dicts=False, indent=2)
|
pprint(v, sort_dicts=False, indent=2)
|
||||||
print('-' * 60)
|
print('-' * 60)
|
||||||
|
|
||||||
'''
|
|
||||||
print("\n【全部内容】")
|
|
||||||
for k, v in booksinfo.items():
|
|
||||||
print(f"{k}:")
|
|
||||||
pprint(v, sort_dicts=False, indent=2)
|
|
||||||
print('-' * 60)
|
|
||||||
'''
|
|
||||||
# 测试最近打开时间
|
|
||||||
print("\n【最近打开时间示例】")
|
print("\n【最近打开时间示例】")
|
||||||
books_open = get_books_last_open()
|
books_open = manager.get_books_last_open()
|
||||||
import datetime
|
import datetime
|
||||||
for k, v in list(books_open.items())[:3]:
|
for k, v in list(books_open.items())[:3]:
|
||||||
ts = v['last_open']
|
ts = v['last_open']
|
||||||
# 苹果时间戳,基准2001-01-01
|
|
||||||
dt = datetime.datetime(2001, 1, 1) + datetime.timedelta(seconds=ts)
|
dt = datetime.datetime(2001, 1, 1) + datetime.timedelta(seconds=ts)
|
||||||
print(f"{k}: {dt} (timestamp: {ts})")
|
print(f"{k}: {dt} (timestamp: {ts})")
|
BIN
data/Books.plist
BIN
data/Books.plist
Binary file not shown.
|
@ -158,31 +158,43 @@ answer = inquirer.fuzzy(
|
||||||
|
|
||||||
## 9.1 主要代码文件说明(细化)
|
## 9.1 主要代码文件说明(细化)
|
||||||
|
|
||||||
|
|
||||||
- `exportbooknotes.py`
|
- `exportbooknotes.py`
|
||||||
|
- 采用 OOP 设计,核心类为 `BookNotesExporter`:
|
||||||
|
- `build_booksnote(bookid=None)`:构建结构化笔记数据。
|
||||||
|
- `export_booksnote_to_md(booksnote, booksinfo, out_path=None)`:导出为 Markdown。
|
||||||
|
- `find_file_by_ext`、`get_toc_tree` 等辅助方法。
|
||||||
- 数据同步:自动复制 iBooks 数据库和元数据到本地。
|
- 数据同步:自动复制 iBooks 数据库和元数据到本地。
|
||||||
- 菜单交互:按最近打开时间戳排序,显示“书名 [时间戳]”,支持模糊搜索。
|
- 菜单交互:按最近打开时间戳排序,显示“书名 [时间戳]”,支持模糊搜索。
|
||||||
- 只处理用户选中书籍的笔记,按章节分组导出 Markdown。
|
- 只处理用户选中书籍的笔记,按章节分组导出 Markdown。
|
||||||
- 依赖核心解析模块,负责主流程调度。
|
- 依赖核心解析模块,负责主流程调度。
|
||||||
|
|
||||||
- `annotationdata.py`
|
- `annotationdata.py`
|
||||||
|
- OOP 设计,核心类为 `AnnotationManager`:
|
||||||
|
- `get_annotations(bookid=None)`:返回所有或指定 assetid 的笔记。
|
||||||
|
- `parse_location(location)`:静态方法,解析定位信息。
|
||||||
- 解析 AEAnnotation.sqlite,提取所有或指定 assetid 的笔记。
|
- 解析 AEAnnotation.sqlite,提取所有或指定 assetid 的笔记。
|
||||||
- 支持苹果时间戳转换,结构化输出。
|
- 支持苹果时间戳转换,结构化输出。
|
||||||
- parse_location 辅助函数,统一解析笔记定位信息。
|
|
||||||
|
|
||||||
- `booklist_parse.py`
|
- `booklist_parse.py`
|
||||||
|
- OOP 设计,核心类为 `BookListManager`:
|
||||||
|
- `get_books_info()`:获取书籍元数据。
|
||||||
|
- `get_books_last_open()`:获取每本书的最近打开时间。
|
||||||
- 解析 Books.plist,获取书籍元数据(书名、作者、路径、时间等)。
|
- 解析 Books.plist,获取书籍元数据(书名、作者、路径、时间等)。
|
||||||
- 解析 BKLibrary.sqlite,获取每本书的最近打开时间(zlastopendate,苹果时间戳)。
|
- 解析 BKLibrary.sqlite,获取每本书的最近打开时间。
|
||||||
- 提供统一数据接口,便于主流程排序和展示。
|
|
||||||
|
|
||||||
- `opf_parse.py`
|
- `opf_parse.py`
|
||||||
|
- OOP 设计,核心类为 `OPFParser`:
|
||||||
|
- `parse_opf(filepath)`:静态方法,返回 id->href 映射。
|
||||||
- 解析 epub 的 OPF 文件,获取章节与文件映射关系(idref -> href)。
|
- 解析 epub 的 OPF 文件,获取章节与文件映射关系(idref -> href)。
|
||||||
- 支持多种 epub 目录结构。
|
|
||||||
|
|
||||||
- `toc_parse.py`
|
- `toc_parse.py`
|
||||||
|
- OOP 设计,核心类为 `TOCParser`:
|
||||||
|
- `parse_navpoints(navpoints)`:递归解析 navPoint 节点。
|
||||||
|
- `find_label_path(node, ref, filepos, path)`:查找章节路径。
|
||||||
|
- `find_section_by_selectedtext(html_path, selectedtext)`:通过选中文本定位章节标题。
|
||||||
|
- `parse_html_title(html_path)`:解析 html 文件标题。
|
||||||
- 解析 NCX 目录文件,递归构建章节树结构。
|
- 解析 NCX 目录文件,递归构建章节树结构。
|
||||||
- find_label_path:支持通过 ref 和 filepos 查找完整 label 路径。
|
|
||||||
- find_section_by_selectedtext:通过选中文本在 html 文件中定位章节标题。
|
|
||||||
- parse_html_title:解析 html 文件标题。
|
|
||||||
|
|
||||||
- `backup/booksnote.py`
|
- `backup/booksnote.py`
|
||||||
- 历史/备份脚本,辅助数据迁移或格式转换。
|
- 历史/备份脚本,辅助数据迁移或格式转换。
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# 笔记导出 2025-08-15 13:25
|
# 笔记导出 2025-08-15 17:20
|
||||||
|
|
||||||
|
|
||||||
## 传统十论
|
## 传统十论
|
||||||
|
|
|
@ -1,31 +1,17 @@
|
||||||
"""
|
"""
|
||||||
exportbooknotes.py
|
exportbooknotes.py (OOP版)
|
||||||
------------------
|
-------------------------
|
||||||
功能:
|
功能:
|
||||||
- 自动同步iBooks数据库和元数据文件到本地data目录。
|
- 自动同步iBooks数据库和元数据文件到本地data目录。
|
||||||
- 解析AEAnnotation.sqlite、Books.plist、BKLibrary.sqlite,构建结构化笔记数据。
|
- 解析AEAnnotation.sqlite、Books.plist、BKLibrary.sqlite,构建结构化笔记数据。
|
||||||
- 解析epub目录和章节信息,定位每条笔记所属章节。
|
- 解析epub目录和章节信息,定位每条笔记所属章节。
|
||||||
- 命令行菜单按最近打开时间降序展示书籍列表,供用户选择导出。
|
- 命令行菜单按最近打开时间降序展示书籍列表,供用户选择导出。
|
||||||
- 仅导出选中书籍的所有笔记,按章节分组,生成Markdown文件。
|
- 仅导出选中书籍的所有笔记,按章节分组,生成Markdown文件。
|
||||||
|
|
||||||
依赖:config.py 统一管理路径和配置项。
|
依赖:config.py 统一管理路径和配置项。
|
||||||
|
主要接口:BookNotesExporter
|
||||||
主要数据流:
|
- run():命令行交互式导出主流程
|
||||||
1. 数据同步到data目录
|
- build_booksnote(bookid=None):构建结构化笔记数据
|
||||||
2. 解析Books.plist获取书籍元数据
|
- export_booksnote_to_md(booksnote, booksinfo, out_path=None):导出为Markdown
|
||||||
3. 解析BKLibrary.sqlite获取最近打开时间
|
|
||||||
4. 菜单排序与显示(书名+时间戳)
|
|
||||||
5. 解析AEAnnotation.sqlite获取笔记
|
|
||||||
6. 解析epub目录,定位章节
|
|
||||||
7. 导出Markdown文件
|
|
||||||
|
|
||||||
依赖:Python 3, InquirerPy, bs4, shutil, os, datetime, sqlite3
|
|
||||||
|
|
||||||
主要数据流:
|
|
||||||
|
|
||||||
典型用法:
|
|
||||||
python exportbooknotes.py
|
|
||||||
# 按提示选择书籍,自动导出笔记到export_notes目录
|
|
||||||
"""
|
"""
|
||||||
import config
|
import config
|
||||||
"""
|
"""
|
||||||
|
@ -40,117 +26,113 @@ booksnote = {
|
||||||
}}}
|
}}}
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
from collections import defaultdict
|
|
||||||
import os
|
import os
|
||||||
from annotationdata import get_annotations
|
from collections import defaultdict
|
||||||
from booklist_parse import parse_books_plist
|
from annotationdata import AnnotationManager
|
||||||
|
from booklist_parse import BookListManager
|
||||||
from opf_parse import parse_opf
|
from opf_parse import parse_opf
|
||||||
from toc_parse import parse_navpoints, find_label_path
|
from toc_parse import TOCParser
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
def find_file_by_ext(root, exts):
|
|
||||||
"""在root下递归查找第一个指定后缀的文件"""
|
|
||||||
for dirpath, _, files in os.walk(root):
|
|
||||||
for f in files:
|
|
||||||
for ext in exts:
|
|
||||||
if f.lower().endswith(ext):
|
|
||||||
return os.path.join(dirpath, f)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_toc_tree(toc_path):
|
class BookNotesExporter:
|
||||||
with open(toc_path, 'r', encoding='utf-8') as f:
|
def __init__(self, config_module=config):
|
||||||
soup = BeautifulSoup(f, 'xml')
|
self.config = config_module
|
||||||
nav_map = soup.find('navMap')
|
self.annotation_db = config_module.LOCAL_ANNOTATION_DB
|
||||||
|
self.books_plist = config_module.LOCAL_BOOKS_PLIST
|
||||||
|
self.library_db = config_module.LOCAL_LIBRARY_DB
|
||||||
|
|
||||||
nav_points = nav_map.find_all('navPoint', recursive=False)
|
@staticmethod
|
||||||
toc_tree = parse_navpoints(nav_points)
|
def find_file_by_ext(root, exts):
|
||||||
#pprint(toc_tree, indent=2, depth=5)
|
for dirpath, _, files in os.walk(root):
|
||||||
return toc_tree
|
for f in files:
|
||||||
|
for ext in exts:
|
||||||
|
if f.lower().endswith(ext):
|
||||||
|
return os.path.join(dirpath, f)
|
||||||
|
return None
|
||||||
|
|
||||||
def build_booksnote(annotation_db=config.LOCAL_ANNOTATION_DB, books_plist=config.LOCAL_BOOKS_PLIST, bookid=None):
|
@staticmethod
|
||||||
# 支持只处理特定 assetid 的笔记
|
def get_toc_tree(toc_path):
|
||||||
annotations = get_annotations(annotation_db, bookid=bookid)
|
with open(toc_path, 'r', encoding='utf-8') as f:
|
||||||
booksinfo = parse_books_plist(books_plist)
|
soup = BeautifulSoup(f, 'xml')
|
||||||
booksnote = defaultdict(lambda: defaultdict(dict))
|
nav_map = soup.find('navMap')
|
||||||
for assetid, notes in annotations.items():
|
nav_points = nav_map.find_all('navPoint', recursive=False)
|
||||||
# 获取epub路径
|
toc_tree = TOCParser.parse_navpoints(nav_points)
|
||||||
bookinfo = booksinfo.get(assetid)
|
return toc_tree
|
||||||
if not bookinfo:
|
|
||||||
continue
|
def build_booksnote(self, bookid=None):
|
||||||
epub_path = bookinfo.get('path')
|
manager = AnnotationManager(self.annotation_db)
|
||||||
if not epub_path or not os.path.isdir(epub_path):
|
annotations = manager.get_annotations(bookid=bookid)
|
||||||
continue
|
bl_manager = BookListManager(plist_path=self.books_plist)
|
||||||
# 查找opf和ncx
|
booksinfo = bl_manager.get_books_info()
|
||||||
opf_path = find_file_by_ext(epub_path, ['.opf'])
|
booksnote = defaultdict(lambda: defaultdict(dict))
|
||||||
ncx_path = find_file_by_ext(epub_path, ['.ncx'])
|
for assetid, notes in annotations.items():
|
||||||
if not opf_path or not ncx_path:
|
bookinfo = booksinfo.get(assetid)
|
||||||
continue
|
if not bookinfo:
|
||||||
id2href = parse_opf(opf_path)
|
continue
|
||||||
toc_tree = get_toc_tree(ncx_path)
|
epub_path = bookinfo.get('path')
|
||||||
for uuid, ann in notes.items():
|
if not epub_path or not os.path.isdir(epub_path):
|
||||||
idref = ann['idref']
|
continue
|
||||||
filepos = ann['filepos']
|
opf_path = self.find_file_by_ext(epub_path, ['.opf'])
|
||||||
href = id2href.get(idref, idref)
|
ncx_path = self.find_file_by_ext(epub_path, ['.ncx'])
|
||||||
chapter = find_label_path(toc_tree, href, filepos)
|
if not opf_path or not ncx_path:
|
||||||
if chapter is None:
|
continue
|
||||||
# 直接从html文件获取章节信息
|
id2href = parse_opf(opf_path)
|
||||||
html_path = os.path.join(epub_path, href.split('#')[0])
|
toc_tree = self.get_toc_tree(ncx_path)
|
||||||
selectedtext = ann.get('selectedtext')
|
for uuid, ann in notes.items():
|
||||||
if os.path.exists(html_path) and selectedtext:
|
idref = ann['idref']
|
||||||
from toc_parse import find_section_by_selectedtext
|
filepos = ann['filepos']
|
||||||
section = find_section_by_selectedtext(html_path, selectedtext)
|
href = id2href.get(idref, idref)
|
||||||
if section:
|
chapter = TOCParser.find_label_path(toc_tree, href, filepos)
|
||||||
chapter = section
|
if chapter is None:
|
||||||
|
html_path = os.path.join(epub_path, href.split('#')[0])
|
||||||
|
selectedtext = ann.get('selectedtext')
|
||||||
|
if os.path.exists(html_path) and selectedtext:
|
||||||
|
section = TOCParser.find_section_by_selectedtext(html_path, selectedtext)
|
||||||
|
if section:
|
||||||
|
chapter = section
|
||||||
|
else:
|
||||||
|
chapter = "(未找到章节)"
|
||||||
else:
|
else:
|
||||||
chapter = "(未找到章节)"
|
chapter = "(未找到章节)"
|
||||||
else:
|
booksnote[assetid][chapter][uuid] = {
|
||||||
chapter = "(未找到章节)"
|
'creationdate': ann['creationdate'],
|
||||||
booksnote[assetid][chapter][uuid] = {
|
'filepos': filepos,
|
||||||
'creationdate': ann['creationdate'],
|
'idref': href,
|
||||||
'filepos': filepos,
|
'note': ann['note'],
|
||||||
'idref': href,
|
'selectedtext': ann['selectedtext']
|
||||||
'note': ann['note'],
|
}
|
||||||
'selectedtext': ann['selectedtext']
|
return booksnote
|
||||||
}
|
|
||||||
return booksnote
|
|
||||||
|
|
||||||
import datetime
|
def export_booksnote_to_md(self, booksnote, booksinfo, out_path=None):
|
||||||
|
import datetime
|
||||||
def export_booksnote_to_md(booksnote, booksinfo, out_path=None):
|
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
|
||||||
"""
|
lines = [f'# 笔记导出 {now}\n']
|
||||||
依据booksnote结构导出markdown文件,格式:
|
for assetid, chapters in booksnote.items():
|
||||||
# “笔记导出”+导出时间
|
bookname = booksinfo.get(assetid, {}).get('itemname', assetid)
|
||||||
## 书名
|
lines.append(f'\n## {bookname}\n')
|
||||||
### chapter
|
for chapter, notes in chapters.items():
|
||||||
selectedtext
|
lines.append(f'### {chapter}')
|
||||||
> note (如果存在)
|
for uuid, ann in notes.items():
|
||||||
"""
|
sel = ann.get('selectedtext')
|
||||||
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
|
note = ann.get('note')
|
||||||
lines = [f'# 笔记导出 {now}\n']
|
if sel:
|
||||||
for assetid, chapters in booksnote.items():
|
lines.append(sel)
|
||||||
bookname = booksinfo.get(assetid, {}).get('itemname', assetid)
|
if note:
|
||||||
lines.append(f'\n## {bookname}\n')
|
lines.append(f'> {note}')
|
||||||
for chapter, notes in chapters.items():
|
lines.append('')
|
||||||
lines.append(f'### {chapter}')
|
md = '\n'.join(lines)
|
||||||
for uuid, ann in notes.items():
|
if out_path:
|
||||||
sel = ann.get('selectedtext')
|
with open(out_path, 'w', encoding='utf-8') as f:
|
||||||
note = ann.get('note')
|
f.write(md)
|
||||||
if sel:
|
return md
|
||||||
lines.append(sel)
|
|
||||||
if note:
|
|
||||||
lines.append(f'> {note}')
|
|
||||||
lines.append('')
|
|
||||||
md = '\n'.join(lines)
|
|
||||||
if out_path:
|
|
||||||
with open(out_path, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(md)
|
|
||||||
return md
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import shutil
|
import shutil
|
||||||
import os.path
|
import os.path
|
||||||
|
from InquirerPy import inquirer # type: ignore
|
||||||
|
exporter = BookNotesExporter(config)
|
||||||
# 自动覆盖 ./data 下的数据库和plist文件,源为iBooks真实路径
|
# 自动覆盖 ./data 下的数据库和plist文件,源为iBooks真实路径
|
||||||
src_files = [
|
src_files = [
|
||||||
(config.IBOOKS_ANNOTATION_DB, config.LOCAL_ANNOTATION_DB),
|
(config.IBOOKS_ANNOTATION_DB, config.LOCAL_ANNOTATION_DB),
|
||||||
|
@ -166,31 +148,19 @@ if __name__ == '__main__':
|
||||||
else:
|
else:
|
||||||
print(f'file not found: {src} ')
|
print(f'file not found: {src} ')
|
||||||
|
|
||||||
from booklist_parse import parse_books_plist
|
|
||||||
from InquirerPy import inquirer # type: ignore
|
|
||||||
|
|
||||||
# 先获取所有书籍元数据
|
# 先获取所有书籍元数据
|
||||||
booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST)
|
manager = BookListManager(plist_path=config.LOCAL_BOOKS_PLIST, db_path=config.LOCAL_LIBRARY_DB)
|
||||||
|
booksinfo = manager.get_books_info()
|
||||||
# 构建书名列表(优先displayname, 其次itemname, 否则assetid),按parse_books_plist中的date字段排序
|
|
||||||
assetid2name = {}
|
assetid2name = {}
|
||||||
assetid2lastopen = {}
|
assetid2lastopen = {}
|
||||||
from booklist_parse import get_books_last_open
|
last_open_times = manager.get_books_last_open()
|
||||||
|
|
||||||
# 获取所有书籍的最后打开时间(字典,值为{'last_open': 时间戳})
|
|
||||||
last_open_times = get_books_last_open(config.LOCAL_LIBRARY_DB)
|
|
||||||
|
|
||||||
for assetid, info in booksinfo.items():
|
for assetid, info in booksinfo.items():
|
||||||
name = info.get('displayname') or info.get('itemname') or assetid
|
name = info.get('displayname') or info.get('itemname') or assetid
|
||||||
# 如果书名中包含“-”,只取“-”前面的部分
|
|
||||||
if '-' in name:
|
if '-' in name:
|
||||||
name = name.split('-', 1)[0].strip()
|
name = name.split('-', 1)[0].strip()
|
||||||
assetid2name[assetid] = name
|
assetid2name[assetid] = name
|
||||||
# 用 get_books_last_open 返回的时间戳排序,如无则为0
|
|
||||||
ts = last_open_times.get(assetid, {}).get('last_open', 0)
|
ts = last_open_times.get(assetid, {}).get('last_open', 0)
|
||||||
assetid2lastopen[assetid] = ts
|
assetid2lastopen[assetid] = ts
|
||||||
|
|
||||||
# 按last_open时间戳降序排列
|
|
||||||
sorted_assetids = sorted(assetid2name.keys(), key=lambda aid: assetid2lastopen[aid], reverse=True)
|
sorted_assetids = sorted(assetid2name.keys(), key=lambda aid: assetid2lastopen[aid], reverse=True)
|
||||||
choices = [f"{assetid2name[aid]} [{assetid2lastopen[aid]}]" for aid in sorted_assetids]
|
choices = [f"{assetid2name[aid]} [{assetid2lastopen[aid]}]" for aid in sorted_assetids]
|
||||||
if not choices:
|
if not choices:
|
||||||
|
@ -202,8 +172,6 @@ if __name__ == '__main__':
|
||||||
multiselect=False,
|
multiselect=False,
|
||||||
instruction="上下键选择,输入可模糊筛选,回车确定"
|
instruction="上下键选择,输入可模糊筛选,回车确定"
|
||||||
).execute()
|
).execute()
|
||||||
|
|
||||||
# 解析选中assetid
|
|
||||||
for aid, name in assetid2name.items():
|
for aid, name in assetid2name.items():
|
||||||
if answer.startswith(name):
|
if answer.startswith(name):
|
||||||
selected_assetid = aid
|
selected_assetid = aid
|
||||||
|
@ -211,10 +179,8 @@ if __name__ == '__main__':
|
||||||
else:
|
else:
|
||||||
print("未找到选中书籍")
|
print("未找到选中书籍")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
selected_booksnote = exporter.build_booksnote(bookid=selected_assetid)
|
||||||
# 只导出选中书的笔记
|
|
||||||
selected_booksnote = build_booksnote(bookid=selected_assetid)
|
|
||||||
selected_booksinfo = {selected_assetid: booksinfo.get(selected_assetid, {})}
|
selected_booksinfo = {selected_assetid: booksinfo.get(selected_assetid, {})}
|
||||||
out_path = f'export_notes/notes_export_{selected_assetid}.md'
|
out_path = f'export_notes/notes_export_{selected_assetid}.md'
|
||||||
export_booksnote_to_md(selected_booksnote, selected_booksinfo, out_path)
|
exporter.export_booksnote_to_md(selected_booksnote, selected_booksinfo, out_path)
|
||||||
print(f'《{selected_booksinfo[selected_assetid].get("displayname") or selected_booksinfo[selected_assetid].get("itemname") or selected_assetid}》 导出笔记 {out_path}')
|
print(f'《{selected_booksinfo[selected_assetid].get("displayname") or selected_booksinfo[selected_assetid].get("itemname") or selected_assetid}》 导出笔记 {out_path}')
|
||||||
|
|
75
opf_parse.py
75
opf_parse.py
|
@ -1,38 +1,46 @@
|
||||||
|
|
||||||
# parseopf.py
|
|
||||||
# -----------------------------
|
|
||||||
# 用于解析EPUB电子书的OPF文件,提取manifest部分所有id对应的html文件href。
|
|
||||||
# 支持批量测试和通过id快速查找href。
|
|
||||||
# 依赖:BeautifulSoup4
|
|
||||||
# -----------------------------
|
|
||||||
|
|
||||||
from collections import defaultdict
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import pprint
|
|
||||||
|
|
||||||
|
|
||||||
def parse_opf(filepath):
|
def parse_opf(filepath):
|
||||||
"""
|
"""
|
||||||
解析OPF文件,返回{id: href}的defaultdict(dict)结构。
|
兼容旧代码的顶层函数,实际调用 OPFParser.parse_opf。
|
||||||
仅保留href以.html结尾的项。
|
|
||||||
|
|
||||||
参数:
|
|
||||||
filepath (str): OPF文件路径
|
|
||||||
返回:
|
|
||||||
defaultdict(dict): id到href的映射(仅html文件)
|
|
||||||
"""
|
"""
|
||||||
result = defaultdict(dict)
|
return OPFParser.parse_opf(filepath)
|
||||||
with open(filepath, 'r', encoding='utf-8') as f:
|
|
||||||
soup = BeautifulSoup(f, 'xml')
|
"""
|
||||||
# 查找manifest部分,遍历所有item,筛选html结尾的href
|
opf_parse.py (OOP版)
|
||||||
manifest = soup.find('manifest')
|
-------------------
|
||||||
if manifest:
|
功能:
|
||||||
for item in manifest.find_all('item'):
|
- 解析EPUB电子书的OPF文件,提取manifest部分所有id对应的html文件href。
|
||||||
id_ = item.get('id')
|
- 支持通过id快速查找href。
|
||||||
href = item.get('href')
|
- 支持批量测试。
|
||||||
if id_ and href and href.strip().lower().endswith('html'):
|
依赖:BeautifulSoup4
|
||||||
result[id_] = href
|
主要接口:OPFParser
|
||||||
return result
|
- parse_opf(filepath):静态方法,返回id->href映射(仅html文件)。
|
||||||
|
"""
|
||||||
|
from collections import defaultdict
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
class OPFParser:
|
||||||
|
@staticmethod
|
||||||
|
def parse_opf(filepath):
|
||||||
|
"""
|
||||||
|
解析OPF文件,返回{id: href}的defaultdict(dict)结构。
|
||||||
|
仅保留href以.html结尾的项。
|
||||||
|
参数:
|
||||||
|
filepath (str): OPF文件路径
|
||||||
|
返回:
|
||||||
|
defaultdict(dict): id到href的映射(仅html文件)
|
||||||
|
"""
|
||||||
|
result = defaultdict(dict)
|
||||||
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
|
soup = BeautifulSoup(f, 'xml')
|
||||||
|
manifest = soup.find('manifest')
|
||||||
|
if manifest:
|
||||||
|
for item in manifest.find_all('item'):
|
||||||
|
id_ = item.get('id')
|
||||||
|
href = item.get('href')
|
||||||
|
if id_ and href and href.strip().lower().endswith('html'):
|
||||||
|
result[id_] = href
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
test_files = [
|
test_files = [
|
||||||
|
@ -44,8 +52,7 @@ if __name__ == "__main__":
|
||||||
for file in test_files:
|
for file in test_files:
|
||||||
print(f"\n==== 测试文件: {file} ====")
|
print(f"\n==== 测试文件: {file} ====")
|
||||||
try:
|
try:
|
||||||
result = parse_opf(file)
|
result = OPFParser.parse_opf(file)
|
||||||
pprint.pprint(result, indent=2, width=120, sort_dicts=False)
|
|
||||||
|
|
||||||
# 增加通过id快速打印href的测试
|
# 增加通过id快速打印href的测试
|
||||||
test_ids = list(result.keys())[:3] # 取前三个id做演示
|
test_ids = list(result.keys())[:3] # 取前三个id做演示
|
||||||
|
|
268
toc_parse.py
268
toc_parse.py
|
@ -1,6 +1,7 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
toc_parse.py
|
toc_parse.py (OOP版)
|
||||||
------------
|
-------------------
|
||||||
功能:
|
功能:
|
||||||
- 解析EPUB电子书的toc.ncx目录文件,递归构建章节树结构。
|
- 解析EPUB电子书的toc.ncx目录文件,递归构建章节树结构。
|
||||||
- 支持通过ref和filepos查找完整label路径。
|
- 支持通过ref和filepos查找完整label路径。
|
||||||
|
@ -8,166 +9,120 @@ toc_parse.py
|
||||||
- 兼容多种EPUB格式,支持批量测试。
|
- 兼容多种EPUB格式,支持批量测试。
|
||||||
|
|
||||||
依赖:config.py 统一管理路径和配置项。
|
依赖:config.py 统一管理路径和配置项。
|
||||||
主要接口:
|
主要接口:TOCParser
|
||||||
parse_navpoints(navpoints) # 递归解析navPoint节点,返回章节树结构。
|
- parse_navpoints(navpoints):递归解析navPoint节点,返回章节树结构。
|
||||||
find_label_path(node, ref, filepos, path) # 查找指定ref和filepos的章节label路径。
|
- find_label_path(node, ref, filepos, path):查找指定ref和filepos的章节label路径。
|
||||||
find_section_by_selectedtext(html_path, selectedtext) # 通过选中文本定位章节标题。
|
- find_section_by_selectedtext(html_path, selectedtext):通过选中文本定位章节标题。
|
||||||
parse_html_title(html_path) # 解析html文件标题。
|
- parse_html_title(html_path):解析html文件标题。
|
||||||
依赖:BeautifulSoup4, pprint, os, typing
|
依赖:BeautifulSoup4, pprint, os, typing
|
||||||
"""
|
"""
|
||||||
import config
|
import config
|
||||||
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from typing import Dict, Optional, List, Any
|
import os
|
||||||
import pprint
|
|
||||||
|
|
||||||
# ==== 辅助函数:根据selectedtext在html文件中的位置推断所在章节 ====
|
class TOCParser:
|
||||||
def find_section_by_selectedtext(html_path, selectedtext):
|
def __init__(self):
|
||||||
"""
|
|
||||||
在html文件中查找selectedtext出现的位置,向上回溯最近的h1-h6标题,返回该标题文本。
|
|
||||||
若未找到标题,则返回None。
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
with open(html_path, 'r', encoding='utf-8') as f:
|
|
||||||
soup = BeautifulSoup(f, 'html.parser')
|
|
||||||
# 在所有文本节点中查找selectedtext
|
|
||||||
for elem in soup.find_all(string=True):
|
|
||||||
if selectedtext and selectedtext.strip() and selectedtext.strip() in elem:
|
|
||||||
# 回溯父节点,查找最近的h1-h6
|
|
||||||
parent = elem.parent
|
|
||||||
while parent:
|
|
||||||
prev = parent.previous_sibling
|
|
||||||
# 向上查找同级前面的h1-h6
|
|
||||||
while prev:
|
|
||||||
if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']:
|
|
||||||
return prev.get_text(strip=True)
|
|
||||||
prev = prev.previous_sibling
|
|
||||||
parent = parent.parent
|
|
||||||
# 若未找到,尝试全局第一个h1-h6
|
|
||||||
for tag in ['h1','h2','h3','h4','h5','h6']:
|
|
||||||
h = soup.find(tag)
|
|
||||||
if h and h.get_text(strip=True):
|
|
||||||
return h.get_text(strip=True)
|
|
||||||
except Exception:
|
|
||||||
pass
|
pass
|
||||||
return None
|
|
||||||
|
|
||||||
def parse_html_title(html_path):
|
@staticmethod
|
||||||
"""
|
def find_section_by_selectedtext(html_path, selectedtext):
|
||||||
解析html文件,优先返回<title>,否则返回body第一个h1/h2/h3/h4/h5/h6或None。
|
try:
|
||||||
"""
|
with open(html_path, 'r', encoding='utf-8') as f:
|
||||||
try:
|
soup = BeautifulSoup(f, 'html.parser')
|
||||||
with open(html_path, 'r', encoding='utf-8') as f:
|
for elem in soup.find_all(string=True):
|
||||||
soup = BeautifulSoup(f, 'html.parser')
|
if selectedtext and selectedtext.strip() and selectedtext.strip() in elem:
|
||||||
# 优先<title>
|
parent = elem.parent
|
||||||
if soup.title and soup.title.string:
|
while parent:
|
||||||
return soup.title.string.strip()
|
prev = parent.previous_sibling
|
||||||
# 其次正文第一个h1-h6
|
while prev:
|
||||||
for tag in ['h1','h2','h3','h4','h5','h6']:
|
if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']:
|
||||||
h = soup.find(tag)
|
return prev.get_text(strip=True)
|
||||||
if h and h.get_text(strip=True):
|
prev = prev.previous_sibling
|
||||||
return h.get_text(strip=True)
|
parent = parent.parent
|
||||||
except Exception:
|
for tag in ['h1','h2','h3','h4','h5','h6']:
|
||||||
pass
|
h = soup.find(tag)
|
||||||
return None
|
if h and h.get_text(strip=True):
|
||||||
|
return h.get_text(strip=True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
def parse_navpoints(navpoints) -> Dict[str, dict]:
|
@staticmethod
|
||||||
"""
|
def parse_html_title(html_path):
|
||||||
递归解析 navpoints 节点,返回嵌套 dict 结构。
|
try:
|
||||||
:param navpoints: BeautifulSoup 查找到的 navPoint 节点列表
|
with open(html_path, 'r', encoding='utf-8') as f:
|
||||||
:return: 章节树结构
|
soup = BeautifulSoup(f, 'html.parser')
|
||||||
"""
|
if soup.title and soup.title.string:
|
||||||
result = {}
|
return soup.title.string.strip()
|
||||||
for navpoint in navpoints:
|
for tag in ['h1','h2','h3','h4','h5','h6']:
|
||||||
label = navpoint.navLabel.text.strip().strip('"“”')
|
h = soup.find(tag)
|
||||||
src = navpoint.content["src"]
|
if h and h.get_text(strip=True):
|
||||||
if "#" in src:
|
return h.get_text(strip=True)
|
||||||
ref, filepos = src.split("#", 1)
|
except Exception:
|
||||||
else:
|
pass
|
||||||
ref, filepos = src, None
|
return None
|
||||||
entry = {
|
|
||||||
"label": label,
|
|
||||||
"ref": ref,
|
|
||||||
"filepos": filepos,
|
|
||||||
"children": parse_navpoints(navpoint.find_all("navPoint", recursive=False))
|
|
||||||
}
|
|
||||||
result[navpoint.get("id")] = entry
|
|
||||||
|
|
||||||
#pprint.pprint(result) # 格式化打印result
|
@staticmethod
|
||||||
|
def parse_navpoints(navpoints):
|
||||||
|
result = {}
|
||||||
|
for navpoint in navpoints:
|
||||||
|
label = navpoint.navLabel.text.strip().strip('"“”')
|
||||||
|
src = navpoint.content["src"]
|
||||||
|
if "#" in src:
|
||||||
|
ref, filepos = src.split("#", 1)
|
||||||
|
else:
|
||||||
|
ref, filepos = src, None
|
||||||
|
entry = {
|
||||||
|
"label": label,
|
||||||
|
"ref": ref,
|
||||||
|
"filepos": filepos,
|
||||||
|
"children": TOCParser.parse_navpoints(navpoint.find_all("navPoint", recursive=False))
|
||||||
|
}
|
||||||
|
result[navpoint.get("id")] = entry
|
||||||
|
return result
|
||||||
|
|
||||||
return result
|
@staticmethod
|
||||||
|
def find_label_path(node, ref, filepos=None, path=None):
|
||||||
def find_label_path(
|
if path is None:
|
||||||
node: Any,
|
path = []
|
||||||
ref: str,
|
if isinstance(node, dict):
|
||||||
filepos: Optional[str] = None,
|
nodes = node.values() if "label" not in node else [node]
|
||||||
path: Optional[List[str]] = None
|
|
||||||
) -> Optional[str]:
|
|
||||||
"""
|
|
||||||
在嵌套 dict 结构中查找指定 ref 和 filepos 的 label 路径。
|
|
||||||
:param node: 当前节点(dict 或 dict集合)
|
|
||||||
:param ref: html文件名
|
|
||||||
:param filepos: 文件位置,可为 None
|
|
||||||
:param path: label 路径累积
|
|
||||||
:return: 以 / 分隔的完整 label 路径,未找到返回 None
|
|
||||||
"""
|
|
||||||
if path is None:
|
|
||||||
path = []
|
|
||||||
if isinstance(node, dict):
|
|
||||||
nodes = node.values() if "label" not in node else [node]
|
|
||||||
# 1. 优先精确匹配ref和filepos
|
|
||||||
for v in nodes:
|
|
||||||
if "label" in v:
|
|
||||||
new_path = path + [v["label"]]
|
|
||||||
if v["ref"] == ref and (filepos is None or v["filepos"] == filepos):
|
|
||||||
title = " / ".join(new_path)
|
|
||||||
#print(f'title ref={ref} filepos={filepos} -> {title}') #DBG
|
|
||||||
return title
|
|
||||||
title = find_label_path(v["children"], ref, filepos, new_path)
|
|
||||||
if title:
|
|
||||||
#print(f'title1 ref={ref} filepos={filepos} -> {title}') #DBG
|
|
||||||
return title
|
|
||||||
|
|
||||||
# 2. 如果带filepos查找失败,回退到同ref下第一个章节(即只要ref匹配就返回)
|
|
||||||
if filepos is not None:
|
|
||||||
for v in nodes:
|
for v in nodes:
|
||||||
if "label" in v:
|
if "label" in v:
|
||||||
new_path = path + [v["label"]]
|
new_path = path + [v["label"]]
|
||||||
# print(f"对比 {v['ref']} == {ref}")
|
if v["ref"] == ref and (filepos is None or v["filepos"] == filepos):
|
||||||
if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]:
|
|
||||||
title = " / ".join(new_path)
|
title = " / ".join(new_path)
|
||||||
#print(f'title3 ref={ref} filepos={filepos} -> {title}') #DBG
|
|
||||||
return title
|
return title
|
||||||
title = find_label_path(v["children"], ref, None, new_path)
|
title = TOCParser.find_label_path(v["children"], ref, filepos, new_path)
|
||||||
if title:
|
|
||||||
#print(f'title4 ref={ref} filepos={filepos} -> {title}') #DBG
|
|
||||||
return title
|
|
||||||
|
|
||||||
# 3. 若完全未找到,尝试直接解析idref所指html文件标题,获取章节label信息
|
|
||||||
# 仅在顶层调用时执行此逻辑
|
|
||||||
if path == [] and ref and ref.endswith('.html'):
|
|
||||||
import os
|
|
||||||
# 自动在常见目录下查找html文件(以toc文件目录为基准)
|
|
||||||
caller_dir = os.path.dirname(os.path.abspath(__file__))
|
|
||||||
search_dirs = [caller_dir, os.getcwd()]
|
|
||||||
for d in search_dirs:
|
|
||||||
html_path = os.path.join(d, ref)
|
|
||||||
#print(f"查找 {html_path}")
|
|
||||||
if os.path.isfile(html_path):
|
|
||||||
title = parse_html_title(html_path)
|
|
||||||
if title:
|
|
||||||
return title
|
|
||||||
# 递归查找(以toc文件目录为根)
|
|
||||||
for d in search_dirs:
|
|
||||||
for root, _, files in os.walk(d):
|
|
||||||
if ref in files:
|
|
||||||
html_path = os.path.join(root, ref)
|
|
||||||
#print(f"2 查找 {html_path}")
|
|
||||||
title = parse_html_title(html_path)
|
|
||||||
if title:
|
if title:
|
||||||
return title
|
return title
|
||||||
return None
|
if filepos is not None:
|
||||||
|
for v in nodes:
|
||||||
|
if "label" in v:
|
||||||
|
new_path = path + [v["label"]]
|
||||||
|
if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]:
|
||||||
|
title = " / ".join(new_path)
|
||||||
|
return title
|
||||||
|
title = TOCParser.find_label_path(v["children"], ref, None, new_path)
|
||||||
|
if title:
|
||||||
|
return title
|
||||||
|
if path == [] and ref and ref.endswith('.html'):
|
||||||
|
caller_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
search_dirs = [caller_dir, os.getcwd()]
|
||||||
|
for d in search_dirs:
|
||||||
|
html_path = os.path.join(d, ref)
|
||||||
|
if os.path.isfile(html_path):
|
||||||
|
title = TOCParser.parse_html_title(html_path)
|
||||||
|
if title:
|
||||||
|
return title
|
||||||
|
for d in search_dirs:
|
||||||
|
for root, _, files in os.walk(d):
|
||||||
|
if ref in files:
|
||||||
|
html_path = os.path.join(root, ref)
|
||||||
|
title = TOCParser.parse_html_title(html_path)
|
||||||
|
if title:
|
||||||
|
return title
|
||||||
|
return None
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# ==== 批量测试指定toc/html/filepos列表 ====
|
# ==== 批量测试指定toc/html/filepos列表 ====
|
||||||
|
@ -182,8 +137,6 @@ if __name__ == "__main__":
|
||||||
[config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
|
[config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
|
||||||
]
|
]
|
||||||
for epub_dir, html_file, filepos in test_cases:
|
for epub_dir, html_file, filepos in test_cases:
|
||||||
# 自动查找epub目录下的toc.ncx
|
|
||||||
import os
|
|
||||||
toc_path = None
|
toc_path = None
|
||||||
for root, _, files in os.walk(epub_dir):
|
for root, _, files in os.walk(epub_dir):
|
||||||
for f in files:
|
for f in files:
|
||||||
|
@ -200,39 +153,32 @@ if __name__ == "__main__":
|
||||||
with open(toc_path, "r", encoding="utf-8") as f:
|
with open(toc_path, "r", encoding="utf-8") as f:
|
||||||
soup = BeautifulSoup(f, "xml")
|
soup = BeautifulSoup(f, "xml")
|
||||||
nav_map = soup.find("navMap")
|
nav_map = soup.find("navMap")
|
||||||
toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False))
|
toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
|
||||||
label_path = find_label_path(toc_tree, html_file, filepos)
|
label_path = TOCParser.find_label_path(toc_tree, html_file, filepos)
|
||||||
print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}")
|
print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}")
|
||||||
|
|
||||||
# tocb中不存在html,直接测试parse_html_title
|
|
||||||
html_path = os.path.join(epub_dir, html_file.split('#')[0])
|
html_path = os.path.join(epub_dir, html_file.split('#')[0])
|
||||||
if os.path.exists(html_path):
|
if os.path.exists(html_path):
|
||||||
title = parse_html_title(html_path)
|
title = TOCParser.parse_html_title(html_path)
|
||||||
print(f"解析html标题: {html_path} => {title if title else '未找到标题'}")
|
print(f"解析html标题: {html_path} => {title if title else '未找到标题'}")
|
||||||
# 新增:根据selectedtext定位章节标题
|
|
||||||
selectedtext = '从变法思想看,王安石变法最大的魅力是“民不加赋而国用足”:老百姓上缴的税率不增,国库的总收入仍可以'
|
selectedtext = '从变法思想看,王安石变法最大的魅力是“民不加赋而国用足”:老百姓上缴的税率不增,国库的总收入仍可以'
|
||||||
section = find_section_by_selectedtext(html_path, selectedtext)
|
section = TOCParser.find_section_by_selectedtext(html_path, selectedtext)
|
||||||
print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}")
|
print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}")
|
||||||
else:
|
else:
|
||||||
print(f"未找到html文件: {html_path}")
|
print(f"未找到html文件: {html_path}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"测试失败: {e}")
|
print(f"测试失败: {e}")
|
||||||
|
|
||||||
# ==== 新增:测试变宋笔记章节定位和html标题解析 ====
|
|
||||||
print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====")
|
print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====")
|
||||||
# 假设笔记数据如下
|
|
||||||
note_idref = 'text/part0002_split_003.html'
|
note_idref = 'text/part0002_split_003.html'
|
||||||
note_filepos = None
|
note_filepos = None
|
||||||
# 变宋toc.ncx路径
|
|
||||||
bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx"
|
bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx"
|
||||||
import os
|
|
||||||
if os.path.exists(bian_song_toc):
|
if os.path.exists(bian_song_toc):
|
||||||
with open(bian_song_toc, "r", encoding="utf-8") as f:
|
with open(bian_song_toc, "r", encoding="utf-8") as f:
|
||||||
soup = BeautifulSoup(f, "xml")
|
soup = BeautifulSoup(f, "xml")
|
||||||
nav_map = soup.find("navMap")
|
nav_map = soup.find("navMap")
|
||||||
toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False))
|
toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
|
||||||
# 先尝试用find_label_path查找章节
|
label_path = TOCParser.find_label_path(toc_tree, note_idref, note_filepos)
|
||||||
label_path = find_label_path(toc_tree, note_idref, note_filepos)
|
|
||||||
print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节,尝试解析html标题")
|
print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节,尝试解析html标题")
|
||||||
else:
|
else:
|
||||||
print(f"未找到toc.ncx: {bian_song_toc}")
|
print(f"未找到toc.ncx: {bian_song_toc}")
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue