This commit is contained in:
douboer 2025-08-15 17:20:30 +08:00
parent 0bc6844209
commit 4e3b8abc34
12 changed files with 406 additions and 516 deletions

View File

@ -1,26 +1,29 @@
""" """
annotationdata.py annotationdata.py (OOP版)
----------------- ------------------------
功能 功能
- 解析iBooks的AEAnnotation.sqlite数据库提取所有或指定书籍assetid/bookid的笔记 - 解析iBooks的AEAnnotation.sqlite数据库提取所有或指定书籍assetid/bookid的笔记
- 提供parse_location辅助函数解析笔记定位信息 - 提供parse_location辅助函数解析笔记定位信息
- 返回结构化的annotations数据便于后续章节定位与导出 - 返回结构化的annotations数据便于后续章节定位与导出
依赖config.py 统一管理路径和配置项 依赖config.py 统一管理路径和配置项
主要接口AnnotationManager
主要接口 - get_annotations(bookid=None)返回所有或指定assetid的笔记结构为{assetid: {uuid: {...}}}
- get_annotations(db_path, bookid=None)返回所有或指定assetid的笔记结构为{assetid: {uuid: {...}}}
- parse_location(location)解析ZANNOTATIONLOCATION返回(idref, filepos) - parse_location(location)解析ZANNOTATIONLOCATION返回(idref, filepos)
依赖sqlite3, collections, re, os, datetime 依赖sqlite3, collections, re, os, datetime
""" """
import config import config
import sqlite3 import sqlite3
from collections import defaultdict
import re import re
import os import os
from collections import defaultdict
class AnnotationManager:
def __init__(self, db_path=None):
self.db_path = db_path or config.LOCAL_ANNOTATION_DB
@staticmethod
def parse_location(location): def parse_location(location):
""" """
解析ZANNOTATIONLOCATION返回(idref, filepos) 解析ZANNOTATIONLOCATION返回(idref, filepos)
@ -31,21 +34,20 @@ def parse_location(location):
filepos = None filepos = None
if not location: if not location:
return idref, filepos return idref, filepos
# 统一处理,提取前两个[]内容
matches = re.findall(r'\[(.*?)\]', location) if location else [] matches = re.findall(r'\[(.*?)\]', location) if location else []
idref = matches[0] if len(matches) > 0 else None idref = matches[0] if len(matches) > 0 else None
filepos = matches[1] if len(matches) > 1 else None filepos = matches[1] if len(matches) > 1 else None
return idref, filepos return idref, filepos
def get_annotations(db_path=config.LOCAL_ANNOTATION_DB, bookid=None): def get_annotations(self, bookid=None):
# 检查WAL模式相关文件 # 检查WAL模式相关文件
base = db_path.rsplit('.', 1)[0] base = self.db_path.rsplit('.', 1)[0]
wal_path = base + '.sqlite-wal' wal_path = base + '.sqlite-wal'
shm_path = base + '.sqlite-shm' shm_path = base + '.sqlite-shm'
for f in [db_path, wal_path, shm_path]: for f in [self.db_path, wal_path, shm_path]:
if not os.path.exists(f): if not os.path.exists(f):
print(f'警告: 缺少 {f},可能无法获取全部最新笔记') print(f'警告: 缺少 {f},可能无法获取全部最新笔记')
conn = sqlite3.connect(db_path) conn = sqlite3.connect(self.db_path)
cursor = conn.cursor() cursor = conn.cursor()
if bookid is not None: if bookid is not None:
cursor.execute(''' cursor.execute('''
@ -67,7 +69,6 @@ def get_annotations(db_path=config.LOCAL_ANNOTATION_DB, bookid=None):
if creationdate: if creationdate:
try: try:
origin = datetime.datetime(2001, 1, 1) origin = datetime.datetime(2001, 1, 1)
# 苹果时间戳 float/int 或数字字符串
if isinstance(creationdate, (int, float)): if isinstance(creationdate, (int, float)):
dt = origin + datetime.timedelta(seconds=creationdate) dt = origin + datetime.timedelta(seconds=creationdate)
elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit(): elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit():
@ -77,8 +78,7 @@ def get_annotations(db_path=config.LOCAL_ANNOTATION_DB, bookid=None):
date_str = f"{dt.year}/{dt.month}/{dt.day}" date_str = f"{dt.year}/{dt.month}/{dt.day}"
except Exception: except Exception:
date_str = str(creationdate) date_str = str(creationdate)
idref, filepos = parse_location(location) idref, filepos = self.parse_location(location)
# 跳过note和selectedtext都为None的笔记
if note is None and selectedtext is None: if note is None and selectedtext is None:
continue continue
annotations[str(assetid)][uuid] = { annotations[str(assetid)][uuid] = {
@ -90,47 +90,24 @@ def get_annotations(db_path=config.LOCAL_ANNOTATION_DB, bookid=None):
} }
conn.close() conn.close()
if bookid is not None: if bookid is not None:
# 只返回特定bookid的笔记结构
return {str(bookid): annotations.get(str(bookid), {})} return {str(bookid): annotations.get(str(bookid), {})}
return annotations return annotations
# 用法示例输出每本书的前3条笔记
if __name__ == "__main__": if __name__ == "__main__":
manager = AnnotationManager()
# 测试 parse_location # 测试 parse_location
'''
test_locations = [ test_locations = [
'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8', 'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8',
'epubcfi(/6/22[id15]!/4/156/1,:21,:157)', 'epubcfi(/6/22[id15]!/4/156/1,:21,:157)',
'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)' 'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)'
] ]
for loc in test_locations: for loc in test_locations:
idref, filepos = parse_location(loc) idref, filepos = manager.parse_location(loc)
print(f"location: {loc}\n idref: {idref}\n filepos: {filepos}\n") print(f"location: {loc}\n idref: {idref}\n filepos: {filepos}\n")
'''
# 测试只获取特定 assetid 的笔记 # 测试只获取特定 assetid 的笔记
test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C" test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C"
annotations = get_annotations(bookid=test_bookid) annotations = manager.get_annotations(bookid=test_bookid)
# 格式化打印该书的所有笔记
from pprint import pprint from pprint import pprint
print(f"\nAssetID={test_bookid} 的所有笔记:") print(f"\nAssetID={test_bookid} 的所有笔记:")
pprint(annotations, indent=2, sort_dicts=False) pprint(annotations, indent=2, sort_dicts=False)
# 输出每本书的前3条笔记
'''
book_notes = defaultdict(list)
for assetid, notes_dict in annotations.items():
for uuid, ann in notes_dict.items():
book_notes[assetid].append({**ann, 'uuid': uuid})
for assetid, notes in book_notes.items():
print(f"\nAssetID: {assetid}")
for i, note in enumerate(notes[:3]):
print(f" 笔记{i+1}:")
print(f" creationdate: {note['creationdate']}")
print(f" idref: {note['idref']}")
print(f" filepos: {note['filepos']}")
print(f" note: {note['note']}")
print(f" selectedtext: {note['selectedtext']}")
print(f" uuid: {note['uuid']}")
'''

View File

@ -1,32 +1,27 @@
"""
booklist_parse.py
-----------------
功能
- 解析iBooks的Books.plist提取所有书籍元数据书名作者路径时间等
- 解析BKLibrary.sqlite获取每本书的最近打开时间苹果时间戳基准2001-01-01
依赖config.py 统一管理路径和配置项
主要接口
- parse_books_plist(plist_path)返回所有书籍元数据结构为{bk_id: {...}}
- get_books_last_open(db_path)返回所有书籍最近打开时间结构为{bk_id: {'last_open': 时间戳}}
依赖plistlib, collections, sqlite3, os, datetime
典型用法
booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST)
books_open = get_books_last_open(config.LOCAL_LIBRARY_DB)
"""
import config import config
import plistlib import plistlib
import sqlite3
import os
from collections import defaultdict from collections import defaultdict
def parse_books_plist(plist_path=config.LOCAL_BOOKS_PLIST): class BookListManager:
def __init__(self, plist_path=None, db_path=None):
self.plist_path = plist_path or config.LOCAL_BOOKS_PLIST
self.db_path = db_path or config.LOCAL_LIBRARY_DB
self._booksinfo = None
self._books_open = None
def get_books_info(self):
if self._booksinfo is not None:
return self._booksinfo
booksinfo = defaultdict(dict) booksinfo = defaultdict(dict)
with open(plist_path, 'rb') as f: plist_data = plistlib.load(f) with open(self.plist_path, 'rb') as f:
plist_data = plistlib.load(f)
for book in plist_data.get('Books', []): for book in plist_data.get('Books', []):
bk_id = book.get('BKGeneratedItemId') bk_id = book.get('BKGeneratedItemId')
if not bk_id: continue if not bk_id:
continue
booksinfo[bk_id] = { booksinfo[bk_id] = {
'displayname': book.get('BKDisplayName', ''), 'displayname': book.get('BKDisplayName', ''),
'author': book.get('artistName', ''), 'author': book.get('artistName', ''),
@ -37,39 +32,35 @@ def parse_books_plist(plist_path=config.LOCAL_BOOKS_PLIST):
'date': book.get('BKInsertionDate',''), 'date': book.get('BKInsertionDate',''),
'updatedate': book.get('updateDate','') 'updatedate': book.get('updateDate','')
} }
self._booksinfo = booksinfo
return booksinfo return booksinfo
import sqlite3
import os
def get_books_last_open(db_path=config.LOCAL_LIBRARY_DB): def get_books_last_open(self):
""" if self._books_open is not None:
从BKLibrary.sqlite获取书籍最近打开时间 return self._books_open
返回defaultdict(dict)bk_id为索引包含最近打开时间
"""
books_open = defaultdict(dict) books_open = defaultdict(dict)
if not os.path.exists(db_path): if not os.path.exists(self.db_path):
return books_open return books_open
try: try:
conn = sqlite3.connect(db_path) conn = sqlite3.connect(self.db_path)
cursor = conn.cursor() cursor = conn.cursor()
# ZBKLIBRARYASSET表包含书籍信息
cursor.execute(''' SELECT ZASSETID, zlastopendate FROM ZBKLIBRARYASSET WHERE zlastopendate IS NOT NULL ''') cursor.execute(''' SELECT ZASSETID, zlastopendate FROM ZBKLIBRARYASSET WHERE zlastopendate IS NOT NULL ''')
rows = cursor.fetchall() rows = cursor.fetchall()
for row in rows: for row in rows:
asset_id, last_open = row asset_id, last_open = row
if asset_id: if asset_id:
books_open[asset_id] = { books_open[asset_id] = {
'last_open': last_open # 苹果时间戳基准时间为2001-01-01 'last_open': last_open
} }
conn.close() conn.close()
except Exception as e: except Exception as e:
print(f'警告: 读取BKLibrary.sqlite失败: {e}') print(f'警告: 读取BKLibrary.sqlite失败: {e}')
self._books_open = books_open
return books_open return books_open
if __name__ == '__main__': if __name__ == '__main__':
booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST) manager = BookListManager()
booksinfo = manager.get_books_info()
from pprint import pprint from pprint import pprint
print("\n【前三条示例】") print("\n【前三条示例】")
for k, v in list(booksinfo.items())[:3]: for k, v in list(booksinfo.items())[:3]:
@ -77,19 +68,10 @@ if __name__ == '__main__':
pprint(v, sort_dicts=False, indent=2) pprint(v, sort_dicts=False, indent=2)
print('-' * 60) print('-' * 60)
'''
print("\n【全部内容】")
for k, v in booksinfo.items():
print(f"{k}:")
pprint(v, sort_dicts=False, indent=2)
print('-' * 60)
'''
# 测试最近打开时间
print("\n【最近打开时间示例】") print("\n【最近打开时间示例】")
books_open = get_books_last_open() books_open = manager.get_books_last_open()
import datetime import datetime
for k, v in list(books_open.items())[:3]: for k, v in list(books_open.items())[:3]:
ts = v['last_open'] ts = v['last_open']
# 苹果时间戳基准2001-01-01
dt = datetime.datetime(2001, 1, 1) + datetime.timedelta(seconds=ts) dt = datetime.datetime(2001, 1, 1) + datetime.timedelta(seconds=ts)
print(f"{k}: {dt} (timestamp: {ts})") print(f"{k}: {dt} (timestamp: {ts})")

Binary file not shown.

View File

@ -158,31 +158,43 @@ answer = inquirer.fuzzy(
## 9.1 主要代码文件说明(细化) ## 9.1 主要代码文件说明(细化)
- `exportbooknotes.py` - `exportbooknotes.py`
- 采用 OOP 设计,核心类为 `BookNotesExporter`
- `build_booksnote(bookid=None)`:构建结构化笔记数据。
- `export_booksnote_to_md(booksnote, booksinfo, out_path=None)`:导出为 Markdown。
- `find_file_by_ext`、`get_toc_tree` 等辅助方法。
- 数据同步:自动复制 iBooks 数据库和元数据到本地。 - 数据同步:自动复制 iBooks 数据库和元数据到本地。
- 菜单交互:按最近打开时间戳排序,显示“书名 [时间戳]”,支持模糊搜索。 - 菜单交互:按最近打开时间戳排序,显示“书名 [时间戳]”,支持模糊搜索。
- 只处理用户选中书籍的笔记,按章节分组导出 Markdown。 - 只处理用户选中书籍的笔记,按章节分组导出 Markdown。
- 依赖核心解析模块,负责主流程调度。 - 依赖核心解析模块,负责主流程调度。
- `annotationdata.py` - `annotationdata.py`
- OOP 设计,核心类为 `AnnotationManager`
- `get_annotations(bookid=None)`:返回所有或指定 assetid 的笔记。
- `parse_location(location)`:静态方法,解析定位信息。
- 解析 AEAnnotation.sqlite提取所有或指定 assetid 的笔记。 - 解析 AEAnnotation.sqlite提取所有或指定 assetid 的笔记。
- 支持苹果时间戳转换,结构化输出。 - 支持苹果时间戳转换,结构化输出。
- parse_location 辅助函数,统一解析笔记定位信息。
- `booklist_parse.py` - `booklist_parse.py`
- OOP 设计,核心类为 `BookListManager`
- `get_books_info()`:获取书籍元数据。
- `get_books_last_open()`:获取每本书的最近打开时间。
- 解析 Books.plist获取书籍元数据书名、作者、路径、时间等 - 解析 Books.plist获取书籍元数据书名、作者、路径、时间等
- 解析 BKLibrary.sqlite获取每本书的最近打开时间zlastopendate苹果时间戳 - 解析 BKLibrary.sqlite获取每本书的最近打开时间。
- 提供统一数据接口,便于主流程排序和展示。
- `opf_parse.py` - `opf_parse.py`
- OOP 设计,核心类为 `OPFParser`
- `parse_opf(filepath)`:静态方法,返回 id->href 映射。
- 解析 epub 的 OPF 文件获取章节与文件映射关系idref -> href - 解析 epub 的 OPF 文件获取章节与文件映射关系idref -> href
- 支持多种 epub 目录结构。
- `toc_parse.py` - `toc_parse.py`
- OOP 设计,核心类为 `TOCParser`
- `parse_navpoints(navpoints)`:递归解析 navPoint 节点。
- `find_label_path(node, ref, filepos, path)`:查找章节路径。
- `find_section_by_selectedtext(html_path, selectedtext)`:通过选中文本定位章节标题。
- `parse_html_title(html_path)`:解析 html 文件标题。
- 解析 NCX 目录文件,递归构建章节树结构。 - 解析 NCX 目录文件,递归构建章节树结构。
- find_label_path支持通过 ref 和 filepos 查找完整 label 路径。
- find_section_by_selectedtext通过选中文本在 html 文件中定位章节标题。
- parse_html_title解析 html 文件标题。
- `backup/booksnote.py` - `backup/booksnote.py`
- 历史/备份脚本,辅助数据迁移或格式转换。 - 历史/备份脚本,辅助数据迁移或格式转换。

View File

@ -1,4 +1,4 @@
# 笔记导出 2025-08-15 13:25 # 笔记导出 2025-08-15 17:20
## 传统十论 ## 传统十论

View File

@ -1,31 +1,17 @@
""" """
exportbooknotes.py exportbooknotes.py (OOP版)
------------------ -------------------------
功能 功能
- 自动同步iBooks数据库和元数据文件到本地data目录 - 自动同步iBooks数据库和元数据文件到本地data目录
- 解析AEAnnotation.sqliteBooks.plistBKLibrary.sqlite构建结构化笔记数据 - 解析AEAnnotation.sqliteBooks.plistBKLibrary.sqlite构建结构化笔记数据
- 解析epub目录和章节信息定位每条笔记所属章节 - 解析epub目录和章节信息定位每条笔记所属章节
- 命令行菜单按最近打开时间降序展示书籍列表供用户选择导出 - 命令行菜单按最近打开时间降序展示书籍列表供用户选择导出
- 仅导出选中书籍的所有笔记按章节分组生成Markdown文件 - 仅导出选中书籍的所有笔记按章节分组生成Markdown文件
依赖config.py 统一管理路径和配置项 依赖config.py 统一管理路径和配置项
主要接口BookNotesExporter
主要数据流 - run()命令行交互式导出主流程
1. 数据同步到data目录 - build_booksnote(bookid=None)构建结构化笔记数据
2. 解析Books.plist获取书籍元数据 - export_booksnote_to_md(booksnote, booksinfo, out_path=None)导出为Markdown
3. 解析BKLibrary.sqlite获取最近打开时间
4. 菜单排序与显示书名+时间戳
5. 解析AEAnnotation.sqlite获取笔记
6. 解析epub目录定位章节
7. 导出Markdown文件
依赖Python 3, InquirerPy, bs4, shutil, os, datetime, sqlite3
主要数据流
典型用法
python exportbooknotes.py
# 按提示选择书籍自动导出笔记到export_notes目录
""" """
import config import config
""" """
@ -40,17 +26,24 @@ booksnote = {
}}} }}}
} }
""" """
from collections import defaultdict
import os import os
from annotationdata import get_annotations from collections import defaultdict
from booklist_parse import parse_books_plist from annotationdata import AnnotationManager
from booklist_parse import BookListManager
from opf_parse import parse_opf from opf_parse import parse_opf
from toc_parse import parse_navpoints, find_label_path from toc_parse import TOCParser
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from pprint import pprint
class BookNotesExporter:
def __init__(self, config_module=config):
self.config = config_module
self.annotation_db = config_module.LOCAL_ANNOTATION_DB
self.books_plist = config_module.LOCAL_BOOKS_PLIST
self.library_db = config_module.LOCAL_LIBRARY_DB
@staticmethod
def find_file_by_ext(root, exts): def find_file_by_ext(root, exts):
"""在root下递归查找第一个指定后缀的文件"""
for dirpath, _, files in os.walk(root): for dirpath, _, files in os.walk(root):
for f in files: for f in files:
for ext in exts: for ext in exts:
@ -58,48 +51,44 @@ def find_file_by_ext(root, exts):
return os.path.join(dirpath, f) return os.path.join(dirpath, f)
return None return None
@staticmethod
def get_toc_tree(toc_path): def get_toc_tree(toc_path):
with open(toc_path, 'r', encoding='utf-8') as f: with open(toc_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'xml') soup = BeautifulSoup(f, 'xml')
nav_map = soup.find('navMap') nav_map = soup.find('navMap')
nav_points = nav_map.find_all('navPoint', recursive=False) nav_points = nav_map.find_all('navPoint', recursive=False)
toc_tree = parse_navpoints(nav_points) toc_tree = TOCParser.parse_navpoints(nav_points)
#pprint(toc_tree, indent=2, depth=5)
return toc_tree return toc_tree
def build_booksnote(annotation_db=config.LOCAL_ANNOTATION_DB, books_plist=config.LOCAL_BOOKS_PLIST, bookid=None): def build_booksnote(self, bookid=None):
# 支持只处理特定 assetid 的笔记 manager = AnnotationManager(self.annotation_db)
annotations = get_annotations(annotation_db, bookid=bookid) annotations = manager.get_annotations(bookid=bookid)
booksinfo = parse_books_plist(books_plist) bl_manager = BookListManager(plist_path=self.books_plist)
booksinfo = bl_manager.get_books_info()
booksnote = defaultdict(lambda: defaultdict(dict)) booksnote = defaultdict(lambda: defaultdict(dict))
for assetid, notes in annotations.items(): for assetid, notes in annotations.items():
# 获取epub路径
bookinfo = booksinfo.get(assetid) bookinfo = booksinfo.get(assetid)
if not bookinfo: if not bookinfo:
continue continue
epub_path = bookinfo.get('path') epub_path = bookinfo.get('path')
if not epub_path or not os.path.isdir(epub_path): if not epub_path or not os.path.isdir(epub_path):
continue continue
# 查找opf和ncx opf_path = self.find_file_by_ext(epub_path, ['.opf'])
opf_path = find_file_by_ext(epub_path, ['.opf']) ncx_path = self.find_file_by_ext(epub_path, ['.ncx'])
ncx_path = find_file_by_ext(epub_path, ['.ncx'])
if not opf_path or not ncx_path: if not opf_path or not ncx_path:
continue continue
id2href = parse_opf(opf_path) id2href = parse_opf(opf_path)
toc_tree = get_toc_tree(ncx_path) toc_tree = self.get_toc_tree(ncx_path)
for uuid, ann in notes.items(): for uuid, ann in notes.items():
idref = ann['idref'] idref = ann['idref']
filepos = ann['filepos'] filepos = ann['filepos']
href = id2href.get(idref, idref) href = id2href.get(idref, idref)
chapter = find_label_path(toc_tree, href, filepos) chapter = TOCParser.find_label_path(toc_tree, href, filepos)
if chapter is None: if chapter is None:
# 直接从html文件获取章节信息
html_path = os.path.join(epub_path, href.split('#')[0]) html_path = os.path.join(epub_path, href.split('#')[0])
selectedtext = ann.get('selectedtext') selectedtext = ann.get('selectedtext')
if os.path.exists(html_path) and selectedtext: if os.path.exists(html_path) and selectedtext:
from toc_parse import find_section_by_selectedtext section = TOCParser.find_section_by_selectedtext(html_path, selectedtext)
section = find_section_by_selectedtext(html_path, selectedtext)
if section: if section:
chapter = section chapter = section
else: else:
@ -115,17 +104,8 @@ def build_booksnote(annotation_db=config.LOCAL_ANNOTATION_DB, books_plist=config
} }
return booksnote return booksnote
def export_booksnote_to_md(self, booksnote, booksinfo, out_path=None):
import datetime import datetime
def export_booksnote_to_md(booksnote, booksinfo, out_path=None):
"""
依据booksnote结构导出markdown文件格式
# “笔记导出”+导出时间
## 书名
### chapter
selectedtext
> note (如果存在)
"""
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
lines = [f'# 笔记导出 {now}\n'] lines = [f'# 笔记导出 {now}\n']
for assetid, chapters in booksnote.items(): for assetid, chapters in booksnote.items():
@ -151,6 +131,8 @@ def export_booksnote_to_md(booksnote, booksinfo, out_path=None):
if __name__ == '__main__': if __name__ == '__main__':
import shutil import shutil
import os.path import os.path
from InquirerPy import inquirer # type: ignore
exporter = BookNotesExporter(config)
# 自动覆盖 ./data 下的数据库和plist文件源为iBooks真实路径 # 自动覆盖 ./data 下的数据库和plist文件源为iBooks真实路径
src_files = [ src_files = [
(config.IBOOKS_ANNOTATION_DB, config.LOCAL_ANNOTATION_DB), (config.IBOOKS_ANNOTATION_DB, config.LOCAL_ANNOTATION_DB),
@ -166,31 +148,19 @@ if __name__ == '__main__':
else: else:
print(f'file not found: {src} ') print(f'file not found: {src} ')
from booklist_parse import parse_books_plist
from InquirerPy import inquirer # type: ignore
# 先获取所有书籍元数据 # 先获取所有书籍元数据
booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST) manager = BookListManager(plist_path=config.LOCAL_BOOKS_PLIST, db_path=config.LOCAL_LIBRARY_DB)
booksinfo = manager.get_books_info()
# 构建书名列表优先displayname, 其次itemname, 否则assetid按parse_books_plist中的date字段排序
assetid2name = {} assetid2name = {}
assetid2lastopen = {} assetid2lastopen = {}
from booklist_parse import get_books_last_open last_open_times = manager.get_books_last_open()
# 获取所有书籍的最后打开时间(字典,值为{'last_open': 时间戳}
last_open_times = get_books_last_open(config.LOCAL_LIBRARY_DB)
for assetid, info in booksinfo.items(): for assetid, info in booksinfo.items():
name = info.get('displayname') or info.get('itemname') or assetid name = info.get('displayname') or info.get('itemname') or assetid
# 如果书名中包含“-”,只取“-”前面的部分
if '-' in name: if '-' in name:
name = name.split('-', 1)[0].strip() name = name.split('-', 1)[0].strip()
assetid2name[assetid] = name assetid2name[assetid] = name
# 用 get_books_last_open 返回的时间戳排序如无则为0
ts = last_open_times.get(assetid, {}).get('last_open', 0) ts = last_open_times.get(assetid, {}).get('last_open', 0)
assetid2lastopen[assetid] = ts assetid2lastopen[assetid] = ts
# 按last_open时间戳降序排列
sorted_assetids = sorted(assetid2name.keys(), key=lambda aid: assetid2lastopen[aid], reverse=True) sorted_assetids = sorted(assetid2name.keys(), key=lambda aid: assetid2lastopen[aid], reverse=True)
choices = [f"{assetid2name[aid]} [{assetid2lastopen[aid]}]" for aid in sorted_assetids] choices = [f"{assetid2name[aid]} [{assetid2lastopen[aid]}]" for aid in sorted_assetids]
if not choices: if not choices:
@ -202,8 +172,6 @@ if __name__ == '__main__':
multiselect=False, multiselect=False,
instruction="上下键选择,输入可模糊筛选,回车确定" instruction="上下键选择,输入可模糊筛选,回车确定"
).execute() ).execute()
# 解析选中assetid
for aid, name in assetid2name.items(): for aid, name in assetid2name.items():
if answer.startswith(name): if answer.startswith(name):
selected_assetid = aid selected_assetid = aid
@ -211,10 +179,8 @@ if __name__ == '__main__':
else: else:
print("未找到选中书籍") print("未找到选中书籍")
exit(1) exit(1)
selected_booksnote = exporter.build_booksnote(bookid=selected_assetid)
# 只导出选中书的笔记
selected_booksnote = build_booksnote(bookid=selected_assetid)
selected_booksinfo = {selected_assetid: booksinfo.get(selected_assetid, {})} selected_booksinfo = {selected_assetid: booksinfo.get(selected_assetid, {})}
out_path = f'export_notes/notes_export_{selected_assetid}.md' out_path = f'export_notes/notes_export_{selected_assetid}.md'
export_booksnote_to_md(selected_booksnote, selected_booksinfo, out_path) exporter.export_booksnote_to_md(selected_booksnote, selected_booksinfo, out_path)
print(f'{selected_booksinfo[selected_assetid].get("displayname") or selected_booksinfo[selected_assetid].get("itemname") or selected_assetid}》 导出笔记 {out_path}') print(f'{selected_booksinfo[selected_assetid].get("displayname") or selected_booksinfo[selected_assetid].get("itemname") or selected_assetid}》 导出笔记 {out_path}')

View File

@ -1,21 +1,29 @@
def parse_opf(filepath):
"""
兼容旧代码的顶层函数实际调用 OPFParser.parse_opf
"""
return OPFParser.parse_opf(filepath)
# parseopf.py """
# ----------------------------- opf_parse.py (OOP版)
# 用于解析EPUB电子书的OPF文件提取manifest部分所有id对应的html文件href。 -------------------
# 支持批量测试和通过id快速查找href。 功能
# 依赖BeautifulSoup4 - 解析EPUB电子书的OPF文件提取manifest部分所有id对应的html文件href
# ----------------------------- - 支持通过id快速查找href
- 支持批量测试
依赖BeautifulSoup4
主要接口OPFParser
- parse_opf(filepath)静态方法返回id->href映射仅html文件
"""
from collections import defaultdict from collections import defaultdict
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import pprint
class OPFParser:
@staticmethod
def parse_opf(filepath): def parse_opf(filepath):
""" """
解析OPF文件返回{id: href}的defaultdict(dict)结构 解析OPF文件返回{id: href}的defaultdict(dict)结构
仅保留href以.html结尾的项 仅保留href以.html结尾的项
参数 参数
filepath (str): OPF文件路径 filepath (str): OPF文件路径
返回 返回
@ -24,7 +32,6 @@ def parse_opf(filepath):
result = defaultdict(dict) result = defaultdict(dict)
with open(filepath, 'r', encoding='utf-8') as f: with open(filepath, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'xml') soup = BeautifulSoup(f, 'xml')
# 查找manifest部分遍历所有item筛选html结尾的href
manifest = soup.find('manifest') manifest = soup.find('manifest')
if manifest: if manifest:
for item in manifest.find_all('item'): for item in manifest.find_all('item'):
@ -34,6 +41,7 @@ def parse_opf(filepath):
result[id_] = href result[id_] = href
return result return result
if __name__ == "__main__": if __name__ == "__main__":
test_files = [ test_files = [
'./examples/epub_format_2/OEBPS/content.opf', './examples/epub_format_2/OEBPS/content.opf',
@ -44,8 +52,7 @@ if __name__ == "__main__":
for file in test_files: for file in test_files:
print(f"\n==== 测试文件: {file} ====") print(f"\n==== 测试文件: {file} ====")
try: try:
result = parse_opf(file) result = OPFParser.parse_opf(file)
pprint.pprint(result, indent=2, width=120, sort_dicts=False)
# 增加通过id快速打印href的测试 # 增加通过id快速打印href的测试
test_ids = list(result.keys())[:3] # 取前三个id做演示 test_ids = list(result.keys())[:3] # 取前三个id做演示

View File

@ -1,6 +1,7 @@
""" """
toc_parse.py toc_parse.py (OOP版)
------------ -------------------
功能 功能
- 解析EPUB电子书的toc.ncx目录文件递归构建章节树结构 - 解析EPUB电子书的toc.ncx目录文件递归构建章节树结构
- 支持通过ref和filepos查找完整label路径 - 支持通过ref和filepos查找完整label路径
@ -8,43 +9,36 @@ toc_parse.py
- 兼容多种EPUB格式支持批量测试 - 兼容多种EPUB格式支持批量测试
依赖config.py 统一管理路径和配置项 依赖config.py 统一管理路径和配置项
主要接口 主要接口TOCParser
parse_navpoints(navpoints) # 递归解析navPoint节点返回章节树结构 - parse_navpoints(navpoints)递归解析navPoint节点返回章节树结构
find_label_path(node, ref, filepos, path) # 查找指定ref和filepos的章节label路径 - find_label_path(node, ref, filepos, path)查找指定ref和filepos的章节label路径
find_section_by_selectedtext(html_path, selectedtext) # 通过选中文本定位章节标题 - find_section_by_selectedtext(html_path, selectedtext)通过选中文本定位章节标题
parse_html_title(html_path) # 解析html文件标题 - parse_html_title(html_path)解析html文件标题
依赖BeautifulSoup4, pprint, os, typing 依赖BeautifulSoup4, pprint, os, typing
""" """
import config import config
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from typing import Dict, Optional, List, Any import os
import pprint
# ==== 辅助函数根据selectedtext在html文件中的位置推断所在章节 ==== class TOCParser:
def __init__(self):
pass
@staticmethod
def find_section_by_selectedtext(html_path, selectedtext): def find_section_by_selectedtext(html_path, selectedtext):
"""
在html文件中查找selectedtext出现的位置向上回溯最近的h1-h6标题返回该标题文本
若未找到标题则返回None
"""
try: try:
with open(html_path, 'r', encoding='utf-8') as f: with open(html_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser') soup = BeautifulSoup(f, 'html.parser')
# 在所有文本节点中查找selectedtext
for elem in soup.find_all(string=True): for elem in soup.find_all(string=True):
if selectedtext and selectedtext.strip() and selectedtext.strip() in elem: if selectedtext and selectedtext.strip() and selectedtext.strip() in elem:
# 回溯父节点查找最近的h1-h6
parent = elem.parent parent = elem.parent
while parent: while parent:
prev = parent.previous_sibling prev = parent.previous_sibling
# 向上查找同级前面的h1-h6
while prev: while prev:
if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']: if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']:
return prev.get_text(strip=True) return prev.get_text(strip=True)
prev = prev.previous_sibling prev = prev.previous_sibling
parent = parent.parent parent = parent.parent
# 若未找到尝试全局第一个h1-h6
for tag in ['h1','h2','h3','h4','h5','h6']: for tag in ['h1','h2','h3','h4','h5','h6']:
h = soup.find(tag) h = soup.find(tag)
if h and h.get_text(strip=True): if h and h.get_text(strip=True):
@ -53,17 +47,13 @@ def find_section_by_selectedtext(html_path, selectedtext):
pass pass
return None return None
@staticmethod
def parse_html_title(html_path): def parse_html_title(html_path):
"""
解析html文件优先返回<title>否则返回body第一个h1/h2/h3/h4/h5/h6或None
"""
try: try:
with open(html_path, 'r', encoding='utf-8') as f: with open(html_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser') soup = BeautifulSoup(f, 'html.parser')
# 优先<title>
if soup.title and soup.title.string: if soup.title and soup.title.string:
return soup.title.string.strip() return soup.title.string.strip()
# 其次正文第一个h1-h6
for tag in ['h1','h2','h3','h4','h5','h6']: for tag in ['h1','h2','h3','h4','h5','h6']:
h = soup.find(tag) h = soup.find(tag)
if h and h.get_text(strip=True): if h and h.get_text(strip=True):
@ -72,12 +62,8 @@ def parse_html_title(html_path):
pass pass
return None return None
def parse_navpoints(navpoints) -> Dict[str, dict]: @staticmethod
""" def parse_navpoints(navpoints):
递归解析 navpoints 节点返回嵌套 dict 结构
:param navpoints: BeautifulSoup 查找到的 navPoint 节点列表
:return: 章节树结构
"""
result = {} result = {}
for navpoint in navpoints: for navpoint in navpoints:
label = navpoint.navLabel.text.strip().strip('"“”') label = navpoint.navLabel.text.strip().strip('"“”')
@ -90,81 +76,50 @@ def parse_navpoints(navpoints) -> Dict[str, dict]:
"label": label, "label": label,
"ref": ref, "ref": ref,
"filepos": filepos, "filepos": filepos,
"children": parse_navpoints(navpoint.find_all("navPoint", recursive=False)) "children": TOCParser.parse_navpoints(navpoint.find_all("navPoint", recursive=False))
} }
result[navpoint.get("id")] = entry result[navpoint.get("id")] = entry
#pprint.pprint(result) # 格式化打印result
return result return result
def find_label_path( @staticmethod
node: Any, def find_label_path(node, ref, filepos=None, path=None):
ref: str,
filepos: Optional[str] = None,
path: Optional[List[str]] = None
) -> Optional[str]:
"""
在嵌套 dict 结构中查找指定 ref filepos label 路径
:param node: 当前节点dict dict集合
:param ref: html文件名
:param filepos: 文件位置可为 None
:param path: label 路径累积
:return: / 分隔的完整 label 路径未找到返回 None
"""
if path is None: if path is None:
path = [] path = []
if isinstance(node, dict): if isinstance(node, dict):
nodes = node.values() if "label" not in node else [node] nodes = node.values() if "label" not in node else [node]
# 1. 优先精确匹配ref和filepos
for v in nodes: for v in nodes:
if "label" in v: if "label" in v:
new_path = path + [v["label"]] new_path = path + [v["label"]]
if v["ref"] == ref and (filepos is None or v["filepos"] == filepos): if v["ref"] == ref and (filepos is None or v["filepos"] == filepos):
title = " / ".join(new_path) title = " / ".join(new_path)
#print(f'title ref={ref} filepos={filepos} -> {title}') #DBG
return title return title
title = find_label_path(v["children"], ref, filepos, new_path) title = TOCParser.find_label_path(v["children"], ref, filepos, new_path)
if title: if title:
#print(f'title1 ref={ref} filepos={filepos} -> {title}') #DBG
return title return title
# 2. 如果带filepos查找失败回退到同ref下第一个章节即只要ref匹配就返回
if filepos is not None: if filepos is not None:
for v in nodes: for v in nodes:
if "label" in v: if "label" in v:
new_path = path + [v["label"]] new_path = path + [v["label"]]
# print(f"对比 {v['ref']} == {ref}")
if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]: if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]:
title = " / ".join(new_path) title = " / ".join(new_path)
#print(f'title3 ref={ref} filepos={filepos} -> {title}') #DBG
return title return title
title = find_label_path(v["children"], ref, None, new_path) title = TOCParser.find_label_path(v["children"], ref, None, new_path)
if title: if title:
#print(f'title4 ref={ref} filepos={filepos} -> {title}') #DBG
return title return title
# 3. 若完全未找到尝试直接解析idref所指html文件标题获取章节label信息
# 仅在顶层调用时执行此逻辑
if path == [] and ref and ref.endswith('.html'): if path == [] and ref and ref.endswith('.html'):
import os
# 自动在常见目录下查找html文件以toc文件目录为基准
caller_dir = os.path.dirname(os.path.abspath(__file__)) caller_dir = os.path.dirname(os.path.abspath(__file__))
search_dirs = [caller_dir, os.getcwd()] search_dirs = [caller_dir, os.getcwd()]
for d in search_dirs: for d in search_dirs:
html_path = os.path.join(d, ref) html_path = os.path.join(d, ref)
#print(f"查找 {html_path}")
if os.path.isfile(html_path): if os.path.isfile(html_path):
title = parse_html_title(html_path) title = TOCParser.parse_html_title(html_path)
if title: if title:
return title return title
# 递归查找以toc文件目录为根
for d in search_dirs: for d in search_dirs:
for root, _, files in os.walk(d): for root, _, files in os.walk(d):
if ref in files: if ref in files:
html_path = os.path.join(root, ref) html_path = os.path.join(root, ref)
#print(f"2 查找 {html_path}") title = TOCParser.parse_html_title(html_path)
title = parse_html_title(html_path)
if title: if title:
return title return title
return None return None
@ -182,8 +137,6 @@ if __name__ == "__main__":
[config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""], [config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""],
] ]
for epub_dir, html_file, filepos in test_cases: for epub_dir, html_file, filepos in test_cases:
# 自动查找epub目录下的toc.ncx
import os
toc_path = None toc_path = None
for root, _, files in os.walk(epub_dir): for root, _, files in os.walk(epub_dir):
for f in files: for f in files:
@ -200,39 +153,32 @@ if __name__ == "__main__":
with open(toc_path, "r", encoding="utf-8") as f: with open(toc_path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "xml") soup = BeautifulSoup(f, "xml")
nav_map = soup.find("navMap") nav_map = soup.find("navMap")
toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False)) toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
label_path = find_label_path(toc_tree, html_file, filepos) label_path = TOCParser.find_label_path(toc_tree, html_file, filepos)
print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}") print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}")
# tocb中不存在html直接测试parse_html_title
html_path = os.path.join(epub_dir, html_file.split('#')[0]) html_path = os.path.join(epub_dir, html_file.split('#')[0])
if os.path.exists(html_path): if os.path.exists(html_path):
title = parse_html_title(html_path) title = TOCParser.parse_html_title(html_path)
print(f"解析html标题: {html_path} => {title if title else '未找到标题'}") print(f"解析html标题: {html_path} => {title if title else '未找到标题'}")
# 新增根据selectedtext定位章节标题
selectedtext = '从变法思想看,王安石变法最大的魅力是“民不加赋而国用足”:老百姓上缴的税率不增,国库的总收入仍可以' selectedtext = '从变法思想看,王安石变法最大的魅力是“民不加赋而国用足”:老百姓上缴的税率不增,国库的总收入仍可以'
section = find_section_by_selectedtext(html_path, selectedtext) section = TOCParser.find_section_by_selectedtext(html_path, selectedtext)
print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}") print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}")
else: else:
print(f"未找到html文件: {html_path}") print(f"未找到html文件: {html_path}")
except Exception as e: except Exception as e:
print(f"测试失败: {e}") print(f"测试失败: {e}")
# ==== 新增测试变宋笔记章节定位和html标题解析 ====
print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====") print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====")
# 假设笔记数据如下
note_idref = 'text/part0002_split_003.html' note_idref = 'text/part0002_split_003.html'
note_filepos = None note_filepos = None
# 变宋toc.ncx路径
bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx" bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx"
import os
if os.path.exists(bian_song_toc): if os.path.exists(bian_song_toc):
with open(bian_song_toc, "r", encoding="utf-8") as f: with open(bian_song_toc, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "xml") soup = BeautifulSoup(f, "xml")
nav_map = soup.find("navMap") nav_map = soup.find("navMap")
toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False)) toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False))
# 先尝试用find_label_path查找章节 label_path = TOCParser.find_label_path(toc_tree, note_idref, note_filepos)
label_path = find_label_path(toc_tree, note_idref, note_filepos)
print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节尝试解析html标题") print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节尝试解析html标题")
else: else:
print(f"未找到toc.ncx: {bian_song_toc}") print(f"未找到toc.ncx: {bian_song_toc}")