diff --git a/__pycache__/annotationdata.cpython-312.pyc b/__pycache__/annotationdata.cpython-312.pyc index 41b3f8f..e515272 100644 Binary files a/__pycache__/annotationdata.cpython-312.pyc and b/__pycache__/annotationdata.cpython-312.pyc differ diff --git a/__pycache__/booklist_parse.cpython-312.pyc b/__pycache__/booklist_parse.cpython-312.pyc index 9d811e5..60acbb2 100644 Binary files a/__pycache__/booklist_parse.cpython-312.pyc and b/__pycache__/booklist_parse.cpython-312.pyc differ diff --git a/__pycache__/opf_parse.cpython-312.pyc b/__pycache__/opf_parse.cpython-312.pyc index 4948c4c..b4070f6 100644 Binary files a/__pycache__/opf_parse.cpython-312.pyc and b/__pycache__/opf_parse.cpython-312.pyc differ diff --git a/__pycache__/toc_parse.cpython-312.pyc b/__pycache__/toc_parse.cpython-312.pyc index 6a4aef8..a328671 100644 Binary files a/__pycache__/toc_parse.cpython-312.pyc and b/__pycache__/toc_parse.cpython-312.pyc differ diff --git a/annotationdata.py b/annotationdata.py index f0f6e38..090e406 100644 --- a/annotationdata.py +++ b/annotationdata.py @@ -1,136 +1,113 @@ + """ -annotationdata.py ------------------ +annotationdata.py (OOP版) +------------------------ 功能: - 解析iBooks的AEAnnotation.sqlite数据库,提取所有或指定书籍(assetid/bookid)的笔记。 - 提供parse_location辅助函数,解析笔记定位信息。 - 返回结构化的annotations数据,便于后续章节定位与导出。 依赖:config.py 统一管理路径和配置项。 - -主要接口: - - get_annotations(db_path, bookid=None):返回所有或指定assetid的笔记,结构为{assetid: {uuid: {...}}} +主要接口:AnnotationManager + - get_annotations(bookid=None):返回所有或指定assetid的笔记,结构为{assetid: {uuid: {...}}} - parse_location(location):解析ZANNOTATIONLOCATION,返回(idref, filepos) - 依赖:sqlite3, collections, re, os, datetime """ import config - import sqlite3 -from collections import defaultdict import re import os +from collections import defaultdict -def parse_location(location): - """ - 解析ZANNOTATIONLOCATION,返回(idref, filepos) - - epubcfi(...)格式优先提取[]内内容为idref - - 其他格式兼容原逻辑 - """ - idref = None - filepos = None - if not location: +class AnnotationManager: + def __init__(self, db_path=None): + self.db_path = db_path or config.LOCAL_ANNOTATION_DB + + @staticmethod + def parse_location(location): + """ + 解析ZANNOTATIONLOCATION,返回(idref, filepos) + - epubcfi(...)格式优先提取[]内内容为idref + - 其他格式兼容原逻辑 + """ + idref = None + filepos = None + if not location: + return idref, filepos + matches = re.findall(r'\[(.*?)\]', location) if location else [] + idref = matches[0] if len(matches) > 0 else None + filepos = matches[1] if len(matches) > 1 else None return idref, filepos - # 统一处理,提取前两个[]内容 - matches = re.findall(r'\[(.*?)\]', location) if location else [] - idref = matches[0] if len(matches) > 0 else None - filepos = matches[1] if len(matches) > 1 else None - return idref, filepos -def get_annotations(db_path=config.LOCAL_ANNOTATION_DB, bookid=None): - # 检查WAL模式相关文件 - base = db_path.rsplit('.', 1)[0] - wal_path = base + '.sqlite-wal' - shm_path = base + '.sqlite-shm' - for f in [db_path, wal_path, shm_path]: - if not os.path.exists(f): - print(f'警告: 缺少 {f},可能无法获取全部最新笔记') - conn = sqlite3.connect(db_path) - cursor = conn.cursor() - if bookid is not None: - cursor.execute(''' - SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID - FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=? - ''', (bookid,)) - else: - cursor.execute(''' - SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID - FROM ZAEANNOTATION - ''') - rows = cursor.fetchall() - annotations = defaultdict(dict) - import datetime - for row in rows: - assetid, creationdate, location, note, selectedtext, uuid = row - # 转换 creationdate 格式,支持苹果时间戳(以2001-01-01为基准) - date_str = creationdate - if creationdate: - try: - origin = datetime.datetime(2001, 1, 1) - # 苹果时间戳 float/int 或数字字符串 - if isinstance(creationdate, (int, float)): - dt = origin + datetime.timedelta(seconds=creationdate) - elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit(): - dt = origin + datetime.timedelta(seconds=float(creationdate)) - else: - dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d") - date_str = f"{dt.year}/{dt.month}/{dt.day}" - except Exception: - date_str = str(creationdate) - idref, filepos = parse_location(location) - # 跳过note和selectedtext都为None的笔记 - if note is None and selectedtext is None: - continue - annotations[str(assetid)][uuid] = { - 'creationdate': date_str, - 'filepos': filepos, - 'idref': idref, - 'note': note, - 'selectedtext': selectedtext - } - conn.close() - if bookid is not None: - # 只返回特定bookid的笔记结构 - return {str(bookid): annotations.get(str(bookid), {})} - return annotations + def get_annotations(self, bookid=None): + # 检查WAL模式相关文件 + base = self.db_path.rsplit('.', 1)[0] + wal_path = base + '.sqlite-wal' + shm_path = base + '.sqlite-shm' + for f in [self.db_path, wal_path, shm_path]: + if not os.path.exists(f): + print(f'警告: 缺少 {f},可能无法获取全部最新笔记') + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + if bookid is not None: + cursor.execute(''' + SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID + FROM ZAEANNOTATION WHERE ZANNOTATIONASSETID=? + ''', (bookid,)) + else: + cursor.execute(''' + SELECT ZANNOTATIONASSETID, ZANNOTATIONCREATIONDATE, ZANNOTATIONLOCATION, ZANNOTATIONNOTE, ZANNOTATIONSELECTEDTEXT, ZANNOTATIONUUID + FROM ZAEANNOTATION + ''') + rows = cursor.fetchall() + annotations = defaultdict(dict) + import datetime + for row in rows: + assetid, creationdate, location, note, selectedtext, uuid = row + # 转换 creationdate 格式,支持苹果时间戳(以2001-01-01为基准) + date_str = creationdate + if creationdate: + try: + origin = datetime.datetime(2001, 1, 1) + if isinstance(creationdate, (int, float)): + dt = origin + datetime.timedelta(seconds=creationdate) + elif isinstance(creationdate, str) and creationdate.replace('.', '', 1).isdigit(): + dt = origin + datetime.timedelta(seconds=float(creationdate)) + else: + dt = datetime.datetime.strptime(creationdate[:10], "%Y-%m-%d") + date_str = f"{dt.year}/{dt.month}/{dt.day}" + except Exception: + date_str = str(creationdate) + idref, filepos = self.parse_location(location) + if note is None and selectedtext is None: + continue + annotations[str(assetid)][uuid] = { + 'creationdate': date_str, + 'filepos': filepos, + 'idref': idref, + 'note': note, + 'selectedtext': selectedtext + } + conn.close() + if bookid is not None: + return {str(bookid): annotations.get(str(bookid), {})} + return annotations -# 用法示例:输出每本书的前3条笔记 if __name__ == "__main__": + manager = AnnotationManager() # 测试 parse_location - ''' test_locations = [ 'epubcfi(/6/746[id509]!/4[4MLOS0-27b363c65bfe41ad8429f530566a2737]/10,/2/1:0,/7:8', 'epubcfi(/6/22[id15]!/4/156/1,:21,:157)', 'epubcfi(/6/764[id518]!/4[4V8DU0-27b363c65bfe41ad8429f530566a2737]/56,/1:0,/3:2)' ] for loc in test_locations: - idref, filepos = parse_location(loc) + idref, filepos = manager.parse_location(loc) print(f"location: {loc}\n idref: {idref}\n filepos: {filepos}\n") - ''' # 测试只获取特定 assetid 的笔记 test_bookid = "B18FCD9F90FD43C2373AE52BAEF9A77C" - annotations = get_annotations(bookid=test_bookid) - - # 格式化打印该书的所有笔记 + annotations = manager.get_annotations(bookid=test_bookid) from pprint import pprint print(f"\nAssetID={test_bookid} 的所有笔记:") pprint(annotations, indent=2, sort_dicts=False) - - # 输出每本书的前3条笔记 - ''' - book_notes = defaultdict(list) - for assetid, notes_dict in annotations.items(): - for uuid, ann in notes_dict.items(): - book_notes[assetid].append({**ann, 'uuid': uuid}) - for assetid, notes in book_notes.items(): - print(f"\nAssetID: {assetid}") - for i, note in enumerate(notes[:3]): - print(f" 笔记{i+1}:") - print(f" creationdate: {note['creationdate']}") - print(f" idref: {note['idref']}") - print(f" filepos: {note['filepos']}") - print(f" note: {note['note']}") - print(f" selectedtext: {note['selectedtext']}") - print(f" uuid: {note['uuid']}") - ''' diff --git a/booklist_parse.py b/booklist_parse.py index 28ca146..fb1c9ab 100644 --- a/booklist_parse.py +++ b/booklist_parse.py @@ -1,75 +1,66 @@ -""" -booklist_parse.py ------------------ -功能: - - 解析iBooks的Books.plist,提取所有书籍元数据(书名、作者、路径、时间等)。 - - 解析BKLibrary.sqlite,获取每本书的最近打开时间(苹果时间戳,基准2001-01-01)。 -依赖:config.py 统一管理路径和配置项。 - -主要接口: - - parse_books_plist(plist_path):返回所有书籍元数据,结构为{bk_id: {...}} - - get_books_last_open(db_path):返回所有书籍最近打开时间,结构为{bk_id: {'last_open': 时间戳}} - -依赖:plistlib, collections, sqlite3, os, datetime - -典型用法: - booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST) - books_open = get_books_last_open(config.LOCAL_LIBRARY_DB) -""" import config import plistlib -from collections import defaultdict - -def parse_books_plist(plist_path=config.LOCAL_BOOKS_PLIST): - booksinfo = defaultdict(dict) - with open(plist_path, 'rb') as f: plist_data = plistlib.load(f) - for book in plist_data.get('Books', []): - bk_id = book.get('BKGeneratedItemId') - if not bk_id: continue - booksinfo[bk_id] = { - 'displayname': book.get('BKDisplayName', ''), - 'author': book.get('artistName', ''), - 'type': book.get('BKBookType', ''), - 'bookid': bk_id, - 'itemname': book.get('itemName', ''), - 'path': book.get('path', ''), - 'date': book.get('BKInsertionDate',''), - 'updatedate': book.get('updateDate','') - } - return booksinfo import sqlite3 import os +from collections import defaultdict -def get_books_last_open(db_path=config.LOCAL_LIBRARY_DB): - """ - 从BKLibrary.sqlite获取书籍最近打开时间 - 返回:defaultdict(dict),bk_id为索引,包含最近打开时间 - """ - books_open = defaultdict(dict) - if not os.path.exists(db_path): +class BookListManager: + def __init__(self, plist_path=None, db_path=None): + self.plist_path = plist_path or config.LOCAL_BOOKS_PLIST + self.db_path = db_path or config.LOCAL_LIBRARY_DB + self._booksinfo = None + self._books_open = None + + def get_books_info(self): + if self._booksinfo is not None: + return self._booksinfo + booksinfo = defaultdict(dict) + with open(self.plist_path, 'rb') as f: + plist_data = plistlib.load(f) + for book in plist_data.get('Books', []): + bk_id = book.get('BKGeneratedItemId') + if not bk_id: + continue + booksinfo[bk_id] = { + 'displayname': book.get('BKDisplayName', ''), + 'author': book.get('artistName', ''), + 'type': book.get('BKBookType', ''), + 'bookid': bk_id, + 'itemname': book.get('itemName', ''), + 'path': book.get('path', ''), + 'date': book.get('BKInsertionDate',''), + 'updatedate': book.get('updateDate','') + } + self._booksinfo = booksinfo + return booksinfo + + def get_books_last_open(self): + if self._books_open is not None: + return self._books_open + books_open = defaultdict(dict) + if not os.path.exists(self.db_path): + return books_open + try: + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + cursor.execute(''' SELECT ZASSETID, zlastopendate FROM ZBKLIBRARYASSET WHERE zlastopendate IS NOT NULL ''') + rows = cursor.fetchall() + for row in rows: + asset_id, last_open = row + if asset_id: + books_open[asset_id] = { + 'last_open': last_open + } + conn.close() + except Exception as e: + print(f'警告: 读取BKLibrary.sqlite失败: {e}') + self._books_open = books_open return books_open - try: - conn = sqlite3.connect(db_path) - cursor = conn.cursor() - # ZBKLIBRARYASSET表包含书籍信息 - cursor.execute(''' SELECT ZASSETID, zlastopendate FROM ZBKLIBRARYASSET WHERE zlastopendate IS NOT NULL ''') - rows = cursor.fetchall() - for row in rows: - asset_id, last_open = row - if asset_id: - books_open[asset_id] = { - 'last_open': last_open # 苹果时间戳,基准时间为2001-01-01 - } - conn.close() - except Exception as e: - print(f'警告: 读取BKLibrary.sqlite失败: {e}') - - return books_open - if __name__ == '__main__': - booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST) + manager = BookListManager() + booksinfo = manager.get_books_info() from pprint import pprint print("\n【前三条示例】") for k, v in list(booksinfo.items())[:3]: @@ -77,19 +68,10 @@ if __name__ == '__main__': pprint(v, sort_dicts=False, indent=2) print('-' * 60) - ''' - print("\n【全部内容】") - for k, v in booksinfo.items(): - print(f"{k}:") - pprint(v, sort_dicts=False, indent=2) - print('-' * 60) - ''' - # 测试最近打开时间 print("\n【最近打开时间示例】") - books_open = get_books_last_open() + books_open = manager.get_books_last_open() import datetime for k, v in list(books_open.items())[:3]: ts = v['last_open'] - # 苹果时间戳,基准2001-01-01 dt = datetime.datetime(2001, 1, 1) + datetime.timedelta(seconds=ts) print(f"{k}: {dt} (timestamp: {ts})") \ No newline at end of file diff --git a/data/Books.plist b/data/Books.plist index fcb2e39..4966ff0 100644 Binary files a/data/Books.plist and b/data/Books.plist differ diff --git a/detaildesign.md b/detaildesign.md index 92acacf..5dc875e 100644 --- a/detaildesign.md +++ b/detaildesign.md @@ -158,31 +158,43 @@ answer = inquirer.fuzzy( ## 9.1 主要代码文件说明(细化) + - `exportbooknotes.py` + - 采用 OOP 设计,核心类为 `BookNotesExporter`: + - `build_booksnote(bookid=None)`:构建结构化笔记数据。 + - `export_booksnote_to_md(booksnote, booksinfo, out_path=None)`:导出为 Markdown。 + - `find_file_by_ext`、`get_toc_tree` 等辅助方法。 - 数据同步:自动复制 iBooks 数据库和元数据到本地。 - 菜单交互:按最近打开时间戳排序,显示“书名 [时间戳]”,支持模糊搜索。 - 只处理用户选中书籍的笔记,按章节分组导出 Markdown。 - 依赖核心解析模块,负责主流程调度。 - `annotationdata.py` + - OOP 设计,核心类为 `AnnotationManager`: + - `get_annotations(bookid=None)`:返回所有或指定 assetid 的笔记。 + - `parse_location(location)`:静态方法,解析定位信息。 - 解析 AEAnnotation.sqlite,提取所有或指定 assetid 的笔记。 - 支持苹果时间戳转换,结构化输出。 - - parse_location 辅助函数,统一解析笔记定位信息。 - `booklist_parse.py` + - OOP 设计,核心类为 `BookListManager`: + - `get_books_info()`:获取书籍元数据。 + - `get_books_last_open()`:获取每本书的最近打开时间。 - 解析 Books.plist,获取书籍元数据(书名、作者、路径、时间等)。 - - 解析 BKLibrary.sqlite,获取每本书的最近打开时间(zlastopendate,苹果时间戳)。 - - 提供统一数据接口,便于主流程排序和展示。 + - 解析 BKLibrary.sqlite,获取每本书的最近打开时间。 - `opf_parse.py` + - OOP 设计,核心类为 `OPFParser`: + - `parse_opf(filepath)`:静态方法,返回 id->href 映射。 - 解析 epub 的 OPF 文件,获取章节与文件映射关系(idref -> href)。 - - 支持多种 epub 目录结构。 - `toc_parse.py` + - OOP 设计,核心类为 `TOCParser`: + - `parse_navpoints(navpoints)`:递归解析 navPoint 节点。 + - `find_label_path(node, ref, filepos, path)`:查找章节路径。 + - `find_section_by_selectedtext(html_path, selectedtext)`:通过选中文本定位章节标题。 + - `parse_html_title(html_path)`:解析 html 文件标题。 - 解析 NCX 目录文件,递归构建章节树结构。 - - find_label_path:支持通过 ref 和 filepos 查找完整 label 路径。 - - find_section_by_selectedtext:通过选中文本在 html 文件中定位章节标题。 - - parse_html_title:解析 html 文件标题。 - `backup/booksnote.py` - 历史/备份脚本,辅助数据迁移或格式转换。 diff --git a/export_notes/notes_export_B18FCD9F90FD43C2373AE52BAEF9A77C.md b/export_notes/notes_export_B18FCD9F90FD43C2373AE52BAEF9A77C.md index d414ade..8a33fc7 100644 --- a/export_notes/notes_export_B18FCD9F90FD43C2373AE52BAEF9A77C.md +++ b/export_notes/notes_export_B18FCD9F90FD43C2373AE52BAEF9A77C.md @@ -1,4 +1,4 @@ -# 笔记导出 2025-08-15 13:25 +# 笔记导出 2025-08-15 17:20 ## 传统十论 diff --git a/exportbooknotes.py b/exportbooknotes.py index e9f8dd3..90ad1b8 100644 --- a/exportbooknotes.py +++ b/exportbooknotes.py @@ -1,31 +1,17 @@ """ -exportbooknotes.py ------------------- +exportbooknotes.py (OOP版) +------------------------- 功能: - 自动同步iBooks数据库和元数据文件到本地data目录。 - 解析AEAnnotation.sqlite、Books.plist、BKLibrary.sqlite,构建结构化笔记数据。 - 解析epub目录和章节信息,定位每条笔记所属章节。 - 命令行菜单按最近打开时间降序展示书籍列表,供用户选择导出。 - 仅导出选中书籍的所有笔记,按章节分组,生成Markdown文件。 - 依赖:config.py 统一管理路径和配置项。 - -主要数据流: - 1. 数据同步到data目录 - 2. 解析Books.plist获取书籍元数据 - 3. 解析BKLibrary.sqlite获取最近打开时间 - 4. 菜单排序与显示(书名+时间戳) - 5. 解析AEAnnotation.sqlite获取笔记 - 6. 解析epub目录,定位章节 - 7. 导出Markdown文件 - -依赖:Python 3, InquirerPy, bs4, shutil, os, datetime, sqlite3 - -主要数据流: - -典型用法: - python exportbooknotes.py - # 按提示选择书籍,自动导出笔记到export_notes目录 +主要接口:BookNotesExporter + - run():命令行交互式导出主流程 + - build_booksnote(bookid=None):构建结构化笔记数据 + - export_booksnote_to_md(booksnote, booksinfo, out_path=None):导出为Markdown """ import config """ @@ -40,117 +26,113 @@ booksnote = { }}} } """ -from collections import defaultdict import os -from annotationdata import get_annotations -from booklist_parse import parse_books_plist +from collections import defaultdict +from annotationdata import AnnotationManager +from booklist_parse import BookListManager from opf_parse import parse_opf -from toc_parse import parse_navpoints, find_label_path +from toc_parse import TOCParser from bs4 import BeautifulSoup -from pprint import pprint -def find_file_by_ext(root, exts): - """在root下递归查找第一个指定后缀的文件""" - for dirpath, _, files in os.walk(root): - for f in files: - for ext in exts: - if f.lower().endswith(ext): - return os.path.join(dirpath, f) - return None -def get_toc_tree(toc_path): - with open(toc_path, 'r', encoding='utf-8') as f: - soup = BeautifulSoup(f, 'xml') - nav_map = soup.find('navMap') +class BookNotesExporter: + def __init__(self, config_module=config): + self.config = config_module + self.annotation_db = config_module.LOCAL_ANNOTATION_DB + self.books_plist = config_module.LOCAL_BOOKS_PLIST + self.library_db = config_module.LOCAL_LIBRARY_DB - nav_points = nav_map.find_all('navPoint', recursive=False) - toc_tree = parse_navpoints(nav_points) - #pprint(toc_tree, indent=2, depth=5) - return toc_tree + @staticmethod + def find_file_by_ext(root, exts): + for dirpath, _, files in os.walk(root): + for f in files: + for ext in exts: + if f.lower().endswith(ext): + return os.path.join(dirpath, f) + return None -def build_booksnote(annotation_db=config.LOCAL_ANNOTATION_DB, books_plist=config.LOCAL_BOOKS_PLIST, bookid=None): - # 支持只处理特定 assetid 的笔记 - annotations = get_annotations(annotation_db, bookid=bookid) - booksinfo = parse_books_plist(books_plist) - booksnote = defaultdict(lambda: defaultdict(dict)) - for assetid, notes in annotations.items(): - # 获取epub路径 - bookinfo = booksinfo.get(assetid) - if not bookinfo: - continue - epub_path = bookinfo.get('path') - if not epub_path or not os.path.isdir(epub_path): - continue - # 查找opf和ncx - opf_path = find_file_by_ext(epub_path, ['.opf']) - ncx_path = find_file_by_ext(epub_path, ['.ncx']) - if not opf_path or not ncx_path: - continue - id2href = parse_opf(opf_path) - toc_tree = get_toc_tree(ncx_path) - for uuid, ann in notes.items(): - idref = ann['idref'] - filepos = ann['filepos'] - href = id2href.get(idref, idref) - chapter = find_label_path(toc_tree, href, filepos) - if chapter is None: - # 直接从html文件获取章节信息 - html_path = os.path.join(epub_path, href.split('#')[0]) - selectedtext = ann.get('selectedtext') - if os.path.exists(html_path) and selectedtext: - from toc_parse import find_section_by_selectedtext - section = find_section_by_selectedtext(html_path, selectedtext) - if section: - chapter = section + @staticmethod + def get_toc_tree(toc_path): + with open(toc_path, 'r', encoding='utf-8') as f: + soup = BeautifulSoup(f, 'xml') + nav_map = soup.find('navMap') + nav_points = nav_map.find_all('navPoint', recursive=False) + toc_tree = TOCParser.parse_navpoints(nav_points) + return toc_tree + + def build_booksnote(self, bookid=None): + manager = AnnotationManager(self.annotation_db) + annotations = manager.get_annotations(bookid=bookid) + bl_manager = BookListManager(plist_path=self.books_plist) + booksinfo = bl_manager.get_books_info() + booksnote = defaultdict(lambda: defaultdict(dict)) + for assetid, notes in annotations.items(): + bookinfo = booksinfo.get(assetid) + if not bookinfo: + continue + epub_path = bookinfo.get('path') + if not epub_path or not os.path.isdir(epub_path): + continue + opf_path = self.find_file_by_ext(epub_path, ['.opf']) + ncx_path = self.find_file_by_ext(epub_path, ['.ncx']) + if not opf_path or not ncx_path: + continue + id2href = parse_opf(opf_path) + toc_tree = self.get_toc_tree(ncx_path) + for uuid, ann in notes.items(): + idref = ann['idref'] + filepos = ann['filepos'] + href = id2href.get(idref, idref) + chapter = TOCParser.find_label_path(toc_tree, href, filepos) + if chapter is None: + html_path = os.path.join(epub_path, href.split('#')[0]) + selectedtext = ann.get('selectedtext') + if os.path.exists(html_path) and selectedtext: + section = TOCParser.find_section_by_selectedtext(html_path, selectedtext) + if section: + chapter = section + else: + chapter = "(未找到章节)" else: chapter = "(未找到章节)" - else: - chapter = "(未找到章节)" - booksnote[assetid][chapter][uuid] = { - 'creationdate': ann['creationdate'], - 'filepos': filepos, - 'idref': href, - 'note': ann['note'], - 'selectedtext': ann['selectedtext'] - } - return booksnote + booksnote[assetid][chapter][uuid] = { + 'creationdate': ann['creationdate'], + 'filepos': filepos, + 'idref': href, + 'note': ann['note'], + 'selectedtext': ann['selectedtext'] + } + return booksnote -import datetime - -def export_booksnote_to_md(booksnote, booksinfo, out_path=None): - """ - 依据booksnote结构导出markdown文件,格式: - # “笔记导出”+导出时间 - ## 书名 - ### chapter - selectedtext - > note (如果存在) - """ - now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') - lines = [f'# 笔记导出 {now}\n'] - for assetid, chapters in booksnote.items(): - bookname = booksinfo.get(assetid, {}).get('itemname', assetid) - lines.append(f'\n## {bookname}\n') - for chapter, notes in chapters.items(): - lines.append(f'### {chapter}') - for uuid, ann in notes.items(): - sel = ann.get('selectedtext') - note = ann.get('note') - if sel: - lines.append(sel) - if note: - lines.append(f'> {note}') - lines.append('') - md = '\n'.join(lines) - if out_path: - with open(out_path, 'w', encoding='utf-8') as f: - f.write(md) - return md + def export_booksnote_to_md(self, booksnote, booksinfo, out_path=None): + import datetime + now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') + lines = [f'# 笔记导出 {now}\n'] + for assetid, chapters in booksnote.items(): + bookname = booksinfo.get(assetid, {}).get('itemname', assetid) + lines.append(f'\n## {bookname}\n') + for chapter, notes in chapters.items(): + lines.append(f'### {chapter}') + for uuid, ann in notes.items(): + sel = ann.get('selectedtext') + note = ann.get('note') + if sel: + lines.append(sel) + if note: + lines.append(f'> {note}') + lines.append('') + md = '\n'.join(lines) + if out_path: + with open(out_path, 'w', encoding='utf-8') as f: + f.write(md) + return md if __name__ == '__main__': import shutil import os.path + from InquirerPy import inquirer # type: ignore + exporter = BookNotesExporter(config) # 自动覆盖 ./data 下的数据库和plist文件,源为iBooks真实路径 src_files = [ (config.IBOOKS_ANNOTATION_DB, config.LOCAL_ANNOTATION_DB), @@ -166,31 +148,19 @@ if __name__ == '__main__': else: print(f'file not found: {src} ') - from booklist_parse import parse_books_plist - from InquirerPy import inquirer # type: ignore - # 先获取所有书籍元数据 - booksinfo = parse_books_plist(config.LOCAL_BOOKS_PLIST) - - # 构建书名列表(优先displayname, 其次itemname, 否则assetid),按parse_books_plist中的date字段排序 + manager = BookListManager(plist_path=config.LOCAL_BOOKS_PLIST, db_path=config.LOCAL_LIBRARY_DB) + booksinfo = manager.get_books_info() assetid2name = {} assetid2lastopen = {} - from booklist_parse import get_books_last_open - - # 获取所有书籍的最后打开时间(字典,值为{'last_open': 时间戳}) - last_open_times = get_books_last_open(config.LOCAL_LIBRARY_DB) - + last_open_times = manager.get_books_last_open() for assetid, info in booksinfo.items(): name = info.get('displayname') or info.get('itemname') or assetid - # 如果书名中包含“-”,只取“-”前面的部分 if '-' in name: name = name.split('-', 1)[0].strip() assetid2name[assetid] = name - # 用 get_books_last_open 返回的时间戳排序,如无则为0 ts = last_open_times.get(assetid, {}).get('last_open', 0) assetid2lastopen[assetid] = ts - - # 按last_open时间戳降序排列 sorted_assetids = sorted(assetid2name.keys(), key=lambda aid: assetid2lastopen[aid], reverse=True) choices = [f"{assetid2name[aid]} [{assetid2lastopen[aid]}]" for aid in sorted_assetids] if not choices: @@ -202,8 +172,6 @@ if __name__ == '__main__': multiselect=False, instruction="上下键选择,输入可模糊筛选,回车确定" ).execute() - - # 解析选中assetid for aid, name in assetid2name.items(): if answer.startswith(name): selected_assetid = aid @@ -211,10 +179,8 @@ if __name__ == '__main__': else: print("未找到选中书籍") exit(1) - - # 只导出选中书的笔记 - selected_booksnote = build_booksnote(bookid=selected_assetid) + selected_booksnote = exporter.build_booksnote(bookid=selected_assetid) selected_booksinfo = {selected_assetid: booksinfo.get(selected_assetid, {})} out_path = f'export_notes/notes_export_{selected_assetid}.md' - export_booksnote_to_md(selected_booksnote, selected_booksinfo, out_path) + exporter.export_booksnote_to_md(selected_booksnote, selected_booksinfo, out_path) print(f'《{selected_booksinfo[selected_assetid].get("displayname") or selected_booksinfo[selected_assetid].get("itemname") or selected_assetid}》 导出笔记 {out_path}') diff --git a/opf_parse.py b/opf_parse.py index b711311..08aece5 100644 --- a/opf_parse.py +++ b/opf_parse.py @@ -1,38 +1,46 @@ - -# parseopf.py -# ----------------------------- -# 用于解析EPUB电子书的OPF文件,提取manifest部分所有id对应的html文件href。 -# 支持批量测试和通过id快速查找href。 -# 依赖:BeautifulSoup4 -# ----------------------------- - -from collections import defaultdict -from bs4 import BeautifulSoup -import pprint - - def parse_opf(filepath): """ - 解析OPF文件,返回{id: href}的defaultdict(dict)结构。 - 仅保留href以.html结尾的项。 - - 参数: - filepath (str): OPF文件路径 - 返回: - defaultdict(dict): id到href的映射(仅html文件) + 兼容旧代码的顶层函数,实际调用 OPFParser.parse_opf。 """ - result = defaultdict(dict) - with open(filepath, 'r', encoding='utf-8') as f: - soup = BeautifulSoup(f, 'xml') - # 查找manifest部分,遍历所有item,筛选html结尾的href - manifest = soup.find('manifest') - if manifest: - for item in manifest.find_all('item'): - id_ = item.get('id') - href = item.get('href') - if id_ and href and href.strip().lower().endswith('html'): - result[id_] = href - return result + return OPFParser.parse_opf(filepath) + +""" +opf_parse.py (OOP版) +------------------- +功能: + - 解析EPUB电子书的OPF文件,提取manifest部分所有id对应的html文件href。 + - 支持通过id快速查找href。 + - 支持批量测试。 +依赖:BeautifulSoup4 +主要接口:OPFParser + - parse_opf(filepath):静态方法,返回id->href映射(仅html文件)。 +""" +from collections import defaultdict +from bs4 import BeautifulSoup + +class OPFParser: + @staticmethod + def parse_opf(filepath): + """ + 解析OPF文件,返回{id: href}的defaultdict(dict)结构。 + 仅保留href以.html结尾的项。 + 参数: + filepath (str): OPF文件路径 + 返回: + defaultdict(dict): id到href的映射(仅html文件) + """ + result = defaultdict(dict) + with open(filepath, 'r', encoding='utf-8') as f: + soup = BeautifulSoup(f, 'xml') + manifest = soup.find('manifest') + if manifest: + for item in manifest.find_all('item'): + id_ = item.get('id') + href = item.get('href') + if id_ and href and href.strip().lower().endswith('html'): + result[id_] = href + return result + if __name__ == "__main__": test_files = [ @@ -44,8 +52,7 @@ if __name__ == "__main__": for file in test_files: print(f"\n==== 测试文件: {file} ====") try: - result = parse_opf(file) - pprint.pprint(result, indent=2, width=120, sort_dicts=False) + result = OPFParser.parse_opf(file) # 增加通过id快速打印href的测试 test_ids = list(result.keys())[:3] # 取前三个id做演示 diff --git a/toc_parse.py b/toc_parse.py index 426e619..c1a7a55 100644 --- a/toc_parse.py +++ b/toc_parse.py @@ -1,6 +1,7 @@ + """ -toc_parse.py ------------- +toc_parse.py (OOP版) +------------------- 功能: - 解析EPUB电子书的toc.ncx目录文件,递归构建章节树结构。 - 支持通过ref和filepos查找完整label路径。 @@ -8,166 +9,120 @@ toc_parse.py - 兼容多种EPUB格式,支持批量测试。 依赖:config.py 统一管理路径和配置项。 -主要接口: - parse_navpoints(navpoints) # 递归解析navPoint节点,返回章节树结构。 - find_label_path(node, ref, filepos, path) # 查找指定ref和filepos的章节label路径。 - find_section_by_selectedtext(html_path, selectedtext) # 通过选中文本定位章节标题。 - parse_html_title(html_path) # 解析html文件标题。 +主要接口:TOCParser + - parse_navpoints(navpoints):递归解析navPoint节点,返回章节树结构。 + - find_label_path(node, ref, filepos, path):查找指定ref和filepos的章节label路径。 + - find_section_by_selectedtext(html_path, selectedtext):通过选中文本定位章节标题。 + - parse_html_title(html_path):解析html文件标题。 依赖:BeautifulSoup4, pprint, os, typing """ import config - - from bs4 import BeautifulSoup -from typing import Dict, Optional, List, Any -import pprint +import os -# ==== 辅助函数:根据selectedtext在html文件中的位置推断所在章节 ==== -def find_section_by_selectedtext(html_path, selectedtext): - """ - 在html文件中查找selectedtext出现的位置,向上回溯最近的h1-h6标题,返回该标题文本。 - 若未找到标题,则返回None。 - """ - try: - with open(html_path, 'r', encoding='utf-8') as f: - soup = BeautifulSoup(f, 'html.parser') - # 在所有文本节点中查找selectedtext - for elem in soup.find_all(string=True): - if selectedtext and selectedtext.strip() and selectedtext.strip() in elem: - # 回溯父节点,查找最近的h1-h6 - parent = elem.parent - while parent: - prev = parent.previous_sibling - # 向上查找同级前面的h1-h6 - while prev: - if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']: - return prev.get_text(strip=True) - prev = prev.previous_sibling - parent = parent.parent - # 若未找到,尝试全局第一个h1-h6 - for tag in ['h1','h2','h3','h4','h5','h6']: - h = soup.find(tag) - if h and h.get_text(strip=True): - return h.get_text(strip=True) - except Exception: +class TOCParser: + def __init__(self): pass - return None -def parse_html_title(html_path): - """ - 解析html文件,优先返回,否则返回body第一个h1/h2/h3/h4/h5/h6或None。 - """ - try: - with open(html_path, 'r', encoding='utf-8') as f: - soup = BeautifulSoup(f, 'html.parser') - # 优先<title> - if soup.title and soup.title.string: - return soup.title.string.strip() - # 其次正文第一个h1-h6 - for tag in ['h1','h2','h3','h4','h5','h6']: - h = soup.find(tag) - if h and h.get_text(strip=True): - return h.get_text(strip=True) - except Exception: - pass - return None + @staticmethod + def find_section_by_selectedtext(html_path, selectedtext): + try: + with open(html_path, 'r', encoding='utf-8') as f: + soup = BeautifulSoup(f, 'html.parser') + for elem in soup.find_all(string=True): + if selectedtext and selectedtext.strip() and selectedtext.strip() in elem: + parent = elem.parent + while parent: + prev = parent.previous_sibling + while prev: + if prev.name and prev.name.lower() in ['h1','h2','h3','h4','h5','h6']: + return prev.get_text(strip=True) + prev = prev.previous_sibling + parent = parent.parent + for tag in ['h1','h2','h3','h4','h5','h6']: + h = soup.find(tag) + if h and h.get_text(strip=True): + return h.get_text(strip=True) + except Exception: + pass + return None -def parse_navpoints(navpoints) -> Dict[str, dict]: - """ - 递归解析 navpoints 节点,返回嵌套 dict 结构。 - :param navpoints: BeautifulSoup 查找到的 navPoint 节点列表 - :return: 章节树结构 - """ - result = {} - for navpoint in navpoints: - label = navpoint.navLabel.text.strip().strip('"“”') - src = navpoint.content["src"] - if "#" in src: - ref, filepos = src.split("#", 1) - else: - ref, filepos = src, None - entry = { - "label": label, - "ref": ref, - "filepos": filepos, - "children": parse_navpoints(navpoint.find_all("navPoint", recursive=False)) - } - result[navpoint.get("id")] = entry + @staticmethod + def parse_html_title(html_path): + try: + with open(html_path, 'r', encoding='utf-8') as f: + soup = BeautifulSoup(f, 'html.parser') + if soup.title and soup.title.string: + return soup.title.string.strip() + for tag in ['h1','h2','h3','h4','h5','h6']: + h = soup.find(tag) + if h and h.get_text(strip=True): + return h.get_text(strip=True) + except Exception: + pass + return None - #pprint.pprint(result) # 格式化打印result + @staticmethod + def parse_navpoints(navpoints): + result = {} + for navpoint in navpoints: + label = navpoint.navLabel.text.strip().strip('"“”') + src = navpoint.content["src"] + if "#" in src: + ref, filepos = src.split("#", 1) + else: + ref, filepos = src, None + entry = { + "label": label, + "ref": ref, + "filepos": filepos, + "children": TOCParser.parse_navpoints(navpoint.find_all("navPoint", recursive=False)) + } + result[navpoint.get("id")] = entry + return result - return result - -def find_label_path( - node: Any, - ref: str, - filepos: Optional[str] = None, - path: Optional[List[str]] = None -) -> Optional[str]: - """ - 在嵌套 dict 结构中查找指定 ref 和 filepos 的 label 路径。 - :param node: 当前节点(dict 或 dict集合) - :param ref: html文件名 - :param filepos: 文件位置,可为 None - :param path: label 路径累积 - :return: 以 / 分隔的完整 label 路径,未找到返回 None - """ - if path is None: - path = [] - if isinstance(node, dict): - nodes = node.values() if "label" not in node else [node] - # 1. 优先精确匹配ref和filepos - for v in nodes: - if "label" in v: - new_path = path + [v["label"]] - if v["ref"] == ref and (filepos is None or v["filepos"] == filepos): - title = " / ".join(new_path) - #print(f'title ref={ref} filepos={filepos} -> {title}') #DBG - return title - title = find_label_path(v["children"], ref, filepos, new_path) - if title: - #print(f'title1 ref={ref} filepos={filepos} -> {title}') #DBG - return title - - # 2. 如果带filepos查找失败,回退到同ref下第一个章节(即只要ref匹配就返回) - if filepos is not None: + @staticmethod + def find_label_path(node, ref, filepos=None, path=None): + if path is None: + path = [] + if isinstance(node, dict): + nodes = node.values() if "label" not in node else [node] for v in nodes: if "label" in v: new_path = path + [v["label"]] - # print(f"对比 {v['ref']} == {ref}") - if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]: + if v["ref"] == ref and (filepos is None or v["filepos"] == filepos): title = " / ".join(new_path) - #print(f'title3 ref={ref} filepos={filepos} -> {title}') #DBG return title - title = find_label_path(v["children"], ref, None, new_path) - if title: - #print(f'title4 ref={ref} filepos={filepos} -> {title}') #DBG - return title - - # 3. 若完全未找到,尝试直接解析idref所指html文件标题,获取章节label信息 - # 仅在顶层调用时执行此逻辑 - if path == [] and ref and ref.endswith('.html'): - import os - # 自动在常见目录下查找html文件(以toc文件目录为基准) - caller_dir = os.path.dirname(os.path.abspath(__file__)) - search_dirs = [caller_dir, os.getcwd()] - for d in search_dirs: - html_path = os.path.join(d, ref) - #print(f"查找 {html_path}") - if os.path.isfile(html_path): - title = parse_html_title(html_path) - if title: - return title - # 递归查找(以toc文件目录为根) - for d in search_dirs: - for root, _, files in os.walk(d): - if ref in files: - html_path = os.path.join(root, ref) - #print(f"2 查找 {html_path}") - title = parse_html_title(html_path) + title = TOCParser.find_label_path(v["children"], ref, filepos, new_path) if title: return title - return None + if filepos is not None: + for v in nodes: + if "label" in v: + new_path = path + [v["label"]] + if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]: + title = " / ".join(new_path) + return title + title = TOCParser.find_label_path(v["children"], ref, None, new_path) + if title: + return title + if path == [] and ref and ref.endswith('.html'): + caller_dir = os.path.dirname(os.path.abspath(__file__)) + search_dirs = [caller_dir, os.getcwd()] + for d in search_dirs: + html_path = os.path.join(d, ref) + if os.path.isfile(html_path): + title = TOCParser.parse_html_title(html_path) + if title: + return title + for d in search_dirs: + for root, _, files in os.walk(d): + if ref in files: + html_path = os.path.join(root, ref) + title = TOCParser.parse_html_title(html_path) + if title: + return title + return None if __name__ == "__main__": # ==== 批量测试指定toc/html/filepos列表 ==== @@ -182,8 +137,6 @@ if __name__ == "__main__": [config.EXAMPLES_DIR + "/政治哲學的12堂Podcast", "ch1.xhtml#_idParaDest-4", ""], ] for epub_dir, html_file, filepos in test_cases: - # 自动查找epub目录下的toc.ncx - import os toc_path = None for root, _, files in os.walk(epub_dir): for f in files: @@ -200,39 +153,32 @@ if __name__ == "__main__": with open(toc_path, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "xml") nav_map = soup.find("navMap") - toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False)) - label_path = find_label_path(toc_tree, html_file, filepos) + toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False)) + label_path = TOCParser.find_label_path(toc_tree, html_file, filepos) print(f"find_label_path: {label_path if label_path else '未找到章节/标题'}") - - # tocb中不存在html,直接测试parse_html_title html_path = os.path.join(epub_dir, html_file.split('#')[0]) if os.path.exists(html_path): - title = parse_html_title(html_path) + title = TOCParser.parse_html_title(html_path) print(f"解析html标题: {html_path} => {title if title else '未找到标题'}") - # 新增:根据selectedtext定位章节标题 selectedtext = '从变法思想看,王安石变法最大的魅力是“民不加赋而国用足”:老百姓上缴的税率不增,国库的总收入仍可以' - section = find_section_by_selectedtext(html_path, selectedtext) + section = TOCParser.find_section_by_selectedtext(html_path, selectedtext) print(f"selectedtext定位到的章节标题: {section if section else '未找到相关标题'}") else: print(f"未找到html文件: {html_path}") except Exception as e: print(f"测试失败: {e}") - # ==== 新增:测试变宋笔记章节定位和html标题解析 ==== print("\n==== 测试: 变宋笔记章节定位和html标题解析 ====") - # 假设笔记数据如下 note_idref = 'text/part0002_split_003.html' note_filepos = None - # 变宋toc.ncx路径 bian_song_toc = config.EXAMPLES_DIR + "/变宋/toc.ncx" - import os if os.path.exists(bian_song_toc): with open(bian_song_toc, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "xml") nav_map = soup.find("navMap") - toc_tree = parse_navpoints(nav_map.find_all("navPoint", recursive=False)) - # 先尝试用find_label_path查找章节 - label_path = find_label_path(toc_tree, note_idref, note_filepos) + toc_tree = TOCParser.parse_navpoints(nav_map.find_all("navPoint", recursive=False)) + label_path = TOCParser.find_label_path(toc_tree, note_idref, note_filepos) print(f"查找 {note_idref}: ", label_path if label_path else "未找到章节,尝试解析html标题") else: print(f"未找到toc.ncx: {bian_song_toc}") +