diff --git a/__pycache__/toc_parse.cpython-312.pyc b/__pycache__/toc_parse.cpython-312.pyc index 01f4f7b..9e5f492 100644 Binary files a/__pycache__/toc_parse.cpython-312.pyc and b/__pycache__/toc_parse.cpython-312.pyc differ diff --git a/annotationdata.py b/annotationdata.py index ef1a192..5fa5343 100644 --- a/annotationdata.py +++ b/annotationdata.py @@ -1,3 +1,18 @@ +""" +annotationdata.py +----------------- +功能: + - 解析iBooks的AEAnnotation.sqlite数据库,提取所有或指定书籍(assetid/bookid)的笔记。 + - 提供parse_location辅助函数,解析笔记定位信息。 + - 返回结构化的annotations数据,便于后续章节定位与导出。 + +主要接口: + - get_annotations(db_path, bookid=None):返回所有或指定assetid的笔记,结构为{assetid: {uuid: {...}}} + - parse_location(location):解析ZANNOTATIONLOCATION,返回(idref, filepos) + +依赖:sqlite3, collections, re, os, datetime +""" + import sqlite3 from collections import defaultdict import re diff --git a/booklist_parse.py b/booklist_parse.py index 7342f65..17b0444 100644 --- a/booklist_parse.py +++ b/booklist_parse.py @@ -1,3 +1,20 @@ +""" +booklist_parse.py +----------------- +功能: + - 解析iBooks的Books.plist,提取所有书籍元数据(书名、作者、路径、时间等)。 + - 解析BKLibrary.sqlite,获取每本书的最近打开时间(苹果时间戳,基准2001-01-01)。 + +主要接口: + - parse_books_plist(plist_path):返回所有书籍元数据,结构为{bk_id: {...}} + - get_books_last_open(db_path):返回所有书籍最近打开时间,结构为{bk_id: {'last_open': 时间戳}} + +依赖:plistlib, collections, sqlite3, os, datetime + +典型用法: + booksinfo = parse_books_plist('./data/Books.plist') + books_open = get_books_last_open('data/BKLibrary.sqlite') +""" import plistlib from collections import defaultdict diff --git a/data/AEAnnotation.sqlite b/data/AEAnnotation.sqlite index 962b917..c1a9fdd 100644 Binary files a/data/AEAnnotation.sqlite and b/data/AEAnnotation.sqlite differ diff --git a/data/AEAnnotation.sqlite-shm b/data/AEAnnotation.sqlite-shm deleted file mode 100644 index ee3a182..0000000 Binary files a/data/AEAnnotation.sqlite-shm and /dev/null differ diff --git a/data/AEAnnotation.sqlite-wal b/data/AEAnnotation.sqlite-wal deleted file mode 100644 index 2771830..0000000 Binary files a/data/AEAnnotation.sqlite-wal and /dev/null differ diff --git a/data/Books.plist b/data/Books.plist index e7588e3..b4f1fd4 100644 Binary files a/data/Books.plist and b/data/Books.plist differ diff --git a/export_notes/notes_export_474FB2345D27062AE3A4DF339A30498E.md b/export_notes/notes_export_474FB2345D27062AE3A4DF339A30498E.md index c2971aa..fc95f80 100644 --- a/export_notes/notes_export_474FB2345D27062AE3A4DF339A30498E.md +++ b/export_notes/notes_export_474FB2345D27062AE3A4DF339A30498E.md @@ -1,4 +1,4 @@ -# 笔记导出 2025-08-12 15:03 +# 笔记导出 2025-08-12 19:48 ## 明夷待访录·破邪论(精)--中华经典名著全本全注全译 (中华书局) diff --git a/export_notes/notes_export_B18FCD9F90FD43C2373AE52BAEF9A77C.md b/export_notes/notes_export_B18FCD9F90FD43C2373AE52BAEF9A77C.md index 47bdc14..0f75c0e 100644 --- a/export_notes/notes_export_B18FCD9F90FD43C2373AE52BAEF9A77C.md +++ b/export_notes/notes_export_B18FCD9F90FD43C2373AE52BAEF9A77C.md @@ -1,4 +1,4 @@ -# 笔记导出 2025-08-12 15:02 +# 笔记导出 2025-08-12 21:16 ## 传统十论 diff --git a/exportbooknotes.py b/exportbooknotes.py index b1934e4..a66214a 100644 --- a/exportbooknotes.py +++ b/exportbooknotes.py @@ -1,4 +1,29 @@ """ +exportbooknotes.py +------------------ +功能: + - 自动同步iBooks数据库和元数据文件到本地data目录。 + - 解析AEAnnotation.sqlite、Books.plist、BKLibrary.sqlite,构建结构化笔记数据。 + - 解析epub目录和章节信息,定位每条笔记所属章节。 + - 命令行菜单按最近打开时间降序展示书籍列表,供用户选择导出。 + - 仅导出选中书籍的所有笔记,按章节分组,生成Markdown文件。 + +主要数据流: + 1. 数据同步到data目录 + 2. 解析Books.plist获取书籍元数据 + 3. 解析BKLibrary.sqlite获取最近打开时间 + 4. 菜单排序与显示(书名+时间戳) + 5. 解析AEAnnotation.sqlite获取笔记 + 6. 解析epub目录,定位章节 + 7. 导出Markdown文件 + +依赖:Python 3, InquirerPy, bs4, shutil, os, datetime, sqlite3 + +典型用法: + python exportbooknotes.py + # 按提示选择书籍,自动导出笔记到export_notes目录 +""" +""" 自动生成 booksnote 数据结构: booksnote = { assetid: { label_path: { uuid: { @@ -17,6 +42,7 @@ from booklist_parse import parse_books_plist from opf_parse import parse_opf from toc_parse import parse_navpoints, find_label_path from bs4 import BeautifulSoup +from pprint import pprint def find_file_by_ext(root, exts): """在root下递归查找第一个指定后缀的文件""" @@ -31,7 +57,11 @@ def get_toc_tree(toc_path): with open(toc_path, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'xml') nav_map = soup.find('navMap') - return parse_navpoints(nav_map.find_all('navPoint', recursive=False)) + + nav_points = nav_map.find_all('navPoint', recursive=False) + toc_tree = parse_navpoints(nav_points) + #pprint(toc_tree, indent=2, depth=5) + return toc_tree def build_booksnote(annotation_db='data/AEAnnotation.sqlite', books_plist='data/Books.plist', bookid=None): # 支持只处理特定 assetid 的笔记 @@ -132,7 +162,7 @@ if __name__ == '__main__': print(f'file not found: {src} ') from booklist_parse import parse_books_plist - from InquirerPy import inquirer + from InquirerPy import inquirer # type: ignore # 先获取所有书籍元数据 booksinfo = parse_books_plist('data/Books.plist') diff --git a/toc_parse.py b/toc_parse.py index 30bfc3e..0bd5256 100644 --- a/toc_parse.py +++ b/toc_parse.py @@ -109,10 +109,14 @@ def find_label_path( if "label" in v: new_path = path + [v["label"]] if v["ref"] == ref and (filepos is None or v["filepos"] == filepos): - return " / ".join(new_path) - found = find_label_path(v["children"], ref, filepos, new_path) - if found: - return found + title = " / ".join(new_path) + #print(f'title ref={ref} filepos={filepos} -> {title}') #DBG + return title + title = find_label_path(v["children"], ref, filepos, new_path) + if title: + #print(f'title1 ref={ref} filepos={filepos} -> {title}') #DBG + return title + # 2. 如果带filepos查找失败,回退到同ref下第一个章节(即只要ref匹配就返回) if filepos is not None: for v in nodes: @@ -120,10 +124,13 @@ def find_label_path( new_path = path + [v["label"]] # print(f"对比 {v['ref']} == {ref}") if v["ref"].split("#", 1)[0] == ref.split("#", 1)[0]: - return " / ".join(new_path) - found = find_label_path(v["children"], ref, None, new_path) - if found: - return found + title = " / ".join(new_path) + #print(f'title3 ref={ref} filepos={filepos} -> {title}') #DBG + return title + title = find_label_path(v["children"], ref, None, new_path) + if title: + #print(f'title4 ref={ref} filepos={filepos} -> {title}') #DBG + return title # 3. 若完全未找到,尝试直接解析idref所指html文件标题,获取章节label信息 # 仅在顶层调用时执行此逻辑