Initial commit

2025-08-06 13:11:08 +08:00
commit 2f2f98cea1
723 changed files with 69242 additions and 0 deletions
--- a/opf_parse.py
+++ b/opf_parse.py
@@ -0,0 +1,57 @@
+
+# parseopf.py
+# -----------------------------
+# 用于解析EPUB电子书的OPF文件，提取manifest部分所有id对应的html文件href。
+# 支持批量测试和通过id快速查找href。
+# 依赖：BeautifulSoup4
+# -----------------------------
+
+from collections import defaultdict
+from bs4 import BeautifulSoup
+import pprint
+
+
+def parse_opf(filepath):
+    """
+    解析OPF文件，返回{id: href}的defaultdict(dict)结构。
+    仅保留href以.html结尾的项。
+
+    参数：
+        filepath (str): OPF文件路径
+    返回：
+        defaultdict(dict): id到href的映射（仅html文件）
+    """
+    result = defaultdict(dict)
+    with open(filepath, 'r', encoding='utf-8') as f:
+        soup = BeautifulSoup(f, 'xml')
+    # 查找manifest部分，遍历所有item，筛选html结尾的href
+    manifest = soup.find('manifest')
+    if manifest:
+        for item in manifest.find_all('item'):
+            id_ = item.get('id')
+            href = item.get('href')
+            if id_ and href and href.strip().lower().endswith('html'):
+                result[id_] = href
+    return result
+
+if __name__ == "__main__":
+    test_files = [
+        './examples/epub_format_2/OEBPS/content.opf',
+        './examples/epub_format_4/item/standard.opf',
+        './examples/epub_format_3/OEBPS/content.opf',
+        './examples/epub_format_1/content.opf',
+    ]
+    for file in test_files:
+        print(f"\n==== 测试文件: {file} ====")
+        try:
+            result = parse_opf(file)
+            pprint.pprint(result, indent=2, width=120, sort_dicts=False)
+
+            # 增加通过id快速打印href的测试
+            test_ids = list(result.keys())[:3]  # 取前三个id做演示
+            for test_id in test_ids:
+                print(f"id: {test_id} -> href: {result[test_id]}")
+
+        except Exception as e:
+            print(f"解析失败: {e}")
+