Initial commit

This commit is contained in:
douboer
2025-08-06 13:11:08 +08:00
commit 2f2f98cea1
723 changed files with 69242 additions and 0 deletions

57
opf_parse.py Normal file
View File

@@ -0,0 +1,57 @@
# parseopf.py
# -----------------------------
# 用于解析EPUB电子书的OPF文件提取manifest部分所有id对应的html文件href。
# 支持批量测试和通过id快速查找href。
# 依赖BeautifulSoup4
# -----------------------------
from collections import defaultdict
from bs4 import BeautifulSoup
import pprint
def parse_opf(filepath):
"""
解析OPF文件返回{id: href}的defaultdict(dict)结构。
仅保留href以.html结尾的项。
参数:
filepath (str): OPF文件路径
返回:
defaultdict(dict): id到href的映射仅html文件
"""
result = defaultdict(dict)
with open(filepath, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'xml')
# 查找manifest部分遍历所有item筛选html结尾的href
manifest = soup.find('manifest')
if manifest:
for item in manifest.find_all('item'):
id_ = item.get('id')
href = item.get('href')
if id_ and href and href.strip().lower().endswith('html'):
result[id_] = href
return result
if __name__ == "__main__":
test_files = [
'./examples/epub_format_2/OEBPS/content.opf',
'./examples/epub_format_4/item/standard.opf',
'./examples/epub_format_3/OEBPS/content.opf',
'./examples/epub_format_1/content.opf',
]
for file in test_files:
print(f"\n==== 测试文件: {file} ====")
try:
result = parse_opf(file)
pprint.pprint(result, indent=2, width=120, sort_dicts=False)
# 增加通过id快速打印href的测试
test_ids = list(result.keys())[:3] # 取前三个id做演示
for test_id in test_ids:
print(f"id: {test_id} -> href: {result[test_id]}")
except Exception as e:
print(f"解析失败: {e}")