iBook/opf_parse.py

58 lines
1.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# parseopf.py
# -----------------------------
# 用于解析EPUB电子书的OPF文件提取manifest部分所有id对应的html文件href。
# 支持批量测试和通过id快速查找href。
# 依赖BeautifulSoup4
# -----------------------------
from collections import defaultdict
from bs4 import BeautifulSoup
import pprint
def parse_opf(filepath):
"""
解析OPF文件返回{id: href}的defaultdict(dict)结构。
仅保留href以.html结尾的项。
参数:
filepath (str): OPF文件路径
返回:
defaultdict(dict): id到href的映射仅html文件
"""
result = defaultdict(dict)
with open(filepath, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'xml')
# 查找manifest部分遍历所有item筛选html结尾的href
manifest = soup.find('manifest')
if manifest:
for item in manifest.find_all('item'):
id_ = item.get('id')
href = item.get('href')
if id_ and href and href.strip().lower().endswith('html'):
result[id_] = href
return result
if __name__ == "__main__":
test_files = [
'./examples/epub_format_2/OEBPS/content.opf',
'./examples/epub_format_4/item/standard.opf',
'./examples/epub_format_3/OEBPS/content.opf',
'./examples/epub_format_1/content.opf',
]
for file in test_files:
print(f"\n==== 测试文件: {file} ====")
try:
result = parse_opf(file)
pprint.pprint(result, indent=2, width=120, sort_dicts=False)
# 增加通过id快速打印href的测试
test_ids = list(result.keys())[:3] # 取前三个id做演示
for test_id in test_ids:
print(f"id: {test_id} -> href: {result[test_id]}")
except Exception as e:
print(f"解析失败: {e}")