iBook/opf_parse.py

65 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

def parse_opf(filepath):
"""
兼容旧代码的顶层函数,实际调用 OPFParser.parse_opf。
"""
return OPFParser.parse_opf(filepath)
"""
opf_parse.py (OOP版)
-------------------
功能:
- 解析EPUB电子书的OPF文件提取manifest部分所有id对应的html文件href。
- 支持通过id快速查找href。
- 支持批量测试。
依赖BeautifulSoup4
主要接口OPFParser
- parse_opf(filepath)静态方法返回id->href映射仅html文件
"""
from collections import defaultdict
from bs4 import BeautifulSoup
class OPFParser:
@staticmethod
def parse_opf(filepath):
"""
解析OPF文件返回{id: href}的defaultdict(dict)结构。
仅保留href以.html结尾的项。
参数:
filepath (str): OPF文件路径
返回:
defaultdict(dict): id到href的映射仅html文件
"""
result = defaultdict(dict)
with open(filepath, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'xml')
manifest = soup.find('manifest')
if manifest:
for item in manifest.find_all('item'):
id_ = item.get('id')
href = item.get('href')
if id_ and href and href.strip().lower().endswith('html'):
result[id_] = href
return result
if __name__ == "__main__":
test_files = [
'./examples/epub_format_2/OEBPS/content.opf',
'./examples/epub_format_4/item/standard.opf',
'./examples/epub_format_3/OEBPS/content.opf',
'./examples/epub_format_1/content.opf',
]
for file in test_files:
print(f"\n==== 测试文件: {file} ====")
try:
result = OPFParser.parse_opf(file)
# 增加通过id快速打印href的测试
test_ids = list(result.keys())[:3] # 取前三个id做演示
for test_id in test_ids:
print(f"id: {test_id} -> href: {result[test_id]}")
except Exception as e:
print(f"解析失败: {e}")