iBook/opf_parse.py

def parse_opf(filepath):
    """
    兼容旧代码的顶层函数，实际调用 OPFParser.parse_opf。
    """
    return OPFParser.parse_opf(filepath)

"""
opf_parse.py (OOP版)
-------------------
功能：
    - 解析EPUB电子书的OPF文件，提取manifest部分所有id对应的html文件href。
    - 支持通过id快速查找href。
    - 支持批量测试。
依赖：BeautifulSoup4
主要接口：OPFParser
    - parse_opf(filepath)：静态方法，返回id->href映射（仅html文件）。
"""
from collections import defaultdict
from bs4 import BeautifulSoup

class OPFParser:
    @staticmethod
    def parse_opf(filepath):
        """
        解析OPF文件，返回{id: href}的defaultdict(dict)结构。
        仅保留href以.html结尾的项。
        参数：
            filepath (str): OPF文件路径
        返回：
            defaultdict(dict): id到href的映射（仅html文件）
        """
        result = defaultdict(dict)
        with open(filepath, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'xml')
        manifest = soup.find('manifest')
        if manifest:
            for item in manifest.find_all('item'):
                id_ = item.get('id')
                href = item.get('href')
                if id_ and href and href.strip().lower().endswith('html'):
                    result[id_] = href
        return result


if __name__ == "__main__":
    test_files = [
        './examples/epub_format_2/OEBPS/content.opf',
        './examples/epub_format_4/item/standard.opf',
        './examples/epub_format_3/OEBPS/content.opf',
        './examples/epub_format_1/content.opf',
    ]
    for file in test_files:
        print(f"\n==== 测试文件: {file} ====")
        try:
            result = OPFParser.parse_opf(file)

            # 增加通过id快速打印href的测试
            test_ids = list(result.keys())[:3]  # 取前三个id做演示
            for test_id in test_ids:
                print(f"id: {test_id} -> href: {result[test_id]}")

        except Exception as e:
            print(f"解析失败: {e}")