iBook/epub_cfi_parser.py

"""
EPUB CFI (Canonical Fragment Identifier) 解析器

基于 IDPF EPUB CFI 规范：https://idpf.org/epub/linking/cfi/epub-cfi.html
用于正确排序 iBooks 标注，按照 EPUB 文档中的真实位置顺序
"""

import re
from typing import List, Tuple, Optional


class EpubCFIParser:
    """EPUB CFI 解析器，用于处理 iBooks 中的 ZANNOTATIONLOCATION 字段"""

    @staticmethod
    def parse_cfi(cfi_string: str) -> Optional[Tuple[List[int], List[int], int]]:
        """
        解析 CFI 字符串，提取位置信息

        Args:
            cfi_string: CFI 字符串，如 "epubcfi(/6/14[chapter01]!/4/2/1:12)"

        Returns:
            (spine_path, local_path, char_offset) 或 None
            - spine_path: 脊柱路径数字列表 [6, 14] - 定位到具体文档/章节
            - local_path: 本地路径数字列表 [4, 2, 1] - 文档内元素位置
            - char_offset: 字符偏移量 12 - 元素内精确位置
        """
        if not cfi_string:
            return None

        # 清理输入，移除可能的前缀和包装
        cfi = cfi_string.strip()
        if cfi.startswith('epubcfi(') and cfi.endswith(')'):
            cfi = cfi[8:-1]  # 移除 epubcfi( 和 )
        elif not cfi.startswith('/'):
            return None

        try:
            # 分割 spine 部分和 local 部分（用 ! 分割）
            # spine 部分定位到文档，local 部分定位文档内位置
            if '!' in cfi:
                spine_part, local_part = cfi.split('!', 1)
            else:
                spine_part = cfi
                local_part = ''

            # 解析 spine 路径 - 定位到具体文档
            spine_path = EpubCFIParser._parse_path_numbers(spine_part)

            # 解析 local 路径和字符偏移 - 文档内位置
            local_path = []
            char_offset = 0

            if local_part:
                # 查找字符偏移（:数字格式）
                char_match = re.search(r':(\d+)', local_part)
                if char_match:
                    char_offset = int(char_match.group(1))
                    # 移除字符偏移部分再解析路径
                    local_part = re.sub(r':\d+.*$', '', local_part)

                local_path = EpubCFIParser._parse_path_numbers(local_part)

            return (spine_path, local_path, char_offset)

        except Exception as e:
            print(f"CFI 解析错误: {cfi_string} -> {e}")
            return None

    @staticmethod
    def _parse_path_numbers(path_str: str) -> List[int]:
        """
        从路径字符串中提取数字序列

        处理形如 '/6/14[chapter1]/2' 的路径，提取 [6, 14, 2]
        忽略方括号内的ID断言，只关注数字路径

        Args:
            path_str: 路径字符串

        Returns:
            数字列表
        """
        numbers = []
        if not path_str:
            return numbers

        # 正则匹配：/数字[可选ID] 模式
        # 例如: /6, /14[chapter1], /2
        pattern = r'/(\d+)(?:\[[^\]]*\])?'
        matches = re.findall(pattern, path_str)

        for match in matches:
            numbers.append(int(match))

        return numbers

    @staticmethod
    def compare_cfi_positions(cfi1: str, cfi2: str) -> int:
        """
        比较两个 CFI 的文档位置顺序

        Args:
            cfi1, cfi2: 要比较的 CFI 字符串

        Returns:
            -1: cfi1 在前面
             0: 位置相同
             1: cfi1 在后面
        """
        parsed1 = EpubCFIParser.parse_cfi(cfi1)
        parsed2 = EpubCFIParser.parse_cfi(cfi2)

        # 处理解析失败的情况
        if not parsed1 and not parsed2:
            return 0
        if not parsed1:
            return 1  # 解析失败的排到后面
        if not parsed2:
            return -1

        spine1, local1, offset1 = parsed1
        spine2, local2, offset2 = parsed2

        # 1. 首先比较 spine 路径（确定文档/章节顺序）
        spine_cmp = EpubCFIParser._compare_number_lists(spine1, spine2)
        if spine_cmp != 0:
            return spine_cmp

        # 2. spine 相同，比较 local 路径（文档内位置）
        local_cmp = EpubCFIParser._compare_number_lists(local1, local2)
        if local_cmp != 0:
            return local_cmp

        # 3. local 路径也相同，比较字符偏移
        if offset1 < offset2:
            return -1
        elif offset1 > offset2:
            return 1
        else:
            return 0

    @staticmethod
    def _compare_number_lists(list1: List[int], list2: List[int]) -> int:
        """
        逐元素比较两个数字列表

        按 EPUB CFI 规范：
        - 偶数表示元素节点
        - 奇数表示文本节点或元素间位置
        - 数字越小位置越靠前

        Args:
            list1, list2: 要比较的数字列表

        Returns:
            -1: list1 在前, 0: 相同, 1: list1 在后
        """
        min_len = min(len(list1), len(list2))

        # 逐位比较
        for i in range(min_len):
            if list1[i] < list2[i]:
                return -1
            elif list1[i] > list2[i]:
                return 1

        # 前面都相同，短路径在前
        if len(list1) < len(list2):
            return -1
        elif len(list1) > len(list2):
            return 1
        else:
            return 0

    @staticmethod
    def create_sort_key(cfi_string: str) -> Tuple:
        """
        为 CFI 创建排序键，用于 Python 的 sorted() 函数

        Args:
            cfi_string: CFI 字符串

        Returns:
            排序元组，确保 CFI 按文档位置正确排序
        """
        parsed = EpubCFIParser.parse_cfi(cfi_string)
        if not parsed:
            # 解析失败的 CFI 排到最后
            return (999999, [], 999999)

        spine_path, local_path, char_offset = parsed

        # 构造多级排序键：
        # 1. Spine 路径各数字（章节顺序）
        # 2. 分隔标记
        # 3. Local 路径各数字（章节内位置）
        # 4. 字符偏移

        sort_key = []

        # Spine 部分（章节）
        sort_key.extend(spine_path)

        # 分隔标记（避免 spine 和 local 路径混淆）
        sort_key.append(-1)

        # Local 部分（章节内位置）
        sort_key.extend(local_path)

        # 字符偏移
        sort_key.append(char_offset)

        return tuple(sort_key)

    @staticmethod
    def extract_chapter_info(cfi_string: str) -> str:
        """
        从 CFI 中提取章节信息

        Args:
            cfi_string: CFI 字符串

        Returns:
            章节信息字符串，如 "chapter01" 或 "第2章"
        """
        if not cfi_string:
            return ""

        # 查找方括号内的 ID 断言
        chapter_match = re.search(r'\[([^\]]+)\]', cfi_string)
        if chapter_match:
            chapter_id = chapter_match.group(1)
            # 清理常见的章节 ID 格式
            if chapter_id.startswith('chapter'):
                return chapter_id
            return f"章节_{chapter_id}"

        # 如果没有 ID 断言，尝试从 spine 路径推断章节
        parsed = EpubCFIParser.parse_cfi(cfi_string)
        if parsed and parsed[0]:
            spine_path = parsed[0]
            if len(spine_path) >= 2:
                # 通常第二个数字表示章节序号
                chapter_num = spine_path[1] // 2  # 偶数索引转章节号
                return f"第{chapter_num}章"

        return "未知章节"


# 测试函数
def test_cfi_parsing():
    """测试 CFI 解析功能"""
    test_cases = [
        "epubcfi(/6/14[chapter01]!/4/2/1:12)",
        "epubcfi(/6/14[chapter01]!/4/2/1:25)",
        "epubcfi(/6/16[chapter02]!/4/1:5)",
        "epubcfi(/6/14[chapter01]!/4/4/2:0)",
        "epubcfi(/6/14!/4/2/1:12)",  # 无 ID 断言
        "/6/14[chapter01]!/4/2/1:12",  # 无 epubcfi 包装
        "epubcfi(/6/2[cover]!/4:0)",  # 封面
        "epubcfi(/6/18[chapter03]!/2/4:25)",  # 第3章
    ]

    print("=== CFI 解析测试 ===")
    for cfi in test_cases:
        parsed = EpubCFIParser.parse_cfi(cfi)
        chapter = EpubCFIParser.extract_chapter_info(cfi)
        print(f"输入: {cfi}")
        if parsed:
            spine, local, offset = parsed
            print(f"解析: spine={spine}, local={local}, offset={offset}")
        else:
            print("解析: 失败")
        print(f"章节: {chapter}")
        print()


def test_cfi_sorting():
    """测试 CFI 排序功能"""
    test_cfis = [
        "epubcfi(/6/16[chapter02]!/4/1:5)",     # 第2章开始
        "epubcfi(/6/14[chapter01]!/4/2/1:25)",  # 第1章，后面位置
        "epubcfi(/6/14[chapter01]!/4/2/1:12)",  # 第1章，前面位置
        "epubcfi(/6/14[chapter01]!/4/4/2:0)",   # 第1章，不同段落
        "epubcfi(/6/18[chapter03]!/2:1)",       # 第3章
        "epubcfi(/6/14[chapter01]!/4:0)",       # 第1章开头
        "epubcfi(/6/2[cover]!/4:0)",            # 封面
        "epubcfi(/6/16[chapter02]!/4/6/2:15)",  # 第2章，后面位置
    ]

    print("=== CFI 排序测试 ===")
    print("排序前:")
    for i, cfi in enumerate(test_cfis):
        chapter = EpubCFIParser.extract_chapter_info(cfi)
        print(f"  {i+1}. {cfi} ({chapter})")

    # 使用 CFI 排序
    sorted_cfis = sorted(test_cfis, key=EpubCFIParser.create_sort_key)

    print("\n排序后（应按文档阅读顺序）:")
    for i, cfi in enumerate(sorted_cfis):
        chapter = EpubCFIParser.extract_chapter_info(cfi)
        print(f"  {i+1}. {cfi} ({chapter})")


if __name__ == "__main__":
    test_cfi_parsing()
    print()
    test_cfi_sorting()