'update'

2025-10-21 10:46:03 +08:00
parent db9be32815
commit fb0f5ed9c5
20 changed files with 1869 additions and 103 deletions
--- a/epub_cfi_parser.py
+++ b/epub_cfi_parser.py
@@ -0,0 +1,311 @@
+"""
+EPUB CFI (Canonical Fragment Identifier) 解析器
+
+基于 IDPF EPUB CFI 规范：https://idpf.org/epub/linking/cfi/epub-cfi.html
+用于正确排序 iBooks 标注，按照 EPUB 文档中的真实位置顺序
+"""
+
+import re
+from typing import List, Tuple, Optional
+
+
+class EpubCFIParser:
+    """EPUB CFI 解析器，用于处理 iBooks 中的 ZANNOTATIONLOCATION 字段"""
+    
+    @staticmethod
+    def parse_cfi(cfi_string: str) -> Optional[Tuple[List[int], List[int], int]]:
+        """
+        解析 CFI 字符串，提取位置信息
+        
+        Args:
+            cfi_string: CFI 字符串，如 "epubcfi(/6/14[chapter01]!/4/2/1:12)"
+            
+        Returns:
+            (spine_path, local_path, char_offset) 或 None
+            - spine_path: 脊柱路径数字列表 [6, 14] - 定位到具体文档/章节
+            - local_path: 本地路径数字列表 [4, 2, 1] - 文档内元素位置
+            - char_offset: 字符偏移量 12 - 元素内精确位置
+        """
+        if not cfi_string:
+            return None
+            
+        # 清理输入，移除可能的前缀和包装
+        cfi = cfi_string.strip()
+        if cfi.startswith('epubcfi(') and cfi.endswith(')'):
+            cfi = cfi[8:-1]  # 移除 epubcfi( 和 )
+        elif not cfi.startswith('/'):
+            return None
+            
+        try:
+            # 分割 spine 部分和 local 部分（用 ! 分割）
+            # spine 部分定位到文档，local 部分定位文档内位置
+            if '!' in cfi:
+                spine_part, local_part = cfi.split('!', 1)
+            else:
+                spine_part = cfi
+                local_part = ''
+            
+            # 解析 spine 路径 - 定位到具体文档
+            spine_path = EpubCFIParser._parse_path_numbers(spine_part)
+            
+            # 解析 local 路径和字符偏移 - 文档内位置
+            local_path = []
+            char_offset = 0
+            
+            if local_part:
+                # 查找字符偏移（:数字格式）
+                char_match = re.search(r':(\d+)', local_part)
+                if char_match:
+                    char_offset = int(char_match.group(1))
+                    # 移除字符偏移部分再解析路径
+                    local_part = re.sub(r':\d+.*$', '', local_part)
+                
+                local_path = EpubCFIParser._parse_path_numbers(local_part)
+            
+            return (spine_path, local_path, char_offset)
+            
+        except Exception as e:
+            print(f"CFI 解析错误: {cfi_string} -> {e}")
+            return None
+    
+    @staticmethod
+    def _parse_path_numbers(path_str: str) -> List[int]:
+        """
+        从路径字符串中提取数字序列
+        
+        处理形如 '/6/14[chapter1]/2' 的路径，提取 [6, 14, 2]
+        忽略方括号内的ID断言，只关注数字路径
+        
+        Args:
+            path_str: 路径字符串
+            
+        Returns:
+            数字列表
+        """
+        numbers = []
+        if not path_str:
+            return numbers
+            
+        # 正则匹配：/数字[可选ID] 模式
+        # 例如: /6, /14[chapter1], /2
+        pattern = r'/(\d+)(?:\[[^\]]*\])?'
+        matches = re.findall(pattern, path_str)
+        
+        for match in matches:
+            numbers.append(int(match))
+        
+        return numbers
+    
+    @staticmethod
+    def compare_cfi_positions(cfi1: str, cfi2: str) -> int:
+        """
+        比较两个 CFI 的文档位置顺序
+        
+        Args:
+            cfi1, cfi2: 要比较的 CFI 字符串
+            
+        Returns:
+            -1: cfi1 在前面
+             0: 位置相同  
+             1: cfi1 在后面
+        """
+        parsed1 = EpubCFIParser.parse_cfi(cfi1)
+        parsed2 = EpubCFIParser.parse_cfi(cfi2)
+        
+        # 处理解析失败的情况
+        if not parsed1 and not parsed2:
+            return 0
+        if not parsed1:
+            return 1  # 解析失败的排到后面
+        if not parsed2:
+            return -1
+            
+        spine1, local1, offset1 = parsed1
+        spine2, local2, offset2 = parsed2
+        
+        # 1. 首先比较 spine 路径（确定文档/章节顺序）
+        spine_cmp = EpubCFIParser._compare_number_lists(spine1, spine2)
+        if spine_cmp != 0:
+            return spine_cmp
+            
+        # 2. spine 相同，比较 local 路径（文档内位置）
+        local_cmp = EpubCFIParser._compare_number_lists(local1, local2)
+        if local_cmp != 0:
+            return local_cmp
+            
+        # 3. local 路径也相同，比较字符偏移
+        if offset1 < offset2:
+            return -1
+        elif offset1 > offset2:
+            return 1
+        else:
+            return 0
+    
+    @staticmethod
+    def _compare_number_lists(list1: List[int], list2: List[int]) -> int:
+        """
+        逐元素比较两个数字列表
+        
+        按 EPUB CFI 规范：
+        - 偶数表示元素节点
+        - 奇数表示文本节点或元素间位置
+        - 数字越小位置越靠前
+        
+        Args:
+            list1, list2: 要比较的数字列表
+            
+        Returns:
+            -1: list1 在前, 0: 相同, 1: list1 在后
+        """
+        min_len = min(len(list1), len(list2))
+        
+        # 逐位比较
+        for i in range(min_len):
+            if list1[i] < list2[i]:
+                return -1
+            elif list1[i] > list2[i]:
+                return 1
+        
+        # 前面都相同，短路径在前
+        if len(list1) < len(list2):
+            return -1
+        elif len(list1) > len(list2):
+            return 1
+        else:
+            return 0
+    
+    @staticmethod
+    def create_sort_key(cfi_string: str) -> Tuple:
+        """
+        为 CFI 创建排序键，用于 Python 的 sorted() 函数
+        
+        Args:
+            cfi_string: CFI 字符串
+            
+        Returns:
+            排序元组，确保 CFI 按文档位置正确排序
+        """
+        parsed = EpubCFIParser.parse_cfi(cfi_string)
+        if not parsed:
+            # 解析失败的 CFI 排到最后
+            return (999999, [], 999999)
+        
+        spine_path, local_path, char_offset = parsed
+        
+        # 构造多级排序键：
+        # 1. Spine 路径各数字（章节顺序）
+        # 2. 分隔标记
+        # 3. Local 路径各数字（章节内位置）
+        # 4. 字符偏移
+        
+        sort_key = []
+        
+        # Spine 部分（章节）
+        sort_key.extend(spine_path)
+        
+        # 分隔标记（避免 spine 和 local 路径混淆）
+        sort_key.append(-1)
+        
+        # Local 部分（章节内位置）
+        sort_key.extend(local_path)
+        
+        # 字符偏移
+        sort_key.append(char_offset)
+        
+        return tuple(sort_key)
+    
+    @staticmethod
+    def extract_chapter_info(cfi_string: str) -> str:
+        """
+        从 CFI 中提取章节信息
+        
+        Args:
+            cfi_string: CFI 字符串
+            
+        Returns:
+            章节信息字符串，如 "chapter01" 或 "第2章"
+        """
+        if not cfi_string:
+            return ""
+            
+        # 查找方括号内的 ID 断言
+        chapter_match = re.search(r'\[([^\]]+)\]', cfi_string)
+        if chapter_match:
+            chapter_id = chapter_match.group(1)
+            # 清理常见的章节 ID 格式
+            if chapter_id.startswith('chapter'):
+                return chapter_id
+            return f"章节_{chapter_id}"
+        
+        # 如果没有 ID 断言，尝试从 spine 路径推断章节
+        parsed = EpubCFIParser.parse_cfi(cfi_string)
+        if parsed and parsed[0]:
+            spine_path = parsed[0]
+            if len(spine_path) >= 2:
+                # 通常第二个数字表示章节序号
+                chapter_num = spine_path[1] // 2  # 偶数索引转章节号
+                return f"第{chapter_num}章"
+        
+        return "未知章节"
+
+
+# 测试函数
+def test_cfi_parsing():
+    """测试 CFI 解析功能"""
+    test_cases = [
+        "epubcfi(/6/14[chapter01]!/4/2/1:12)",
+        "epubcfi(/6/14[chapter01]!/4/2/1:25)", 
+        "epubcfi(/6/16[chapter02]!/4/1:5)",
+        "epubcfi(/6/14[chapter01]!/4/4/2:0)",
+        "epubcfi(/6/14!/4/2/1:12)",  # 无 ID 断言
+        "/6/14[chapter01]!/4/2/1:12",  # 无 epubcfi 包装
+        "epubcfi(/6/2[cover]!/4:0)",  # 封面
+        "epubcfi(/6/18[chapter03]!/2/4:25)",  # 第3章
+    ]
+    
+    print("=== CFI 解析测试 ===")
+    for cfi in test_cases:
+        parsed = EpubCFIParser.parse_cfi(cfi)
+        chapter = EpubCFIParser.extract_chapter_info(cfi)
+        print(f"输入: {cfi}")
+        if parsed:
+            spine, local, offset = parsed
+            print(f"解析: spine={spine}, local={local}, offset={offset}")
+        else:
+            print("解析: 失败")
+        print(f"章节: {chapter}")
+        print()
+
+
+def test_cfi_sorting():
+    """测试 CFI 排序功能"""
+    test_cfis = [
+        "epubcfi(/6/16[chapter02]!/4/1:5)",     # 第2章开始
+        "epubcfi(/6/14[chapter01]!/4/2/1:25)",  # 第1章，后面位置
+        "epubcfi(/6/14[chapter01]!/4/2/1:12)",  # 第1章，前面位置
+        "epubcfi(/6/14[chapter01]!/4/4/2:0)",   # 第1章，不同段落
+        "epubcfi(/6/18[chapter03]!/2:1)",       # 第3章
+        "epubcfi(/6/14[chapter01]!/4:0)",       # 第1章开头
+        "epubcfi(/6/2[cover]!/4:0)",            # 封面
+        "epubcfi(/6/16[chapter02]!/4/6/2:15)",  # 第2章，后面位置
+    ]
+    
+    print("=== CFI 排序测试 ===")
+    print("排序前:")
+    for i, cfi in enumerate(test_cfis):
+        chapter = EpubCFIParser.extract_chapter_info(cfi)
+        print(f"  {i+1}. {cfi} ({chapter})")
+    
+    # 使用 CFI 排序
+    sorted_cfis = sorted(test_cfis, key=EpubCFIParser.create_sort_key)
+    
+    print("\n排序后（应按文档阅读顺序）:")
+    for i, cfi in enumerate(sorted_cfis):
+        chapter = EpubCFIParser.extract_chapter_info(cfi)
+        print(f"  {i+1}. {cfi} ({chapter})")
+
+
+if __name__ == "__main__":
+    test_cfi_parsing()
+    print()
+    test_cfi_sorting()