'update'
This commit is contained in:
311
epub_cfi_parser.py
Normal file
311
epub_cfi_parser.py
Normal file
@@ -0,0 +1,311 @@
|
||||
"""
|
||||
EPUB CFI (Canonical Fragment Identifier) 解析器
|
||||
|
||||
基于 IDPF EPUB CFI 规范:https://idpf.org/epub/linking/cfi/epub-cfi.html
|
||||
用于正确排序 iBooks 标注,按照 EPUB 文档中的真实位置顺序
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
|
||||
class EpubCFIParser:
|
||||
"""EPUB CFI 解析器,用于处理 iBooks 中的 ZANNOTATIONLOCATION 字段"""
|
||||
|
||||
@staticmethod
|
||||
def parse_cfi(cfi_string: str) -> Optional[Tuple[List[int], List[int], int]]:
|
||||
"""
|
||||
解析 CFI 字符串,提取位置信息
|
||||
|
||||
Args:
|
||||
cfi_string: CFI 字符串,如 "epubcfi(/6/14[chapter01]!/4/2/1:12)"
|
||||
|
||||
Returns:
|
||||
(spine_path, local_path, char_offset) 或 None
|
||||
- spine_path: 脊柱路径数字列表 [6, 14] - 定位到具体文档/章节
|
||||
- local_path: 本地路径数字列表 [4, 2, 1] - 文档内元素位置
|
||||
- char_offset: 字符偏移量 12 - 元素内精确位置
|
||||
"""
|
||||
if not cfi_string:
|
||||
return None
|
||||
|
||||
# 清理输入,移除可能的前缀和包装
|
||||
cfi = cfi_string.strip()
|
||||
if cfi.startswith('epubcfi(') and cfi.endswith(')'):
|
||||
cfi = cfi[8:-1] # 移除 epubcfi( 和 )
|
||||
elif not cfi.startswith('/'):
|
||||
return None
|
||||
|
||||
try:
|
||||
# 分割 spine 部分和 local 部分(用 ! 分割)
|
||||
# spine 部分定位到文档,local 部分定位文档内位置
|
||||
if '!' in cfi:
|
||||
spine_part, local_part = cfi.split('!', 1)
|
||||
else:
|
||||
spine_part = cfi
|
||||
local_part = ''
|
||||
|
||||
# 解析 spine 路径 - 定位到具体文档
|
||||
spine_path = EpubCFIParser._parse_path_numbers(spine_part)
|
||||
|
||||
# 解析 local 路径和字符偏移 - 文档内位置
|
||||
local_path = []
|
||||
char_offset = 0
|
||||
|
||||
if local_part:
|
||||
# 查找字符偏移(:数字格式)
|
||||
char_match = re.search(r':(\d+)', local_part)
|
||||
if char_match:
|
||||
char_offset = int(char_match.group(1))
|
||||
# 移除字符偏移部分再解析路径
|
||||
local_part = re.sub(r':\d+.*$', '', local_part)
|
||||
|
||||
local_path = EpubCFIParser._parse_path_numbers(local_part)
|
||||
|
||||
return (spine_path, local_path, char_offset)
|
||||
|
||||
except Exception as e:
|
||||
print(f"CFI 解析错误: {cfi_string} -> {e}")
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _parse_path_numbers(path_str: str) -> List[int]:
|
||||
"""
|
||||
从路径字符串中提取数字序列
|
||||
|
||||
处理形如 '/6/14[chapter1]/2' 的路径,提取 [6, 14, 2]
|
||||
忽略方括号内的ID断言,只关注数字路径
|
||||
|
||||
Args:
|
||||
path_str: 路径字符串
|
||||
|
||||
Returns:
|
||||
数字列表
|
||||
"""
|
||||
numbers = []
|
||||
if not path_str:
|
||||
return numbers
|
||||
|
||||
# 正则匹配:/数字[可选ID] 模式
|
||||
# 例如: /6, /14[chapter1], /2
|
||||
pattern = r'/(\d+)(?:\[[^\]]*\])?'
|
||||
matches = re.findall(pattern, path_str)
|
||||
|
||||
for match in matches:
|
||||
numbers.append(int(match))
|
||||
|
||||
return numbers
|
||||
|
||||
@staticmethod
|
||||
def compare_cfi_positions(cfi1: str, cfi2: str) -> int:
|
||||
"""
|
||||
比较两个 CFI 的文档位置顺序
|
||||
|
||||
Args:
|
||||
cfi1, cfi2: 要比较的 CFI 字符串
|
||||
|
||||
Returns:
|
||||
-1: cfi1 在前面
|
||||
0: 位置相同
|
||||
1: cfi1 在后面
|
||||
"""
|
||||
parsed1 = EpubCFIParser.parse_cfi(cfi1)
|
||||
parsed2 = EpubCFIParser.parse_cfi(cfi2)
|
||||
|
||||
# 处理解析失败的情况
|
||||
if not parsed1 and not parsed2:
|
||||
return 0
|
||||
if not parsed1:
|
||||
return 1 # 解析失败的排到后面
|
||||
if not parsed2:
|
||||
return -1
|
||||
|
||||
spine1, local1, offset1 = parsed1
|
||||
spine2, local2, offset2 = parsed2
|
||||
|
||||
# 1. 首先比较 spine 路径(确定文档/章节顺序)
|
||||
spine_cmp = EpubCFIParser._compare_number_lists(spine1, spine2)
|
||||
if spine_cmp != 0:
|
||||
return spine_cmp
|
||||
|
||||
# 2. spine 相同,比较 local 路径(文档内位置)
|
||||
local_cmp = EpubCFIParser._compare_number_lists(local1, local2)
|
||||
if local_cmp != 0:
|
||||
return local_cmp
|
||||
|
||||
# 3. local 路径也相同,比较字符偏移
|
||||
if offset1 < offset2:
|
||||
return -1
|
||||
elif offset1 > offset2:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
def _compare_number_lists(list1: List[int], list2: List[int]) -> int:
|
||||
"""
|
||||
逐元素比较两个数字列表
|
||||
|
||||
按 EPUB CFI 规范:
|
||||
- 偶数表示元素节点
|
||||
- 奇数表示文本节点或元素间位置
|
||||
- 数字越小位置越靠前
|
||||
|
||||
Args:
|
||||
list1, list2: 要比较的数字列表
|
||||
|
||||
Returns:
|
||||
-1: list1 在前, 0: 相同, 1: list1 在后
|
||||
"""
|
||||
min_len = min(len(list1), len(list2))
|
||||
|
||||
# 逐位比较
|
||||
for i in range(min_len):
|
||||
if list1[i] < list2[i]:
|
||||
return -1
|
||||
elif list1[i] > list2[i]:
|
||||
return 1
|
||||
|
||||
# 前面都相同,短路径在前
|
||||
if len(list1) < len(list2):
|
||||
return -1
|
||||
elif len(list1) > len(list2):
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
def create_sort_key(cfi_string: str) -> Tuple:
|
||||
"""
|
||||
为 CFI 创建排序键,用于 Python 的 sorted() 函数
|
||||
|
||||
Args:
|
||||
cfi_string: CFI 字符串
|
||||
|
||||
Returns:
|
||||
排序元组,确保 CFI 按文档位置正确排序
|
||||
"""
|
||||
parsed = EpubCFIParser.parse_cfi(cfi_string)
|
||||
if not parsed:
|
||||
# 解析失败的 CFI 排到最后
|
||||
return (999999, [], 999999)
|
||||
|
||||
spine_path, local_path, char_offset = parsed
|
||||
|
||||
# 构造多级排序键:
|
||||
# 1. Spine 路径各数字(章节顺序)
|
||||
# 2. 分隔标记
|
||||
# 3. Local 路径各数字(章节内位置)
|
||||
# 4. 字符偏移
|
||||
|
||||
sort_key = []
|
||||
|
||||
# Spine 部分(章节)
|
||||
sort_key.extend(spine_path)
|
||||
|
||||
# 分隔标记(避免 spine 和 local 路径混淆)
|
||||
sort_key.append(-1)
|
||||
|
||||
# Local 部分(章节内位置)
|
||||
sort_key.extend(local_path)
|
||||
|
||||
# 字符偏移
|
||||
sort_key.append(char_offset)
|
||||
|
||||
return tuple(sort_key)
|
||||
|
||||
@staticmethod
|
||||
def extract_chapter_info(cfi_string: str) -> str:
|
||||
"""
|
||||
从 CFI 中提取章节信息
|
||||
|
||||
Args:
|
||||
cfi_string: CFI 字符串
|
||||
|
||||
Returns:
|
||||
章节信息字符串,如 "chapter01" 或 "第2章"
|
||||
"""
|
||||
if not cfi_string:
|
||||
return ""
|
||||
|
||||
# 查找方括号内的 ID 断言
|
||||
chapter_match = re.search(r'\[([^\]]+)\]', cfi_string)
|
||||
if chapter_match:
|
||||
chapter_id = chapter_match.group(1)
|
||||
# 清理常见的章节 ID 格式
|
||||
if chapter_id.startswith('chapter'):
|
||||
return chapter_id
|
||||
return f"章节_{chapter_id}"
|
||||
|
||||
# 如果没有 ID 断言,尝试从 spine 路径推断章节
|
||||
parsed = EpubCFIParser.parse_cfi(cfi_string)
|
||||
if parsed and parsed[0]:
|
||||
spine_path = parsed[0]
|
||||
if len(spine_path) >= 2:
|
||||
# 通常第二个数字表示章节序号
|
||||
chapter_num = spine_path[1] // 2 # 偶数索引转章节号
|
||||
return f"第{chapter_num}章"
|
||||
|
||||
return "未知章节"
|
||||
|
||||
|
||||
# 测试函数
|
||||
def test_cfi_parsing():
|
||||
"""测试 CFI 解析功能"""
|
||||
test_cases = [
|
||||
"epubcfi(/6/14[chapter01]!/4/2/1:12)",
|
||||
"epubcfi(/6/14[chapter01]!/4/2/1:25)",
|
||||
"epubcfi(/6/16[chapter02]!/4/1:5)",
|
||||
"epubcfi(/6/14[chapter01]!/4/4/2:0)",
|
||||
"epubcfi(/6/14!/4/2/1:12)", # 无 ID 断言
|
||||
"/6/14[chapter01]!/4/2/1:12", # 无 epubcfi 包装
|
||||
"epubcfi(/6/2[cover]!/4:0)", # 封面
|
||||
"epubcfi(/6/18[chapter03]!/2/4:25)", # 第3章
|
||||
]
|
||||
|
||||
print("=== CFI 解析测试 ===")
|
||||
for cfi in test_cases:
|
||||
parsed = EpubCFIParser.parse_cfi(cfi)
|
||||
chapter = EpubCFIParser.extract_chapter_info(cfi)
|
||||
print(f"输入: {cfi}")
|
||||
if parsed:
|
||||
spine, local, offset = parsed
|
||||
print(f"解析: spine={spine}, local={local}, offset={offset}")
|
||||
else:
|
||||
print("解析: 失败")
|
||||
print(f"章节: {chapter}")
|
||||
print()
|
||||
|
||||
|
||||
def test_cfi_sorting():
|
||||
"""测试 CFI 排序功能"""
|
||||
test_cfis = [
|
||||
"epubcfi(/6/16[chapter02]!/4/1:5)", # 第2章开始
|
||||
"epubcfi(/6/14[chapter01]!/4/2/1:25)", # 第1章,后面位置
|
||||
"epubcfi(/6/14[chapter01]!/4/2/1:12)", # 第1章,前面位置
|
||||
"epubcfi(/6/14[chapter01]!/4/4/2:0)", # 第1章,不同段落
|
||||
"epubcfi(/6/18[chapter03]!/2:1)", # 第3章
|
||||
"epubcfi(/6/14[chapter01]!/4:0)", # 第1章开头
|
||||
"epubcfi(/6/2[cover]!/4:0)", # 封面
|
||||
"epubcfi(/6/16[chapter02]!/4/6/2:15)", # 第2章,后面位置
|
||||
]
|
||||
|
||||
print("=== CFI 排序测试 ===")
|
||||
print("排序前:")
|
||||
for i, cfi in enumerate(test_cfis):
|
||||
chapter = EpubCFIParser.extract_chapter_info(cfi)
|
||||
print(f" {i+1}. {cfi} ({chapter})")
|
||||
|
||||
# 使用 CFI 排序
|
||||
sorted_cfis = sorted(test_cfis, key=EpubCFIParser.create_sort_key)
|
||||
|
||||
print("\n排序后(应按文档阅读顺序):")
|
||||
for i, cfi in enumerate(sorted_cfis):
|
||||
chapter = EpubCFIParser.extract_chapter_info(cfi)
|
||||
print(f" {i+1}. {cfi} ({chapter})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_cfi_parsing()
|
||||
print()
|
||||
test_cfi_sorting()
|
||||
Reference in New Issue
Block a user