Files
kman/mobiparse/mobi/mobi_ncx.py
2024-04-03 15:08:22 +08:00

133 lines
5.4 KiB
Python
Executable File

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
from __future__ import unicode_literals, division, absolute_import, print_function
import os
from .unipath import pathof
from loguru import logger
import re
import json
# note: re requites the pattern to be the exact same type as the data to be searched in python3
# but u"" is not allowed for the pattern itself only b""
'''
NCX (Navigation Control for XML applications) is a generalized navigation definition DTD for application
to Digital Talking Books, eBooks, and general web content models.
This DTD is an XML application that layers navigation functionality on top of SMIL 2.0 content.
The NCX defines a navigation path/model that may be applied upon existing publications,
without modification of the existing publication source, so long as the navigation targets within
the source publication can be directly referenced via a URI.
http://www.daisy.org/z3986/2005/ncx-2005-1.dtd
'''
from .mobi_utils import toBase32
from .mobi_index import MobiIndex
DEBUG_NCX = False
class ncxExtract:
def __init__(self, mh):
self.mh = mh
self.sect = self.mh.sect
self.isNCX = False
self.mi = MobiIndex(self.sect)
self.ncxidx = self.mh.ncxidx
self.indx_data = None
def parseNCX(self):
indx_data = []
tag_fieldname_map = {
1: ["pos", 0],
2: ["len", 0],
3: ["noffs", 0],
4: ["hlvl", 0],
5: ["koffs", 0],
6: ["pos_fid", 0],
21: ["parent", 0],
22: ["child1", 0],
23: ["childn", 0],
}
if self.ncxidx != 0xFFFFFFFF:
outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
if DEBUG_NCX:
logger.debug("ctoc_text {}".format(ctoc_text))
logger.debug("outtbl {}".format(outtbl))
num = 0
for [text, tagMap] in outtbl:
tmp = {
"name": text.decode("utf-8"),
"pos": -1,
"len": 0,
"noffs": -1,
"text": "Unknown Text",
"hlvl": -1,
"kind": "Unknown Kind",
"pos_fid": None,
"parent": -1,
"child1": -1,
"childn": -1,
"num": num,
}
for tag in tag_fieldname_map:
[fieldname, i] = tag_fieldname_map[tag]
if tag in tagMap:
fieldvalue = tagMap[tag][i]
if tag == 6:
pos_fid = toBase32(fieldvalue, 4).decode("utf-8")
fieldvalue2 = tagMap[tag][i + 1]
pos_off = toBase32(fieldvalue2, 10).decode("utf-8")
fieldvalue = "kindle:pos:fid:%s:off:%s" % (pos_fid, pos_off)
tmp[fieldname] = fieldvalue
if tag == 3:
toctext = ctoc_text.get(fieldvalue, "Unknown Text")
toctext = toctext.decode(self.mh.codec)
tmp["text"] = toctext
if tag == 5:
kindtext = ctoc_text.get(fieldvalue, "Unknown Kind")
kindtext = kindtext.decode(self.mh.codec)
tmp["kind"] = kindtext
indx_data.append(tmp)
# CGDBG
'''
record number: 3
name: 03
position 461377 length: 465358 => position/150 = real page number
text: 第二章 青铜时代­——单机游戏
kind: Unknown Kind
heading level: 0 => level of section
parent: -1 => record number of previous level of section
first child: 15 last child: 26 => range of record number of next level section
pos_fid is kindle:pos:fid:0023:off:0000000000
'''
if DEBUG_NCX:
print("record number: ", num)
print(
"name: ", tmp["name"],
)
print("position", tmp["pos"], " length: ", tmp["len"])
print("text: ", tmp["text"])
print("kind: ", tmp["kind"])
print("heading level: ", tmp["hlvl"])
print("parent:", tmp["parent"])
print(
"first child: ", tmp["child1"], " last child: ", tmp["childn"]
)
print("pos_fid is ", tmp["pos_fid"])
print("\n\n")
num += 1
self.indx_data = indx_data
# {'name': '00', 'pos': 167, 'len': 24798, 'noffs': 0, 'text': '版权信息', 'hlvl': 0, 'kind': 'Unknown Kind', 'pos_fid': None, 'parent': -1, 'child1': -1, 'childn': -1, 'num': 0}
# {'name': '0B', 'pos': 67932, 'len': 3274, 'noffs': 236, 'text': '8.希罗多德', 'hlvl': 0, 'kind': 'Unknown Kind', 'pos_fid': None, 'parent': -1, 'child1': -1, 'childn': -1, 'num': 11}
#print('indx_data {}'.format(json.dumps(indx_data, indent=4, sort_keys=True, ensure_ascii=False)))
return indx_data