#!/usr/bin/env python # -*- coding: utf-8 -*- # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab from __future__ import unicode_literals, division, absolute_import, print_function from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr from loguru import logger if PY2: range = xrange array_format = b"B" if PY3: unichr = chr array_format = "B" import array import struct # note: struct pack, unpack, unpack_from all require bytestring format # data all the way up to at least python 2.7.5, python 3 okay with bytestring from .mobi_index import getVariableWidthValue, readTagSection, getTagMap from .mobi_utils import toHex DEBUG_DICT = True class InflectionData(object): def __init__(self, infldatas): self.infldatas = infldatas self.starts = [] self.counts = [] for idata in self.infldatas: (start,) = struct.unpack_from(b">L", idata, 0x14) (count,) = struct.unpack_from(b">L", idata, 0x18) self.starts.append(start) self.counts.append(count) def lookup(self, lookupvalue): i = 0 rvalue = lookupvalue while rvalue >= self.counts[i]: rvalue = rvalue - self.counts[i] i += 1 if i == len(self.counts): logger.debug("Error: Problem with multiple inflections data sections") return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0] return rvalue, self.starts[i], self.counts[i], self.infldatas[i] def offsets(self, value): rvalue, start, count, data = self.lookup(value) (offset,) = struct.unpack_from(b">H", data, start + 4 + (2 * rvalue)) if rvalue + 1 < count: (nextOffset,) = struct.unpack_from( b">H", data, start + 4 + (2 * (rvalue + 1)) ) else: nextOffset = None return offset, nextOffset, data class dictSupport(object): def __init__(self, mh, sect): self.mh = mh self.header = mh.header self.sect = sect self.metaOrthIndex = mh.metaOrthIndex self.metaInflIndex = mh.metaInflIndex def parseHeader(self, data): "read INDX header" if not data[:4] == b"INDX": logger.debug("Warning: index section is not INDX") return False words = ( "len", "nul1", "type", "gen", "start", "count", "code", "lng", "total", "ordt", "ligt", "nligt", "nctoc", ) num = len(words) values = struct.unpack(bstr(">%dL" % num), data[4 : 4 * (num + 1)]) header = {} for n in range(num): header[words[n]] = values[n] ordt1 = None ordt2 = None otype, oentries, op1, op2, otagx = struct.unpack_from(b">LLLLL", data, 0xA4) header["otype"] = otype header["oentries"] = oentries if DEBUG_DICT: logger.debug( "otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx) ) if header["code"] == 0xFDEA or oentries > 0: # some dictionaries seem to be codepage 65002 (0xFDEA) which seems # to be some sort of strange EBCDIC utf-8 or 16 encoded strings # So we need to look for them and store them away to process leading text # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries # we only ever seem to use the second but ... # # if otype = 0, ORDT table uses 16 bit values as offsets into the table # if otype = 1, ORDT table uses 8 bit values as offsets inot the table assert data[op1 : op1 + 4] == b"ORDT" assert data[op2 : op2 + 4] == b"ORDT" ordt1 = struct.unpack_from(bstr(">%dB" % oentries), data, op1 + 4) ordt2 = struct.unpack_from(bstr(">%dH" % oentries), data, op2 + 4) if DEBUG_DICT: logger.debug("parsed INDX header:") for key in header: logger.debug( key, "%x" % header[key], ) logger.debug("\n") return header, ordt1, ordt2 def getPositionMap(self): sect = self.sect positionMap = {} metaOrthIndex = self.metaOrthIndex metaInflIndex = self.metaInflIndex decodeInflection = True if metaOrthIndex != 0xFFFFFFFF: logger.debug( "Info: Document contains orthographic index, handle as dictionary" ) if metaInflIndex == 0xFFFFFFFF: decodeInflection = False else: metaInflIndexData = sect.loadSection(metaInflIndex) logger.debug("\nParsing metaInflIndexData") midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData) metaIndexCount = midxhdr["count"] idatas = [] for j in range(metaIndexCount): idatas.append(sect.loadSection(metaInflIndex + 1 + j)) dinfl = InflectionData(idatas) inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) tagSectionStart = midxhdr["len"] inflectionControlByteCount, inflectionTagTable = readTagSection( tagSectionStart, metaInflIndexData ) if DEBUG_DICT: logger.debug("inflectionTagTable: %s" % inflectionTagTable) if self.hasTag(inflectionTagTable, 0x07): logger.debug( "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported" ) decodeInflection = False data = sect.loadSection(metaOrthIndex) logger.debug("\nParsing metaOrthIndex") idxhdr, hordt1, hordt2 = self.parseHeader(data) tagSectionStart = idxhdr["len"] controlByteCount, tagTable = readTagSection(tagSectionStart, data) orthIndexCount = idxhdr["count"] logger.debug("orthIndexCount is", orthIndexCount) if DEBUG_DICT: logger.debug("orthTagTable: %s" % tagTable) if hordt2 is not None: logger.debug( "orth entry uses ordt2 lookup table of type ", idxhdr["otype"] ) hasEntryLength = self.hasTag(tagTable, 0x02) if not hasEntryLength: logger.debug("Info: Index doesn't contain entry length tags") logger.debug("Read dictionary index data") for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): data = sect.loadSection(i) hdrinfo, ordt1, ordt2 = self.parseHeader(data) idxtPos = hdrinfo["start"] entryCount = hdrinfo["count"] idxPositions = [] for j in range(entryCount): (pos,) = struct.unpack_from(b">H", data, idxtPos + 4 + (2 * j)) idxPositions.append(pos) # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) idxPositions.append(idxtPos) for j in range(entryCount): startPos = idxPositions[j] endPos = idxPositions[j + 1] textLength = ord(data[startPos : startPos + 1]) text = data[startPos + 1 : startPos + 1 + textLength] if hordt2 is not None: utext = "" if idxhdr["otype"] == 0: pattern = b">H" inc = 2 else: pattern = b">B" inc = 1 pos = 0 while pos < textLength: (off,) = struct.unpack_from(pattern, text, pos) if off < len(hordt2): utext += unichr(hordt2[off]) else: utext += unichr(off) pos += inc text = utext.encode("utf-8") tagMap = getTagMap( controlByteCount, tagTable, data, startPos + 1 + textLength, endPos, ) if 0x01 in tagMap: if decodeInflection and 0x2A in tagMap: inflectionGroups = self.getInflectionGroups( text, inflectionControlByteCount, inflectionTagTable, dinfl, inflNameData, tagMap[0x2A], ) else: inflectionGroups = b"" assert len(tagMap[0x01]) == 1 entryStartPosition = tagMap[0x01][0] if hasEntryLength: # The idx:entry attribute "scriptable" must be present to create entry length tags. ml = ( b'' + inflectionGroups + b"" ) if entryStartPosition in positionMap: positionMap[entryStartPosition] = ( positionMap[entryStartPosition] + ml ) else: positionMap[entryStartPosition] = ml assert len(tagMap[0x02]) == 1 entryEndPosition = entryStartPosition + tagMap[0x02][0] if entryEndPosition in positionMap: positionMap[entryEndPosition] = ( b"" + positionMap[entryEndPosition] ) else: positionMap[entryEndPosition] = b"" else: indexTags = ( b'\n\n' + inflectionGroups + b"\n" ) if entryStartPosition in positionMap: positionMap[entryStartPosition] = ( positionMap[entryStartPosition] + indexTags ) else: positionMap[entryStartPosition] = indexTags return positionMap def hasTag(self, tagTable, tag): """ Test if tag table contains given tag. @param tagTable: The tag table. @param tag: The tag to search. @return: True if tag table contains given tag; False otherwise. """ for currentTag, _, _, _ in tagTable: if currentTag == tag: return True return False def getInflectionGroups( self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList ): """ Create string which contains the inflection groups with inflection rules as mobipocket tags. @param mainEntry: The word to inflect. @param controlByteCount: The number of control bytes. @param tagTable: The tag table. @param data: The Inflection data object to properly select the right inflection data section to use @param inflectionNames: The inflection rule name data. @param groupList: The list of inflection groups to process. @return: String with inflection groups and rules or empty string if required tags are not available. """ result = b"" for value in groupList: offset, nextOffset, data = dinfl.offsets(value) # First byte seems to be always 0x00 and must be skipped. assert ord(data[offset : offset + 1]) == 0x00 tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) # Make sure that the required tags are available. if 0x05 not in tagMap: logger.debug("Error: Required tag 0x05 not found in tagMap") return "" if 0x1A not in tagMap: logger.debug("Error: Required tag 0x1a not found in tagMap") return b"" result += b"" for i in range(len(tagMap[0x05])): # Get name of inflection rule. value = tagMap[0x05][i] consumed, textLength = getVariableWidthValue(inflectionNames, value) inflectionName = inflectionNames[ value + consumed : value + consumed + textLength ] # Get and apply inflection rule across possibly multiple inflection data sections value = tagMap[0x1A][i] rvalue, start, count, data = dinfl.lookup(value) (offset,) = struct.unpack_from(b">H", data, start + 4 + (2 * rvalue)) textLength = ord(data[offset : offset + 1]) inflection = self.applyInflectionRule( mainEntry, data, offset + 1, offset + 1 + textLength ) if inflection is not None: result += ( b' ' ) result += b"" return result def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end): """ Apply inflection rule. @param mainEntry: The word to inflect. @param inflectionRuleData: The inflection rules. @param start: The start position of the inflection rule to use. @param end: The end position of the inflection rule to use. @return: The string with the inflected word or None if an error occurs. """ mode = -1 byteArray = array.array(array_format, mainEntry) position = len(byteArray) for charOffset in range(start, end): char = inflectionRuleData[charOffset : charOffset + 1] abyte = ord(char) if abyte >= 0x0A and abyte <= 0x13: # Move cursor backwards offset = abyte - 0x0A if mode not in [0x02, 0x03]: mode = 0x02 position = len(byteArray) position -= offset elif abyte > 0x13: if mode == -1: logger.debug( "Error: Unexpected first byte %i of inflection rule" % abyte ) return None elif position == -1: logger.debug( "Error: Unexpected first byte %i of inflection rule" % abyte ) return None else: if mode == 0x01: # Insert at word start byteArray.insert(position, abyte) position += 1 elif mode == 0x02: # Insert at word end byteArray.insert(position, abyte) elif mode == 0x03: # Delete at word end position -= 1 deleted = byteArray.pop(position) if bchr(deleted) != char: if DEBUG_DICT: logger.debug( "0x03: %s %s %s %s" % ( mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted), ) ) logger.debug( "Error: Delete operation of inflection rule failed" ) return None elif mode == 0x04: # Delete at word start deleted = byteArray.pop(position) if bchr(deleted) != char: if DEBUG_DICT: logger.debug( "0x03: %s %s %s %s" % ( mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted), ) ) logger.debug( "Error: Delete operation of inflection rule failed" ) return None else: logger.debug( "Error: Inflection rule mode %x is not implemented" % mode ) return None elif abyte == 0x01: # Insert at word start if mode not in [0x01, 0x04]: position = 0 mode = abyte elif abyte == 0x02: # Insert at word end if mode not in [0x02, 0x03]: position = len(byteArray) mode = abyte elif abyte == 0x03: # Delete at word end if mode not in [0x02, 0x03]: position = len(byteArray) mode = abyte elif abyte == 0x04: # Delete at word start if mode not in [0x01, 0x04]: position = 0 # Delete at word start mode = abyte else: logger.debug( "Error: Inflection rule mode %x is not implemented" % abyte ) return None return utf8_str(byteArray.tostring())