#!/usr/bin/env python # -*- coding: utf-8 -*- # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab from __future__ import unicode_literals, division, absolute_import, print_function DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7. """ set to True to use OrderedDict for MobiHeader.metadata.""" if DEBUG_USE_ORDERED_DICTIONARY: from collections import OrderedDict as dict_ else: dict_ = dict from .compatibility_utils import PY2, unicode_str, hexlify, bord from loguru import logger if PY2: range = xrange import struct import uuid # import the mobiunpack support libraries from .mobi_utils import getLanguage from .mobi_uncompress import HuffcdicReader, PalmdocReader, UncompressedReader class unpackException(Exception): pass def sortedHeaderKeys(mheader): hdrkeys = sorted(list(mheader.keys()), key=lambda akey: mheader[akey][0]) return hdrkeys # HD Containers have their own headers and their own EXTH # this is just guesswork so far, making big assumption that # metavalue key numbers remain the same in the CONT EXTH # Note: The layout of the CONT Header is still unknown # so just deal with their EXTH sections for now def dump_contexth(cpage, extheader): # determine text encoding codec = "windows-1252" codec_map = { 1252: "windows-1252", 65001: "utf-8", } if cpage in codec_map: codec = codec_map[cpage] if extheader == b"": return id_map_strings = { 1: "Drm Server Id", 2: "Drm Commerce Id", 3: "Drm Ebookbase Book Id", 4: "Drm Ebookbase Dep Id", 100: "Creator", 101: "Publisher", 102: "Imprint", 103: "Description", 104: "ISBN", 105: "Subject", 106: "Published", 107: "Review", 108: "Contributor", 109: "Rights", 110: "SubjectCode", 111: "Type", 112: "Source", 113: "ASIN", 114: "versionNumber", 117: "Adult", 118: "Retail-Price", 119: "Retail-Currency", 120: "TSC", 122: "fixed-layout", 123: "book-type", 124: "orientation-lock", 126: "original-resolution", 127: "zero-gutter", 128: "zero-margin", 129: "MetadataResourceURI", 132: "RegionMagnification", 150: "LendingEnabled", 200: "DictShortName", 501: "cdeType", 502: "last_update_time", 503: "Updated_Title", 504: "CDEContentKey", 505: "AmazonContentReference", 506: "Title-Language", 507: "Title-Display-Direction", 508: "Title-Pronunciation", 509: "Title-Collation", 510: "Secondary-Title", 511: "Secondary-Title-Language", 512: "Secondary-Title-Direction", 513: "Secondary-Title-Pronunciation", 514: "Secondary-Title-Collation", 515: "Author-Language", 516: "Author-Display-Direction", 517: "Author-Pronunciation", 518: "Author-Collation", 519: "Author-Type", 520: "Publisher-Language", 521: "Publisher-Display-Direction", 522: "Publisher-Pronunciation", 523: "Publisher-Collation", 524: "Content-Language-Tag", 525: "primary-writing-mode", 526: "NCX-Ingested-By-Software", 527: "page-progression-direction", 528: "override-kindle-fonts", 529: "Compression-Upgraded", 530: "Soft-Hyphens-In-Content", 531: "Dictionary_In_Langague", 532: "Dictionary_Out_Language", 533: "Font_Converted", 534: "Amazon_Creator_Info", 535: "Creator-Build-Tag", 536: "HD-Media-Containers-Info", # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?) 538: "Resource-Container-Fidelity", 539: "HD-Container-Mimetype", 540: "Sample-For_Special-Purpose", 541: "Kindletool-Operation-Information", 542: "Container_Id", 543: "Asset-Type", # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER 544: "Unknown_544", } id_map_values = { 115: "sample", 116: "StartOffset", 121: "Mobi8-Boundary-Section", 125: "Embedded-Record-Count", 130: "Offline-Sample", 131: "Metadata-Record-Offset", 201: "CoverOffset", 202: "ThumbOffset", 203: "HasFakeCover", 204: "Creator-Software", 205: "Creator-Major-Version", 206: "Creator-Minor-Version", 207: "Creator-Build-Number", 401: "Clipping-Limit", 402: "Publisher-Limit", 404: "Text-to-Speech-Disabled", 406: "Rental-Expiration-Time", } id_map_hexstrings = { 208: "Watermark_(hex)", 209: "Tamper-Proof-Keys_(hex)", 300: "Font-Signature_(hex)", 403: "Unknown_(403)_(hex)", 405: "Ownership-Type_(hex)", 407: "Unknown_(407)_(hex)", 420: "Multimedia-Content-Reference_(hex)", 450: "Locations_Match_(hex)", 451: "Full-Story-Length_(hex)", 452: "Sample-Start_Location_(hex)", 453: "Sample-End-Location_(hex)", } _length, num_items = struct.unpack(b">LL", extheader[4:12]) extheader = extheader[12:] pos = 0 for _ in range(num_items): id, size = struct.unpack(b">LL", extheader[pos : pos + 8]) content = extheader[pos + 8 : pos + size] if id in id_map_strings: name = id_map_strings[id] logger.debug( '\n Key: "%s"\n Value: "%s"' % (name, content.decode(codec, errors="replace")) ) elif id in id_map_values: name = id_map_values[id] if size == 9: (value,) = struct.unpack(b"B", content) logger.debug('\n Key: "%s"\n Value: 0x%01x' % (name, value)) elif size == 10: (value,) = struct.unpack(b">H", content) logger.debug('\n Key: "%s"\n Value: 0x%02x' % (name, value)) elif size == 12: (value,) = struct.unpack(b">L", content) logger.debug('\n Key: "%s"\n Value: 0x%04x' % (name, value)) else: logger.debug( "\nError: Value for %s has unexpected size of %s" % (name, size) ) elif id in id_map_hexstrings: name = id_map_hexstrings[id] logger.debug( '\n Key: "%s"\n Value: 0x%s' % (name, hexlify(content)) ) else: logger.debug("\nWarning: Unknown metadata with id %s found" % id) name = str(id) + " (hex)" logger.debug( ' Key: "%s"\n Value: 0x%s' % (name, hexlify(content)) ) pos += size return class MobiHeader: # all values are packed in big endian format palmdoc_header = { "compression_type": (0x00, b">H", 2), "fill0": (0x02, b">H", 2), "text_length": (0x04, b">L", 4), "text_records": (0x08, b">H", 2), "max_section_size": (0x0A, b">H", 2), "read_pos ": (0x0C, b">L", 4), } mobi6_header = { "compression_type": (0x00, b">H", 2), "fill0": (0x02, b">H", 2), "text_length": (0x04, b">L", 4), "text_records": (0x08, b">H", 2), "max_section_size": (0x0A, b">H", 2), "crypto_type": (0x0C, b">H", 2), "fill1": (0x0E, b">H", 2), "magic": (0x10, b"4s", 4), "header_length (from MOBI)": (0x14, b">L", 4), "type": (0x18, b">L", 4), "codepage": (0x1C, b">L", 4), "unique_id": (0x20, b">L", 4), "version": (0x24, b">L", 4), "metaorthindex": (0x28, b">L", 4), "metainflindex": (0x2C, b">L", 4), "index_names": (0x30, b">L", 4), "index_keys": (0x34, b">L", 4), "extra_index0": (0x38, b">L", 4), "extra_index1": (0x3C, b">L", 4), "extra_index2": (0x40, b">L", 4), "extra_index3": (0x44, b">L", 4), "extra_index4": (0x48, b">L", 4), "extra_index5": (0x4C, b">L", 4), "first_nontext": (0x50, b">L", 4), "title_offset": (0x54, b">L", 4), "title_length": (0x58, b">L", 4), "language_code": (0x5C, b">L", 4), "dict_in_lang": (0x60, b">L", 4), "dict_out_lang": (0x64, b">L", 4), "min_version": (0x68, b">L", 4), "first_resc_offset": (0x6C, b">L", 4), "huff_offset": (0x70, b">L", 4), "huff_num": (0x74, b">L", 4), "huff_tbl_offset": (0x78, b">L", 4), "huff_tbl_len": (0x7C, b">L", 4), "exth_flags": (0x80, b">L", 4), "fill3_a": (0x84, b">L", 4), "fill3_b": (0x88, b">L", 4), "fill3_c": (0x8C, b">L", 4), "fill3_d": (0x90, b">L", 4), "fill3_e": (0x94, b">L", 4), "fill3_f": (0x98, b">L", 4), "fill3_g": (0x9C, b">L", 4), "fill3_h": (0xA0, b">L", 4), "unknown0": (0xA4, b">L", 4), "drm_offset": (0xA8, b">L", 4), "drm_count": (0xAC, b">L", 4), "drm_size": (0xB0, b">L", 4), "drm_flags": (0xB4, b">L", 4), "fill4_a": (0xB8, b">L", 4), "fill4_b": (0xBC, b">L", 4), "first_content": (0xC0, b">H", 2), "last_content": (0xC2, b">H", 2), "unknown0": (0xC4, b">L", 4), "fcis_offset": (0xC8, b">L", 4), "fcis_count": (0xCC, b">L", 4), "flis_offset": (0xD0, b">L", 4), "flis_count": (0xD4, b">L", 4), "unknown1": (0xD8, b">L", 4), "unknown2": (0xDC, b">L", 4), "srcs_offset": (0xE0, b">L", 4), "srcs_count": (0xE4, b">L", 4), "unknown3": (0xE8, b">L", 4), "unknown4": (0xEC, b">L", 4), "fill5": (0xF0, b">H", 2), "traildata_flags": (0xF2, b">H", 2), "ncx_index": (0xF4, b">L", 4), "unknown5": (0xF8, b">L", 4), "unknown6": (0xFC, b">L", 4), "datp_offset": (0x100, b">L", 4), "unknown7": (0x104, b">L", 4), "Unknown ": (0x108, b">L", 4), "Unknown ": (0x10C, b">L", 4), "Unknown ": (0x110, b">L", 4), "Unknown ": (0x114, b">L", 4), "Unknown ": (0x118, b">L", 4), "Unknown ": (0x11C, b">L", 4), "Unknown ": (0x120, b">L", 4), "Unknown ": (0x124, b">L", 4), "Unknown ": (0x128, b">L", 4), "Unknown ": (0x12C, b">L", 4), "Unknown ": (0x130, b">L", 4), "Unknown ": (0x134, b">L", 4), "Unknown ": (0x138, b">L", 4), "Unknown ": (0x11C, b">L", 4), } mobi8_header = { "compression_type": (0x00, b">H", 2), "fill0": (0x02, b">H", 2), "text_length": (0x04, b">L", 4), "text_records": (0x08, b">H", 2), "max_section_size": (0x0A, b">H", 2), "crypto_type": (0x0C, b">H", 2), "fill1": (0x0E, b">H", 2), "magic": (0x10, b"4s", 4), "header_length (from MOBI)": (0x14, b">L", 4), "type": (0x18, b">L", 4), "codepage": (0x1C, b">L", 4), "unique_id": (0x20, b">L", 4), "version": (0x24, b">L", 4), "metaorthindex": (0x28, b">L", 4), "metainflindex": (0x2C, b">L", 4), "index_names": (0x30, b">L", 4), "index_keys": (0x34, b">L", 4), "extra_index0": (0x38, b">L", 4), "extra_index1": (0x3C, b">L", 4), "extra_index2": (0x40, b">L", 4), "extra_index3": (0x44, b">L", 4), "extra_index4": (0x48, b">L", 4), "extra_index5": (0x4C, b">L", 4), "first_nontext": (0x50, b">L", 4), "title_offset": (0x54, b">L", 4), "title_length": (0x58, b">L", 4), "language_code": (0x5C, b">L", 4), "dict_in_lang": (0x60, b">L", 4), "dict_out_lang": (0x64, b">L", 4), "min_version": (0x68, b">L", 4), "first_resc_offset": (0x6C, b">L", 4), "huff_offset": (0x70, b">L", 4), "huff_num": (0x74, b">L", 4), "huff_tbl_offset": (0x78, b">L", 4), "huff_tbl_len": (0x7C, b">L", 4), "exth_flags": (0x80, b">L", 4), "fill3_a": (0x84, b">L", 4), "fill3_b": (0x88, b">L", 4), "fill3_c": (0x8C, b">L", 4), "fill3_d": (0x90, b">L", 4), "fill3_e": (0x94, b">L", 4), "fill3_f": (0x98, b">L", 4), "fill3_g": (0x9C, b">L", 4), "fill3_h": (0xA0, b">L", 4), "unknown0": (0xA4, b">L", 4), "drm_offset": (0xA8, b">L", 4), "drm_count": (0xAC, b">L", 4), "drm_size": (0xB0, b">L", 4), "drm_flags": (0xB4, b">L", 4), "fill4_a": (0xB8, b">L", 4), "fill4_b": (0xBC, b">L", 4), "fdst_offset": (0xC0, b">L", 4), "fdst_flow_count": (0xC4, b">L", 4), "fcis_offset": (0xC8, b">L", 4), "fcis_count": (0xCC, b">L", 4), "flis_offset": (0xD0, b">L", 4), "flis_count": (0xD4, b">L", 4), "unknown1": (0xD8, b">L", 4), "unknown2": (0xDC, b">L", 4), "srcs_offset": (0xE0, b">L", 4), "srcs_count": (0xE4, b">L", 4), "unknown3": (0xE8, b">L", 4), "unknown4": (0xEC, b">L", 4), "fill5": (0xF0, b">H", 2), "traildata_flags": (0xF2, b">H", 2), "ncx_index": (0xF4, b">L", 4), "fragment_index": (0xF8, b">L", 4), "skeleton_index": (0xFC, b">L", 4), "datp_offset": (0x100, b">L", 4), "guide_index": (0x104, b">L", 4), "Unknown ": (0x108, b">L", 4), "Unknown ": (0x10C, b">L", 4), "Unknown ": (0x110, b">L", 4), "Unknown ": (0x114, b">L", 4), "Unknown ": (0x118, b">L", 4), "Unknown ": (0x11C, b">L", 4), "Unknown ": (0x120, b">L", 4), "Unknown ": (0x124, b">L", 4), "Unknown ": (0x128, b">L", 4), "Unknown ": (0x12C, b">L", 4), "Unknown ": (0x130, b">L", 4), "Unknown ": (0x134, b">L", 4), "Unknown ": (0x138, b">L", 4), "Unknown ": (0x11C, b">L", 4), } palmdoc_header_sorted_keys = sortedHeaderKeys(palmdoc_header) mobi6_header_sorted_keys = sortedHeaderKeys(mobi6_header) mobi8_header_sorted_keys = sortedHeaderKeys(mobi8_header) id_map_strings = { 1: "Drm Server Id", 2: "Drm Commerce Id", 3: "Drm Ebookbase Book Id", 4: "Drm Ebookbase Dep Id", 100: "Creator", 101: "Publisher", 102: "Imprint", 103: "Description", 104: "ISBN", 105: "Subject", 106: "Published", 107: "Review", 108: "Contributor", 109: "Rights", 110: "SubjectCode", 111: "Type", 112: "Source", 113: "ASIN", 114: "versionNumber", 117: "Adult", 118: "Retail-Price", 119: "Retail-Currency", 120: "TSC", 122: "fixed-layout", 123: "book-type", 124: "orientation-lock", 126: "original-resolution", 127: "zero-gutter", 128: "zero-margin", 129: "MetadataResourceURI", 132: "RegionMagnification", 150: "LendingEnabled", 200: "DictShortName", 501: "cdeType", 502: "last_update_time", 503: "Updated_Title", 504: "CDEContentKey", 505: "AmazonContentReference", 506: "Title-Language", 507: "Title-Display-Direction", 508: "Title-Pronunciation", 509: "Title-Collation", 510: "Secondary-Title", 511: "Secondary-Title-Language", 512: "Secondary-Title-Direction", 513: "Secondary-Title-Pronunciation", 514: "Secondary-Title-Collation", 515: "Author-Language", 516: "Author-Display-Direction", 517: "Author-Pronunciation", 518: "Author-Collation", 519: "Author-Type", 520: "Publisher-Language", 521: "Publisher-Display-Direction", 522: "Publisher-Pronunciation", 523: "Publisher-Collation", 524: "Content-Language-Tag", 525: "primary-writing-mode", 526: "NCX-Ingested-By-Software", 527: "page-progression-direction", 528: "override-kindle-fonts", 529: "Compression-Upgraded", 530: "Soft-Hyphens-In-Content", 531: "Dictionary_In_Langague", 532: "Dictionary_Out_Language", 533: "Font_Converted", 534: "Amazon_Creator_Info", 535: "Creator-Build-Tag", 536: "HD-Media-Containers-Info", # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?) 538: "Resource-Container-Fidelity", 539: "HD-Container-Mimetype", 540: "Sample-For_Special-Purpose", 541: "Kindletool-Operation-Information", 542: "Container_Id", 543: "Asset-Type", # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER 544: "Unknown_544", } id_map_values = { 115: "sample", 116: "StartOffset", 121: "Mobi8-Boundary-Section", 125: "Embedded-Record-Count", 130: "Offline-Sample", 131: "Metadata-Record-Offset", 201: "CoverOffset", 202: "ThumbOffset", 203: "HasFakeCover", 204: "Creator-Software", 205: "Creator-Major-Version", 206: "Creator-Minor-Version", 207: "Creator-Build-Number", 401: "Clipping-Limit", 402: "Publisher-Limit", 404: "Text-to-Speech-Disabled", 406: "Rental-Expiration-Time", } id_map_hexstrings = { 208: "Watermark_(hex)", 209: "Tamper-Proof-Keys_(hex)", 300: "Font-Signature_(hex)", 403: "Unknown_(403)_(hex)", 405: "Ownership-Type_(hex)", 407: "Unknown_(407)_(hex)", 420: "Multimedia-Content-Reference_(hex)", 450: "Locations_Match_(hex)", 451: "Full-Story-Length_(hex)", 452: "Sample-Start_Location_(hex)", 453: "Sample-End-Location_(hex)", } def __init__(self, sect, sectNumber): self.sect = sect self.start = sectNumber self.header = self.sect.loadSection(self.start) if len(self.header) > 20 and self.header[16:20] == b"MOBI": self.sect.setsectiondescription(0, "Mobipocket Header") self.palm = False elif self.sect.ident == b"TEXtREAd": self.sect.setsectiondescription(0, "PalmDOC Header") self.palm = True else: raise unpackException("Unknown File Format") (self.records,) = struct.unpack_from(b">H", self.header, 0x8) # set defaults in case this is a PalmDOC self.title = self.sect.palmname.decode("latin-1", errors="replace") self.length = len(self.header) - 16 self.type = 3 self.codepage = 1252 self.codec = "windows-1252" self.unique_id = 0 self.version = 0 self.hasExth = False self.exth = b"" self.exth_offset = self.length + 16 self.exth_length = 0 self.crypto_type = 0 self.firstnontext = self.start + self.records + 1 self.firstresource = self.start + self.records + 1 self.ncxidx = 0xFFFFFFFF self.metaOrthIndex = 0xFFFFFFFF self.metaInflIndex = 0xFFFFFFFF self.skelidx = 0xFFFFFFFF self.fragidx = 0xFFFFFFFF self.guideidx = 0xFFFFFFFF self.fdst = 0xFFFFFFFF self.mlstart = self.sect.loadSection(self.start + 1)[:4] self.rawSize = 0 self.metadata = dict_() # set up for decompression/unpacking (self.compression,) = struct.unpack_from(b">H", self.header, 0x0) if self.compression == 0x4448: reader = HuffcdicReader() huffoff, huffnum = struct.unpack_from(b">LL", self.header, 0x70) huffoff = huffoff + self.start self.sect.setsectiondescription(huffoff, "Huffman Compression Seed") reader.loadHuff(self.sect.loadSection(huffoff)) for i in range(1, huffnum): self.sect.setsectiondescription( huffoff + i, "Huffman CDIC Compression Seed %d" % i ) reader.loadCdic(self.sect.loadSection(huffoff + i)) self.unpack = reader.unpack elif self.compression == 2: self.unpack = PalmdocReader().unpack elif self.compression == 1: self.unpack = UncompressedReader().unpack else: raise unpackException("invalid compression type: 0x%4x" % self.compression) if self.palm: return ( self.length, self.type, self.codepage, self.unique_id, self.version, ) = struct.unpack(b">LLLLL", self.header[20:40]) codec_map = { 1252: "windows-1252", 65001: "utf-8", } if self.codepage in codec_map: self.codec = codec_map[self.codepage] # title toff, tlen = struct.unpack(b">II", self.header[0x54:0x5C]) tend = toff + tlen self.title = self.header[toff:tend].decode(self.codec, errors="replace") (exth_flag,) = struct.unpack(b">L", self.header[0x80:0x84]) self.hasExth = exth_flag & 0x40 self.exth_offset = self.length + 16 self.exth_length = 0 if self.hasExth: (self.exth_length,) = struct.unpack_from( b">L", self.header, self.exth_offset + 4 ) self.exth_length = ( (self.exth_length + 3) >> 2 ) << 2 # round to next 4 byte boundary self.exth = self.header[ self.exth_offset : self.exth_offset + self.exth_length ] # parse the exth / metadata self.parseMetaData() # self.mlstart = self.sect.loadSection(self.start+1) # self.mlstart = self.mlstart[0:4] (self.crypto_type,) = struct.unpack_from(b">H", self.header, 0xC) # Start sector for additional files such as images, fonts, resources, etc # Can be missing so fall back to default set previously (ofst,) = struct.unpack_from(b">L", self.header, 0x6C) if ofst != 0xFFFFFFFF: self.firstresource = ofst + self.start (ofst,) = struct.unpack_from(b">L", self.header, 0x50) if ofst != 0xFFFFFFFF: self.firstnontext = ofst + self.start if self.isPrintReplica(): return if self.version < 8: # Dictionary metaOrthIndex (self.metaOrthIndex,) = struct.unpack_from(b">L", self.header, 0x28) if self.metaOrthIndex != 0xFFFFFFFF: self.metaOrthIndex += self.start # Dictionary metaInflIndex (self.metaInflIndex,) = struct.unpack_from(b">L", self.header, 0x2C) if self.metaInflIndex != 0xFFFFFFFF: self.metaInflIndex += self.start # handle older headers without any ncxindex info and later # specifically 0xe4 headers if self.length + 16 < 0xF8: return # NCX Index (self.ncxidx,) = struct.unpack(b">L", self.header[0xF4:0xF8]) if self.ncxidx != 0xFFFFFFFF: self.ncxidx += self.start # K8 specific Indexes if self.start != 0 or self.version == 8: # Index into file skeletons in RawML (self.skelidx,) = struct.unpack_from(b">L", self.header, 0xFC) if self.skelidx != 0xFFFFFFFF: self.skelidx += self.start # Index into
sections in RawML (self.fragidx,) = struct.unpack_from(b">L", self.header, 0xF8) if self.fragidx != 0xFFFFFFFF: self.fragidx += self.start # Index into Other files (self.guideidx,) = struct.unpack_from(b">L", self.header, 0x104) if self.guideidx != 0xFFFFFFFF: self.guideidx += self.start # dictionaries do not seem to use the same approach in K8's # so disable them self.metaOrthIndex = 0xFFFFFFFF self.metaInflIndex = 0xFFFFFFFF # need to use the FDST record to find out how to properly unpack # the rawML into pieces # it is simply a table of start and end locations for each flow piece (self.fdst,) = struct.unpack_from(b">L", self.header, 0xC0) (self.fdstcnt,) = struct.unpack_from(b">L", self.header, 0xC4) # if cnt is 1 or less, fdst section mumber can be garbage if self.fdstcnt <= 1: self.fdst = 0xFFFFFFFF if self.fdst != 0xFFFFFFFF: self.fdst += self.start # setting of fdst section description properly handled in mobi_kf8proc def dump_exth(self): # determine text encoding codec = self.codec if (not self.hasExth) or (self.exth_length) == 0 or (self.exth == b""): return (num_items,) = struct.unpack(b">L", self.exth[8:12]) pos = 12 logger.debug("Key Size Decription Value") for _ in range(num_items): id, size = struct.unpack(b">LL", self.exth[pos : pos + 8]) contentsize = size - 8 content = self.exth[pos + 8 : pos + size] if id in MobiHeader.id_map_strings: exth_name = MobiHeader.id_map_strings[id] logger.debug( "{0: >3d} {1: >4d} {2: <30s} {3:s}".format( id, contentsize, exth_name, content.decode(codec, errors="replace"), ) ) elif id in MobiHeader.id_map_values: exth_name = MobiHeader.id_map_values[id] if size == 9: (value,) = struct.unpack(b"B", content) logger.debug( "{0:3d} byte {1:<30s} {2:d}".format(id, exth_name, value) ) elif size == 10: (value,) = struct.unpack(b">H", content) logger.debug( "{0:3d} word {1:<30s} 0x{2:0>4X} ({2:d})".format( id, exth_name, value ) ) elif size == 12: (value,) = struct.unpack(b">L", content) logger.debug( "{0:3d} long {1:<30s} 0x{2:0>8X} ({2:d})".format( id, exth_name, value ) ) else: logger.debug( "{0: >3d} {1: >4d} {2: <30s} (0x{3:s})".format( id, contentsize, "Bad size for " + exth_name, hexlify(content), ) ) elif id in MobiHeader.id_map_hexstrings: exth_name = MobiHeader.id_map_hexstrings[id] logger.debug( "{0:3d} {1:4d} {2:<30s} 0x{3:s}".format( id, contentsize, exth_name, hexlify(content) ) ) else: exth_name = "Unknown EXTH ID {0:d}".format(id) logger.debug( "{0: >3d} {1: >4d} {2: <30s} 0x{3:s}".format( id, contentsize, exth_name, hexlify(content) ) ) pos += size return def dumpheader(self): # first 16 bytes are not part of the official mobiheader # but we will treat it as such # so section 0 is 16 (decimal) + self.length in total == at least 0x108 bytes for Mobi 8 headers logger.debug( "Dumping section %d, Mobipocket Header version: %d, total length %d" % (self.start, self.version, self.length + 16) ) self.hdr = {} # set it up for the proper header version if self.version == 0: self.mobi_header = MobiHeader.palmdoc_header self.mobi_header_sorted_keys = MobiHeader.palmdoc_header_sorted_keys elif self.version < 8: self.mobi_header = MobiHeader.mobi6_header self.mobi_header_sorted_keys = MobiHeader.mobi6_header_sorted_keys else: self.mobi_header = MobiHeader.mobi8_header self.mobi_header_sorted_keys = MobiHeader.mobi8_header_sorted_keys # parse the header information for key in self.mobi_header_sorted_keys: (pos, format, tot_len) = self.mobi_header[key] if pos < (self.length + 16): (val,) = struct.unpack_from(format, self.header, pos) self.hdr[key] = val if "title_offset" in self.hdr: title_offset = self.hdr["title_offset"] title_length = self.hdr["title_length"] else: title_offset = 0 title_length = 0 if title_offset == 0: title_offset = len(self.header) title_length = 0 self.title = self.sect.palmname.decode("latin-1", errors="replace") else: self.title = self.header[title_offset : title_offset + title_length].decode( self.codec, errors="replace" ) # title record always padded with two nul bytes and then padded with nuls to next 4 byte boundary title_length = ((title_length + 2 + 3) >> 2) << 2 self.extra1 = self.header[self.exth_offset + self.exth_length : title_offset] self.extra2 = self.header[title_offset + title_length :] logger.debug("Mobipocket header from section %d" % self.start) logger.debug(" Offset Value Hex Dec Description") for key in self.mobi_header_sorted_keys: (pos, format, tot_len) = self.mobi_header[key] if pos < (self.length + 16): if key != "magic": fmt_string = ( "0x{0:0>3X} ({0:3d}){1: >" + str(9 - 2 * tot_len) + "s}0x{2:0>" + str(2 * tot_len) + "X} {2:10d} {3:s}" ) else: self.hdr[key] = unicode_str(self.hdr[key]) fmt_string = "0x{0:0>3X} ({0:3d}){2:>11s} {3:s}" logger.debug(fmt_string.format(pos, " ", self.hdr[key], key)) logger.debug("") if self.exth_length > 0: logger.debug( "EXTH metadata, offset %d, padded length %d" % (self.exth_offset, self.exth_length) ) self.dump_exth() logger.debug("") if len(self.extra1) > 0: logger.debug( "Extra data between EXTH and Title, length %d" % len(self.extra1) ) logger.debug(hexlify(self.extra1)) logger.debug("") if title_length > 0: logger.debug( "Title in header at offset %d, padded length %d: '%s'" % (title_offset, title_length, self.title) ) logger.debug("") if len(self.extra2) > 0: print( "Extra data between Title and end of header, length %d" % len(self.extra2) ) #print(hexlify(self.extra2)) print("") def isPrintReplica(self): return self.mlstart[0:4] == b"%MOP" def isK8(self): return self.start != 0 or self.version == 8 def isEncrypted(self): return self.crypto_type != 0 def hasNCX(self): return self.ncxidx != 0xFFFFFFFF def isDictionary(self): return self.metaOrthIndex != 0xFFFFFFFF def getncxIndex(self): return self.ncxidx def decompress(self, data): return self.unpack(data) def Language(self): langcode = struct.unpack(b"!L", self.header[0x5C:0x60])[0] langid = langcode & 0xFF sublangid = (langcode >> 8) & 0xFF return getLanguage(langid, sublangid) def DictInLanguage(self): if self.isDictionary(): langcode = struct.unpack(b"!L", self.header[0x60:0x64])[0] langid = langcode & 0xFF sublangid = (langcode >> 10) & 0xFF if langid != 0: return getLanguage(langid, sublangid) return False def DictOutLanguage(self): if self.isDictionary(): langcode = struct.unpack(b"!L", self.header[0x64:0x68])[0] langid = langcode & 0xFF sublangid = (langcode >> 10) & 0xFF if langid != 0: return getLanguage(langid, sublangid) return False def getRawML(self): def getSizeOfTrailingDataEntry(data): num = 0 for v in data[-4:]: if bord(v) & 0x80: num = 0 num = (num << 7) | (bord(v) & 0x7F) return num def trimTrailingDataEntries(data): for _ in range(trailers): num = getSizeOfTrailingDataEntry(data) data = data[:-num] if multibyte: num = (ord(data[-1:]) & 3) + 1 data = data[:-num] return data multibyte = 0 trailers = 0 if self.sect.ident == b"BOOKMOBI": (mobi_length,) = struct.unpack_from(b">L", self.header, 0x14) (mobi_version,) = struct.unpack_from(b">L", self.header, 0x68) if (mobi_length >= 0xE4) and (mobi_version >= 5): (flags,) = struct.unpack_from(b">H", self.header, 0xF2) multibyte = flags & 1 while flags > 1: if flags & 2: trailers += 1 flags = flags >> 1 # get raw mobi markup languge logger.debug("Unpacking raw markup language") dataList = [] # offset = 0 for i in range(1, self.records + 1): data = trimTrailingDataEntries(self.sect.loadSection(self.start + i)) dataList.append(self.unpack(data)) if self.isK8(): self.sect.setsectiondescription( self.start + i, "KF8 Text Section {0:d}".format(i) ) elif self.version == 0: self.sect.setsectiondescription( self.start + i, "PalmDOC Text Section {0:d}".format(i) ) else: self.sect.setsectiondescription( self.start + i, "Mobipocket Text Section {0:d}".format(i) ) rawML = b"".join(dataList) self.rawSize = len(rawML) return rawML # all metadata is stored in a dictionary with key and returns a *list* of values # a list is used to allow for multiple creators, multiple contributors, etc def parseMetaData(self): def addValue(name, value): if name not in self.metadata: self.metadata[name] = [value] else: self.metadata[name].append(value) codec = self.codec if self.hasExth: extheader = self.exth _length, num_items = struct.unpack(b">LL", extheader[4:12]) extheader = extheader[12:] pos = 0 for _ in range(num_items): id, size = struct.unpack(b">LL", extheader[pos : pos + 8]) content = extheader[pos + 8 : pos + size] if id in MobiHeader.id_map_strings: name = MobiHeader.id_map_strings[id] addValue(name, content.decode(codec, errors="replace")) elif id in MobiHeader.id_map_values: name = MobiHeader.id_map_values[id] if size == 9: (value,) = struct.unpack(b"B", content) addValue(name, unicode_str(str(value))) elif size == 10: (value,) = struct.unpack(b">H", content) addValue(name, unicode_str(str(value))) elif size == 12: (value,) = struct.unpack(b">L", content) # handle special case of missing CoverOffset or missing ThumbOffset if id == 201 or id == 202: if value != 0xFFFFFFFF: addValue(name, unicode_str(str(value))) else: addValue(name, unicode_str(str(value))) else: print( "Warning: Bad key, size, value combination detected in EXTH ", id, size, hexlify(content), ) addValue(name, hexlify(content)) elif id in MobiHeader.id_map_hexstrings: name = MobiHeader.id_map_hexstrings[id] addValue(name, hexlify(content)) else: name = unicode_str(str(id)) + " (hex)" addValue(name, hexlify(content)) pos += size # add the basics to the metadata each as a list element self.metadata["Language"] = [self.Language()] self.metadata["Title"] = [unicode_str(self.title, self.codec)] self.metadata["Codec"] = [self.codec] self.metadata["UniqueID"] = [unicode_str(str(self.unique_id))] # if no asin create one using a uuid if "ASIN" not in self.metadata: self.metadata["ASIN"] = [unicode_str(str(uuid.uuid4()))] # if no cdeType set it to "EBOK" if "cdeType" not in self.metadata: self.metadata["cdeType"] = ["EBOK"] def getMetaData(self): return self.metadata def describeHeader(self, DUMP): logger.debug("Mobi Version: %s" % self.version) logger.debug("Codec: %s" % self.codec) logger.debug("Title: %s" % self.title) logger.debug("dumpheader : %s" % self.dumpheader()) if "Updated_Title" in self.metadata: logger.debug("EXTH Title: %s" % self.metadata["Updated_Title"][0]) if self.compression == 0x4448: logger.debug("Huffdic compression") elif self.compression == 2: logger.debug("Palmdoc compression") elif self.compression == 1: logger.debug("No compression") if DUMP: self.dumpheader()