pypdf.generic._utils 源代码

import codecs
from typing import Dict, List, Tuple, Union

from .._codecs import _pdfdoc_encoding
from .._utils import StreamType, logger_warning, read_non_whitespace
from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError
from ._base import ByteStringObject, TextStringObject


[文档] def hex_to_rgb(value: str) -> Tuple[float, float, float]: return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore
[文档] def read_hex_string_from_stream( stream: StreamType, forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, ) -> Union["TextStringObject", "ByteStringObject"]: stream.read(1) arr = [] x = b"" while True: tok = read_non_whitespace(stream) if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if tok == b">": break x += tok if len(x) == 2: arr.append(int(x, base=16)) x = b"" if len(x) == 1: x += b"0" if x != b"": arr.append(int(x, base=16)) return create_string_object(bytes(arr), forced_encoding)
__ESPACE_DICT__ = { b"n": ord(b"\n"), b"r": ord(b"\r"), b"t": ord(b"\t"), b"b": ord(b"\b"), b"f": ord(b"\f"), b"(": ord(b"("), b")": ord(b")"), b"/": ord(b"/"), b"\\": ord(b"\\"), b" ": ord(b" "), b"%": ord(b"%"), b"<": ord(b"<"), b">": ord(b">"), b"[": ord(b"["), b"]": ord(b"]"), b"#": ord(b"#"), b"_": ord(b"_"), b"&": ord(b"&"), b"$": ord(b"$"), } __BACKSLASH_CODE__ = 92
[文档] def read_string_from_stream( stream: StreamType, forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, ) -> Union["TextStringObject", "ByteStringObject"]: tok = stream.read(1) parens = 1 txt = [] while True: tok = stream.read(1) if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if tok == b"(": parens += 1 elif tok == b")": parens -= 1 if parens == 0: break elif tok == b"\\": tok = stream.read(1) try: txt.append(__ESPACE_DICT__[tok]) continue except KeyError: if b"0" <= tok <= b"7": # "The number ddd may consist of one, two, or three # octal digits; high-order overflow shall be ignored. # Three octal digits shall be used, with leading zeros # as needed, if the next character of the string is also # a digit." (PDF reference 7.3.4.2, p 16) sav = stream.tell() - 1 for _ in range(2): ntok = stream.read(1) if b"0" <= ntok <= b"7": tok += ntok else: stream.seek(-1, 1) # ntok has to be analyzed break i = int(tok, base=8) if i > 255: txt.append(__BACKSLASH_CODE__) stream.seek(sav) else: txt.append(i) continue elif tok in b"\n\r": # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the # second character: tok = stream.read(1) if tok not in b"\n\r": stream.seek(-1, 1) # Then don't add anything to the actual string, since this # line break was escaped: continue else: msg = f"Unexpected escaped string: {tok.decode('utf-8','ignore')}" logger_warning(msg, __name__) txt.append(__BACKSLASH_CODE__) txt.append(ord(tok)) return create_string_object(bytes(txt), forced_encoding)
[文档] def create_string_object( string: Union[str, bytes], forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, ) -> Union[TextStringObject, ByteStringObject]: """ Create a ByteStringObject or a TextStringObject from a string to represent the string. Args: string: The data being used forced_encoding: Typically None, or an encoding string Returns: A ByteStringObject Raises: TypeError: If string is not of type str or bytes. """ if isinstance(string, str): return TextStringObject(string) elif isinstance(string, bytes): if isinstance(forced_encoding, (list, dict)): out = "" for x in string: try: out += forced_encoding[x] except Exception: out += bytes((x,)).decode("charmap") obj = TextStringObject(out) obj._original_bytes = string return obj elif isinstance(forced_encoding, str): if forced_encoding == "bytes": return ByteStringObject(string) obj = TextStringObject(string.decode(forced_encoding)) obj._original_bytes = string return obj else: try: if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)): retval = TextStringObject(string.decode("utf-16")) retval._original_bytes = string retval.autodetect_utf16 = True retval.utf16_bom = string[:2] return retval if string.startswith(b"\x00"): retval = TextStringObject(string.decode("utf-16be")) retval._original_bytes = string retval.autodetect_utf16 = True retval.utf16_bom = codecs.BOM_UTF16_BE return retval if string[1:2] == b"\x00": retval = TextStringObject(string.decode("utf-16le")) retval._original_bytes = string retval.autodetect_utf16 = True retval.utf16_bom = codecs.BOM_UTF16_LE return retval # This is probably a big performance hit here, but we need # to convert string objects into the text/unicode-aware # version if possible... and the only way to check if that's # possible is to try. # Some strings are strings, some are just byte arrays. retval = TextStringObject(decode_pdfdocencoding(string)) retval._original_bytes = string retval.autodetect_pdfdocencoding = True return retval except UnicodeDecodeError: return ByteStringObject(string) else: raise TypeError("create_string_object should have str or unicode arg")
[文档] def decode_pdfdocencoding(byte_array: bytes) -> str: retval = "" for b in byte_array: c = _pdfdoc_encoding[b] if c == "\u0000": raise UnicodeDecodeError( "pdfdocencoding", bytearray(b), -1, -1, "does not exist in translation table", ) retval += c return retval