pypdf._text_extraction 源代码

Code related to text extraction.

Some parts are still in In doubt, they will stay there.

import math
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding

CUSTOM_RTL_MIN: int = -1
CUSTOM_RTL_MAX: int = -1

class OrientationNotFoundError(Exception):

def set_custom_rtl(
    _min: Union[str, int, None] = None,
    _max: Union[str, int, None] = None,
    specials: Union[str, List[int], None] = None,
) -> Tuple[int, int, List[int]]:
    Change the Right-To-Left and special characters custom parameters.

        _min: The new minimum value for the range of custom characters that
            will be written right to left.
            If set to ``None``, the value will not be changed.
            If set to an integer or string, it will be converted to its ASCII code.
            The default value is -1, which sets no additional range to be converted.
        _max: The new maximum value for the range of custom characters that will
            be written right to left.
            If set to ``None``, the value will not be changed.
            If set to an integer or string, it will be converted to its ASCII code.
            The default value is -1, which sets no additional range to be converted.
        specials: The new list of special characters to be inserted in the
            current insertion order.
            If set to ``None``, the current value will not be changed.
            If set to a string, it will be converted to a list of ASCII codes.
            The default value is an empty list.

        A tuple containing the new values for ``CUSTOM_RTL_MIN``,

    if isinstance(_min, int):
        CUSTOM_RTL_MIN = _min
    elif isinstance(_min, str):
        CUSTOM_RTL_MIN = ord(_min)
    if isinstance(_max, int):
        CUSTOM_RTL_MAX = _max
    elif isinstance(_max, str):
        CUSTOM_RTL_MAX = ord(_max)
    if isinstance(specials, str):
        CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
    elif isinstance(specials, list):
        CUSTOM_RTL_SPECIAL_CHARS = specials

[文档] def mult(m: List[float], n: List[float]) -> List[float]: return [ m[0] * n[0] + m[1] * n[2], m[0] * n[1] + m[1] * n[3], m[2] * n[0] + m[3] * n[2], m[2] * n[1] + m[3] * n[3], m[4] * n[0] + m[5] * n[2] + n[4], m[4] * n[1] + m[5] * n[3] + n[5], ]
def orient(m: List[float]) -> int: if m[3] > 1e-6: return 0 elif m[3] < -1e-6: return 180 elif m[1] > 0: return 90 else: return 270 def crlf_space_check( text: str, cmtm_prev: Tuple[List[float], List[float]], cmtm_matrix: Tuple[List[float], List[float]], memo_cmtm: Tuple[List[float], List[float]], cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ], orientations: Tuple[int, ...], output: str, font_size: float, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], str_widths: float, spacewidth: float, str_height: float, ) -> Tuple[str, str, List[float], List[float]]: cm_prev = cmtm_prev[0] tm_prev = cmtm_prev[1] cm_matrix = cmtm_matrix[0] tm_matrix = cmtm_matrix[1] memo_cm = memo_cmtm[0] memo_tm = memo_cmtm[1] m_prev = mult(tm_prev, cm_prev) m = mult(tm_matrix, cm_matrix) orientation = orient(m) delta_x = m[4] - m_prev[4] delta_y = m[5] - m_prev[5] # Table 108 of the 1.7 reference ("Text positioning operators") scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2) scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2) scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2) cm_prev = m if orientation not in orientations: raise OrientationNotFoundError if orientation in (0, 180): moved_height: float = delta_y moved_width: float = delta_x elif orientation in (90, 270): moved_height = delta_x moved_width = delta_y try: if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y): if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: visitor_text( text + "\n", memo_cm, memo_tm, cmap[3], font_size, ) text = "" elif ( (moved_width >= (spacewidth + str_widths) * scale_prev_x) and (output + text)[-1] != " " ): text += " " except Exception: pass tm_prev = tm_matrix.copy() cm_prev = cm_matrix.copy() return text, output, cm_prev, tm_prev def get_text_operands( operands: List[Union[str, TextStringObject]], cm_matrix: List[float], tm_matrix: List[float], cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ], orientations: Tuple[int, ...] ) -> Tuple[str, bool]: t: str = "" is_str_operands = False m = mult(tm_matrix, cm_matrix) orientation = orient(m) if orientation in orientations and len(operands) > 0: if isinstance(operands[0], str): t = operands[0] is_str_operands = True else: t = "" tt: bytes = ( encode_pdfdocencoding(operands[0]) if isinstance(operands[0], str) else operands[0] ) if isinstance(cmap[0], str): try: t = tt.decode(cmap[0], "surrogatepass") # apply str encoding except Exception: # the data does not match the expectation, # we use the alternative ; # text extraction may not be good t = tt.decode( "utf-16-be" if cmap[0] == "charmap" else "charmap", "surrogatepass", ) # apply str encoding else: # apply dict encoding t = "".join( [cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt] ) return (t, is_str_operands) def get_display_str( text: str, cm_matrix: List[float], tm_matrix: List[float], cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ], text_operands: str, font_size: float, rtl_dir: bool, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] ) -> Tuple[str, bool]: # "\u0590 - \u08FF \uFB50 - \uFDFF" for x in [cmap[1].get(x, x) for x in text_operands]: # x can be a sequence of bytes ; ex: habibi.pdf if len(x) == 1: xx = ord(x) else: xx = 1 # fmt: off if ( # cases where the current inserting order is kept (xx <= 0x2F) # punctuations but... or 0x3A <= xx <= 0x40 # numbers (x30-39) or 0x2000 <= xx <= 0x206F # upper punctuations.. or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... ): text = x + text if rtl_dir else text + x elif ( # right-to-left characters set 0x0590 <= xx <= 0x08FF or 0xFB1D <= xx <= 0xFDFF or 0xFE70 <= xx <= 0xFEFF or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX ): if not rtl_dir: rtl_dir = True if visitor_text is not None: visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" text = x + text else: # left-to-right if rtl_dir: rtl_dir = False if visitor_text is not None: visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" text = text + x # fmt: on return text, rtl_dir