o
    f 1                     @   s   d Z ddlZddlZddlZddlmZ ddlmZ	 ddl
mZ ddlmZ ejZeeZG dd de	jZG dd de	jZdS )	)PdfTextPagePdfTextSearcher    N)PdfiumError)PDFIUM_INFOc                       s   e Zd ZdZ fddZedd ZdddZdddZdddZ	dd Z
d ddZdd Zd!ddZdd Zd"ddZ  ZS )#r   z
    Text page helper class.
    
    Attributes:
        raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle.
        page (PdfPage): Reference to the page this textpage belongs to.
    c                       || _ || _t tj d S N)rawpagesuper__init__pdfium_cFPDFText_ClosePage)selfr   r	   	__class__ T/home/ubuntu/webapp/venv/lib/python3.10/site-packages/pypdfium2/_helpers/textpage.pyr         zPdfTextPage.__init__c                 C      | j S r   )r	   r   r   r   r   parent!      zPdfTextPage.parentr   c                 C   sp   ||krdS t | |}|dkr| |d ||d |S t | |}|dkr2| ||d ||d S ||||fS )Nr      )r   "FPDFText_GetTextIndexFromCharIndex_get_active_text_range)r   c_startc_end	l_passive	r_passivet_startt_endr   r   r   r   &   s   z"PdfTextPage._get_active_text_ranger   ignoreFc                 C   s  ||fdkr|st d | j|dS |dkr|  | }| ||| d }|dkr-dS |\}}}}	||7 }|||	 8 }|d | }
dtj  k rNd	k rTn n|
d
9 }
|
d7 }
t|
d
 }t	|t
tj}t| |||}|
|ksJ d|
 d| |jd|d d
  jd|dS )a  
        Warning:
            .. versionchanged:: 4.28
               For various reasons, calling this method with default params now implicitly translates to :meth:`.get_text_bounded` (pass ``force_this=True`` to circumvent).
        
        Extract text from a given range.
        
        Parameters:
            index (int): Index of the first char to include.
            count (int): Number of chars to cover, relative to the internal char list. Defaults to -1 for all remaining chars after *index*.
            errors (str): Error handling when decoding the data (see :meth:`bytes.decode`).
        Returns:
            str: The text in the range in question, or an empty string if no text was found.
        
        Note:
            * The returned text's length does not have to match *count*, even if it will for most PDFs.
              This is because the underlying API may exclude/insert chars compared to the internal list, although rare in practice.
              This means, if the char at ``i`` is excluded, ``get_text_range(i, 2)[1]`` will raise an index error.
              Pdfium provides raw APIs ``FPDFText_GetTextIndexFromCharIndex()`` / ``FPDFText_GetCharIndexFromTextIndex()`` to translate between the two views and identify excluded/inserted chars.
            * In case of leading/trailing excluded characters, pypdfium2 modifies *index* and *count* accordingly to prevent pdfium from unexpectedly reading beyond ``range(index, index+count)``.
        r   r   z]get_text_range() call with default params will be implicitly redirected to get_text_bounded()errorsr   r   r    i  i     zBuffer too small: z vs N	utf-16-le)warningswarnget_text_boundedcount_charsr   r   buildctypescreate_string_buffercastPOINTERc_ushortr   FPDFText_GetTextr   decode)r   indexcountr%   
force_thisactive_ranger    r!   r   r   in_countbuffer
buffer_ptr	out_countr   r   r   get_text_range6   s(   
 zPdfTextPage.get_text_rangeNc                 C   s   | j  }|du r|d }|du r|d }|du r|d }|du r%|d }| ||||f}tjg |ddR  }|dkr>dS t|d }	t|	ttj}
tjg ||
|R   |	j	j
d|dS )	a  
        Extract text from given boundaries in PDF coordinates.
        If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`.
        
        Parameters:
            errors (str): Error treatment when decoding the data (see :meth:`bytes.decode`).
        Returns:
            str: The text on the page area in question, or an empty string if no text was found.
        Nr   r   r'      r&   r(   r$   )r	   get_bboxr   FPDFText_GetBoundedTextr.   r/   r0   r1   r2   r   r4   )r   leftbottomrighttopr%   bboxargsn_charsr:   r;   r   r   r   r+   q   s"   
zPdfTextPage.get_text_boundedc                 C   s   t | }|dkrtd|S )zV
        Returns:
            int: The number of characters on the text page.
        r   zFailed to get character count.)r   FPDFText_CountCharsr   )r   rG   r   r   r   r,      s   
zPdfTextPage.count_charsc                 C   s"   t | ||}|dkrtd|S )a  
        Parameters:
            index (int): Start character index.
            count (int): Character count to consider (defaults to -1 for all remaining).
        Returns:
            int: The number of text rectangles in the given character range.
        r   zFailed to count rectangles.)r   FPDFText_CountRectsr   )r   r5   r6   n_rectsr   r   r   count_rects   s   zPdfTextPage.count_rectsc                 C   s"   t | ||||}|dk rdS |S )a  
        Get the index of a character by position.
        
        Parameters:
            x (float): Horizontal position (in PDF canvas units).
            y (float): Vertical position.
            x_tol (float): Horizontal tolerance.
            y_tol (float): Vertical tolerance.
        Returns:
            int | None: The index of the character at or nearby the point (x, y).
            May be None if there is no character or an error occurred.
        r   N)r   FPDFText_GetCharIndexAtPos)r   xyx_toly_tolr5   r   r   r   	get_index   s   zPdfTextPage.get_indexc           	      C   s   |rt  }t | ||}|j|j|j|jf\}}}}n&t t t t f\}}}}t | |||||}|j	|j	|j	|j	f\}}}}|sHt
d||||fS )a  
        Get the bounding box of a single character.
        
        Parameters:
            index (int):
                Index of the character to work with, in the page's character array.
            loose (bool):
                Get a more comprehensive box covering the entire font bounds, as opposed to the default tight box specific to the one character.
        Returns:
            Float values for left, bottom, right and top in PDF canvas units.
        zFailed to get charbox.)r   FS_RECTFFPDFText_GetLooseCharBoxrA   rB   rC   rD   c_doubleFPDFText_GetCharBoxvaluer   )	r   r5   looserectoklbrtr   r   r   get_charbox   s   zPdfTextPage.get_charboxc                 C   sP   t  t  t  t  f\}}}}t| |||||}|std|j|j|j|jfS )al  
        Get the bounding box of a text rectangle at the given index.
        Note that :meth:`.count_rects` must be called once with default parameters
        before subsequent :meth:`.get_rect` calls for this function to work (due to PDFium's API).
        
        Returns:
            Float values for left, bottom, right and top in PDF canvas units.
        zzFailed to get rectangle. (Make sure count_rects() was called with default params once before subsequent get_rect() calls.))rT   r   FPDFText_GetRectr   rV   )r   r5   rZ   r[   r\   r]   rY   r   r   r   get_rect   s
   	zPdfTextPage.get_rectc                 C   s   t |dkr
tdd}|r|tjO }|r|tjO }|r!|tjO }|d d}t|t	tj
}t| |||}	t|	| }
| |
 |
S )au  
        Locate text on the page.
        
        Parameters:
            text (str):
                The string to search for.
            index (int):
                Character index at which to start searching.
            match_case (bool):
                If True, the search will be case-specific (upper and lower letters treated as different characters).
            match_whole_word (bool):
                If True, substring occurrences will be ignored (e. g. `cat` would not match `category`).
            consecutive (bool):
                If False (the default), :meth:`.search` will skip past the current match to look for the next match.
                If True, parts of the previous match may be caught again (e. g. searching for `aa` in `aaaa` would match 3 rather than 2 times).
        Returns:
            PdfTextSearcher: A helper object to search text.
        r   z#Text length must be greater than 0. r(   )len
ValueErrorr   FPDF_MATCHCASEFPDF_MATCHWHOLEWORDFPDF_CONSECUTIVEencoder.   r0   r1   r2   FPDFText_FindStartr   _add_kid)r   textr5   
match_casematch_whole_wordconsecutiveflagsenc_textenc_text_ptrraw_searchersearcherr   r   r   search   s   




zPdfTextPage.search)r   r   )r   r   r"   F)NNNNr"   r#   )F)r   FFF)__name__
__module____qualname____doc__r   propertyr   r   r=   r+   r,   rK   rQ   r^   r`   rs   __classcell__r   r   r   r   r      s    



; 

r   c                       sD   e Zd ZdZ fddZedd Zdd Zdd	 Zd
d Z	  Z
S )r   z
    Text searcher helper class.
    
    Attributes:
        raw (FPDF_SCHHANDLE): The underlying PDFium searcher handle.
        textpage (PdfTextPage): Reference to the textpage this searcher belongs to.
    c                    r   r   )r   textpager
   r   r   FPDFText_FindClose)r   r   rz   r   r   r   r     r   zPdfTextSearcher.__init__c                 C   r   r   )rz   r   r   r   r   r     r   zPdfTextSearcher.parentc                 C   s,   || }|sd S t | }t | }||fS r   )r   FPDFText_GetSchResultIndexFPDFText_GetSchCount)r   	find_funcrY   r5   r6   r   r   r   _get_occurrence#  s   

zPdfTextSearcher._get_occurrencec                 C      |  tjS )z
        Returns:
            (int, int): Start character index and count of the next occurrence,
            or None if the last occurrence was passed.
        )r   r   FPDFText_FindNextr   r   r   r   get_next+     zPdfTextSearcher.get_nextc                 C   r   )z
        Returns:
            (int, int): Start character index and count of the previous occurrence (i. e. the one before the last valid occurrence),
            or None if the last occurrence was passed.
        )r   r   FPDFText_FindPrevr   r   r   r   get_prev3  r   zPdfTextSearcher.get_prev)rt   ru   rv   rw   r   rx   r   r   r   r   ry   r   r   r   r   r     s    
r   )__all__r.   loggingr)   pypdfium2.rawr   r   pypdfium2.internalinternalpdfium_ipypdfium2._helpers.miscr   pypdfium2.versionr   rT   	getLoggerrt   loggerAutoCloseabler   r   r   r   r   r   <module>   s   
 ~