cornsnake.util_pdf

Functions for extracting text from a PDF file and checking if a file is a PDF.

Documentation

 1"""
 2Functions for extracting text from a PDF file and checking if a file is a PDF.
 3
 4[Documentation](http://docs.mrseanryan.cornsnake.s3-website-eu-west-1.amazonaws.com/cornsnake/util_pdf.html)
 5"""
 6
 7
 8def extract_text_from_pdf(
 9    filepath: str, start_page: int = -1, end_page: int = -1
10) -> str:
11    """
12    Function to extract text from a PDF file.
13
14    Args:
15    filepath (str): The path to the PDF file.
16    start_page (int, optional): The page to start extracting from (1-indexed).
17    end_page (int, optional): The last page to extract (1-indexed).
18
19    Returns:
20    str: The extracted text from the PDF file.
21    """
22    import fitz  # try avoid forcing install of PyMuPDF unless actually used
23
24    doc = fitz.open(filepath)
25    text = []
26
27    # Clamp page numbers to valid range
28    start = max(start_page, 1)
29    end = min(end_page, doc.page_count)
30    if end == -1:
31        end = doc.page_count
32
33    for page_num in range(start - 1, end):  # zero-based index for fitz
34        page = doc.load_page(page_num)
35        text.append(page.get_text())
36
37    doc.close()
38    return "\n".join(text)
39
40
41def is_pdf(filepath: str) -> bool:
42    """
43    Function to check if a file is a PDF.
44
45    Args:
46    filepath (str): The path to the file.
47
48    Returns:
49    bool: True if the file is a PDF, False otherwise.
50    """
51    return filepath[-4:] == ".pdf"
def extract_text_from_pdf(filepath: str, start_page: int = -1, end_page: int = -1) -> str:
 9def extract_text_from_pdf(
10    filepath: str, start_page: int = -1, end_page: int = -1
11) -> str:
12    """
13    Function to extract text from a PDF file.
14
15    Args:
16    filepath (str): The path to the PDF file.
17    start_page (int, optional): The page to start extracting from (1-indexed).
18    end_page (int, optional): The last page to extract (1-indexed).
19
20    Returns:
21    str: The extracted text from the PDF file.
22    """
23    import fitz  # try avoid forcing install of PyMuPDF unless actually used
24
25    doc = fitz.open(filepath)
26    text = []
27
28    # Clamp page numbers to valid range
29    start = max(start_page, 1)
30    end = min(end_page, doc.page_count)
31    if end == -1:
32        end = doc.page_count
33
34    for page_num in range(start - 1, end):  # zero-based index for fitz
35        page = doc.load_page(page_num)
36        text.append(page.get_text())
37
38    doc.close()
39    return "\n".join(text)

Function to extract text from a PDF file.

Args: filepath (str): The path to the PDF file. start_page (int, optional): The page to start extracting from (1-indexed). end_page (int, optional): The last page to extract (1-indexed).

Returns: str: The extracted text from the PDF file.

def is_pdf(filepath: str) -> bool:
42def is_pdf(filepath: str) -> bool:
43    """
44    Function to check if a file is a PDF.
45
46    Args:
47    filepath (str): The path to the file.
48
49    Returns:
50    bool: True if the file is a PDF, False otherwise.
51    """
52    return filepath[-4:] == ".pdf"

Function to check if a file is a PDF.

Args: filepath (str): The path to the file.

Returns: bool: True if the file is a PDF, False otherwise.