cornsnake.util_pdf
Functions for extracting text from a PDF file and checking if a file is a PDF.
1""" 2Functions for extracting text from a PDF file and checking if a file is a PDF. 3 4[Documentation](http://docs.mrseanryan.cornsnake.s3-website-eu-west-1.amazonaws.com/cornsnake/util_pdf.html) 5""" 6 7 8def extract_text_from_pdf( 9 filepath: str, start_page: int = -1, end_page: int = -1 10) -> str: 11 """ 12 Function to extract text from a PDF file. 13 14 Args: 15 filepath (str): The path to the PDF file. 16 start_page (int, optional): The page to start extracting from (1-indexed). 17 end_page (int, optional): The last page to extract (1-indexed). 18 19 Returns: 20 str: The extracted text from the PDF file. 21 """ 22 import fitz # try avoid forcing install of PyMuPDF unless actually used 23 24 doc = fitz.open(filepath) 25 text = [] 26 27 # Clamp page numbers to valid range 28 start = max(start_page, 1) 29 end = min(end_page, doc.page_count) 30 if end == -1: 31 end = doc.page_count 32 33 for page_num in range(start - 1, end): # zero-based index for fitz 34 page = doc.load_page(page_num) 35 text.append(page.get_text()) 36 37 doc.close() 38 return "\n".join(text) 39 40 41def is_pdf(filepath: str) -> bool: 42 """ 43 Function to check if a file is a PDF. 44 45 Args: 46 filepath (str): The path to the file. 47 48 Returns: 49 bool: True if the file is a PDF, False otherwise. 50 """ 51 return filepath[-4:] == ".pdf"
def
extract_text_from_pdf(filepath: str, start_page: int = -1, end_page: int = -1) -> str:
9def extract_text_from_pdf( 10 filepath: str, start_page: int = -1, end_page: int = -1 11) -> str: 12 """ 13 Function to extract text from a PDF file. 14 15 Args: 16 filepath (str): The path to the PDF file. 17 start_page (int, optional): The page to start extracting from (1-indexed). 18 end_page (int, optional): The last page to extract (1-indexed). 19 20 Returns: 21 str: The extracted text from the PDF file. 22 """ 23 import fitz # try avoid forcing install of PyMuPDF unless actually used 24 25 doc = fitz.open(filepath) 26 text = [] 27 28 # Clamp page numbers to valid range 29 start = max(start_page, 1) 30 end = min(end_page, doc.page_count) 31 if end == -1: 32 end = doc.page_count 33 34 for page_num in range(start - 1, end): # zero-based index for fitz 35 page = doc.load_page(page_num) 36 text.append(page.get_text()) 37 38 doc.close() 39 return "\n".join(text)
Function to extract text from a PDF file.
Args: filepath (str): The path to the PDF file. start_page (int, optional): The page to start extracting from (1-indexed). end_page (int, optional): The last page to extract (1-indexed).
Returns: str: The extracted text from the PDF file.
def
is_pdf(filepath: str) -> bool:
42def is_pdf(filepath: str) -> bool: 43 """ 44 Function to check if a file is a PDF. 45 46 Args: 47 filepath (str): The path to the file. 48 49 Returns: 50 bool: True if the file is a PDF, False otherwise. 51 """ 52 return filepath[-4:] == ".pdf"
Function to check if a file is a PDF.
Args: filepath (str): The path to the file.
Returns: bool: True if the file is a PDF, False otherwise.