import fitz
import pdfplumber
import base64
import io
import sys
from pprint import pprint

def extract_pdf_llm_ready(file_path):
    result = {
        "text": "",
        "pages": [],
        "tables": [],
        "images": [] 
    }

    # -------- TEXT + TABLES ----------
    with pdfplumber.open(file_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            page_text = page.extract_text() or ""

            result["pages"].append({
                "page": page_num + 1,
                "text": page_text,
                "images": [], 
                "tables": [] 
            })

            result["text"] += page_text + "\n"

            tables = page.extract_tables()
            result["tables"].append({
                "page": page_num + 1,
                "tables": tables
            })
           
            result["pages"][page_num]["tables"] = tables

   
    doc = fitz.open(file_path)

    for page_num in range(len(doc)):
        image_list = doc.get_page_images(page_num, full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)

            if pix.n < 5:
                img_bytes = pix.tobytes("png")
            else:
                pix = fitz.Pixmap(fitz.csRGB, pix)
                img_bytes = pix.tobytes("png")

            img_base64 = base64.b64encode(img_bytes).decode("utf-8")

            image_obj = {
                "index": img_index,
                "mime_type": "image/png",
                "base64": img_base64
            }

           
            result["pages"][page_num]["images"].append(image_obj)

            
            result["images"].append({
                "page": page_num + 1,
                **image_obj
            })

            pix = None

    return result


if __name__ == "__main__":
    pdf_path = sys.argv[1]
    data = extract_pdf_llm_ready(pdf_path)
    pprint(data)
