From c0ca104998899148fd1eb8c08917ea518bf6c1d7 Mon Sep 17 00:00:00 2001
From: zlei9 <zlei9@126.com>
Date: Sun, 22 Mar 2026 11:43:37 +0800
Subject: [PATCH] Initial commit: pdf skill

---
 LICENSE.txt                               |  30 ++
 SKILL.md                                  | 314 +++++++++++
 forms.md                                  | 294 +++++++++++
 reference.md                              | 612 ++++++++++++++++++++++
 scripts/check_bounding_boxes.py           |  65 +++
 scripts/check_fillable_fields.py          |  11 +
 scripts/convert_pdf_to_images.py          |  33 ++
 scripts/create_validation_image.py        |  37 ++
 scripts/extract_form_field_info.py        | 122 +++++
 scripts/extract_form_structure.py         | 115 ++++
 scripts/fill_fillable_fields.py           |  98 ++++
 scripts/fill_pdf_form_with_annotations.py | 107 ++++
 12 files changed, 1838 insertions(+)
 create mode 100644 LICENSE.txt
 create mode 100644 SKILL.md
 create mode 100644 forms.md
 create mode 100644 reference.md
 create mode 100644 scripts/check_bounding_boxes.py
 create mode 100644 scripts/check_fillable_fields.py
 create mode 100644 scripts/convert_pdf_to_images.py
 create mode 100644 scripts/create_validation_image.py
 create mode 100644 scripts/extract_form_field_info.py
 create mode 100644 scripts/extract_form_structure.py
 create mode 100644 scripts/fill_fillable_fields.py
 create mode 100644 scripts/fill_pdf_form_with_annotations.py

diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..c55ab42
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,30 @@
+© 2025 Anthropic, PBC. All rights reserved.
+
+LICENSE: Use of these materials (including all code, prompts, assets, files,
+and other components of this Skill) is governed by your agreement with
+Anthropic regarding use of Anthropic's services. If no separate agreement
+exists, use is governed by Anthropic's Consumer Terms of Service or
+Commercial Terms of Service, as applicable:
+https://www.anthropic.com/legal/consumer-terms
+https://www.anthropic.com/legal/commercial-terms
+Your applicable agreement is referred to as the "Agreement." "Services" are
+as defined in the Agreement.
+
+ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the
+contrary, users may not:
+
+- Extract these materials from the Services or retain copies of these
+  materials outside the Services
+- Reproduce or copy these materials, except for temporary copies created
+  automatically during authorized use of the Services
+- Create derivative works based on these materials
+- Distribute, sublicense, or transfer these materials to any third party
+- Make, offer to sell, sell, or import any inventions embodied in these
+  materials
+- Reverse engineer, decompile, or disassemble these materials
+
+The receipt, viewing, or possession of these materials does not convey or
+imply any license or right beyond those expressly granted above.
+
+Anthropic retains all right, title, and interest in these materials,
+including all copyrights, patents, and other intellectual property rights.
diff --git a/SKILL.md b/SKILL.md
new file mode 100644
index 0000000..d3e046a
--- /dev/null
+++ b/SKILL.md
@@ -0,0 +1,314 @@
+---
+name: pdf
+description: Use this skill whenever the user wants to do anything with PDF files. This includes reading or extracting text/tables from PDFs, combining or merging multiple PDFs into one, splitting PDFs apart, rotating pages, adding watermarks, creating new PDFs, filling PDF forms, encrypting/decrypting PDFs, extracting images, and OCR on scanned PDFs to make them searchable. If the user mentions a .pdf file or asks to produce one, use this skill.
+license: Proprietary. LICENSE.txt has complete terms
+---
+
+# PDF Processing Guide
+
+## Overview
+
+This guide covers essential PDF processing operations using Python libraries and command-line tools. For advanced features, JavaScript libraries, and detailed examples, see REFERENCE.md. If you need to fill out a PDF form, read FORMS.md and follow its instructions.
+
+## Quick Start
+
+```python
+from pypdf import PdfReader, PdfWriter
+
+# Read a PDF
+reader = PdfReader("document.pdf")
+print(f"Pages: {len(reader.pages)}")
+
+# Extract text
+text = ""
+for page in reader.pages:
+    text += page.extract_text()
+```
+
+## Python Libraries
+
+### pypdf - Basic Operations
+
+#### Merge PDFs
+```python
+from pypdf import PdfWriter, PdfReader
+
+writer = PdfWriter()
+for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]:
+    reader = PdfReader(pdf_file)
+    for page in reader.pages:
+        writer.add_page(page)
+
+with open("merged.pdf", "wb") as output:
+    writer.write(output)
+```
+
+#### Split PDF
+```python
+reader = PdfReader("input.pdf")
+for i, page in enumerate(reader.pages):
+    writer = PdfWriter()
+    writer.add_page(page)
+    with open(f"page_{i+1}.pdf", "wb") as output:
+        writer.write(output)
+```
+
+#### Extract Metadata
+```python
+reader = PdfReader("document.pdf")
+meta = reader.metadata
+print(f"Title: {meta.title}")
+print(f"Author: {meta.author}")
+print(f"Subject: {meta.subject}")
+print(f"Creator: {meta.creator}")
+```
+
+#### Rotate Pages
+```python
+reader = PdfReader("input.pdf")
+writer = PdfWriter()
+
+page = reader.pages[0]
+page.rotate(90)  # Rotate 90 degrees clockwise
+writer.add_page(page)
+
+with open("rotated.pdf", "wb") as output:
+    writer.write(output)
+```
+
+### pdfplumber - Text and Table Extraction
+
+#### Extract Text with Layout
+```python
+import pdfplumber
+
+with pdfplumber.open("document.pdf") as pdf:
+    for page in pdf.pages:
+        text = page.extract_text()
+        print(text)
+```
+
+#### Extract Tables
+```python
+with pdfplumber.open("document.pdf") as pdf:
+    for i, page in enumerate(pdf.pages):
+        tables = page.extract_tables()
+        for j, table in enumerate(tables):
+            print(f"Table {j+1} on page {i+1}:")
+            for row in table:
+                print(row)
+```
+
+#### Advanced Table Extraction
+```python
+import pandas as pd
+
+with pdfplumber.open("document.pdf") as pdf:
+    all_tables = []
+    for page in pdf.pages:
+        tables = page.extract_tables()
+        for table in tables:
+            if table:  # Check if table is not empty
+                df = pd.DataFrame(table[1:], columns=table[0])
+                all_tables.append(df)
+
+# Combine all tables
+if all_tables:
+    combined_df = pd.concat(all_tables, ignore_index=True)
+    combined_df.to_excel("extracted_tables.xlsx", index=False)
+```
+
+### reportlab - Create PDFs
+
+#### Basic PDF Creation
+```python
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+
+c = canvas.Canvas("hello.pdf", pagesize=letter)
+width, height = letter
+
+# Add text
+c.drawString(100, height - 100, "Hello World!")
+c.drawString(100, height - 120, "This is a PDF created with reportlab")
+
+# Add a line
+c.line(100, height - 140, 400, height - 140)
+
+# Save
+c.save()
+```
+
+#### Create PDF with Multiple Pages
+```python
+from reportlab.lib.pagesizes import letter
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
+from reportlab.lib.styles import getSampleStyleSheet
+
+doc = SimpleDocTemplate("report.pdf", pagesize=letter)
+styles = getSampleStyleSheet()
+story = []
+
+# Add content
+title = Paragraph("Report Title", styles['Title'])
+story.append(title)
+story.append(Spacer(1, 12))
+
+body = Paragraph("This is the body of the report. " * 20, styles['Normal'])
+story.append(body)
+story.append(PageBreak())
+
+# Page 2
+story.append(Paragraph("Page 2", styles['Heading1']))
+story.append(Paragraph("Content for page 2", styles['Normal']))
+
+# Build PDF
+doc.build(story)
+```
+
+#### Subscripts and Superscripts
+
+**IMPORTANT**: Never use Unicode subscript/superscript characters (₀₁₂₃₄₅₆₇₈₉, ⁰¹²³⁴⁵⁶⁷⁸⁹) in ReportLab PDFs. The built-in fonts do not include these glyphs, causing them to render as solid black boxes.
+
+Instead, use ReportLab's XML markup tags in Paragraph objects:
+```python
+from reportlab.platypus import Paragraph
+from reportlab.lib.styles import getSampleStyleSheet
+
+styles = getSampleStyleSheet()
+
+# Subscripts: use <sub> tag
+chemical = Paragraph("H<sub>2</sub>O", styles['Normal'])
+
+# Superscripts: use <super> tag
+squared = Paragraph("x<super>2</super> + y<super>2</super>", styles['Normal'])
+```
+
+For canvas-drawn text (not Paragraph objects), manually adjust font the size and position rather than using Unicode subscripts/superscripts.
+
+## Command-Line Tools
+
+### pdftotext (poppler-utils)
+```bash
+# Extract text
+pdftotext input.pdf output.txt
+
+# Extract text preserving layout
+pdftotext -layout input.pdf output.txt
+
+# Extract specific pages
+pdftotext -f 1 -l 5 input.pdf output.txt  # Pages 1-5
+```
+
+### qpdf
+```bash
+# Merge PDFs
+qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf
+
+# Split pages
+qpdf input.pdf --pages . 1-5 -- pages1-5.pdf
+qpdf input.pdf --pages . 6-10 -- pages6-10.pdf
+
+# Rotate pages
+qpdf input.pdf output.pdf --rotate=+90:1  # Rotate page 1 by 90 degrees
+
+# Remove password
+qpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf
+```
+
+### pdftk (if available)
+```bash
+# Merge
+pdftk file1.pdf file2.pdf cat output merged.pdf
+
+# Split
+pdftk input.pdf burst
+
+# Rotate
+pdftk input.pdf rotate 1east output rotated.pdf
+```
+
+## Common Tasks
+
+### Extract Text from Scanned PDFs
+```python
+# Requires: pip install pytesseract pdf2image
+import pytesseract
+from pdf2image import convert_from_path
+
+# Convert PDF to images
+images = convert_from_path('scanned.pdf')
+
+# OCR each page
+text = ""
+for i, image in enumerate(images):
+    text += f"Page {i+1}:\n"
+    text += pytesseract.image_to_string(image)
+    text += "\n\n"
+
+print(text)
+```
+
+### Add Watermark
+```python
+from pypdf import PdfReader, PdfWriter
+
+# Create watermark (or load existing)
+watermark = PdfReader("watermark.pdf").pages[0]
+
+# Apply to all pages
+reader = PdfReader("document.pdf")
+writer = PdfWriter()
+
+for page in reader.pages:
+    page.merge_page(watermark)
+    writer.add_page(page)
+
+with open("watermarked.pdf", "wb") as output:
+    writer.write(output)
+```
+
+### Extract Images
+```bash
+# Using pdfimages (poppler-utils)
+pdfimages -j input.pdf output_prefix
+
+# This extracts all images as output_prefix-000.jpg, output_prefix-001.jpg, etc.
+```
+
+### Password Protection
+```python
+from pypdf import PdfReader, PdfWriter
+
+reader = PdfReader("input.pdf")
+writer = PdfWriter()
+
+for page in reader.pages:
+    writer.add_page(page)
+
+# Add password
+writer.encrypt("userpassword", "ownerpassword")
+
+with open("encrypted.pdf", "wb") as output:
+    writer.write(output)
+```
+
+## Quick Reference
+
+| Task | Best Tool | Command/Code |
+|------|-----------|--------------|
+| Merge PDFs | pypdf | `writer.add_page(page)` |
+| Split PDFs | pypdf | One page per file |
+| Extract text | pdfplumber | `page.extract_text()` |
+| Extract tables | pdfplumber | `page.extract_tables()` |
+| Create PDFs | reportlab | Canvas or Platypus |
+| Command line merge | qpdf | `qpdf --empty --pages ...` |
+| OCR scanned PDFs | pytesseract | Convert to image first |
+| Fill PDF forms | pdf-lib or pypdf (see FORMS.md) | See FORMS.md |
+
+## Next Steps
+
+- For advanced pypdfium2 usage, see REFERENCE.md
+- For JavaScript libraries (pdf-lib), see REFERENCE.md
+- If you need to fill out a PDF form, follow the instructions in FORMS.md
+- For troubleshooting guides, see REFERENCE.md
diff --git a/forms.md b/forms.md
new file mode 100644
index 0000000..6e7e1e0
--- /dev/null
+++ b/forms.md
@@ -0,0 +1,294 @@
+**CRITICAL: You MUST complete these steps in order. Do not skip ahead to writing code.**
+
+If you need to fill out a PDF form, first check to see if the PDF has fillable form fields. Run this script from this file's directory:
+ `python scripts/check_fillable_fields <file.pdf>`, and depending on the result go to either the "Fillable fields" or "Non-fillable fields" and follow those instructions.
+
+# Fillable fields
+If the PDF has fillable form fields:
+- Run this script from this file's directory: `python scripts/extract_form_field_info.py <input.pdf> <field_info.json>`. It will create a JSON file with a list of fields in this format:
+```
+[
+  {
+    "field_id": (unique ID for the field),
+    "page": (page number, 1-based),
+    "rect": ([left, bottom, right, top] bounding box in PDF coordinates, y=0 is the bottom of the page),
+    "type": ("text", "checkbox", "radio_group", or "choice"),
+  },
+  // Checkboxes have "checked_value" and "unchecked_value" properties:
+  {
+    "field_id": (unique ID for the field),
+    "page": (page number, 1-based),
+    "type": "checkbox",
+    "checked_value": (Set the field to this value to check the checkbox),
+    "unchecked_value": (Set the field to this value to uncheck the checkbox),
+  },
+  // Radio groups have a "radio_options" list with the possible choices.
+  {
+    "field_id": (unique ID for the field),
+    "page": (page number, 1-based),
+    "type": "radio_group",
+    "radio_options": [
+      {
+        "value": (set the field to this value to select this radio option),
+        "rect": (bounding box for the radio button for this option)
+      },
+      // Other radio options
+    ]
+  },
+  // Multiple choice fields have a "choice_options" list with the possible choices:
+  {
+    "field_id": (unique ID for the field),
+    "page": (page number, 1-based),
+    "type": "choice",
+    "choice_options": [
+      {
+        "value": (set the field to this value to select this option),
+        "text": (display text of the option)
+      },
+      // Other choice options
+    ],
+  }
+]
+```
+- Convert the PDF to PNGs (one image for each page) with this script (run from this file's directory):
+`python scripts/convert_pdf_to_images.py <file.pdf> <output_directory>`
+Then analyze the images to determine the purpose of each form field (make sure to convert the bounding box PDF coordinates to image coordinates).
+- Create a `field_values.json` file in this format with the values to be entered for each field:
+```
+[
+  {
+    "field_id": "last_name", // Must match the field_id from `extract_form_field_info.py`
+    "description": "The user's last name",
+    "page": 1, // Must match the "page" value in field_info.json
+    "value": "Simpson"
+  },
+  {
+    "field_id": "Checkbox12",
+    "description": "Checkbox to be checked if the user is 18 or over",
+    "page": 1,
+    "value": "/On" // If this is a checkbox, use its "checked_value" value to check it. If it's a radio button group, use one of the "value" values in "radio_options".
+  },
+  // more fields
+]
+```
+- Run the `fill_fillable_fields.py` script from this file's directory to create a filled-in PDF:
+`python scripts/fill_fillable_fields.py <input pdf> <field_values.json> <output pdf>`
+This script will verify that the field IDs and values you provide are valid; if it prints error messages, correct the appropriate fields and try again.
+
+# Non-fillable fields
+If the PDF doesn't have fillable form fields, you'll add text annotations. First try to extract coordinates from the PDF structure (more accurate), then fall back to visual estimation if needed.
+
+## Step 1: Try Structure Extraction First
+
+Run this script to extract text labels, lines, and checkboxes with their exact PDF coordinates:
+`python scripts/extract_form_structure.py <input.pdf> form_structure.json`
+
+This creates a JSON file containing:
+- **labels**: Every text element with exact coordinates (x0, top, x1, bottom in PDF points)
+- **lines**: Horizontal lines that define row boundaries
+- **checkboxes**: Small square rectangles that are checkboxes (with center coordinates)
+- **row_boundaries**: Row top/bottom positions calculated from horizontal lines
+
+**Check the results**: If `form_structure.json` has meaningful labels (text elements that correspond to form fields), use **Approach A: Structure-Based Coordinates**. If the PDF is scanned/image-based and has few or no labels, use **Approach B: Visual Estimation**.
+
+---
+
+## Approach A: Structure-Based Coordinates (Preferred)
+
+Use this when `extract_form_structure.py` found text labels in the PDF.
+
+### A.1: Analyze the Structure
+
+Read form_structure.json and identify:
+
+1. **Label groups**: Adjacent text elements that form a single label (e.g., "Last" + "Name")
+2. **Row structure**: Labels with similar `top` values are in the same row
+3. **Field columns**: Entry areas start after label ends (x0 = label.x1 + gap)
+4. **Checkboxes**: Use the checkbox coordinates directly from the structure
+
+**Coordinate system**: PDF coordinates where y=0 is at TOP of page, y increases downward.
+
+### A.2: Check for Missing Elements
+
+The structure extraction may not detect all form elements. Common cases:
+- **Circular checkboxes**: Only square rectangles are detected as checkboxes
+- **Complex graphics**: Decorative elements or non-standard form controls
+- **Faded or light-colored elements**: May not be extracted
+
+If you see form fields in the PDF images that aren't in form_structure.json, you'll need to use **visual analysis** for those specific fields (see "Hybrid Approach" below).
+
+### A.3: Create fields.json with PDF Coordinates
+
+For each field, calculate entry coordinates from the extracted structure:
+
+**Text fields:**
+- entry x0 = label x1 + 5 (small gap after label)
+- entry x1 = next label's x0, or row boundary
+- entry top = same as label top
+- entry bottom = row boundary line below, or label bottom + row_height
+
+**Checkboxes:**
+- Use the checkbox rectangle coordinates directly from form_structure.json
+- entry_bounding_box = [checkbox.x0, checkbox.top, checkbox.x1, checkbox.bottom]
+
+Create fields.json using `pdf_width` and `pdf_height` (signals PDF coordinates):
+```json
+{
+  "pages": [
+    {"page_number": 1, "pdf_width": 612, "pdf_height": 792}
+  ],
+  "form_fields": [
+    {
+      "page_number": 1,
+      "description": "Last name entry field",
+      "field_label": "Last Name",
+      "label_bounding_box": [43, 63, 87, 73],
+      "entry_bounding_box": [92, 63, 260, 79],
+      "entry_text": {"text": "Smith", "font_size": 10}
+    },
+    {
+      "page_number": 1,
+      "description": "US Citizen Yes checkbox",
+      "field_label": "Yes",
+      "label_bounding_box": [260, 200, 280, 210],
+      "entry_bounding_box": [285, 197, 292, 205],
+      "entry_text": {"text": "X"}
+    }
+  ]
+}
+```
+
+**Important**: Use `pdf_width`/`pdf_height` and coordinates directly from form_structure.json.
+
+### A.4: Validate Bounding Boxes
+
+Before filling, check your bounding boxes for errors:
+`python scripts/check_bounding_boxes.py fields.json`
+
+This checks for intersecting bounding boxes and entry boxes that are too small for the font size. Fix any reported errors before filling.
+
+---
+
+## Approach B: Visual Estimation (Fallback)
+
+Use this when the PDF is scanned/image-based and structure extraction found no usable text labels (e.g., all text shows as "(cid:X)" patterns).
+
+### B.1: Convert PDF to Images
+
+`python scripts/convert_pdf_to_images.py <input.pdf> <images_dir/>`
+
+### B.2: Initial Field Identification
+
+Examine each page image to identify form sections and get **rough estimates** of field locations:
+- Form field labels and their approximate positions
+- Entry areas (lines, boxes, or blank spaces for text input)
+- Checkboxes and their approximate locations
+
+For each field, note approximate pixel coordinates (they don't need to be precise yet).
+
+### B.3: Zoom Refinement (CRITICAL for accuracy)
+
+For each field, crop a region around the estimated position to refine coordinates precisely.
+
+**Create a zoomed crop using ImageMagick:**
+```bash
+magick <page_image> -crop <width>x<height>+<x>+<y> +repage <crop_output.png>
+```
+
+Where:
+- `<x>, <y>` = top-left corner of crop region (use your rough estimate minus padding)
+- `<width>, <height>` = size of crop region (field area plus ~50px padding on each side)
+
+**Example:** To refine a "Name" field estimated around (100, 150):
+```bash
+magick images_dir/page_1.png -crop 300x80+50+120 +repage crops/name_field.png
+```
+
+(Note: if the `magick` command isn't available, try `convert` with the same arguments).
+
+**Examine the cropped image** to determine precise coordinates:
+1. Identify the exact pixel where the entry area begins (after the label)
+2. Identify where the entry area ends (before next field or edge)
+3. Identify the top and bottom of the entry line/box
+
+**Convert crop coordinates back to full image coordinates:**
+- full_x = crop_x + crop_offset_x
+- full_y = crop_y + crop_offset_y
+
+Example: If the crop started at (50, 120) and the entry box starts at (52, 18) within the crop:
+- entry_x0 = 52 + 50 = 102
+- entry_top = 18 + 120 = 138
+
+**Repeat for each field**, grouping nearby fields into single crops when possible.
+
+### B.4: Create fields.json with Refined Coordinates
+
+Create fields.json using `image_width` and `image_height` (signals image coordinates):
+```json
+{
+  "pages": [
+    {"page_number": 1, "image_width": 1700, "image_height": 2200}
+  ],
+  "form_fields": [
+    {
+      "page_number": 1,
+      "description": "Last name entry field",
+      "field_label": "Last Name",
+      "label_bounding_box": [120, 175, 242, 198],
+      "entry_bounding_box": [255, 175, 720, 218],
+      "entry_text": {"text": "Smith", "font_size": 10}
+    }
+  ]
+}
+```
+
+**Important**: Use `image_width`/`image_height` and the refined pixel coordinates from the zoom analysis.
+
+### B.5: Validate Bounding Boxes
+
+Before filling, check your bounding boxes for errors:
+`python scripts/check_bounding_boxes.py fields.json`
+
+This checks for intersecting bounding boxes and entry boxes that are too small for the font size. Fix any reported errors before filling.
+
+---
+
+## Hybrid Approach: Structure + Visual
+
+Use this when structure extraction works for most fields but misses some elements (e.g., circular checkboxes, unusual form controls).
+
+1. **Use Approach A** for fields that were detected in form_structure.json
+2. **Convert PDF to images** for visual analysis of missing fields
+3. **Use zoom refinement** (from Approach B) for the missing fields
+4. **Combine coordinates**: For fields from structure extraction, use `pdf_width`/`pdf_height`. For visually-estimated fields, you must convert image coordinates to PDF coordinates:
+   - pdf_x = image_x * (pdf_width / image_width)
+   - pdf_y = image_y * (pdf_height / image_height)
+5. **Use a single coordinate system** in fields.json - convert all to PDF coordinates with `pdf_width`/`pdf_height`
+
+---
+
+## Step 2: Validate Before Filling
+
+**Always validate bounding boxes before filling:**
+`python scripts/check_bounding_boxes.py fields.json`
+
+This checks for:
+- Intersecting bounding boxes (which would cause overlapping text)
+- Entry boxes that are too small for the specified font size
+
+Fix any reported errors in fields.json before proceeding.
+
+## Step 3: Fill the Form
+
+The fill script auto-detects the coordinate system and handles conversion:
+`python scripts/fill_pdf_form_with_annotations.py <input.pdf> fields.json <output.pdf>`
+
+## Step 4: Verify Output
+
+Convert the filled PDF to images and verify text placement:
+`python scripts/convert_pdf_to_images.py <output.pdf> <verify_images/>`
+
+If text is mispositioned:
+- **Approach A**: Check that you're using PDF coordinates from form_structure.json with `pdf_width`/`pdf_height`
+- **Approach B**: Check that image dimensions match and coordinates are accurate pixels
+- **Hybrid**: Ensure coordinate conversions are correct for visually-estimated fields
diff --git a/reference.md b/reference.md
new file mode 100644
index 0000000..41400bf
--- /dev/null
+++ b/reference.md
@@ -0,0 +1,612 @@
+# PDF Processing Advanced Reference
+
+This document contains advanced PDF processing features, detailed examples, and additional libraries not covered in the main skill instructions.
+
+## pypdfium2 Library (Apache/BSD License)
+
+### Overview
+pypdfium2 is a Python binding for PDFium (Chromium's PDF library). It's excellent for fast PDF rendering, image generation, and serves as a PyMuPDF replacement.
+
+### Render PDF to Images
+```python
+import pypdfium2 as pdfium
+from PIL import Image
+
+# Load PDF
+pdf = pdfium.PdfDocument("document.pdf")
+
+# Render page to image
+page = pdf[0]  # First page
+bitmap = page.render(
+    scale=2.0,  # Higher resolution
+    rotation=0  # No rotation
+)
+
+# Convert to PIL Image
+img = bitmap.to_pil()
+img.save("page_1.png", "PNG")
+
+# Process multiple pages
+for i, page in enumerate(pdf):
+    bitmap = page.render(scale=1.5)
+    img = bitmap.to_pil()
+    img.save(f"page_{i+1}.jpg", "JPEG", quality=90)
+```
+
+### Extract Text with pypdfium2
+```python
+import pypdfium2 as pdfium
+
+pdf = pdfium.PdfDocument("document.pdf")
+for i, page in enumerate(pdf):
+    text = page.get_text()
+    print(f"Page {i+1} text length: {len(text)} chars")
+```
+
+## JavaScript Libraries
+
+### pdf-lib (MIT License)
+
+pdf-lib is a powerful JavaScript library for creating and modifying PDF documents in any JavaScript environment.
+
+#### Load and Manipulate Existing PDF
+```javascript
+import { PDFDocument } from 'pdf-lib';
+import fs from 'fs';
+
+async function manipulatePDF() {
+    // Load existing PDF
+    const existingPdfBytes = fs.readFileSync('input.pdf');
+    const pdfDoc = await PDFDocument.load(existingPdfBytes);
+
+    // Get page count
+    const pageCount = pdfDoc.getPageCount();
+    console.log(`Document has ${pageCount} pages`);
+
+    // Add new page
+    const newPage = pdfDoc.addPage([600, 400]);
+    newPage.drawText('Added by pdf-lib', {
+        x: 100,
+        y: 300,
+        size: 16
+    });
+
+    // Save modified PDF
+    const pdfBytes = await pdfDoc.save();
+    fs.writeFileSync('modified.pdf', pdfBytes);
+}
+```
+
+#### Create Complex PDFs from Scratch
+```javascript
+import { PDFDocument, rgb, StandardFonts } from 'pdf-lib';
+import fs from 'fs';
+
+async function createPDF() {
+    const pdfDoc = await PDFDocument.create();
+
+    // Add fonts
+    const helveticaFont = await pdfDoc.embedFont(StandardFonts.Helvetica);
+    const helveticaBold = await pdfDoc.embedFont(StandardFonts.HelveticaBold);
+
+    // Add page
+    const page = pdfDoc.addPage([595, 842]); // A4 size
+    const { width, height } = page.getSize();
+
+    // Add text with styling
+    page.drawText('Invoice #12345', {
+        x: 50,
+        y: height - 50,
+        size: 18,
+        font: helveticaBold,
+        color: rgb(0.2, 0.2, 0.8)
+    });
+
+    // Add rectangle (header background)
+    page.drawRectangle({
+        x: 40,
+        y: height - 100,
+        width: width - 80,
+        height: 30,
+        color: rgb(0.9, 0.9, 0.9)
+    });
+
+    // Add table-like content
+    const items = [
+        ['Item', 'Qty', 'Price', 'Total'],
+        ['Widget', '2', '$50', '$100'],
+        ['Gadget', '1', '$75', '$75']
+    ];
+
+    let yPos = height - 150;
+    items.forEach(row => {
+        let xPos = 50;
+        row.forEach(cell => {
+            page.drawText(cell, {
+                x: xPos,
+                y: yPos,
+                size: 12,
+                font: helveticaFont
+            });
+            xPos += 120;
+        });
+        yPos -= 25;
+    });
+
+    const pdfBytes = await pdfDoc.save();
+    fs.writeFileSync('created.pdf', pdfBytes);
+}
+```
+
+#### Advanced Merge and Split Operations
+```javascript
+import { PDFDocument } from 'pdf-lib';
+import fs from 'fs';
+
+async function mergePDFs() {
+    // Create new document
+    const mergedPdf = await PDFDocument.create();
+
+    // Load source PDFs
+    const pdf1Bytes = fs.readFileSync('doc1.pdf');
+    const pdf2Bytes = fs.readFileSync('doc2.pdf');
+
+    const pdf1 = await PDFDocument.load(pdf1Bytes);
+    const pdf2 = await PDFDocument.load(pdf2Bytes);
+
+    // Copy pages from first PDF
+    const pdf1Pages = await mergedPdf.copyPages(pdf1, pdf1.getPageIndices());
+    pdf1Pages.forEach(page => mergedPdf.addPage(page));
+
+    // Copy specific pages from second PDF (pages 0, 2, 4)
+    const pdf2Pages = await mergedPdf.copyPages(pdf2, [0, 2, 4]);
+    pdf2Pages.forEach(page => mergedPdf.addPage(page));
+
+    const mergedPdfBytes = await mergedPdf.save();
+    fs.writeFileSync('merged.pdf', mergedPdfBytes);
+}
+```
+
+### pdfjs-dist (Apache License)
+
+PDF.js is Mozilla's JavaScript library for rendering PDFs in the browser.
+
+#### Basic PDF Loading and Rendering
+```javascript
+import * as pdfjsLib from 'pdfjs-dist';
+
+// Configure worker (important for performance)
+pdfjsLib.GlobalWorkerOptions.workerSrc = './pdf.worker.js';
+
+async function renderPDF() {
+    // Load PDF
+    const loadingTask = pdfjsLib.getDocument('document.pdf');
+    const pdf = await loadingTask.promise;
+
+    console.log(`Loaded PDF with ${pdf.numPages} pages`);
+
+    // Get first page
+    const page = await pdf.getPage(1);
+    const viewport = page.getViewport({ scale: 1.5 });
+
+    // Render to canvas
+    const canvas = document.createElement('canvas');
+    const context = canvas.getContext('2d');
+    canvas.height = viewport.height;
+    canvas.width = viewport.width;
+
+    const renderContext = {
+        canvasContext: context,
+        viewport: viewport
+    };
+
+    await page.render(renderContext).promise;
+    document.body.appendChild(canvas);
+}
+```
+
+#### Extract Text with Coordinates
+```javascript
+import * as pdfjsLib from 'pdfjs-dist';
+
+async function extractText() {
+    const loadingTask = pdfjsLib.getDocument('document.pdf');
+    const pdf = await loadingTask.promise;
+
+    let fullText = '';
+
+    // Extract text from all pages
+    for (let i = 1; i <= pdf.numPages; i++) {
+        const page = await pdf.getPage(i);
+        const textContent = await page.getTextContent();
+
+        const pageText = textContent.items
+            .map(item => item.str)
+            .join(' ');
+
+        fullText += `\n--- Page ${i} ---\n${pageText}`;
+
+        // Get text with coordinates for advanced processing
+        const textWithCoords = textContent.items.map(item => ({
+            text: item.str,
+            x: item.transform[4],
+            y: item.transform[5],
+            width: item.width,
+            height: item.height
+        }));
+    }
+
+    console.log(fullText);
+    return fullText;
+}
+```
+
+#### Extract Annotations and Forms
+```javascript
+import * as pdfjsLib from 'pdfjs-dist';
+
+async function extractAnnotations() {
+    const loadingTask = pdfjsLib.getDocument('annotated.pdf');
+    const pdf = await loadingTask.promise;
+
+    for (let i = 1; i <= pdf.numPages; i++) {
+        const page = await pdf.getPage(i);
+        const annotations = await page.getAnnotations();
+
+        annotations.forEach(annotation => {
+            console.log(`Annotation type: ${annotation.subtype}`);
+            console.log(`Content: ${annotation.contents}`);
+            console.log(`Coordinates: ${JSON.stringify(annotation.rect)}`);
+        });
+    }
+}
+```
+
+## Advanced Command-Line Operations
+
+### poppler-utils Advanced Features
+
+#### Extract Text with Bounding Box Coordinates
+```bash
+# Extract text with bounding box coordinates (essential for structured data)
+pdftotext -bbox-layout document.pdf output.xml
+
+# The XML output contains precise coordinates for each text element
+```
+
+#### Advanced Image Conversion
+```bash
+# Convert to PNG images with specific resolution
+pdftoppm -png -r 300 document.pdf output_prefix
+
+# Convert specific page range with high resolution
+pdftoppm -png -r 600 -f 1 -l 3 document.pdf high_res_pages
+
+# Convert to JPEG with quality setting
+pdftoppm -jpeg -jpegopt quality=85 -r 200 document.pdf jpeg_output
+```
+
+#### Extract Embedded Images
+```bash
+# Extract all embedded images with metadata
+pdfimages -j -p document.pdf page_images
+
+# List image info without extracting
+pdfimages -list document.pdf
+
+# Extract images in their original format
+pdfimages -all document.pdf images/img
+```
+
+### qpdf Advanced Features
+
+#### Complex Page Manipulation
+```bash
+# Split PDF into groups of pages
+qpdf --split-pages=3 input.pdf output_group_%02d.pdf
+
+# Extract specific pages with complex ranges
+qpdf input.pdf --pages input.pdf 1,3-5,8,10-end -- extracted.pdf
+
+# Merge specific pages from multiple PDFs
+qpdf --empty --pages doc1.pdf 1-3 doc2.pdf 5-7 doc3.pdf 2,4 -- combined.pdf
+```
+
+#### PDF Optimization and Repair
+```bash
+# Optimize PDF for web (linearize for streaming)
+qpdf --linearize input.pdf optimized.pdf
+
+# Remove unused objects and compress
+qpdf --optimize-level=all input.pdf compressed.pdf
+
+# Attempt to repair corrupted PDF structure
+qpdf --check input.pdf
+qpdf --fix-qdf damaged.pdf repaired.pdf
+
+# Show detailed PDF structure for debugging
+qpdf --show-all-pages input.pdf > structure.txt
+```
+
+#### Advanced Encryption
+```bash
+# Add password protection with specific permissions
+qpdf --encrypt user_pass owner_pass 256 --print=none --modify=none -- input.pdf encrypted.pdf
+
+# Check encryption status
+qpdf --show-encryption encrypted.pdf
+
+# Remove password protection (requires password)
+qpdf --password=secret123 --decrypt encrypted.pdf decrypted.pdf
+```
+
+## Advanced Python Techniques
+
+### pdfplumber Advanced Features
+
+#### Extract Text with Precise Coordinates
+```python
+import pdfplumber
+
+with pdfplumber.open("document.pdf") as pdf:
+    page = pdf.pages[0]
+    
+    # Extract all text with coordinates
+    chars = page.chars
+    for char in chars[:10]:  # First 10 characters
+        print(f"Char: '{char['text']}' at x:{char['x0']:.1f} y:{char['y0']:.1f}")
+    
+    # Extract text by bounding box (left, top, right, bottom)
+    bbox_text = page.within_bbox((100, 100, 400, 200)).extract_text()
+```
+
+#### Advanced Table Extraction with Custom Settings
+```python
+import pdfplumber
+import pandas as pd
+
+with pdfplumber.open("complex_table.pdf") as pdf:
+    page = pdf.pages[0]
+    
+    # Extract tables with custom settings for complex layouts
+    table_settings = {
+        "vertical_strategy": "lines",
+        "horizontal_strategy": "lines",
+        "snap_tolerance": 3,
+        "intersection_tolerance": 15
+    }
+    tables = page.extract_tables(table_settings)
+    
+    # Visual debugging for table extraction
+    img = page.to_image(resolution=150)
+    img.save("debug_layout.png")
+```
+
+### reportlab Advanced Features
+
+#### Create Professional Reports with Tables
+```python
+from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph
+from reportlab.lib.styles import getSampleStyleSheet
+from reportlab.lib import colors
+
+# Sample data
+data = [
+    ['Product', 'Q1', 'Q2', 'Q3', 'Q4'],
+    ['Widgets', '120', '135', '142', '158'],
+    ['Gadgets', '85', '92', '98', '105']
+]
+
+# Create PDF with table
+doc = SimpleDocTemplate("report.pdf")
+elements = []
+
+# Add title
+styles = getSampleStyleSheet()
+title = Paragraph("Quarterly Sales Report", styles['Title'])
+elements.append(title)
+
+# Add table with advanced styling
+table = Table(data)
+table.setStyle(TableStyle([
+    ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
+    ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+    ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
+    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+    ('FONTSIZE', (0, 0), (-1, 0), 14),
+    ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
+    ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
+    ('GRID', (0, 0), (-1, -1), 1, colors.black)
+]))
+elements.append(table)
+
+doc.build(elements)
+```
+
+## Complex Workflows
+
+### Extract Figures/Images from PDF
+
+#### Method 1: Using pdfimages (fastest)
+```bash
+# Extract all images with original quality
+pdfimages -all document.pdf images/img
+```
+
+#### Method 2: Using pypdfium2 + Image Processing
+```python
+import pypdfium2 as pdfium
+from PIL import Image
+import numpy as np
+
+def extract_figures(pdf_path, output_dir):
+    pdf = pdfium.PdfDocument(pdf_path)
+    
+    for page_num, page in enumerate(pdf):
+        # Render high-resolution page
+        bitmap = page.render(scale=3.0)
+        img = bitmap.to_pil()
+        
+        # Convert to numpy for processing
+        img_array = np.array(img)
+        
+        # Simple figure detection (non-white regions)
+        mask = np.any(img_array != [255, 255, 255], axis=2)
+        
+        # Find contours and extract bounding boxes
+        # (This is simplified - real implementation would need more sophisticated detection)
+        
+        # Save detected figures
+        # ... implementation depends on specific needs
+```
+
+### Batch PDF Processing with Error Handling
+```python
+import os
+import glob
+from pypdf import PdfReader, PdfWriter
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def batch_process_pdfs(input_dir, operation='merge'):
+    pdf_files = glob.glob(os.path.join(input_dir, "*.pdf"))
+    
+    if operation == 'merge':
+        writer = PdfWriter()
+        for pdf_file in pdf_files:
+            try:
+                reader = PdfReader(pdf_file)
+                for page in reader.pages:
+                    writer.add_page(page)
+                logger.info(f"Processed: {pdf_file}")
+            except Exception as e:
+                logger.error(f"Failed to process {pdf_file}: {e}")
+                continue
+        
+        with open("batch_merged.pdf", "wb") as output:
+            writer.write(output)
+    
+    elif operation == 'extract_text':
+        for pdf_file in pdf_files:
+            try:
+                reader = PdfReader(pdf_file)
+                text = ""
+                for page in reader.pages:
+                    text += page.extract_text()
+                
+                output_file = pdf_file.replace('.pdf', '.txt')
+                with open(output_file, 'w', encoding='utf-8') as f:
+                    f.write(text)
+                logger.info(f"Extracted text from: {pdf_file}")
+                
+            except Exception as e:
+                logger.error(f"Failed to extract text from {pdf_file}: {e}")
+                continue
+```
+
+### Advanced PDF Cropping
+```python
+from pypdf import PdfWriter, PdfReader
+
+reader = PdfReader("input.pdf")
+writer = PdfWriter()
+
+# Crop page (left, bottom, right, top in points)
+page = reader.pages[0]
+page.mediabox.left = 50
+page.mediabox.bottom = 50
+page.mediabox.right = 550
+page.mediabox.top = 750
+
+writer.add_page(page)
+with open("cropped.pdf", "wb") as output:
+    writer.write(output)
+```
+
+## Performance Optimization Tips
+
+### 1. For Large PDFs
+- Use streaming approaches instead of loading entire PDF in memory
+- Use `qpdf --split-pages` for splitting large files
+- Process pages individually with pypdfium2
+
+### 2. For Text Extraction
+- `pdftotext -bbox-layout` is fastest for plain text extraction
+- Use pdfplumber for structured data and tables
+- Avoid `pypdf.extract_text()` for very large documents
+
+### 3. For Image Extraction
+- `pdfimages` is much faster than rendering pages
+- Use low resolution for previews, high resolution for final output
+
+### 4. For Form Filling
+- pdf-lib maintains form structure better than most alternatives
+- Pre-validate form fields before processing
+
+### 5. Memory Management
+```python
+# Process PDFs in chunks
+def process_large_pdf(pdf_path, chunk_size=10):
+    reader = PdfReader(pdf_path)
+    total_pages = len(reader.pages)
+    
+    for start_idx in range(0, total_pages, chunk_size):
+        end_idx = min(start_idx + chunk_size, total_pages)
+        writer = PdfWriter()
+        
+        for i in range(start_idx, end_idx):
+            writer.add_page(reader.pages[i])
+        
+        # Process chunk
+        with open(f"chunk_{start_idx//chunk_size}.pdf", "wb") as output:
+            writer.write(output)
+```
+
+## Troubleshooting Common Issues
+
+### Encrypted PDFs
+```python
+# Handle password-protected PDFs
+from pypdf import PdfReader
+
+try:
+    reader = PdfReader("encrypted.pdf")
+    if reader.is_encrypted:
+        reader.decrypt("password")
+except Exception as e:
+    print(f"Failed to decrypt: {e}")
+```
+
+### Corrupted PDFs
+```bash
+# Use qpdf to repair
+qpdf --check corrupted.pdf
+qpdf --replace-input corrupted.pdf
+```
+
+### Text Extraction Issues
+```python
+# Fallback to OCR for scanned PDFs
+import pytesseract
+from pdf2image import convert_from_path
+
+def extract_text_with_ocr(pdf_path):
+    images = convert_from_path(pdf_path)
+    text = ""
+    for i, image in enumerate(images):
+        text += pytesseract.image_to_string(image)
+    return text
+```
+
+## License Information
+
+- **pypdf**: BSD License
+- **pdfplumber**: MIT License
+- **pypdfium2**: Apache/BSD License
+- **reportlab**: BSD License
+- **poppler-utils**: GPL-2 License
+- **qpdf**: Apache License
+- **pdf-lib**: MIT License
+- **pdfjs-dist**: Apache License
\ No newline at end of file
diff --git a/scripts/check_bounding_boxes.py b/scripts/check_bounding_boxes.py
new file mode 100644
index 0000000..2cc5e34
--- /dev/null
+++ b/scripts/check_bounding_boxes.py
@@ -0,0 +1,65 @@
+from dataclasses import dataclass
+import json
+import sys
+
+
+
+
+@dataclass
+class RectAndField:
+    rect: list[float]
+    rect_type: str
+    field: dict
+
+
+def get_bounding_box_messages(fields_json_stream) -> list[str]:
+    messages = []
+    fields = json.load(fields_json_stream)
+    messages.append(f"Read {len(fields['form_fields'])} fields")
+
+    def rects_intersect(r1, r2):
+        disjoint_horizontal = r1[0] >= r2[2] or r1[2] <= r2[0]
+        disjoint_vertical = r1[1] >= r2[3] or r1[3] <= r2[1]
+        return not (disjoint_horizontal or disjoint_vertical)
+
+    rects_and_fields = []
+    for f in fields["form_fields"]:
+        rects_and_fields.append(RectAndField(f["label_bounding_box"], "label", f))
+        rects_and_fields.append(RectAndField(f["entry_bounding_box"], "entry", f))
+
+    has_error = False
+    for i, ri in enumerate(rects_and_fields):
+        for j in range(i + 1, len(rects_and_fields)):
+            rj = rects_and_fields[j]
+            if ri.field["page_number"] == rj.field["page_number"] and rects_intersect(ri.rect, rj.rect):
+                has_error = True
+                if ri.field is rj.field:
+                    messages.append(f"FAILURE: intersection between label and entry bounding boxes for `{ri.field['description']}` ({ri.rect}, {rj.rect})")
+                else:
+                    messages.append(f"FAILURE: intersection between {ri.rect_type} bounding box for `{ri.field['description']}` ({ri.rect}) and {rj.rect_type} bounding box for `{rj.field['description']}` ({rj.rect})")
+                if len(messages) >= 20:
+                    messages.append("Aborting further checks; fix bounding boxes and try again")
+                    return messages
+        if ri.rect_type == "entry":
+            if "entry_text" in ri.field:
+                font_size = ri.field["entry_text"].get("font_size", 14)
+                entry_height = ri.rect[3] - ri.rect[1]
+                if entry_height < font_size:
+                    has_error = True
+                    messages.append(f"FAILURE: entry bounding box height ({entry_height}) for `{ri.field['description']}` is too short for the text content (font size: {font_size}). Increase the box height or decrease the font size.")
+                    if len(messages) >= 20:
+                        messages.append("Aborting further checks; fix bounding boxes and try again")
+                        return messages
+
+    if not has_error:
+        messages.append("SUCCESS: All bounding boxes are valid")
+    return messages
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: check_bounding_boxes.py [fields.json]")
+        sys.exit(1)
+    with open(sys.argv[1]) as f:
+        messages = get_bounding_box_messages(f)
+    for msg in messages:
+        print(msg)
diff --git a/scripts/check_fillable_fields.py b/scripts/check_fillable_fields.py
new file mode 100644
index 0000000..36dfb95
--- /dev/null
+++ b/scripts/check_fillable_fields.py
@@ -0,0 +1,11 @@
+import sys
+from pypdf import PdfReader
+
+
+
+
+reader = PdfReader(sys.argv[1])
+if (reader.get_fields()):
+    print("This PDF has fillable form fields")
+else:
+    print("This PDF does not have fillable form fields; you will need to visually determine where to enter data")
diff --git a/scripts/convert_pdf_to_images.py b/scripts/convert_pdf_to_images.py
new file mode 100644
index 0000000..7939cef
--- /dev/null
+++ b/scripts/convert_pdf_to_images.py
@@ -0,0 +1,33 @@
+import os
+import sys
+
+from pdf2image import convert_from_path
+
+
+
+
+def convert(pdf_path, output_dir, max_dim=1000):
+    images = convert_from_path(pdf_path, dpi=200)
+
+    for i, image in enumerate(images):
+        width, height = image.size
+        if width > max_dim or height > max_dim:
+            scale_factor = min(max_dim / width, max_dim / height)
+            new_width = int(width * scale_factor)
+            new_height = int(height * scale_factor)
+            image = image.resize((new_width, new_height))
+        
+        image_path = os.path.join(output_dir, f"page_{i+1}.png")
+        image.save(image_path)
+        print(f"Saved page {i+1} as {image_path} (size: {image.size})")
+
+    print(f"Converted {len(images)} pages to PNG images")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: convert_pdf_to_images.py [input pdf] [output directory]")
+        sys.exit(1)
+    pdf_path = sys.argv[1]
+    output_directory = sys.argv[2]
+    convert(pdf_path, output_directory)
diff --git a/scripts/create_validation_image.py b/scripts/create_validation_image.py
new file mode 100644
index 0000000..10eadd8
--- /dev/null
+++ b/scripts/create_validation_image.py
@@ -0,0 +1,37 @@
+import json
+import sys
+
+from PIL import Image, ImageDraw
+
+
+
+
+def create_validation_image(page_number, fields_json_path, input_path, output_path):
+    with open(fields_json_path, 'r') as f:
+        data = json.load(f)
+
+        img = Image.open(input_path)
+        draw = ImageDraw.Draw(img)
+        num_boxes = 0
+        
+        for field in data["form_fields"]:
+            if field["page_number"] == page_number:
+                entry_box = field['entry_bounding_box']
+                label_box = field['label_bounding_box']
+                draw.rectangle(entry_box, outline='red', width=2)
+                draw.rectangle(label_box, outline='blue', width=2)
+                num_boxes += 2
+        
+        img.save(output_path)
+        print(f"Created validation image at {output_path} with {num_boxes} bounding boxes")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 5:
+        print("Usage: create_validation_image.py [page number] [fields.json file] [input image path] [output image path]")
+        sys.exit(1)
+    page_number = int(sys.argv[1])
+    fields_json_path = sys.argv[2]
+    input_image_path = sys.argv[3]
+    output_image_path = sys.argv[4]
+    create_validation_image(page_number, fields_json_path, input_image_path, output_image_path)
diff --git a/scripts/extract_form_field_info.py b/scripts/extract_form_field_info.py
new file mode 100644
index 0000000..64cd470
--- /dev/null
+++ b/scripts/extract_form_field_info.py
@@ -0,0 +1,122 @@
+import json
+import sys
+
+from pypdf import PdfReader
+
+
+
+
+def get_full_annotation_field_id(annotation):
+    components = []
+    while annotation:
+        field_name = annotation.get('/T')
+        if field_name:
+            components.append(field_name)
+        annotation = annotation.get('/Parent')
+    return ".".join(reversed(components)) if components else None
+
+
+def make_field_dict(field, field_id):
+    field_dict = {"field_id": field_id}
+    ft = field.get('/FT')
+    if ft == "/Tx":
+        field_dict["type"] = "text"
+    elif ft == "/Btn":
+        field_dict["type"] = "checkbox"  
+        states = field.get("/_States_", [])
+        if len(states) == 2:
+            if "/Off" in states:
+                field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1]
+                field_dict["unchecked_value"] = "/Off"
+            else:
+                print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.")
+                field_dict["checked_value"] = states[0]
+                field_dict["unchecked_value"] = states[1]
+    elif ft == "/Ch":
+        field_dict["type"] = "choice"
+        states = field.get("/_States_", [])
+        field_dict["choice_options"] = [{
+            "value": state[0],
+            "text": state[1],
+        } for state in states]
+    else:
+        field_dict["type"] = f"unknown ({ft})"
+    return field_dict
+
+
+def get_field_info(reader: PdfReader):
+    fields = reader.get_fields()
+
+    field_info_by_id = {}
+    possible_radio_names = set()
+
+    for field_id, field in fields.items():
+        if field.get("/Kids"):
+            if field.get("/FT") == "/Btn":
+                possible_radio_names.add(field_id)
+            continue
+        field_info_by_id[field_id] = make_field_dict(field, field_id)
+
+
+    radio_fields_by_id = {}
+
+    for page_index, page in enumerate(reader.pages):
+        annotations = page.get('/Annots', [])
+        for ann in annotations:
+            field_id = get_full_annotation_field_id(ann)
+            if field_id in field_info_by_id:
+                field_info_by_id[field_id]["page"] = page_index + 1
+                field_info_by_id[field_id]["rect"] = ann.get('/Rect')
+            elif field_id in possible_radio_names:
+                try:
+                    on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"]
+                except KeyError:
+                    continue
+                if len(on_values) == 1:
+                    rect = ann.get("/Rect")
+                    if field_id not in radio_fields_by_id:
+                        radio_fields_by_id[field_id] = {
+                            "field_id": field_id,
+                            "type": "radio_group",
+                            "page": page_index + 1,
+                            "radio_options": [],
+                        }
+                    radio_fields_by_id[field_id]["radio_options"].append({
+                        "value": on_values[0],
+                        "rect": rect,
+                    })
+
+    fields_with_location = []
+    for field_info in field_info_by_id.values():
+        if "page" in field_info:
+            fields_with_location.append(field_info)
+        else:
+            print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring")
+
+    def sort_key(f):
+        if "radio_options" in f:
+            rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0]
+        else:
+            rect = f.get("rect") or [0, 0, 0, 0]
+        adjusted_position = [-rect[1], rect[0]]
+        return [f.get("page"), adjusted_position]
+    
+    sorted_fields = fields_with_location + list(radio_fields_by_id.values())
+    sorted_fields.sort(key=sort_key)
+
+    return sorted_fields
+
+
+def write_field_info(pdf_path: str, json_output_path: str):
+    reader = PdfReader(pdf_path)
+    field_info = get_field_info(reader)
+    with open(json_output_path, "w") as f:
+        json.dump(field_info, f, indent=2)
+    print(f"Wrote {len(field_info)} fields to {json_output_path}")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: extract_form_field_info.py [input pdf] [output json]")
+        sys.exit(1)
+    write_field_info(sys.argv[1], sys.argv[2])
diff --git a/scripts/extract_form_structure.py b/scripts/extract_form_structure.py
new file mode 100644
index 0000000..f219e7d
--- /dev/null
+++ b/scripts/extract_form_structure.py
@@ -0,0 +1,115 @@
+"""
+Extract form structure from a non-fillable PDF.
+
+This script analyzes the PDF to find:
+- Text labels with their exact coordinates
+- Horizontal lines (row boundaries)
+- Checkboxes (small rectangles)
+
+Output: A JSON file with the form structure that can be used to generate
+accurate field coordinates for filling.
+
+Usage: python extract_form_structure.py <input.pdf> <output.json>
+"""
+
+import json
+import sys
+import pdfplumber
+
+
+def extract_form_structure(pdf_path):
+    structure = {
+        "pages": [],
+        "labels": [],
+        "lines": [],
+        "checkboxes": [],
+        "row_boundaries": []
+    }
+
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_num, page in enumerate(pdf.pages, 1):
+            structure["pages"].append({
+                "page_number": page_num,
+                "width": float(page.width),
+                "height": float(page.height)
+            })
+
+            words = page.extract_words()
+            for word in words:
+                structure["labels"].append({
+                    "page": page_num,
+                    "text": word["text"],
+                    "x0": round(float(word["x0"]), 1),
+                    "top": round(float(word["top"]), 1),
+                    "x1": round(float(word["x1"]), 1),
+                    "bottom": round(float(word["bottom"]), 1)
+                })
+
+            for line in page.lines:
+                if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5:
+                    structure["lines"].append({
+                        "page": page_num,
+                        "y": round(float(line["top"]), 1),
+                        "x0": round(float(line["x0"]), 1),
+                        "x1": round(float(line["x1"]), 1)
+                    })
+
+            for rect in page.rects:
+                width = float(rect["x1"]) - float(rect["x0"])
+                height = float(rect["bottom"]) - float(rect["top"])
+                if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2:
+                    structure["checkboxes"].append({
+                        "page": page_num,
+                        "x0": round(float(rect["x0"]), 1),
+                        "top": round(float(rect["top"]), 1),
+                        "x1": round(float(rect["x1"]), 1),
+                        "bottom": round(float(rect["bottom"]), 1),
+                        "center_x": round((float(rect["x0"]) + float(rect["x1"])) / 2, 1),
+                        "center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1)
+                    })
+
+    lines_by_page = {}
+    for line in structure["lines"]:
+        page = line["page"]
+        if page not in lines_by_page:
+            lines_by_page[page] = []
+        lines_by_page[page].append(line["y"])
+
+    for page, y_coords in lines_by_page.items():
+        y_coords = sorted(set(y_coords))
+        for i in range(len(y_coords) - 1):
+            structure["row_boundaries"].append({
+                "page": page,
+                "row_top": y_coords[i],
+                "row_bottom": y_coords[i + 1],
+                "row_height": round(y_coords[i + 1] - y_coords[i], 1)
+            })
+
+    return structure
+
+
+def main():
+    if len(sys.argv) != 3:
+        print("Usage: extract_form_structure.py <input.pdf> <output.json>")
+        sys.exit(1)
+
+    pdf_path = sys.argv[1]
+    output_path = sys.argv[2]
+
+    print(f"Extracting structure from {pdf_path}...")
+    structure = extract_form_structure(pdf_path)
+
+    with open(output_path, "w") as f:
+        json.dump(structure, f, indent=2)
+
+    print(f"Found:")
+    print(f"  - {len(structure['pages'])} pages")
+    print(f"  - {len(structure['labels'])} text labels")
+    print(f"  - {len(structure['lines'])} horizontal lines")
+    print(f"  - {len(structure['checkboxes'])} checkboxes")
+    print(f"  - {len(structure['row_boundaries'])} row boundaries")
+    print(f"Saved to {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/fill_fillable_fields.py b/scripts/fill_fillable_fields.py
new file mode 100644
index 0000000..51c2600
--- /dev/null
+++ b/scripts/fill_fillable_fields.py
@@ -0,0 +1,98 @@
+import json
+import sys
+
+from pypdf import PdfReader, PdfWriter
+
+from extract_form_field_info import get_field_info
+
+
+
+
+def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path: str):
+    with open(fields_json_path) as f:
+        fields = json.load(f)
+    fields_by_page = {}
+    for field in fields:
+        if "value" in field:
+            field_id = field["field_id"]
+            page = field["page"]
+            if page not in fields_by_page:
+                fields_by_page[page] = {}
+            fields_by_page[page][field_id] = field["value"]
+    
+    reader = PdfReader(input_pdf_path)
+
+    has_error = False
+    field_info = get_field_info(reader)
+    fields_by_ids = {f["field_id"]: f for f in field_info}
+    for field in fields:
+        existing_field = fields_by_ids.get(field["field_id"])
+        if not existing_field:
+            has_error = True
+            print(f"ERROR: `{field['field_id']}` is not a valid field ID")
+        elif field["page"] != existing_field["page"]:
+            has_error = True
+            print(f"ERROR: Incorrect page number for `{field['field_id']}` (got {field['page']}, expected {existing_field['page']})")
+        else:
+            if "value" in field:
+                err = validation_error_for_field_value(existing_field, field["value"])
+                if err:
+                    print(err)
+                    has_error = True
+    if has_error:
+        sys.exit(1)
+
+    writer = PdfWriter(clone_from=reader)
+    for page, field_values in fields_by_page.items():
+        writer.update_page_form_field_values(writer.pages[page - 1], field_values, auto_regenerate=False)
+
+    writer.set_need_appearances_writer(True)
+    
+    with open(output_pdf_path, "wb") as f:
+        writer.write(f)
+
+
+def validation_error_for_field_value(field_info, field_value):
+    field_type = field_info["type"]
+    field_id = field_info["field_id"]
+    if field_type == "checkbox":
+        checked_val = field_info["checked_value"]
+        unchecked_val = field_info["unchecked_value"]
+        if field_value != checked_val and field_value != unchecked_val:
+            return f'ERROR: Invalid value "{field_value}" for checkbox field "{field_id}". The checked value is "{checked_val}" and the unchecked value is "{unchecked_val}"'
+    elif field_type == "radio_group":
+        option_values = [opt["value"] for opt in field_info["radio_options"]]
+        if field_value not in option_values:
+            return f'ERROR: Invalid value "{field_value}" for radio group field "{field_id}". Valid values are: {option_values}' 
+    elif field_type == "choice":
+        choice_values = [opt["value"] for opt in field_info["choice_options"]]
+        if field_value not in choice_values:
+            return f'ERROR: Invalid value "{field_value}" for choice field "{field_id}". Valid values are: {choice_values}'
+    return None
+
+
+def monkeypatch_pydpf_method():
+    from pypdf.generic import DictionaryObject
+    from pypdf.constants import FieldDictionaryAttributes
+
+    original_get_inherited = DictionaryObject.get_inherited
+
+    def patched_get_inherited(self, key: str, default = None):
+        result = original_get_inherited(self, key, default)
+        if key == FieldDictionaryAttributes.Opt:
+            if isinstance(result, list) and all(isinstance(v, list) and len(v) == 2 for v in result):
+                result = [r[0] for r in result]
+        return result
+
+    DictionaryObject.get_inherited = patched_get_inherited
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("Usage: fill_fillable_fields.py [input pdf] [field_values.json] [output pdf]")
+        sys.exit(1)
+    monkeypatch_pydpf_method()
+    input_pdf = sys.argv[1]
+    fields_json = sys.argv[2]
+    output_pdf = sys.argv[3]
+    fill_pdf_fields(input_pdf, fields_json, output_pdf)
diff --git a/scripts/fill_pdf_form_with_annotations.py b/scripts/fill_pdf_form_with_annotations.py
new file mode 100644
index 0000000..b430069
--- /dev/null
+++ b/scripts/fill_pdf_form_with_annotations.py
@@ -0,0 +1,107 @@
+import json
+import sys
+
+from pypdf import PdfReader, PdfWriter
+from pypdf.annotations import FreeText
+
+
+
+
+def transform_from_image_coords(bbox, image_width, image_height, pdf_width, pdf_height):
+    x_scale = pdf_width / image_width
+    y_scale = pdf_height / image_height
+
+    left = bbox[0] * x_scale
+    right = bbox[2] * x_scale
+
+    top = pdf_height - (bbox[1] * y_scale)
+    bottom = pdf_height - (bbox[3] * y_scale)
+
+    return left, bottom, right, top
+
+
+def transform_from_pdf_coords(bbox, pdf_height):
+    left = bbox[0]
+    right = bbox[2]
+
+    pypdf_top = pdf_height - bbox[1]      
+    pypdf_bottom = pdf_height - bbox[3]   
+
+    return left, pypdf_bottom, right, pypdf_top
+
+
+def fill_pdf_form(input_pdf_path, fields_json_path, output_pdf_path):
+    
+    with open(fields_json_path, "r") as f:
+        fields_data = json.load(f)
+    
+    reader = PdfReader(input_pdf_path)
+    writer = PdfWriter()
+    
+    writer.append(reader)
+    
+    pdf_dimensions = {}
+    for i, page in enumerate(reader.pages):
+        mediabox = page.mediabox
+        pdf_dimensions[i + 1] = [mediabox.width, mediabox.height]
+    
+    annotations = []
+    for field in fields_data["form_fields"]:
+        page_num = field["page_number"]
+
+        page_info = next(p for p in fields_data["pages"] if p["page_number"] == page_num)
+        pdf_width, pdf_height = pdf_dimensions[page_num]
+
+        if "pdf_width" in page_info:
+            transformed_entry_box = transform_from_pdf_coords(
+                field["entry_bounding_box"],
+                float(pdf_height)
+            )
+        else:
+            image_width = page_info["image_width"]
+            image_height = page_info["image_height"]
+            transformed_entry_box = transform_from_image_coords(
+                field["entry_bounding_box"],
+                image_width, image_height,
+                float(pdf_width), float(pdf_height)
+            )
+        
+        if "entry_text" not in field or "text" not in field["entry_text"]:
+            continue
+        entry_text = field["entry_text"]
+        text = entry_text["text"]
+        if not text:
+            continue
+        
+        font_name = entry_text.get("font", "Arial")
+        font_size = str(entry_text.get("font_size", 14)) + "pt"
+        font_color = entry_text.get("font_color", "000000")
+
+        annotation = FreeText(
+            text=text,
+            rect=transformed_entry_box,
+            font=font_name,
+            font_size=font_size,
+            font_color=font_color,
+            border_color=None,
+            background_color=None,
+        )
+        annotations.append(annotation)
+        writer.add_annotation(page_number=page_num - 1, annotation=annotation)
+        
+    with open(output_pdf_path, "wb") as output:
+        writer.write(output)
+    
+    print(f"Successfully filled PDF form and saved to {output_pdf_path}")
+    print(f"Added {len(annotations)} text annotations")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("Usage: fill_pdf_form_with_annotations.py [input pdf] [fields.json] [output pdf]")
+        sys.exit(1)
+    input_pdf = sys.argv[1]
+    fields_json = sys.argv[2]
+    output_pdf = sys.argv[3]
+    
+    fill_pdf_form(input_pdf, fields_json, output_pdf)