From c0ca104998899148fd1eb8c08917ea518bf6c1d7 Mon Sep 17 00:00:00 2001 From: zlei9 Date: Sun, 22 Mar 2026 11:43:37 +0800 Subject: [PATCH] Initial commit: pdf skill --- LICENSE.txt | 30 ++ SKILL.md | 314 +++++++++++ forms.md | 294 +++++++++++ reference.md | 612 ++++++++++++++++++++++ scripts/check_bounding_boxes.py | 65 +++ scripts/check_fillable_fields.py | 11 + scripts/convert_pdf_to_images.py | 33 ++ scripts/create_validation_image.py | 37 ++ scripts/extract_form_field_info.py | 122 +++++ scripts/extract_form_structure.py | 115 ++++ scripts/fill_fillable_fields.py | 98 ++++ scripts/fill_pdf_form_with_annotations.py | 107 ++++ 12 files changed, 1838 insertions(+) create mode 100644 LICENSE.txt create mode 100644 SKILL.md create mode 100644 forms.md create mode 100644 reference.md create mode 100644 scripts/check_bounding_boxes.py create mode 100644 scripts/check_fillable_fields.py create mode 100644 scripts/convert_pdf_to_images.py create mode 100644 scripts/create_validation_image.py create mode 100644 scripts/extract_form_field_info.py create mode 100644 scripts/extract_form_structure.py create mode 100644 scripts/fill_fillable_fields.py create mode 100644 scripts/fill_pdf_form_with_annotations.py diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..c55ab42 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,30 @@ +© 2025 Anthropic, PBC. All rights reserved. + +LICENSE: Use of these materials (including all code, prompts, assets, files, +and other components of this Skill) is governed by your agreement with +Anthropic regarding use of Anthropic's services. If no separate agreement +exists, use is governed by Anthropic's Consumer Terms of Service or +Commercial Terms of Service, as applicable: +https://www.anthropic.com/legal/consumer-terms +https://www.anthropic.com/legal/commercial-terms +Your applicable agreement is referred to as the "Agreement." "Services" are +as defined in the Agreement. + +ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the +contrary, users may not: + +- Extract these materials from the Services or retain copies of these + materials outside the Services +- Reproduce or copy these materials, except for temporary copies created + automatically during authorized use of the Services +- Create derivative works based on these materials +- Distribute, sublicense, or transfer these materials to any third party +- Make, offer to sell, sell, or import any inventions embodied in these + materials +- Reverse engineer, decompile, or disassemble these materials + +The receipt, viewing, or possession of these materials does not convey or +imply any license or right beyond those expressly granted above. + +Anthropic retains all right, title, and interest in these materials, +including all copyrights, patents, and other intellectual property rights. diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..d3e046a --- /dev/null +++ b/SKILL.md @@ -0,0 +1,314 @@ +--- +name: pdf +description: Use this skill whenever the user wants to do anything with PDF files. This includes reading or extracting text/tables from PDFs, combining or merging multiple PDFs into one, splitting PDFs apart, rotating pages, adding watermarks, creating new PDFs, filling PDF forms, encrypting/decrypting PDFs, extracting images, and OCR on scanned PDFs to make them searchable. If the user mentions a .pdf file or asks to produce one, use this skill. +license: Proprietary. LICENSE.txt has complete terms +--- + +# PDF Processing Guide + +## Overview + +This guide covers essential PDF processing operations using Python libraries and command-line tools. For advanced features, JavaScript libraries, and detailed examples, see REFERENCE.md. If you need to fill out a PDF form, read FORMS.md and follow its instructions. + +## Quick Start + +```python +from pypdf import PdfReader, PdfWriter + +# Read a PDF +reader = PdfReader("document.pdf") +print(f"Pages: {len(reader.pages)}") + +# Extract text +text = "" +for page in reader.pages: + text += page.extract_text() +``` + +## Python Libraries + +### pypdf - Basic Operations + +#### Merge PDFs +```python +from pypdf import PdfWriter, PdfReader + +writer = PdfWriter() +for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]: + reader = PdfReader(pdf_file) + for page in reader.pages: + writer.add_page(page) + +with open("merged.pdf", "wb") as output: + writer.write(output) +``` + +#### Split PDF +```python +reader = PdfReader("input.pdf") +for i, page in enumerate(reader.pages): + writer = PdfWriter() + writer.add_page(page) + with open(f"page_{i+1}.pdf", "wb") as output: + writer.write(output) +``` + +#### Extract Metadata +```python +reader = PdfReader("document.pdf") +meta = reader.metadata +print(f"Title: {meta.title}") +print(f"Author: {meta.author}") +print(f"Subject: {meta.subject}") +print(f"Creator: {meta.creator}") +``` + +#### Rotate Pages +```python +reader = PdfReader("input.pdf") +writer = PdfWriter() + +page = reader.pages[0] +page.rotate(90) # Rotate 90 degrees clockwise +writer.add_page(page) + +with open("rotated.pdf", "wb") as output: + writer.write(output) +``` + +### pdfplumber - Text and Table Extraction + +#### Extract Text with Layout +```python +import pdfplumber + +with pdfplumber.open("document.pdf") as pdf: + for page in pdf.pages: + text = page.extract_text() + print(text) +``` + +#### Extract Tables +```python +with pdfplumber.open("document.pdf") as pdf: + for i, page in enumerate(pdf.pages): + tables = page.extract_tables() + for j, table in enumerate(tables): + print(f"Table {j+1} on page {i+1}:") + for row in table: + print(row) +``` + +#### Advanced Table Extraction +```python +import pandas as pd + +with pdfplumber.open("document.pdf") as pdf: + all_tables = [] + for page in pdf.pages: + tables = page.extract_tables() + for table in tables: + if table: # Check if table is not empty + df = pd.DataFrame(table[1:], columns=table[0]) + all_tables.append(df) + +# Combine all tables +if all_tables: + combined_df = pd.concat(all_tables, ignore_index=True) + combined_df.to_excel("extracted_tables.xlsx", index=False) +``` + +### reportlab - Create PDFs + +#### Basic PDF Creation +```python +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas + +c = canvas.Canvas("hello.pdf", pagesize=letter) +width, height = letter + +# Add text +c.drawString(100, height - 100, "Hello World!") +c.drawString(100, height - 120, "This is a PDF created with reportlab") + +# Add a line +c.line(100, height - 140, 400, height - 140) + +# Save +c.save() +``` + +#### Create PDF with Multiple Pages +```python +from reportlab.lib.pagesizes import letter +from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak +from reportlab.lib.styles import getSampleStyleSheet + +doc = SimpleDocTemplate("report.pdf", pagesize=letter) +styles = getSampleStyleSheet() +story = [] + +# Add content +title = Paragraph("Report Title", styles['Title']) +story.append(title) +story.append(Spacer(1, 12)) + +body = Paragraph("This is the body of the report. " * 20, styles['Normal']) +story.append(body) +story.append(PageBreak()) + +# Page 2 +story.append(Paragraph("Page 2", styles['Heading1'])) +story.append(Paragraph("Content for page 2", styles['Normal'])) + +# Build PDF +doc.build(story) +``` + +#### Subscripts and Superscripts + +**IMPORTANT**: Never use Unicode subscript/superscript characters (₀₁₂₃₄₅₆₇₈₉, ⁰¹²³⁴⁵⁶⁷⁸⁹) in ReportLab PDFs. The built-in fonts do not include these glyphs, causing them to render as solid black boxes. + +Instead, use ReportLab's XML markup tags in Paragraph objects: +```python +from reportlab.platypus import Paragraph +from reportlab.lib.styles import getSampleStyleSheet + +styles = getSampleStyleSheet() + +# Subscripts: use tag +chemical = Paragraph("H2O", styles['Normal']) + +# Superscripts: use tag +squared = Paragraph("x2 + y2", styles['Normal']) +``` + +For canvas-drawn text (not Paragraph objects), manually adjust font the size and position rather than using Unicode subscripts/superscripts. + +## Command-Line Tools + +### pdftotext (poppler-utils) +```bash +# Extract text +pdftotext input.pdf output.txt + +# Extract text preserving layout +pdftotext -layout input.pdf output.txt + +# Extract specific pages +pdftotext -f 1 -l 5 input.pdf output.txt # Pages 1-5 +``` + +### qpdf +```bash +# Merge PDFs +qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf + +# Split pages +qpdf input.pdf --pages . 1-5 -- pages1-5.pdf +qpdf input.pdf --pages . 6-10 -- pages6-10.pdf + +# Rotate pages +qpdf input.pdf output.pdf --rotate=+90:1 # Rotate page 1 by 90 degrees + +# Remove password +qpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf +``` + +### pdftk (if available) +```bash +# Merge +pdftk file1.pdf file2.pdf cat output merged.pdf + +# Split +pdftk input.pdf burst + +# Rotate +pdftk input.pdf rotate 1east output rotated.pdf +``` + +## Common Tasks + +### Extract Text from Scanned PDFs +```python +# Requires: pip install pytesseract pdf2image +import pytesseract +from pdf2image import convert_from_path + +# Convert PDF to images +images = convert_from_path('scanned.pdf') + +# OCR each page +text = "" +for i, image in enumerate(images): + text += f"Page {i+1}:\n" + text += pytesseract.image_to_string(image) + text += "\n\n" + +print(text) +``` + +### Add Watermark +```python +from pypdf import PdfReader, PdfWriter + +# Create watermark (or load existing) +watermark = PdfReader("watermark.pdf").pages[0] + +# Apply to all pages +reader = PdfReader("document.pdf") +writer = PdfWriter() + +for page in reader.pages: + page.merge_page(watermark) + writer.add_page(page) + +with open("watermarked.pdf", "wb") as output: + writer.write(output) +``` + +### Extract Images +```bash +# Using pdfimages (poppler-utils) +pdfimages -j input.pdf output_prefix + +# This extracts all images as output_prefix-000.jpg, output_prefix-001.jpg, etc. +``` + +### Password Protection +```python +from pypdf import PdfReader, PdfWriter + +reader = PdfReader("input.pdf") +writer = PdfWriter() + +for page in reader.pages: + writer.add_page(page) + +# Add password +writer.encrypt("userpassword", "ownerpassword") + +with open("encrypted.pdf", "wb") as output: + writer.write(output) +``` + +## Quick Reference + +| Task | Best Tool | Command/Code | +|------|-----------|--------------| +| Merge PDFs | pypdf | `writer.add_page(page)` | +| Split PDFs | pypdf | One page per file | +| Extract text | pdfplumber | `page.extract_text()` | +| Extract tables | pdfplumber | `page.extract_tables()` | +| Create PDFs | reportlab | Canvas or Platypus | +| Command line merge | qpdf | `qpdf --empty --pages ...` | +| OCR scanned PDFs | pytesseract | Convert to image first | +| Fill PDF forms | pdf-lib or pypdf (see FORMS.md) | See FORMS.md | + +## Next Steps + +- For advanced pypdfium2 usage, see REFERENCE.md +- For JavaScript libraries (pdf-lib), see REFERENCE.md +- If you need to fill out a PDF form, follow the instructions in FORMS.md +- For troubleshooting guides, see REFERENCE.md diff --git a/forms.md b/forms.md new file mode 100644 index 0000000..6e7e1e0 --- /dev/null +++ b/forms.md @@ -0,0 +1,294 @@ +**CRITICAL: You MUST complete these steps in order. Do not skip ahead to writing code.** + +If you need to fill out a PDF form, first check to see if the PDF has fillable form fields. Run this script from this file's directory: + `python scripts/check_fillable_fields `, and depending on the result go to either the "Fillable fields" or "Non-fillable fields" and follow those instructions. + +# Fillable fields +If the PDF has fillable form fields: +- Run this script from this file's directory: `python scripts/extract_form_field_info.py `. It will create a JSON file with a list of fields in this format: +``` +[ + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "rect": ([left, bottom, right, top] bounding box in PDF coordinates, y=0 is the bottom of the page), + "type": ("text", "checkbox", "radio_group", or "choice"), + }, + // Checkboxes have "checked_value" and "unchecked_value" properties: + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "type": "checkbox", + "checked_value": (Set the field to this value to check the checkbox), + "unchecked_value": (Set the field to this value to uncheck the checkbox), + }, + // Radio groups have a "radio_options" list with the possible choices. + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "type": "radio_group", + "radio_options": [ + { + "value": (set the field to this value to select this radio option), + "rect": (bounding box for the radio button for this option) + }, + // Other radio options + ] + }, + // Multiple choice fields have a "choice_options" list with the possible choices: + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "type": "choice", + "choice_options": [ + { + "value": (set the field to this value to select this option), + "text": (display text of the option) + }, + // Other choice options + ], + } +] +``` +- Convert the PDF to PNGs (one image for each page) with this script (run from this file's directory): +`python scripts/convert_pdf_to_images.py ` +Then analyze the images to determine the purpose of each form field (make sure to convert the bounding box PDF coordinates to image coordinates). +- Create a `field_values.json` file in this format with the values to be entered for each field: +``` +[ + { + "field_id": "last_name", // Must match the field_id from `extract_form_field_info.py` + "description": "The user's last name", + "page": 1, // Must match the "page" value in field_info.json + "value": "Simpson" + }, + { + "field_id": "Checkbox12", + "description": "Checkbox to be checked if the user is 18 or over", + "page": 1, + "value": "/On" // If this is a checkbox, use its "checked_value" value to check it. If it's a radio button group, use one of the "value" values in "radio_options". + }, + // more fields +] +``` +- Run the `fill_fillable_fields.py` script from this file's directory to create a filled-in PDF: +`python scripts/fill_fillable_fields.py ` +This script will verify that the field IDs and values you provide are valid; if it prints error messages, correct the appropriate fields and try again. + +# Non-fillable fields +If the PDF doesn't have fillable form fields, you'll add text annotations. First try to extract coordinates from the PDF structure (more accurate), then fall back to visual estimation if needed. + +## Step 1: Try Structure Extraction First + +Run this script to extract text labels, lines, and checkboxes with their exact PDF coordinates: +`python scripts/extract_form_structure.py form_structure.json` + +This creates a JSON file containing: +- **labels**: Every text element with exact coordinates (x0, top, x1, bottom in PDF points) +- **lines**: Horizontal lines that define row boundaries +- **checkboxes**: Small square rectangles that are checkboxes (with center coordinates) +- **row_boundaries**: Row top/bottom positions calculated from horizontal lines + +**Check the results**: If `form_structure.json` has meaningful labels (text elements that correspond to form fields), use **Approach A: Structure-Based Coordinates**. If the PDF is scanned/image-based and has few or no labels, use **Approach B: Visual Estimation**. + +--- + +## Approach A: Structure-Based Coordinates (Preferred) + +Use this when `extract_form_structure.py` found text labels in the PDF. + +### A.1: Analyze the Structure + +Read form_structure.json and identify: + +1. **Label groups**: Adjacent text elements that form a single label (e.g., "Last" + "Name") +2. **Row structure**: Labels with similar `top` values are in the same row +3. **Field columns**: Entry areas start after label ends (x0 = label.x1 + gap) +4. **Checkboxes**: Use the checkbox coordinates directly from the structure + +**Coordinate system**: PDF coordinates where y=0 is at TOP of page, y increases downward. + +### A.2: Check for Missing Elements + +The structure extraction may not detect all form elements. Common cases: +- **Circular checkboxes**: Only square rectangles are detected as checkboxes +- **Complex graphics**: Decorative elements or non-standard form controls +- **Faded or light-colored elements**: May not be extracted + +If you see form fields in the PDF images that aren't in form_structure.json, you'll need to use **visual analysis** for those specific fields (see "Hybrid Approach" below). + +### A.3: Create fields.json with PDF Coordinates + +For each field, calculate entry coordinates from the extracted structure: + +**Text fields:** +- entry x0 = label x1 + 5 (small gap after label) +- entry x1 = next label's x0, or row boundary +- entry top = same as label top +- entry bottom = row boundary line below, or label bottom + row_height + +**Checkboxes:** +- Use the checkbox rectangle coordinates directly from form_structure.json +- entry_bounding_box = [checkbox.x0, checkbox.top, checkbox.x1, checkbox.bottom] + +Create fields.json using `pdf_width` and `pdf_height` (signals PDF coordinates): +```json +{ + "pages": [ + {"page_number": 1, "pdf_width": 612, "pdf_height": 792} + ], + "form_fields": [ + { + "page_number": 1, + "description": "Last name entry field", + "field_label": "Last Name", + "label_bounding_box": [43, 63, 87, 73], + "entry_bounding_box": [92, 63, 260, 79], + "entry_text": {"text": "Smith", "font_size": 10} + }, + { + "page_number": 1, + "description": "US Citizen Yes checkbox", + "field_label": "Yes", + "label_bounding_box": [260, 200, 280, 210], + "entry_bounding_box": [285, 197, 292, 205], + "entry_text": {"text": "X"} + } + ] +} +``` + +**Important**: Use `pdf_width`/`pdf_height` and coordinates directly from form_structure.json. + +### A.4: Validate Bounding Boxes + +Before filling, check your bounding boxes for errors: +`python scripts/check_bounding_boxes.py fields.json` + +This checks for intersecting bounding boxes and entry boxes that are too small for the font size. Fix any reported errors before filling. + +--- + +## Approach B: Visual Estimation (Fallback) + +Use this when the PDF is scanned/image-based and structure extraction found no usable text labels (e.g., all text shows as "(cid:X)" patterns). + +### B.1: Convert PDF to Images + +`python scripts/convert_pdf_to_images.py ` + +### B.2: Initial Field Identification + +Examine each page image to identify form sections and get **rough estimates** of field locations: +- Form field labels and their approximate positions +- Entry areas (lines, boxes, or blank spaces for text input) +- Checkboxes and their approximate locations + +For each field, note approximate pixel coordinates (they don't need to be precise yet). + +### B.3: Zoom Refinement (CRITICAL for accuracy) + +For each field, crop a region around the estimated position to refine coordinates precisely. + +**Create a zoomed crop using ImageMagick:** +```bash +magick -crop x++ +repage +``` + +Where: +- `, ` = top-left corner of crop region (use your rough estimate minus padding) +- `, ` = size of crop region (field area plus ~50px padding on each side) + +**Example:** To refine a "Name" field estimated around (100, 150): +```bash +magick images_dir/page_1.png -crop 300x80+50+120 +repage crops/name_field.png +``` + +(Note: if the `magick` command isn't available, try `convert` with the same arguments). + +**Examine the cropped image** to determine precise coordinates: +1. Identify the exact pixel where the entry area begins (after the label) +2. Identify where the entry area ends (before next field or edge) +3. Identify the top and bottom of the entry line/box + +**Convert crop coordinates back to full image coordinates:** +- full_x = crop_x + crop_offset_x +- full_y = crop_y + crop_offset_y + +Example: If the crop started at (50, 120) and the entry box starts at (52, 18) within the crop: +- entry_x0 = 52 + 50 = 102 +- entry_top = 18 + 120 = 138 + +**Repeat for each field**, grouping nearby fields into single crops when possible. + +### B.4: Create fields.json with Refined Coordinates + +Create fields.json using `image_width` and `image_height` (signals image coordinates): +```json +{ + "pages": [ + {"page_number": 1, "image_width": 1700, "image_height": 2200} + ], + "form_fields": [ + { + "page_number": 1, + "description": "Last name entry field", + "field_label": "Last Name", + "label_bounding_box": [120, 175, 242, 198], + "entry_bounding_box": [255, 175, 720, 218], + "entry_text": {"text": "Smith", "font_size": 10} + } + ] +} +``` + +**Important**: Use `image_width`/`image_height` and the refined pixel coordinates from the zoom analysis. + +### B.5: Validate Bounding Boxes + +Before filling, check your bounding boxes for errors: +`python scripts/check_bounding_boxes.py fields.json` + +This checks for intersecting bounding boxes and entry boxes that are too small for the font size. Fix any reported errors before filling. + +--- + +## Hybrid Approach: Structure + Visual + +Use this when structure extraction works for most fields but misses some elements (e.g., circular checkboxes, unusual form controls). + +1. **Use Approach A** for fields that were detected in form_structure.json +2. **Convert PDF to images** for visual analysis of missing fields +3. **Use zoom refinement** (from Approach B) for the missing fields +4. **Combine coordinates**: For fields from structure extraction, use `pdf_width`/`pdf_height`. For visually-estimated fields, you must convert image coordinates to PDF coordinates: + - pdf_x = image_x * (pdf_width / image_width) + - pdf_y = image_y * (pdf_height / image_height) +5. **Use a single coordinate system** in fields.json - convert all to PDF coordinates with `pdf_width`/`pdf_height` + +--- + +## Step 2: Validate Before Filling + +**Always validate bounding boxes before filling:** +`python scripts/check_bounding_boxes.py fields.json` + +This checks for: +- Intersecting bounding boxes (which would cause overlapping text) +- Entry boxes that are too small for the specified font size + +Fix any reported errors in fields.json before proceeding. + +## Step 3: Fill the Form + +The fill script auto-detects the coordinate system and handles conversion: +`python scripts/fill_pdf_form_with_annotations.py fields.json ` + +## Step 4: Verify Output + +Convert the filled PDF to images and verify text placement: +`python scripts/convert_pdf_to_images.py ` + +If text is mispositioned: +- **Approach A**: Check that you're using PDF coordinates from form_structure.json with `pdf_width`/`pdf_height` +- **Approach B**: Check that image dimensions match and coordinates are accurate pixels +- **Hybrid**: Ensure coordinate conversions are correct for visually-estimated fields diff --git a/reference.md b/reference.md new file mode 100644 index 0000000..41400bf --- /dev/null +++ b/reference.md @@ -0,0 +1,612 @@ +# PDF Processing Advanced Reference + +This document contains advanced PDF processing features, detailed examples, and additional libraries not covered in the main skill instructions. + +## pypdfium2 Library (Apache/BSD License) + +### Overview +pypdfium2 is a Python binding for PDFium (Chromium's PDF library). It's excellent for fast PDF rendering, image generation, and serves as a PyMuPDF replacement. + +### Render PDF to Images +```python +import pypdfium2 as pdfium +from PIL import Image + +# Load PDF +pdf = pdfium.PdfDocument("document.pdf") + +# Render page to image +page = pdf[0] # First page +bitmap = page.render( + scale=2.0, # Higher resolution + rotation=0 # No rotation +) + +# Convert to PIL Image +img = bitmap.to_pil() +img.save("page_1.png", "PNG") + +# Process multiple pages +for i, page in enumerate(pdf): + bitmap = page.render(scale=1.5) + img = bitmap.to_pil() + img.save(f"page_{i+1}.jpg", "JPEG", quality=90) +``` + +### Extract Text with pypdfium2 +```python +import pypdfium2 as pdfium + +pdf = pdfium.PdfDocument("document.pdf") +for i, page in enumerate(pdf): + text = page.get_text() + print(f"Page {i+1} text length: {len(text)} chars") +``` + +## JavaScript Libraries + +### pdf-lib (MIT License) + +pdf-lib is a powerful JavaScript library for creating and modifying PDF documents in any JavaScript environment. + +#### Load and Manipulate Existing PDF +```javascript +import { PDFDocument } from 'pdf-lib'; +import fs from 'fs'; + +async function manipulatePDF() { + // Load existing PDF + const existingPdfBytes = fs.readFileSync('input.pdf'); + const pdfDoc = await PDFDocument.load(existingPdfBytes); + + // Get page count + const pageCount = pdfDoc.getPageCount(); + console.log(`Document has ${pageCount} pages`); + + // Add new page + const newPage = pdfDoc.addPage([600, 400]); + newPage.drawText('Added by pdf-lib', { + x: 100, + y: 300, + size: 16 + }); + + // Save modified PDF + const pdfBytes = await pdfDoc.save(); + fs.writeFileSync('modified.pdf', pdfBytes); +} +``` + +#### Create Complex PDFs from Scratch +```javascript +import { PDFDocument, rgb, StandardFonts } from 'pdf-lib'; +import fs from 'fs'; + +async function createPDF() { + const pdfDoc = await PDFDocument.create(); + + // Add fonts + const helveticaFont = await pdfDoc.embedFont(StandardFonts.Helvetica); + const helveticaBold = await pdfDoc.embedFont(StandardFonts.HelveticaBold); + + // Add page + const page = pdfDoc.addPage([595, 842]); // A4 size + const { width, height } = page.getSize(); + + // Add text with styling + page.drawText('Invoice #12345', { + x: 50, + y: height - 50, + size: 18, + font: helveticaBold, + color: rgb(0.2, 0.2, 0.8) + }); + + // Add rectangle (header background) + page.drawRectangle({ + x: 40, + y: height - 100, + width: width - 80, + height: 30, + color: rgb(0.9, 0.9, 0.9) + }); + + // Add table-like content + const items = [ + ['Item', 'Qty', 'Price', 'Total'], + ['Widget', '2', '$50', '$100'], + ['Gadget', '1', '$75', '$75'] + ]; + + let yPos = height - 150; + items.forEach(row => { + let xPos = 50; + row.forEach(cell => { + page.drawText(cell, { + x: xPos, + y: yPos, + size: 12, + font: helveticaFont + }); + xPos += 120; + }); + yPos -= 25; + }); + + const pdfBytes = await pdfDoc.save(); + fs.writeFileSync('created.pdf', pdfBytes); +} +``` + +#### Advanced Merge and Split Operations +```javascript +import { PDFDocument } from 'pdf-lib'; +import fs from 'fs'; + +async function mergePDFs() { + // Create new document + const mergedPdf = await PDFDocument.create(); + + // Load source PDFs + const pdf1Bytes = fs.readFileSync('doc1.pdf'); + const pdf2Bytes = fs.readFileSync('doc2.pdf'); + + const pdf1 = await PDFDocument.load(pdf1Bytes); + const pdf2 = await PDFDocument.load(pdf2Bytes); + + // Copy pages from first PDF + const pdf1Pages = await mergedPdf.copyPages(pdf1, pdf1.getPageIndices()); + pdf1Pages.forEach(page => mergedPdf.addPage(page)); + + // Copy specific pages from second PDF (pages 0, 2, 4) + const pdf2Pages = await mergedPdf.copyPages(pdf2, [0, 2, 4]); + pdf2Pages.forEach(page => mergedPdf.addPage(page)); + + const mergedPdfBytes = await mergedPdf.save(); + fs.writeFileSync('merged.pdf', mergedPdfBytes); +} +``` + +### pdfjs-dist (Apache License) + +PDF.js is Mozilla's JavaScript library for rendering PDFs in the browser. + +#### Basic PDF Loading and Rendering +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +// Configure worker (important for performance) +pdfjsLib.GlobalWorkerOptions.workerSrc = './pdf.worker.js'; + +async function renderPDF() { + // Load PDF + const loadingTask = pdfjsLib.getDocument('document.pdf'); + const pdf = await loadingTask.promise; + + console.log(`Loaded PDF with ${pdf.numPages} pages`); + + // Get first page + const page = await pdf.getPage(1); + const viewport = page.getViewport({ scale: 1.5 }); + + // Render to canvas + const canvas = document.createElement('canvas'); + const context = canvas.getContext('2d'); + canvas.height = viewport.height; + canvas.width = viewport.width; + + const renderContext = { + canvasContext: context, + viewport: viewport + }; + + await page.render(renderContext).promise; + document.body.appendChild(canvas); +} +``` + +#### Extract Text with Coordinates +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +async function extractText() { + const loadingTask = pdfjsLib.getDocument('document.pdf'); + const pdf = await loadingTask.promise; + + let fullText = ''; + + // Extract text from all pages + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const textContent = await page.getTextContent(); + + const pageText = textContent.items + .map(item => item.str) + .join(' '); + + fullText += `\n--- Page ${i} ---\n${pageText}`; + + // Get text with coordinates for advanced processing + const textWithCoords = textContent.items.map(item => ({ + text: item.str, + x: item.transform[4], + y: item.transform[5], + width: item.width, + height: item.height + })); + } + + console.log(fullText); + return fullText; +} +``` + +#### Extract Annotations and Forms +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +async function extractAnnotations() { + const loadingTask = pdfjsLib.getDocument('annotated.pdf'); + const pdf = await loadingTask.promise; + + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const annotations = await page.getAnnotations(); + + annotations.forEach(annotation => { + console.log(`Annotation type: ${annotation.subtype}`); + console.log(`Content: ${annotation.contents}`); + console.log(`Coordinates: ${JSON.stringify(annotation.rect)}`); + }); + } +} +``` + +## Advanced Command-Line Operations + +### poppler-utils Advanced Features + +#### Extract Text with Bounding Box Coordinates +```bash +# Extract text with bounding box coordinates (essential for structured data) +pdftotext -bbox-layout document.pdf output.xml + +# The XML output contains precise coordinates for each text element +``` + +#### Advanced Image Conversion +```bash +# Convert to PNG images with specific resolution +pdftoppm -png -r 300 document.pdf output_prefix + +# Convert specific page range with high resolution +pdftoppm -png -r 600 -f 1 -l 3 document.pdf high_res_pages + +# Convert to JPEG with quality setting +pdftoppm -jpeg -jpegopt quality=85 -r 200 document.pdf jpeg_output +``` + +#### Extract Embedded Images +```bash +# Extract all embedded images with metadata +pdfimages -j -p document.pdf page_images + +# List image info without extracting +pdfimages -list document.pdf + +# Extract images in their original format +pdfimages -all document.pdf images/img +``` + +### qpdf Advanced Features + +#### Complex Page Manipulation +```bash +# Split PDF into groups of pages +qpdf --split-pages=3 input.pdf output_group_%02d.pdf + +# Extract specific pages with complex ranges +qpdf input.pdf --pages input.pdf 1,3-5,8,10-end -- extracted.pdf + +# Merge specific pages from multiple PDFs +qpdf --empty --pages doc1.pdf 1-3 doc2.pdf 5-7 doc3.pdf 2,4 -- combined.pdf +``` + +#### PDF Optimization and Repair +```bash +# Optimize PDF for web (linearize for streaming) +qpdf --linearize input.pdf optimized.pdf + +# Remove unused objects and compress +qpdf --optimize-level=all input.pdf compressed.pdf + +# Attempt to repair corrupted PDF structure +qpdf --check input.pdf +qpdf --fix-qdf damaged.pdf repaired.pdf + +# Show detailed PDF structure for debugging +qpdf --show-all-pages input.pdf > structure.txt +``` + +#### Advanced Encryption +```bash +# Add password protection with specific permissions +qpdf --encrypt user_pass owner_pass 256 --print=none --modify=none -- input.pdf encrypted.pdf + +# Check encryption status +qpdf --show-encryption encrypted.pdf + +# Remove password protection (requires password) +qpdf --password=secret123 --decrypt encrypted.pdf decrypted.pdf +``` + +## Advanced Python Techniques + +### pdfplumber Advanced Features + +#### Extract Text with Precise Coordinates +```python +import pdfplumber + +with pdfplumber.open("document.pdf") as pdf: + page = pdf.pages[0] + + # Extract all text with coordinates + chars = page.chars + for char in chars[:10]: # First 10 characters + print(f"Char: '{char['text']}' at x:{char['x0']:.1f} y:{char['y0']:.1f}") + + # Extract text by bounding box (left, top, right, bottom) + bbox_text = page.within_bbox((100, 100, 400, 200)).extract_text() +``` + +#### Advanced Table Extraction with Custom Settings +```python +import pdfplumber +import pandas as pd + +with pdfplumber.open("complex_table.pdf") as pdf: + page = pdf.pages[0] + + # Extract tables with custom settings for complex layouts + table_settings = { + "vertical_strategy": "lines", + "horizontal_strategy": "lines", + "snap_tolerance": 3, + "intersection_tolerance": 15 + } + tables = page.extract_tables(table_settings) + + # Visual debugging for table extraction + img = page.to_image(resolution=150) + img.save("debug_layout.png") +``` + +### reportlab Advanced Features + +#### Create Professional Reports with Tables +```python +from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph +from reportlab.lib.styles import getSampleStyleSheet +from reportlab.lib import colors + +# Sample data +data = [ + ['Product', 'Q1', 'Q2', 'Q3', 'Q4'], + ['Widgets', '120', '135', '142', '158'], + ['Gadgets', '85', '92', '98', '105'] +] + +# Create PDF with table +doc = SimpleDocTemplate("report.pdf") +elements = [] + +# Add title +styles = getSampleStyleSheet() +title = Paragraph("Quarterly Sales Report", styles['Title']) +elements.append(title) + +# Add table with advanced styling +table = Table(data) +table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), + ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), + ('FONTSIZE', (0, 0), (-1, 0), 14), + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), + ('BACKGROUND', (0, 1), (-1, -1), colors.beige), + ('GRID', (0, 0), (-1, -1), 1, colors.black) +])) +elements.append(table) + +doc.build(elements) +``` + +## Complex Workflows + +### Extract Figures/Images from PDF + +#### Method 1: Using pdfimages (fastest) +```bash +# Extract all images with original quality +pdfimages -all document.pdf images/img +``` + +#### Method 2: Using pypdfium2 + Image Processing +```python +import pypdfium2 as pdfium +from PIL import Image +import numpy as np + +def extract_figures(pdf_path, output_dir): + pdf = pdfium.PdfDocument(pdf_path) + + for page_num, page in enumerate(pdf): + # Render high-resolution page + bitmap = page.render(scale=3.0) + img = bitmap.to_pil() + + # Convert to numpy for processing + img_array = np.array(img) + + # Simple figure detection (non-white regions) + mask = np.any(img_array != [255, 255, 255], axis=2) + + # Find contours and extract bounding boxes + # (This is simplified - real implementation would need more sophisticated detection) + + # Save detected figures + # ... implementation depends on specific needs +``` + +### Batch PDF Processing with Error Handling +```python +import os +import glob +from pypdf import PdfReader, PdfWriter +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def batch_process_pdfs(input_dir, operation='merge'): + pdf_files = glob.glob(os.path.join(input_dir, "*.pdf")) + + if operation == 'merge': + writer = PdfWriter() + for pdf_file in pdf_files: + try: + reader = PdfReader(pdf_file) + for page in reader.pages: + writer.add_page(page) + logger.info(f"Processed: {pdf_file}") + except Exception as e: + logger.error(f"Failed to process {pdf_file}: {e}") + continue + + with open("batch_merged.pdf", "wb") as output: + writer.write(output) + + elif operation == 'extract_text': + for pdf_file in pdf_files: + try: + reader = PdfReader(pdf_file) + text = "" + for page in reader.pages: + text += page.extract_text() + + output_file = pdf_file.replace('.pdf', '.txt') + with open(output_file, 'w', encoding='utf-8') as f: + f.write(text) + logger.info(f"Extracted text from: {pdf_file}") + + except Exception as e: + logger.error(f"Failed to extract text from {pdf_file}: {e}") + continue +``` + +### Advanced PDF Cropping +```python +from pypdf import PdfWriter, PdfReader + +reader = PdfReader("input.pdf") +writer = PdfWriter() + +# Crop page (left, bottom, right, top in points) +page = reader.pages[0] +page.mediabox.left = 50 +page.mediabox.bottom = 50 +page.mediabox.right = 550 +page.mediabox.top = 750 + +writer.add_page(page) +with open("cropped.pdf", "wb") as output: + writer.write(output) +``` + +## Performance Optimization Tips + +### 1. For Large PDFs +- Use streaming approaches instead of loading entire PDF in memory +- Use `qpdf --split-pages` for splitting large files +- Process pages individually with pypdfium2 + +### 2. For Text Extraction +- `pdftotext -bbox-layout` is fastest for plain text extraction +- Use pdfplumber for structured data and tables +- Avoid `pypdf.extract_text()` for very large documents + +### 3. For Image Extraction +- `pdfimages` is much faster than rendering pages +- Use low resolution for previews, high resolution for final output + +### 4. For Form Filling +- pdf-lib maintains form structure better than most alternatives +- Pre-validate form fields before processing + +### 5. Memory Management +```python +# Process PDFs in chunks +def process_large_pdf(pdf_path, chunk_size=10): + reader = PdfReader(pdf_path) + total_pages = len(reader.pages) + + for start_idx in range(0, total_pages, chunk_size): + end_idx = min(start_idx + chunk_size, total_pages) + writer = PdfWriter() + + for i in range(start_idx, end_idx): + writer.add_page(reader.pages[i]) + + # Process chunk + with open(f"chunk_{start_idx//chunk_size}.pdf", "wb") as output: + writer.write(output) +``` + +## Troubleshooting Common Issues + +### Encrypted PDFs +```python +# Handle password-protected PDFs +from pypdf import PdfReader + +try: + reader = PdfReader("encrypted.pdf") + if reader.is_encrypted: + reader.decrypt("password") +except Exception as e: + print(f"Failed to decrypt: {e}") +``` + +### Corrupted PDFs +```bash +# Use qpdf to repair +qpdf --check corrupted.pdf +qpdf --replace-input corrupted.pdf +``` + +### Text Extraction Issues +```python +# Fallback to OCR for scanned PDFs +import pytesseract +from pdf2image import convert_from_path + +def extract_text_with_ocr(pdf_path): + images = convert_from_path(pdf_path) + text = "" + for i, image in enumerate(images): + text += pytesseract.image_to_string(image) + return text +``` + +## License Information + +- **pypdf**: BSD License +- **pdfplumber**: MIT License +- **pypdfium2**: Apache/BSD License +- **reportlab**: BSD License +- **poppler-utils**: GPL-2 License +- **qpdf**: Apache License +- **pdf-lib**: MIT License +- **pdfjs-dist**: Apache License \ No newline at end of file diff --git a/scripts/check_bounding_boxes.py b/scripts/check_bounding_boxes.py new file mode 100644 index 0000000..2cc5e34 --- /dev/null +++ b/scripts/check_bounding_boxes.py @@ -0,0 +1,65 @@ +from dataclasses import dataclass +import json +import sys + + + + +@dataclass +class RectAndField: + rect: list[float] + rect_type: str + field: dict + + +def get_bounding_box_messages(fields_json_stream) -> list[str]: + messages = [] + fields = json.load(fields_json_stream) + messages.append(f"Read {len(fields['form_fields'])} fields") + + def rects_intersect(r1, r2): + disjoint_horizontal = r1[0] >= r2[2] or r1[2] <= r2[0] + disjoint_vertical = r1[1] >= r2[3] or r1[3] <= r2[1] + return not (disjoint_horizontal or disjoint_vertical) + + rects_and_fields = [] + for f in fields["form_fields"]: + rects_and_fields.append(RectAndField(f["label_bounding_box"], "label", f)) + rects_and_fields.append(RectAndField(f["entry_bounding_box"], "entry", f)) + + has_error = False + for i, ri in enumerate(rects_and_fields): + for j in range(i + 1, len(rects_and_fields)): + rj = rects_and_fields[j] + if ri.field["page_number"] == rj.field["page_number"] and rects_intersect(ri.rect, rj.rect): + has_error = True + if ri.field is rj.field: + messages.append(f"FAILURE: intersection between label and entry bounding boxes for `{ri.field['description']}` ({ri.rect}, {rj.rect})") + else: + messages.append(f"FAILURE: intersection between {ri.rect_type} bounding box for `{ri.field['description']}` ({ri.rect}) and {rj.rect_type} bounding box for `{rj.field['description']}` ({rj.rect})") + if len(messages) >= 20: + messages.append("Aborting further checks; fix bounding boxes and try again") + return messages + if ri.rect_type == "entry": + if "entry_text" in ri.field: + font_size = ri.field["entry_text"].get("font_size", 14) + entry_height = ri.rect[3] - ri.rect[1] + if entry_height < font_size: + has_error = True + messages.append(f"FAILURE: entry bounding box height ({entry_height}) for `{ri.field['description']}` is too short for the text content (font size: {font_size}). Increase the box height or decrease the font size.") + if len(messages) >= 20: + messages.append("Aborting further checks; fix bounding boxes and try again") + return messages + + if not has_error: + messages.append("SUCCESS: All bounding boxes are valid") + return messages + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: check_bounding_boxes.py [fields.json]") + sys.exit(1) + with open(sys.argv[1]) as f: + messages = get_bounding_box_messages(f) + for msg in messages: + print(msg) diff --git a/scripts/check_fillable_fields.py b/scripts/check_fillable_fields.py new file mode 100644 index 0000000..36dfb95 --- /dev/null +++ b/scripts/check_fillable_fields.py @@ -0,0 +1,11 @@ +import sys +from pypdf import PdfReader + + + + +reader = PdfReader(sys.argv[1]) +if (reader.get_fields()): + print("This PDF has fillable form fields") +else: + print("This PDF does not have fillable form fields; you will need to visually determine where to enter data") diff --git a/scripts/convert_pdf_to_images.py b/scripts/convert_pdf_to_images.py new file mode 100644 index 0000000..7939cef --- /dev/null +++ b/scripts/convert_pdf_to_images.py @@ -0,0 +1,33 @@ +import os +import sys + +from pdf2image import convert_from_path + + + + +def convert(pdf_path, output_dir, max_dim=1000): + images = convert_from_path(pdf_path, dpi=200) + + for i, image in enumerate(images): + width, height = image.size + if width > max_dim or height > max_dim: + scale_factor = min(max_dim / width, max_dim / height) + new_width = int(width * scale_factor) + new_height = int(height * scale_factor) + image = image.resize((new_width, new_height)) + + image_path = os.path.join(output_dir, f"page_{i+1}.png") + image.save(image_path) + print(f"Saved page {i+1} as {image_path} (size: {image.size})") + + print(f"Converted {len(images)} pages to PNG images") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: convert_pdf_to_images.py [input pdf] [output directory]") + sys.exit(1) + pdf_path = sys.argv[1] + output_directory = sys.argv[2] + convert(pdf_path, output_directory) diff --git a/scripts/create_validation_image.py b/scripts/create_validation_image.py new file mode 100644 index 0000000..10eadd8 --- /dev/null +++ b/scripts/create_validation_image.py @@ -0,0 +1,37 @@ +import json +import sys + +from PIL import Image, ImageDraw + + + + +def create_validation_image(page_number, fields_json_path, input_path, output_path): + with open(fields_json_path, 'r') as f: + data = json.load(f) + + img = Image.open(input_path) + draw = ImageDraw.Draw(img) + num_boxes = 0 + + for field in data["form_fields"]: + if field["page_number"] == page_number: + entry_box = field['entry_bounding_box'] + label_box = field['label_bounding_box'] + draw.rectangle(entry_box, outline='red', width=2) + draw.rectangle(label_box, outline='blue', width=2) + num_boxes += 2 + + img.save(output_path) + print(f"Created validation image at {output_path} with {num_boxes} bounding boxes") + + +if __name__ == "__main__": + if len(sys.argv) != 5: + print("Usage: create_validation_image.py [page number] [fields.json file] [input image path] [output image path]") + sys.exit(1) + page_number = int(sys.argv[1]) + fields_json_path = sys.argv[2] + input_image_path = sys.argv[3] + output_image_path = sys.argv[4] + create_validation_image(page_number, fields_json_path, input_image_path, output_image_path) diff --git a/scripts/extract_form_field_info.py b/scripts/extract_form_field_info.py new file mode 100644 index 0000000..64cd470 --- /dev/null +++ b/scripts/extract_form_field_info.py @@ -0,0 +1,122 @@ +import json +import sys + +from pypdf import PdfReader + + + + +def get_full_annotation_field_id(annotation): + components = [] + while annotation: + field_name = annotation.get('/T') + if field_name: + components.append(field_name) + annotation = annotation.get('/Parent') + return ".".join(reversed(components)) if components else None + + +def make_field_dict(field, field_id): + field_dict = {"field_id": field_id} + ft = field.get('/FT') + if ft == "/Tx": + field_dict["type"] = "text" + elif ft == "/Btn": + field_dict["type"] = "checkbox" + states = field.get("/_States_", []) + if len(states) == 2: + if "/Off" in states: + field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1] + field_dict["unchecked_value"] = "/Off" + else: + print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.") + field_dict["checked_value"] = states[0] + field_dict["unchecked_value"] = states[1] + elif ft == "/Ch": + field_dict["type"] = "choice" + states = field.get("/_States_", []) + field_dict["choice_options"] = [{ + "value": state[0], + "text": state[1], + } for state in states] + else: + field_dict["type"] = f"unknown ({ft})" + return field_dict + + +def get_field_info(reader: PdfReader): + fields = reader.get_fields() + + field_info_by_id = {} + possible_radio_names = set() + + for field_id, field in fields.items(): + if field.get("/Kids"): + if field.get("/FT") == "/Btn": + possible_radio_names.add(field_id) + continue + field_info_by_id[field_id] = make_field_dict(field, field_id) + + + radio_fields_by_id = {} + + for page_index, page in enumerate(reader.pages): + annotations = page.get('/Annots', []) + for ann in annotations: + field_id = get_full_annotation_field_id(ann) + if field_id in field_info_by_id: + field_info_by_id[field_id]["page"] = page_index + 1 + field_info_by_id[field_id]["rect"] = ann.get('/Rect') + elif field_id in possible_radio_names: + try: + on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"] + except KeyError: + continue + if len(on_values) == 1: + rect = ann.get("/Rect") + if field_id not in radio_fields_by_id: + radio_fields_by_id[field_id] = { + "field_id": field_id, + "type": "radio_group", + "page": page_index + 1, + "radio_options": [], + } + radio_fields_by_id[field_id]["radio_options"].append({ + "value": on_values[0], + "rect": rect, + }) + + fields_with_location = [] + for field_info in field_info_by_id.values(): + if "page" in field_info: + fields_with_location.append(field_info) + else: + print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring") + + def sort_key(f): + if "radio_options" in f: + rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0] + else: + rect = f.get("rect") or [0, 0, 0, 0] + adjusted_position = [-rect[1], rect[0]] + return [f.get("page"), adjusted_position] + + sorted_fields = fields_with_location + list(radio_fields_by_id.values()) + sorted_fields.sort(key=sort_key) + + return sorted_fields + + +def write_field_info(pdf_path: str, json_output_path: str): + reader = PdfReader(pdf_path) + field_info = get_field_info(reader) + with open(json_output_path, "w") as f: + json.dump(field_info, f, indent=2) + print(f"Wrote {len(field_info)} fields to {json_output_path}") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: extract_form_field_info.py [input pdf] [output json]") + sys.exit(1) + write_field_info(sys.argv[1], sys.argv[2]) diff --git a/scripts/extract_form_structure.py b/scripts/extract_form_structure.py new file mode 100644 index 0000000..f219e7d --- /dev/null +++ b/scripts/extract_form_structure.py @@ -0,0 +1,115 @@ +""" +Extract form structure from a non-fillable PDF. + +This script analyzes the PDF to find: +- Text labels with their exact coordinates +- Horizontal lines (row boundaries) +- Checkboxes (small rectangles) + +Output: A JSON file with the form structure that can be used to generate +accurate field coordinates for filling. + +Usage: python extract_form_structure.py +""" + +import json +import sys +import pdfplumber + + +def extract_form_structure(pdf_path): + structure = { + "pages": [], + "labels": [], + "lines": [], + "checkboxes": [], + "row_boundaries": [] + } + + with pdfplumber.open(pdf_path) as pdf: + for page_num, page in enumerate(pdf.pages, 1): + structure["pages"].append({ + "page_number": page_num, + "width": float(page.width), + "height": float(page.height) + }) + + words = page.extract_words() + for word in words: + structure["labels"].append({ + "page": page_num, + "text": word["text"], + "x0": round(float(word["x0"]), 1), + "top": round(float(word["top"]), 1), + "x1": round(float(word["x1"]), 1), + "bottom": round(float(word["bottom"]), 1) + }) + + for line in page.lines: + if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5: + structure["lines"].append({ + "page": page_num, + "y": round(float(line["top"]), 1), + "x0": round(float(line["x0"]), 1), + "x1": round(float(line["x1"]), 1) + }) + + for rect in page.rects: + width = float(rect["x1"]) - float(rect["x0"]) + height = float(rect["bottom"]) - float(rect["top"]) + if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2: + structure["checkboxes"].append({ + "page": page_num, + "x0": round(float(rect["x0"]), 1), + "top": round(float(rect["top"]), 1), + "x1": round(float(rect["x1"]), 1), + "bottom": round(float(rect["bottom"]), 1), + "center_x": round((float(rect["x0"]) + float(rect["x1"])) / 2, 1), + "center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1) + }) + + lines_by_page = {} + for line in structure["lines"]: + page = line["page"] + if page not in lines_by_page: + lines_by_page[page] = [] + lines_by_page[page].append(line["y"]) + + for page, y_coords in lines_by_page.items(): + y_coords = sorted(set(y_coords)) + for i in range(len(y_coords) - 1): + structure["row_boundaries"].append({ + "page": page, + "row_top": y_coords[i], + "row_bottom": y_coords[i + 1], + "row_height": round(y_coords[i + 1] - y_coords[i], 1) + }) + + return structure + + +def main(): + if len(sys.argv) != 3: + print("Usage: extract_form_structure.py ") + sys.exit(1) + + pdf_path = sys.argv[1] + output_path = sys.argv[2] + + print(f"Extracting structure from {pdf_path}...") + structure = extract_form_structure(pdf_path) + + with open(output_path, "w") as f: + json.dump(structure, f, indent=2) + + print(f"Found:") + print(f" - {len(structure['pages'])} pages") + print(f" - {len(structure['labels'])} text labels") + print(f" - {len(structure['lines'])} horizontal lines") + print(f" - {len(structure['checkboxes'])} checkboxes") + print(f" - {len(structure['row_boundaries'])} row boundaries") + print(f"Saved to {output_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/fill_fillable_fields.py b/scripts/fill_fillable_fields.py new file mode 100644 index 0000000..51c2600 --- /dev/null +++ b/scripts/fill_fillable_fields.py @@ -0,0 +1,98 @@ +import json +import sys + +from pypdf import PdfReader, PdfWriter + +from extract_form_field_info import get_field_info + + + + +def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path: str): + with open(fields_json_path) as f: + fields = json.load(f) + fields_by_page = {} + for field in fields: + if "value" in field: + field_id = field["field_id"] + page = field["page"] + if page not in fields_by_page: + fields_by_page[page] = {} + fields_by_page[page][field_id] = field["value"] + + reader = PdfReader(input_pdf_path) + + has_error = False + field_info = get_field_info(reader) + fields_by_ids = {f["field_id"]: f for f in field_info} + for field in fields: + existing_field = fields_by_ids.get(field["field_id"]) + if not existing_field: + has_error = True + print(f"ERROR: `{field['field_id']}` is not a valid field ID") + elif field["page"] != existing_field["page"]: + has_error = True + print(f"ERROR: Incorrect page number for `{field['field_id']}` (got {field['page']}, expected {existing_field['page']})") + else: + if "value" in field: + err = validation_error_for_field_value(existing_field, field["value"]) + if err: + print(err) + has_error = True + if has_error: + sys.exit(1) + + writer = PdfWriter(clone_from=reader) + for page, field_values in fields_by_page.items(): + writer.update_page_form_field_values(writer.pages[page - 1], field_values, auto_regenerate=False) + + writer.set_need_appearances_writer(True) + + with open(output_pdf_path, "wb") as f: + writer.write(f) + + +def validation_error_for_field_value(field_info, field_value): + field_type = field_info["type"] + field_id = field_info["field_id"] + if field_type == "checkbox": + checked_val = field_info["checked_value"] + unchecked_val = field_info["unchecked_value"] + if field_value != checked_val and field_value != unchecked_val: + return f'ERROR: Invalid value "{field_value}" for checkbox field "{field_id}". The checked value is "{checked_val}" and the unchecked value is "{unchecked_val}"' + elif field_type == "radio_group": + option_values = [opt["value"] for opt in field_info["radio_options"]] + if field_value not in option_values: + return f'ERROR: Invalid value "{field_value}" for radio group field "{field_id}". Valid values are: {option_values}' + elif field_type == "choice": + choice_values = [opt["value"] for opt in field_info["choice_options"]] + if field_value not in choice_values: + return f'ERROR: Invalid value "{field_value}" for choice field "{field_id}". Valid values are: {choice_values}' + return None + + +def monkeypatch_pydpf_method(): + from pypdf.generic import DictionaryObject + from pypdf.constants import FieldDictionaryAttributes + + original_get_inherited = DictionaryObject.get_inherited + + def patched_get_inherited(self, key: str, default = None): + result = original_get_inherited(self, key, default) + if key == FieldDictionaryAttributes.Opt: + if isinstance(result, list) and all(isinstance(v, list) and len(v) == 2 for v in result): + result = [r[0] for r in result] + return result + + DictionaryObject.get_inherited = patched_get_inherited + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: fill_fillable_fields.py [input pdf] [field_values.json] [output pdf]") + sys.exit(1) + monkeypatch_pydpf_method() + input_pdf = sys.argv[1] + fields_json = sys.argv[2] + output_pdf = sys.argv[3] + fill_pdf_fields(input_pdf, fields_json, output_pdf) diff --git a/scripts/fill_pdf_form_with_annotations.py b/scripts/fill_pdf_form_with_annotations.py new file mode 100644 index 0000000..b430069 --- /dev/null +++ b/scripts/fill_pdf_form_with_annotations.py @@ -0,0 +1,107 @@ +import json +import sys + +from pypdf import PdfReader, PdfWriter +from pypdf.annotations import FreeText + + + + +def transform_from_image_coords(bbox, image_width, image_height, pdf_width, pdf_height): + x_scale = pdf_width / image_width + y_scale = pdf_height / image_height + + left = bbox[0] * x_scale + right = bbox[2] * x_scale + + top = pdf_height - (bbox[1] * y_scale) + bottom = pdf_height - (bbox[3] * y_scale) + + return left, bottom, right, top + + +def transform_from_pdf_coords(bbox, pdf_height): + left = bbox[0] + right = bbox[2] + + pypdf_top = pdf_height - bbox[1] + pypdf_bottom = pdf_height - bbox[3] + + return left, pypdf_bottom, right, pypdf_top + + +def fill_pdf_form(input_pdf_path, fields_json_path, output_pdf_path): + + with open(fields_json_path, "r") as f: + fields_data = json.load(f) + + reader = PdfReader(input_pdf_path) + writer = PdfWriter() + + writer.append(reader) + + pdf_dimensions = {} + for i, page in enumerate(reader.pages): + mediabox = page.mediabox + pdf_dimensions[i + 1] = [mediabox.width, mediabox.height] + + annotations = [] + for field in fields_data["form_fields"]: + page_num = field["page_number"] + + page_info = next(p for p in fields_data["pages"] if p["page_number"] == page_num) + pdf_width, pdf_height = pdf_dimensions[page_num] + + if "pdf_width" in page_info: + transformed_entry_box = transform_from_pdf_coords( + field["entry_bounding_box"], + float(pdf_height) + ) + else: + image_width = page_info["image_width"] + image_height = page_info["image_height"] + transformed_entry_box = transform_from_image_coords( + field["entry_bounding_box"], + image_width, image_height, + float(pdf_width), float(pdf_height) + ) + + if "entry_text" not in field or "text" not in field["entry_text"]: + continue + entry_text = field["entry_text"] + text = entry_text["text"] + if not text: + continue + + font_name = entry_text.get("font", "Arial") + font_size = str(entry_text.get("font_size", 14)) + "pt" + font_color = entry_text.get("font_color", "000000") + + annotation = FreeText( + text=text, + rect=transformed_entry_box, + font=font_name, + font_size=font_size, + font_color=font_color, + border_color=None, + background_color=None, + ) + annotations.append(annotation) + writer.add_annotation(page_number=page_num - 1, annotation=annotation) + + with open(output_pdf_path, "wb") as output: + writer.write(output) + + print(f"Successfully filled PDF form and saved to {output_pdf_path}") + print(f"Added {len(annotations)} text annotations") + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: fill_pdf_form_with_annotations.py [input pdf] [fields.json] [output pdf]") + sys.exit(1) + input_pdf = sys.argv[1] + fields_json = sys.argv[2] + output_pdf = sys.argv[3] + + fill_pdf_form(input_pdf, fields_json, output_pdf)