commit 9342c3042188b580b86044e28bf3b0d633a04e39 Author: zlei9 Date: Sun Mar 22 11:47:08 2026 +0800 Initial commit: pptx skill diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..c55ab42 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,30 @@ +© 2025 Anthropic, PBC. All rights reserved. + +LICENSE: Use of these materials (including all code, prompts, assets, files, +and other components of this Skill) is governed by your agreement with +Anthropic regarding use of Anthropic's services. If no separate agreement +exists, use is governed by Anthropic's Consumer Terms of Service or +Commercial Terms of Service, as applicable: +https://www.anthropic.com/legal/consumer-terms +https://www.anthropic.com/legal/commercial-terms +Your applicable agreement is referred to as the "Agreement." "Services" are +as defined in the Agreement. + +ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the +contrary, users may not: + +- Extract these materials from the Services or retain copies of these + materials outside the Services +- Reproduce or copy these materials, except for temporary copies created + automatically during authorized use of the Services +- Create derivative works based on these materials +- Distribute, sublicense, or transfer these materials to any third party +- Make, offer to sell, sell, or import any inventions embodied in these + materials +- Reverse engineer, decompile, or disassemble these materials + +The receipt, viewing, or possession of these materials does not convey or +imply any license or right beyond those expressly granted above. + +Anthropic retains all right, title, and interest in these materials, +including all copyrights, patents, and other intellectual property rights. diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..df5000e --- /dev/null +++ b/SKILL.md @@ -0,0 +1,232 @@ +--- +name: pptx +description: "Use this skill any time a .pptx file is involved in any way — as input, output, or both. This includes: creating slide decks, pitch decks, or presentations; reading, parsing, or extracting text from any .pptx file (even if the extracted content will be used elsewhere, like in an email or summary); editing, modifying, or updating existing presentations; combining or splitting slide files; working with templates, layouts, speaker notes, or comments. Trigger whenever the user mentions \"deck,\" \"slides,\" \"presentation,\" or references a .pptx filename, regardless of what they plan to do with the content afterward. If a .pptx file needs to be opened, created, or touched, use this skill." +license: Proprietary. LICENSE.txt has complete terms +--- + +# PPTX Skill + +## Quick Reference + +| Task | Guide | +|------|-------| +| Read/analyze content | `python -m markitdown presentation.pptx` | +| Edit or create from template | Read [editing.md](editing.md) | +| Create from scratch | Read [pptxgenjs.md](pptxgenjs.md) | + +--- + +## Reading Content + +```bash +# Text extraction +python -m markitdown presentation.pptx + +# Visual overview +python scripts/thumbnail.py presentation.pptx + +# Raw XML +python scripts/office/unpack.py presentation.pptx unpacked/ +``` + +--- + +## Editing Workflow + +**Read [editing.md](editing.md) for full details.** + +1. Analyze template with `thumbnail.py` +2. Unpack → manipulate slides → edit content → clean → pack + +--- + +## Creating from Scratch + +**Read [pptxgenjs.md](pptxgenjs.md) for full details.** + +Use when no template or reference presentation is available. + +--- + +## Design Ideas + +**Don't create boring slides.** Plain bullets on a white background won't impress anyone. Consider ideas from this list for each slide. + +### Before Starting + +- **Pick a bold, content-informed color palette**: The palette should feel designed for THIS topic. If swapping your colors into a completely different presentation would still "work," you haven't made specific enough choices. +- **Dominance over equality**: One color should dominate (60-70% visual weight), with 1-2 supporting tones and one sharp accent. Never give all colors equal weight. +- **Dark/light contrast**: Dark backgrounds for title + conclusion slides, light for content ("sandwich" structure). Or commit to dark throughout for a premium feel. +- **Commit to a visual motif**: Pick ONE distinctive element and repeat it — rounded image frames, icons in colored circles, thick single-side borders. Carry it across every slide. + +### Color Palettes + +Choose colors that match your topic — don't default to generic blue. Use these palettes as inspiration: + +| Theme | Primary | Secondary | Accent | +|-------|---------|-----------|--------| +| **Midnight Executive** | `1E2761` (navy) | `CADCFC` (ice blue) | `FFFFFF` (white) | +| **Forest & Moss** | `2C5F2D` (forest) | `97BC62` (moss) | `F5F5F5` (cream) | +| **Coral Energy** | `F96167` (coral) | `F9E795` (gold) | `2F3C7E` (navy) | +| **Warm Terracotta** | `B85042` (terracotta) | `E7E8D1` (sand) | `A7BEAE` (sage) | +| **Ocean Gradient** | `065A82` (deep blue) | `1C7293` (teal) | `21295C` (midnight) | +| **Charcoal Minimal** | `36454F` (charcoal) | `F2F2F2` (off-white) | `212121` (black) | +| **Teal Trust** | `028090` (teal) | `00A896` (seafoam) | `02C39A` (mint) | +| **Berry & Cream** | `6D2E46` (berry) | `A26769` (dusty rose) | `ECE2D0` (cream) | +| **Sage Calm** | `84B59F` (sage) | `69A297` (eucalyptus) | `50808E` (slate) | +| **Cherry Bold** | `990011` (cherry) | `FCF6F5` (off-white) | `2F3C7E` (navy) | + +### For Each Slide + +**Every slide needs a visual element** — image, chart, icon, or shape. Text-only slides are forgettable. + +**Layout options:** +- Two-column (text left, illustration on right) +- Icon + text rows (icon in colored circle, bold header, description below) +- 2x2 or 2x3 grid (image on one side, grid of content blocks on other) +- Half-bleed image (full left or right side) with content overlay + +**Data display:** +- Large stat callouts (big numbers 60-72pt with small labels below) +- Comparison columns (before/after, pros/cons, side-by-side options) +- Timeline or process flow (numbered steps, arrows) + +**Visual polish:** +- Icons in small colored circles next to section headers +- Italic accent text for key stats or taglines + +### Typography + +**Choose an interesting font pairing** — don't default to Arial. Pick a header font with personality and pair it with a clean body font. + +| Header Font | Body Font | +|-------------|-----------| +| Georgia | Calibri | +| Arial Black | Arial | +| Calibri | Calibri Light | +| Cambria | Calibri | +| Trebuchet MS | Calibri | +| Impact | Arial | +| Palatino | Garamond | +| Consolas | Calibri | + +| Element | Size | +|---------|------| +| Slide title | 36-44pt bold | +| Section header | 20-24pt bold | +| Body text | 14-16pt | +| Captions | 10-12pt muted | + +### Spacing + +- 0.5" minimum margins +- 0.3-0.5" between content blocks +- Leave breathing room—don't fill every inch + +### Avoid (Common Mistakes) + +- **Don't repeat the same layout** — vary columns, cards, and callouts across slides +- **Don't center body text** — left-align paragraphs and lists; center only titles +- **Don't skimp on size contrast** — titles need 36pt+ to stand out from 14-16pt body +- **Don't default to blue** — pick colors that reflect the specific topic +- **Don't mix spacing randomly** — choose 0.3" or 0.5" gaps and use consistently +- **Don't style one slide and leave the rest plain** — commit fully or keep it simple throughout +- **Don't create text-only slides** — add images, icons, charts, or visual elements; avoid plain title + bullets +- **Don't forget text box padding** — when aligning lines or shapes with text edges, set `margin: 0` on the text box or offset the shape to account for padding +- **Don't use low-contrast elements** — icons AND text need strong contrast against the background; avoid light text on light backgrounds or dark text on dark backgrounds +- **NEVER use accent lines under titles** — these are a hallmark of AI-generated slides; use whitespace or background color instead + +--- + +## QA (Required) + +**Assume there are problems. Your job is to find them.** + +Your first render is almost never correct. Approach QA as a bug hunt, not a confirmation step. If you found zero issues on first inspection, you weren't looking hard enough. + +### Content QA + +```bash +python -m markitdown output.pptx +``` + +Check for missing content, typos, wrong order. + +**When using templates, check for leftover placeholder text:** + +```bash +python -m markitdown output.pptx | grep -iE "xxxx|lorem|ipsum|this.*(page|slide).*layout" +``` + +If grep returns results, fix them before declaring success. + +### Visual QA + +**⚠️ USE SUBAGENTS** — even for 2-3 slides. You've been staring at the code and will see what you expect, not what's there. Subagents have fresh eyes. + +Convert slides to images (see [Converting to Images](#converting-to-images)), then use this prompt: + +``` +Visually inspect these slides. Assume there are issues — find them. + +Look for: +- Overlapping elements (text through shapes, lines through words, stacked elements) +- Text overflow or cut off at edges/box boundaries +- Decorative lines positioned for single-line text but title wrapped to two lines +- Source citations or footers colliding with content above +- Elements too close (< 0.3" gaps) or cards/sections nearly touching +- Uneven gaps (large empty area in one place, cramped in another) +- Insufficient margin from slide edges (< 0.5") +- Columns or similar elements not aligned consistently +- Low-contrast text (e.g., light gray text on cream-colored background) +- Low-contrast icons (e.g., dark icons on dark backgrounds without a contrasting circle) +- Text boxes too narrow causing excessive wrapping +- Leftover placeholder content + +For each slide, list issues or areas of concern, even if minor. + +Read and analyze these images: +1. /path/to/slide-01.jpg (Expected: [brief description]) +2. /path/to/slide-02.jpg (Expected: [brief description]) + +Report ALL issues found, including minor ones. +``` + +### Verification Loop + +1. Generate slides → Convert to images → Inspect +2. **List issues found** (if none found, look again more critically) +3. Fix issues +4. **Re-verify affected slides** — one fix often creates another problem +5. Repeat until a full pass reveals no new issues + +**Do not declare success until you've completed at least one fix-and-verify cycle.** + +--- + +## Converting to Images + +Convert presentations to individual slide images for visual inspection: + +```bash +python scripts/office/soffice.py --headless --convert-to pdf output.pptx +pdftoppm -jpeg -r 150 output.pdf slide +``` + +This creates `slide-01.jpg`, `slide-02.jpg`, etc. + +To re-render specific slides after fixes: + +```bash +pdftoppm -jpeg -r 150 -f N -l N output.pdf slide-fixed +``` + +--- + +## Dependencies + +- `pip install "markitdown[pptx]"` - text extraction +- `pip install Pillow` - thumbnail grids +- `npm install -g pptxgenjs` - creating from scratch +- LibreOffice (`soffice`) - PDF conversion (auto-configured for sandboxed environments via `scripts/office/soffice.py`) +- Poppler (`pdftoppm`) - PDF to images diff --git a/editing.md b/editing.md new file mode 100644 index 0000000..f873e8a --- /dev/null +++ b/editing.md @@ -0,0 +1,205 @@ +# Editing Presentations + +## Template-Based Workflow + +When using an existing presentation as a template: + +1. **Analyze existing slides**: + ```bash + python scripts/thumbnail.py template.pptx + python -m markitdown template.pptx + ``` + Review `thumbnails.jpg` to see layouts, and markitdown output to see placeholder text. + +2. **Plan slide mapping**: For each content section, choose a template slide. + + ⚠️ **USE VARIED LAYOUTS** — monotonous presentations are a common failure mode. Don't default to basic title + bullet slides. Actively seek out: + - Multi-column layouts (2-column, 3-column) + - Image + text combinations + - Full-bleed images with text overlay + - Quote or callout slides + - Section dividers + - Stat/number callouts + - Icon grids or icon + text rows + + **Avoid:** Repeating the same text-heavy layout for every slide. + + Match content type to layout style (e.g., key points → bullet slide, team info → multi-column, testimonials → quote slide). + +3. **Unpack**: `python scripts/office/unpack.py template.pptx unpacked/` + +4. **Build presentation** (do this yourself, not with subagents): + - Delete unwanted slides (remove from ``) + - Duplicate slides you want to reuse (`add_slide.py`) + - Reorder slides in `` + - **Complete all structural changes before step 5** + +5. **Edit content**: Update text in each `slide{N}.xml`. + **Use subagents here if available** — slides are separate XML files, so subagents can edit in parallel. + +6. **Clean**: `python scripts/clean.py unpacked/` + +7. **Pack**: `python scripts/office/pack.py unpacked/ output.pptx --original template.pptx` + +--- + +## Scripts + +| Script | Purpose | +|--------|---------| +| `unpack.py` | Extract and pretty-print PPTX | +| `add_slide.py` | Duplicate slide or create from layout | +| `clean.py` | Remove orphaned files | +| `pack.py` | Repack with validation | +| `thumbnail.py` | Create visual grid of slides | + +### unpack.py + +```bash +python scripts/office/unpack.py input.pptx unpacked/ +``` + +Extracts PPTX, pretty-prints XML, escapes smart quotes. + +### add_slide.py + +```bash +python scripts/add_slide.py unpacked/ slide2.xml # Duplicate slide +python scripts/add_slide.py unpacked/ slideLayout2.xml # From layout +``` + +Prints `` to add to `` at desired position. + +### clean.py + +```bash +python scripts/clean.py unpacked/ +``` + +Removes slides not in ``, unreferenced media, orphaned rels. + +### pack.py + +```bash +python scripts/office/pack.py unpacked/ output.pptx --original input.pptx +``` + +Validates, repairs, condenses XML, re-encodes smart quotes. + +### thumbnail.py + +```bash +python scripts/thumbnail.py input.pptx [output_prefix] [--cols N] +``` + +Creates `thumbnails.jpg` with slide filenames as labels. Default 3 columns, max 12 per grid. + +**Use for template analysis only** (choosing layouts). For visual QA, use `soffice` + `pdftoppm` to create full-resolution individual slide images—see SKILL.md. + +--- + +## Slide Operations + +Slide order is in `ppt/presentation.xml` → ``. + +**Reorder**: Rearrange `` elements. + +**Delete**: Remove ``, then run `clean.py`. + +**Add**: Use `add_slide.py`. Never manually copy slide files—the script handles notes references, Content_Types.xml, and relationship IDs that manual copying misses. + +--- + +## Editing Content + +**Subagents:** If available, use them here (after completing step 4). Each slide is a separate XML file, so subagents can edit in parallel. In your prompt to subagents, include: +- The slide file path(s) to edit +- **"Use the Edit tool for all changes"** +- The formatting rules and common pitfalls below + +For each slide: +1. Read the slide's XML +2. Identify ALL placeholder content—text, images, charts, icons, captions +3. Replace each placeholder with final content + +**Use the Edit tool, not sed or Python scripts.** The Edit tool forces specificity about what to replace and where, yielding better reliability. + +### Formatting Rules + +- **Bold all headers, subheadings, and inline labels**: Use `b="1"` on ``. This includes: + - Slide titles + - Section headers within a slide + - Inline labels like (e.g.: "Status:", "Description:") at the start of a line +- **Never use unicode bullets (•)**: Use proper list formatting with `` or `` +- **Bullet consistency**: Let bullets inherit from the layout. Only specify `` or ``. + +--- + +## Common Pitfalls + +### Template Adaptation + +When source content has fewer items than the template: +- **Remove excess elements entirely** (images, shapes, text boxes), don't just clear text +- Check for orphaned visuals after clearing text content +- Run visual QA to catch mismatched counts + +When replacing text with different length content: +- **Shorter replacements**: Usually safe +- **Longer replacements**: May overflow or wrap unexpectedly +- Test with visual QA after text changes +- Consider truncating or splitting content to fit the template's design constraints + +**Template slots ≠ Source items**: If template has 4 team members but source has 3 users, delete the 4th member's entire group (image + text boxes), not just the text. + +### Multi-Item Content + +If source has multiple items (numbered lists, multiple sections), create separate `` elements for each — **never concatenate into one string**. + +**❌ WRONG** — all items in one paragraph: +```xml + + Step 1: Do the first thing. Step 2: Do the second thing. + +``` + +**✅ CORRECT** — separate paragraphs with bold headers: +```xml + + + Step 1 + + + + Do the first thing. + + + + Step 2 + + +``` + +Copy `` from the original paragraph to preserve line spacing. Use `b="1"` on headers. + +### Smart Quotes + +Handled automatically by unpack/pack. But the Edit tool converts smart quotes to ASCII. + +**When adding new text with quotes, use XML entities:** + +```xml +the “Agreement” +``` + +| Character | Name | Unicode | XML Entity | +|-----------|------|---------|------------| +| `“` | Left double quote | U+201C | `“` | +| `”` | Right double quote | U+201D | `”` | +| `‘` | Left single quote | U+2018 | `‘` | +| `’` | Right single quote | U+2019 | `’` | + +### Other + +- **Whitespace**: Use `xml:space="preserve"` on `` with leading/trailing spaces +- **XML parsing**: Use `defusedxml.minidom`, not `xml.etree.ElementTree` (corrupts namespaces) diff --git a/pptxgenjs.md b/pptxgenjs.md new file mode 100644 index 0000000..6bfed90 --- /dev/null +++ b/pptxgenjs.md @@ -0,0 +1,420 @@ +# PptxGenJS Tutorial + +## Setup & Basic Structure + +```javascript +const pptxgen = require("pptxgenjs"); + +let pres = new pptxgen(); +pres.layout = 'LAYOUT_16x9'; // or 'LAYOUT_16x10', 'LAYOUT_4x3', 'LAYOUT_WIDE' +pres.author = 'Your Name'; +pres.title = 'Presentation Title'; + +let slide = pres.addSlide(); +slide.addText("Hello World!", { x: 0.5, y: 0.5, fontSize: 36, color: "363636" }); + +pres.writeFile({ fileName: "Presentation.pptx" }); +``` + +## Layout Dimensions + +Slide dimensions (coordinates in inches): +- `LAYOUT_16x9`: 10" × 5.625" (default) +- `LAYOUT_16x10`: 10" × 6.25" +- `LAYOUT_4x3`: 10" × 7.5" +- `LAYOUT_WIDE`: 13.3" × 7.5" + +--- + +## Text & Formatting + +```javascript +// Basic text +slide.addText("Simple Text", { + x: 1, y: 1, w: 8, h: 2, fontSize: 24, fontFace: "Arial", + color: "363636", bold: true, align: "center", valign: "middle" +}); + +// Character spacing (use charSpacing, not letterSpacing which is silently ignored) +slide.addText("SPACED TEXT", { x: 1, y: 1, w: 8, h: 1, charSpacing: 6 }); + +// Rich text arrays +slide.addText([ + { text: "Bold ", options: { bold: true } }, + { text: "Italic ", options: { italic: true } } +], { x: 1, y: 3, w: 8, h: 1 }); + +// Multi-line text (requires breakLine: true) +slide.addText([ + { text: "Line 1", options: { breakLine: true } }, + { text: "Line 2", options: { breakLine: true } }, + { text: "Line 3" } // Last item doesn't need breakLine +], { x: 0.5, y: 0.5, w: 8, h: 2 }); + +// Text box margin (internal padding) +slide.addText("Title", { + x: 0.5, y: 0.3, w: 9, h: 0.6, + margin: 0 // Use 0 when aligning text with other elements like shapes or icons +}); +``` + +**Tip:** Text boxes have internal margin by default. Set `margin: 0` when you need text to align precisely with shapes, lines, or icons at the same x-position. + +--- + +## Lists & Bullets + +```javascript +// ✅ CORRECT: Multiple bullets +slide.addText([ + { text: "First item", options: { bullet: true, breakLine: true } }, + { text: "Second item", options: { bullet: true, breakLine: true } }, + { text: "Third item", options: { bullet: true } } +], { x: 0.5, y: 0.5, w: 8, h: 3 }); + +// ❌ WRONG: Never use unicode bullets +slide.addText("• First item", { ... }); // Creates double bullets + +// Sub-items and numbered lists +{ text: "Sub-item", options: { bullet: true, indentLevel: 1 } } +{ text: "First", options: { bullet: { type: "number" }, breakLine: true } } +``` + +--- + +## Shapes + +```javascript +slide.addShape(pres.shapes.RECTANGLE, { + x: 0.5, y: 0.8, w: 1.5, h: 3.0, + fill: { color: "FF0000" }, line: { color: "000000", width: 2 } +}); + +slide.addShape(pres.shapes.OVAL, { x: 4, y: 1, w: 2, h: 2, fill: { color: "0000FF" } }); + +slide.addShape(pres.shapes.LINE, { + x: 1, y: 3, w: 5, h: 0, line: { color: "FF0000", width: 3, dashType: "dash" } +}); + +// With transparency +slide.addShape(pres.shapes.RECTANGLE, { + x: 1, y: 1, w: 3, h: 2, + fill: { color: "0088CC", transparency: 50 } +}); + +// Rounded rectangle (rectRadius only works with ROUNDED_RECTANGLE, not RECTANGLE) +// ⚠️ Don't pair with rectangular accent overlays — they won't cover rounded corners. Use RECTANGLE instead. +slide.addShape(pres.shapes.ROUNDED_RECTANGLE, { + x: 1, y: 1, w: 3, h: 2, + fill: { color: "FFFFFF" }, rectRadius: 0.1 +}); + +// With shadow +slide.addShape(pres.shapes.RECTANGLE, { + x: 1, y: 1, w: 3, h: 2, + fill: { color: "FFFFFF" }, + shadow: { type: "outer", color: "000000", blur: 6, offset: 2, angle: 135, opacity: 0.15 } +}); +``` + +Shadow options: + +| Property | Type | Range | Notes | +|----------|------|-------|-------| +| `type` | string | `"outer"`, `"inner"` | | +| `color` | string | 6-char hex (e.g. `"000000"`) | No `#` prefix, no 8-char hex — see Common Pitfalls | +| `blur` | number | 0-100 pt | | +| `offset` | number | 0-200 pt | **Must be non-negative** — negative values corrupt the file | +| `angle` | number | 0-359 degrees | Direction the shadow falls (135 = bottom-right, 270 = upward) | +| `opacity` | number | 0.0-1.0 | Use this for transparency, never encode in color string | + +To cast a shadow upward (e.g. on a footer bar), use `angle: 270` with a positive offset — do **not** use a negative offset. + +**Note**: Gradient fills are not natively supported. Use a gradient image as a background instead. + +--- + +## Images + +### Image Sources + +```javascript +// From file path +slide.addImage({ path: "images/chart.png", x: 1, y: 1, w: 5, h: 3 }); + +// From URL +slide.addImage({ path: "https://example.com/image.jpg", x: 1, y: 1, w: 5, h: 3 }); + +// From base64 (faster, no file I/O) +slide.addImage({ data: "image/png;base64,iVBORw0KGgo...", x: 1, y: 1, w: 5, h: 3 }); +``` + +### Image Options + +```javascript +slide.addImage({ + path: "image.png", + x: 1, y: 1, w: 5, h: 3, + rotate: 45, // 0-359 degrees + rounding: true, // Circular crop + transparency: 50, // 0-100 + flipH: true, // Horizontal flip + flipV: false, // Vertical flip + altText: "Description", // Accessibility + hyperlink: { url: "https://example.com" } +}); +``` + +### Image Sizing Modes + +```javascript +// Contain - fit inside, preserve ratio +{ sizing: { type: 'contain', w: 4, h: 3 } } + +// Cover - fill area, preserve ratio (may crop) +{ sizing: { type: 'cover', w: 4, h: 3 } } + +// Crop - cut specific portion +{ sizing: { type: 'crop', x: 0.5, y: 0.5, w: 2, h: 2 } } +``` + +### Calculate Dimensions (preserve aspect ratio) + +```javascript +const origWidth = 1978, origHeight = 923, maxHeight = 3.0; +const calcWidth = maxHeight * (origWidth / origHeight); +const centerX = (10 - calcWidth) / 2; + +slide.addImage({ path: "image.png", x: centerX, y: 1.2, w: calcWidth, h: maxHeight }); +``` + +### Supported Formats + +- **Standard**: PNG, JPG, GIF (animated GIFs work in Microsoft 365) +- **SVG**: Works in modern PowerPoint/Microsoft 365 + +--- + +## Icons + +Use react-icons to generate SVG icons, then rasterize to PNG for universal compatibility. + +### Setup + +```javascript +const React = require("react"); +const ReactDOMServer = require("react-dom/server"); +const sharp = require("sharp"); +const { FaCheckCircle, FaChartLine } = require("react-icons/fa"); + +function renderIconSvg(IconComponent, color = "#000000", size = 256) { + return ReactDOMServer.renderToStaticMarkup( + React.createElement(IconComponent, { color, size: String(size) }) + ); +} + +async function iconToBase64Png(IconComponent, color, size = 256) { + const svg = renderIconSvg(IconComponent, color, size); + const pngBuffer = await sharp(Buffer.from(svg)).png().toBuffer(); + return "image/png;base64," + pngBuffer.toString("base64"); +} +``` + +### Add Icon to Slide + +```javascript +const iconData = await iconToBase64Png(FaCheckCircle, "#4472C4", 256); + +slide.addImage({ + data: iconData, + x: 1, y: 1, w: 0.5, h: 0.5 // Size in inches +}); +``` + +**Note**: Use size 256 or higher for crisp icons. The size parameter controls the rasterization resolution, not the display size on the slide (which is set by `w` and `h` in inches). + +### Icon Libraries + +Install: `npm install -g react-icons react react-dom sharp` + +Popular icon sets in react-icons: +- `react-icons/fa` - Font Awesome +- `react-icons/md` - Material Design +- `react-icons/hi` - Heroicons +- `react-icons/bi` - Bootstrap Icons + +--- + +## Slide Backgrounds + +```javascript +// Solid color +slide.background = { color: "F1F1F1" }; + +// Color with transparency +slide.background = { color: "FF3399", transparency: 50 }; + +// Image from URL +slide.background = { path: "https://example.com/bg.jpg" }; + +// Image from base64 +slide.background = { data: "image/png;base64,iVBORw0KGgo..." }; +``` + +--- + +## Tables + +```javascript +slide.addTable([ + ["Header 1", "Header 2"], + ["Cell 1", "Cell 2"] +], { + x: 1, y: 1, w: 8, h: 2, + border: { pt: 1, color: "999999" }, fill: { color: "F1F1F1" } +}); + +// Advanced with merged cells +let tableData = [ + [{ text: "Header", options: { fill: { color: "6699CC" }, color: "FFFFFF", bold: true } }, "Cell"], + [{ text: "Merged", options: { colspan: 2 } }] +]; +slide.addTable(tableData, { x: 1, y: 3.5, w: 8, colW: [4, 4] }); +``` + +--- + +## Charts + +```javascript +// Bar chart +slide.addChart(pres.charts.BAR, [{ + name: "Sales", labels: ["Q1", "Q2", "Q3", "Q4"], values: [4500, 5500, 6200, 7100] +}], { + x: 0.5, y: 0.6, w: 6, h: 3, barDir: 'col', + showTitle: true, title: 'Quarterly Sales' +}); + +// Line chart +slide.addChart(pres.charts.LINE, [{ + name: "Temp", labels: ["Jan", "Feb", "Mar"], values: [32, 35, 42] +}], { x: 0.5, y: 4, w: 6, h: 3, lineSize: 3, lineSmooth: true }); + +// Pie chart +slide.addChart(pres.charts.PIE, [{ + name: "Share", labels: ["A", "B", "Other"], values: [35, 45, 20] +}], { x: 7, y: 1, w: 5, h: 4, showPercent: true }); +``` + +### Better-Looking Charts + +Default charts look dated. Apply these options for a modern, clean appearance: + +```javascript +slide.addChart(pres.charts.BAR, chartData, { + x: 0.5, y: 1, w: 9, h: 4, barDir: "col", + + // Custom colors (match your presentation palette) + chartColors: ["0D9488", "14B8A6", "5EEAD4"], + + // Clean background + chartArea: { fill: { color: "FFFFFF" }, roundedCorners: true }, + + // Muted axis labels + catAxisLabelColor: "64748B", + valAxisLabelColor: "64748B", + + // Subtle grid (value axis only) + valGridLine: { color: "E2E8F0", size: 0.5 }, + catGridLine: { style: "none" }, + + // Data labels on bars + showValue: true, + dataLabelPosition: "outEnd", + dataLabelColor: "1E293B", + + // Hide legend for single series + showLegend: false, +}); +``` + +**Key styling options:** +- `chartColors: [...]` - hex colors for series/segments +- `chartArea: { fill, border, roundedCorners }` - chart background +- `catGridLine/valGridLine: { color, style, size }` - grid lines (`style: "none"` to hide) +- `lineSmooth: true` - curved lines (line charts) +- `legendPos: "r"` - legend position: "b", "t", "l", "r", "tr" + +--- + +## Slide Masters + +```javascript +pres.defineSlideMaster({ + title: 'TITLE_SLIDE', background: { color: '283A5E' }, + objects: [{ + placeholder: { options: { name: 'title', type: 'title', x: 1, y: 2, w: 8, h: 2 } } + }] +}); + +let titleSlide = pres.addSlide({ masterName: "TITLE_SLIDE" }); +titleSlide.addText("My Title", { placeholder: "title" }); +``` + +--- + +## Common Pitfalls + +⚠️ These issues cause file corruption, visual bugs, or broken output. Avoid them. + +1. **NEVER use "#" with hex colors** - causes file corruption + ```javascript + color: "FF0000" // ✅ CORRECT + color: "#FF0000" // ❌ WRONG + ``` + +2. **NEVER encode opacity in hex color strings** - 8-char colors (e.g., `"00000020"`) corrupt the file. Use the `opacity` property instead. + ```javascript + shadow: { type: "outer", blur: 6, offset: 2, color: "00000020" } // ❌ CORRUPTS FILE + shadow: { type: "outer", blur: 6, offset: 2, color: "000000", opacity: 0.12 } // ✅ CORRECT + ``` + +3. **Use `bullet: true`** - NEVER unicode symbols like "•" (creates double bullets) + +4. **Use `breakLine: true`** between array items or text runs together + +5. **Avoid `lineSpacing` with bullets** - causes excessive gaps; use `paraSpaceAfter` instead + +6. **Each presentation needs fresh instance** - don't reuse `pptxgen()` objects + +7. **NEVER reuse option objects across calls** - PptxGenJS mutates objects in-place (e.g. converting shadow values to EMU). Sharing one object between multiple calls corrupts the second shape. + ```javascript + const shadow = { type: "outer", blur: 6, offset: 2, color: "000000", opacity: 0.15 }; + slide.addShape(pres.shapes.RECTANGLE, { shadow, ... }); // ❌ second call gets already-converted values + slide.addShape(pres.shapes.RECTANGLE, { shadow, ... }); + + const makeShadow = () => ({ type: "outer", blur: 6, offset: 2, color: "000000", opacity: 0.15 }); + slide.addShape(pres.shapes.RECTANGLE, { shadow: makeShadow(), ... }); // ✅ fresh object each time + slide.addShape(pres.shapes.RECTANGLE, { shadow: makeShadow(), ... }); + ``` + +8. **Don't use `ROUNDED_RECTANGLE` with accent borders** - rectangular overlay bars won't cover rounded corners. Use `RECTANGLE` instead. + ```javascript + // ❌ WRONG: Accent bar doesn't cover rounded corners + slide.addShape(pres.shapes.ROUNDED_RECTANGLE, { x: 1, y: 1, w: 3, h: 1.5, fill: { color: "FFFFFF" } }); + slide.addShape(pres.shapes.RECTANGLE, { x: 1, y: 1, w: 0.08, h: 1.5, fill: { color: "0891B2" } }); + + // ✅ CORRECT: Use RECTANGLE for clean alignment + slide.addShape(pres.shapes.RECTANGLE, { x: 1, y: 1, w: 3, h: 1.5, fill: { color: "FFFFFF" } }); + slide.addShape(pres.shapes.RECTANGLE, { x: 1, y: 1, w: 0.08, h: 1.5, fill: { color: "0891B2" } }); + ``` + +--- + +## Quick Reference + +- **Shapes**: RECTANGLE, OVAL, LINE, ROUNDED_RECTANGLE +- **Charts**: BAR, LINE, PIE, DOUGHNUT, SCATTER, BUBBLE, RADAR +- **Layouts**: LAYOUT_16x9 (10"×5.625"), LAYOUT_16x10, LAYOUT_4x3, LAYOUT_WIDE +- **Alignment**: "left", "center", "right" +- **Chart data labels**: "outEnd", "inEnd", "center" diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/add_slide.py b/scripts/add_slide.py new file mode 100644 index 0000000..13700df --- /dev/null +++ b/scripts/add_slide.py @@ -0,0 +1,195 @@ +"""Add a new slide to an unpacked PPTX directory. + +Usage: python add_slide.py + +The source can be: + - A slide file (e.g., slide2.xml) - duplicates the slide + - A layout file (e.g., slideLayout2.xml) - creates from layout + +Examples: + python add_slide.py unpacked/ slide2.xml + # Duplicates slide2, creates slide5.xml + + python add_slide.py unpacked/ slideLayout2.xml + # Creates slide5.xml from slideLayout2.xml + +To see available layouts: ls unpacked/ppt/slideLayouts/ + +Prints the element to add to presentation.xml. +""" + +import re +import shutil +import sys +from pathlib import Path + + +def get_next_slide_number(slides_dir: Path) -> int: + existing = [int(m.group(1)) for f in slides_dir.glob("slide*.xml") + if (m := re.match(r"slide(\d+)\.xml", f.name))] + return max(existing) + 1 if existing else 1 + + +def create_slide_from_layout(unpacked_dir: Path, layout_file: str) -> None: + slides_dir = unpacked_dir / "ppt" / "slides" + rels_dir = slides_dir / "_rels" + layouts_dir = unpacked_dir / "ppt" / "slideLayouts" + + layout_path = layouts_dir / layout_file + if not layout_path.exists(): + print(f"Error: {layout_path} not found", file=sys.stderr) + sys.exit(1) + + next_num = get_next_slide_number(slides_dir) + dest = f"slide{next_num}.xml" + dest_slide = slides_dir / dest + dest_rels = rels_dir / f"{dest}.rels" + + slide_xml = ''' + + + + + + + + + + + + + + + + + + + + + +''' + dest_slide.write_text(slide_xml, encoding="utf-8") + + rels_dir.mkdir(exist_ok=True) + rels_xml = f''' + + +''' + dest_rels.write_text(rels_xml, encoding="utf-8") + + _add_to_content_types(unpacked_dir, dest) + + rid = _add_to_presentation_rels(unpacked_dir, dest) + + next_slide_id = _get_next_slide_id(unpacked_dir) + + print(f"Created {dest} from {layout_file}") + print(f'Add to presentation.xml : ') + + +def duplicate_slide(unpacked_dir: Path, source: str) -> None: + slides_dir = unpacked_dir / "ppt" / "slides" + rels_dir = slides_dir / "_rels" + + source_slide = slides_dir / source + + if not source_slide.exists(): + print(f"Error: {source_slide} not found", file=sys.stderr) + sys.exit(1) + + next_num = get_next_slide_number(slides_dir) + dest = f"slide{next_num}.xml" + dest_slide = slides_dir / dest + + source_rels = rels_dir / f"{source}.rels" + dest_rels = rels_dir / f"{dest}.rels" + + shutil.copy2(source_slide, dest_slide) + + if source_rels.exists(): + shutil.copy2(source_rels, dest_rels) + + rels_content = dest_rels.read_text(encoding="utf-8") + rels_content = re.sub( + r'\s*]*Type="[^"]*notesSlide"[^>]*/>\s*', + "\n", + rels_content, + ) + dest_rels.write_text(rels_content, encoding="utf-8") + + _add_to_content_types(unpacked_dir, dest) + + rid = _add_to_presentation_rels(unpacked_dir, dest) + + next_slide_id = _get_next_slide_id(unpacked_dir) + + print(f"Created {dest} from {source}") + print(f'Add to presentation.xml : ') + + +def _add_to_content_types(unpacked_dir: Path, dest: str) -> None: + content_types_path = unpacked_dir / "[Content_Types].xml" + content_types = content_types_path.read_text(encoding="utf-8") + + new_override = f'' + + if f"/ppt/slides/{dest}" not in content_types: + content_types = content_types.replace("", f" {new_override}\n") + content_types_path.write_text(content_types, encoding="utf-8") + + +def _add_to_presentation_rels(unpacked_dir: Path, dest: str) -> str: + pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels" + pres_rels = pres_rels_path.read_text(encoding="utf-8") + + rids = [int(m) for m in re.findall(r'Id="rId(\d+)"', pres_rels)] + next_rid = max(rids) + 1 if rids else 1 + rid = f"rId{next_rid}" + + new_rel = f'' + + if f"slides/{dest}" not in pres_rels: + pres_rels = pres_rels.replace("", f" {new_rel}\n") + pres_rels_path.write_text(pres_rels, encoding="utf-8") + + return rid + + +def _get_next_slide_id(unpacked_dir: Path) -> int: + pres_path = unpacked_dir / "ppt" / "presentation.xml" + pres_content = pres_path.read_text(encoding="utf-8") + slide_ids = [int(m) for m in re.findall(r']*id="(\d+)"', pres_content)] + return max(slide_ids) + 1 if slide_ids else 256 + + +def parse_source(source: str) -> tuple[str, str | None]: + if source.startswith("slideLayout") and source.endswith(".xml"): + return ("layout", source) + + return ("slide", None) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python add_slide.py ", file=sys.stderr) + print("", file=sys.stderr) + print("Source can be:", file=sys.stderr) + print(" slide2.xml - duplicate an existing slide", file=sys.stderr) + print(" slideLayout2.xml - create from a layout template", file=sys.stderr) + print("", file=sys.stderr) + print("To see available layouts: ls /ppt/slideLayouts/", file=sys.stderr) + sys.exit(1) + + unpacked_dir = Path(sys.argv[1]) + source = sys.argv[2] + + if not unpacked_dir.exists(): + print(f"Error: {unpacked_dir} not found", file=sys.stderr) + sys.exit(1) + + source_type, layout_file = parse_source(source) + + if source_type == "layout" and layout_file is not None: + create_slide_from_layout(unpacked_dir, layout_file) + else: + duplicate_slide(unpacked_dir, source) diff --git a/scripts/clean.py b/scripts/clean.py new file mode 100644 index 0000000..3d13994 --- /dev/null +++ b/scripts/clean.py @@ -0,0 +1,286 @@ +"""Remove unreferenced files from an unpacked PPTX directory. + +Usage: python clean.py + +Example: + python clean.py unpacked/ + +This script removes: +- Orphaned slides (not in sldIdLst) and their relationships +- [trash] directory (unreferenced files) +- Orphaned .rels files for deleted resources +- Unreferenced media, embeddings, charts, diagrams, drawings, ink files +- Unreferenced theme files +- Unreferenced notes slides +- Content-Type overrides for deleted files +""" + +import sys +from pathlib import Path + +import defusedxml.minidom + + +import re + + +def get_slides_in_sldidlst(unpacked_dir: Path) -> set[str]: + pres_path = unpacked_dir / "ppt" / "presentation.xml" + pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels" + + if not pres_path.exists() or not pres_rels_path.exists(): + return set() + + rels_dom = defusedxml.minidom.parse(str(pres_rels_path)) + rid_to_slide = {} + for rel in rels_dom.getElementsByTagName("Relationship"): + rid = rel.getAttribute("Id") + target = rel.getAttribute("Target") + rel_type = rel.getAttribute("Type") + if "slide" in rel_type and target.startswith("slides/"): + rid_to_slide[rid] = target.replace("slides/", "") + + pres_content = pres_path.read_text(encoding="utf-8") + referenced_rids = set(re.findall(r']*r:id="([^"]+)"', pres_content)) + + return {rid_to_slide[rid] for rid in referenced_rids if rid in rid_to_slide} + + +def remove_orphaned_slides(unpacked_dir: Path) -> list[str]: + slides_dir = unpacked_dir / "ppt" / "slides" + slides_rels_dir = slides_dir / "_rels" + pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels" + + if not slides_dir.exists(): + return [] + + referenced_slides = get_slides_in_sldidlst(unpacked_dir) + removed = [] + + for slide_file in slides_dir.glob("slide*.xml"): + if slide_file.name not in referenced_slides: + rel_path = slide_file.relative_to(unpacked_dir) + slide_file.unlink() + removed.append(str(rel_path)) + + rels_file = slides_rels_dir / f"{slide_file.name}.rels" + if rels_file.exists(): + rels_file.unlink() + removed.append(str(rels_file.relative_to(unpacked_dir))) + + if removed and pres_rels_path.exists(): + rels_dom = defusedxml.minidom.parse(str(pres_rels_path)) + changed = False + + for rel in list(rels_dom.getElementsByTagName("Relationship")): + target = rel.getAttribute("Target") + if target.startswith("slides/"): + slide_name = target.replace("slides/", "") + if slide_name not in referenced_slides: + if rel.parentNode: + rel.parentNode.removeChild(rel) + changed = True + + if changed: + with open(pres_rels_path, "wb") as f: + f.write(rels_dom.toxml(encoding="utf-8")) + + return removed + + +def remove_trash_directory(unpacked_dir: Path) -> list[str]: + trash_dir = unpacked_dir / "[trash]" + removed = [] + + if trash_dir.exists() and trash_dir.is_dir(): + for file_path in trash_dir.iterdir(): + if file_path.is_file(): + rel_path = file_path.relative_to(unpacked_dir) + removed.append(str(rel_path)) + file_path.unlink() + trash_dir.rmdir() + + return removed + + +def get_slide_referenced_files(unpacked_dir: Path) -> set: + referenced = set() + slides_rels_dir = unpacked_dir / "ppt" / "slides" / "_rels" + + if not slides_rels_dir.exists(): + return referenced + + for rels_file in slides_rels_dir.glob("*.rels"): + dom = defusedxml.minidom.parse(str(rels_file)) + for rel in dom.getElementsByTagName("Relationship"): + target = rel.getAttribute("Target") + if not target: + continue + target_path = (rels_file.parent.parent / target).resolve() + try: + referenced.add(target_path.relative_to(unpacked_dir.resolve())) + except ValueError: + pass + + return referenced + + +def remove_orphaned_rels_files(unpacked_dir: Path) -> list[str]: + resource_dirs = ["charts", "diagrams", "drawings"] + removed = [] + slide_referenced = get_slide_referenced_files(unpacked_dir) + + for dir_name in resource_dirs: + rels_dir = unpacked_dir / "ppt" / dir_name / "_rels" + if not rels_dir.exists(): + continue + + for rels_file in rels_dir.glob("*.rels"): + resource_file = rels_dir.parent / rels_file.name.replace(".rels", "") + try: + resource_rel_path = resource_file.resolve().relative_to(unpacked_dir.resolve()) + except ValueError: + continue + + if not resource_file.exists() or resource_rel_path not in slide_referenced: + rels_file.unlink() + rel_path = rels_file.relative_to(unpacked_dir) + removed.append(str(rel_path)) + + return removed + + +def get_referenced_files(unpacked_dir: Path) -> set: + referenced = set() + + for rels_file in unpacked_dir.rglob("*.rels"): + dom = defusedxml.minidom.parse(str(rels_file)) + for rel in dom.getElementsByTagName("Relationship"): + target = rel.getAttribute("Target") + if not target: + continue + target_path = (rels_file.parent.parent / target).resolve() + try: + referenced.add(target_path.relative_to(unpacked_dir.resolve())) + except ValueError: + pass + + return referenced + + +def remove_orphaned_files(unpacked_dir: Path, referenced: set) -> list[str]: + resource_dirs = ["media", "embeddings", "charts", "diagrams", "tags", "drawings", "ink"] + removed = [] + + for dir_name in resource_dirs: + dir_path = unpacked_dir / "ppt" / dir_name + if not dir_path.exists(): + continue + + for file_path in dir_path.glob("*"): + if not file_path.is_file(): + continue + rel_path = file_path.relative_to(unpacked_dir) + if rel_path not in referenced: + file_path.unlink() + removed.append(str(rel_path)) + + theme_dir = unpacked_dir / "ppt" / "theme" + if theme_dir.exists(): + for file_path in theme_dir.glob("theme*.xml"): + rel_path = file_path.relative_to(unpacked_dir) + if rel_path not in referenced: + file_path.unlink() + removed.append(str(rel_path)) + theme_rels = theme_dir / "_rels" / f"{file_path.name}.rels" + if theme_rels.exists(): + theme_rels.unlink() + removed.append(str(theme_rels.relative_to(unpacked_dir))) + + notes_dir = unpacked_dir / "ppt" / "notesSlides" + if notes_dir.exists(): + for file_path in notes_dir.glob("*.xml"): + if not file_path.is_file(): + continue + rel_path = file_path.relative_to(unpacked_dir) + if rel_path not in referenced: + file_path.unlink() + removed.append(str(rel_path)) + + notes_rels_dir = notes_dir / "_rels" + if notes_rels_dir.exists(): + for file_path in notes_rels_dir.glob("*.rels"): + notes_file = notes_dir / file_path.name.replace(".rels", "") + if not notes_file.exists(): + file_path.unlink() + removed.append(str(file_path.relative_to(unpacked_dir))) + + return removed + + +def update_content_types(unpacked_dir: Path, removed_files: list[str]) -> None: + ct_path = unpacked_dir / "[Content_Types].xml" + if not ct_path.exists(): + return + + dom = defusedxml.minidom.parse(str(ct_path)) + changed = False + + for override in list(dom.getElementsByTagName("Override")): + part_name = override.getAttribute("PartName").lstrip("/") + if part_name in removed_files: + if override.parentNode: + override.parentNode.removeChild(override) + changed = True + + if changed: + with open(ct_path, "wb") as f: + f.write(dom.toxml(encoding="utf-8")) + + +def clean_unused_files(unpacked_dir: Path) -> list[str]: + all_removed = [] + + slides_removed = remove_orphaned_slides(unpacked_dir) + all_removed.extend(slides_removed) + + trash_removed = remove_trash_directory(unpacked_dir) + all_removed.extend(trash_removed) + + while True: + removed_rels = remove_orphaned_rels_files(unpacked_dir) + referenced = get_referenced_files(unpacked_dir) + removed_files = remove_orphaned_files(unpacked_dir, referenced) + + total_removed = removed_rels + removed_files + if not total_removed: + break + + all_removed.extend(total_removed) + + if all_removed: + update_content_types(unpacked_dir, all_removed) + + return all_removed + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python clean.py ", file=sys.stderr) + print("Example: python clean.py unpacked/", file=sys.stderr) + sys.exit(1) + + unpacked_dir = Path(sys.argv[1]) + + if not unpacked_dir.exists(): + print(f"Error: {unpacked_dir} not found", file=sys.stderr) + sys.exit(1) + + removed = clean_unused_files(unpacked_dir) + + if removed: + print(f"Removed {len(removed)} unreferenced files:") + for f in removed: + print(f" {f}") + else: + print("No unreferenced files found") diff --git a/scripts/office/helpers/__init__.py b/scripts/office/helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/office/helpers/merge_runs.py b/scripts/office/helpers/merge_runs.py new file mode 100644 index 0000000..ad7c25e --- /dev/null +++ b/scripts/office/helpers/merge_runs.py @@ -0,0 +1,199 @@ +"""Merge adjacent runs with identical formatting in DOCX. + +Merges adjacent elements that have identical properties. +Works on runs in paragraphs and inside tracked changes (, ). + +Also: +- Removes rsid attributes from runs (revision metadata that doesn't affect rendering) +- Removes proofErr elements (spell/grammar markers that block merging) +""" + +from pathlib import Path + +import defusedxml.minidom + + +def merge_runs(input_dir: str) -> tuple[int, str]: + doc_xml = Path(input_dir) / "word" / "document.xml" + + if not doc_xml.exists(): + return 0, f"Error: {doc_xml} not found" + + try: + dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8")) + root = dom.documentElement + + _remove_elements(root, "proofErr") + _strip_run_rsid_attrs(root) + + containers = {run.parentNode for run in _find_elements(root, "r")} + + merge_count = 0 + for container in containers: + merge_count += _merge_runs_in(container) + + doc_xml.write_bytes(dom.toxml(encoding="UTF-8")) + return merge_count, f"Merged {merge_count} runs" + + except Exception as e: + return 0, f"Error: {e}" + + + + +def _find_elements(root, tag: str) -> list: + results = [] + + def traverse(node): + if node.nodeType == node.ELEMENT_NODE: + name = node.localName or node.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(node) + for child in node.childNodes: + traverse(child) + + traverse(root) + return results + + +def _get_child(parent, tag: str): + for child in parent.childNodes: + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name == tag or name.endswith(f":{tag}"): + return child + return None + + +def _get_children(parent, tag: str) -> list: + results = [] + for child in parent.childNodes: + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(child) + return results + + +def _is_adjacent(elem1, elem2) -> bool: + node = elem1.nextSibling + while node: + if node == elem2: + return True + if node.nodeType == node.ELEMENT_NODE: + return False + if node.nodeType == node.TEXT_NODE and node.data.strip(): + return False + node = node.nextSibling + return False + + + + +def _remove_elements(root, tag: str): + for elem in _find_elements(root, tag): + if elem.parentNode: + elem.parentNode.removeChild(elem) + + +def _strip_run_rsid_attrs(root): + for run in _find_elements(root, "r"): + for attr in list(run.attributes.values()): + if "rsid" in attr.name.lower(): + run.removeAttribute(attr.name) + + + + +def _merge_runs_in(container) -> int: + merge_count = 0 + run = _first_child_run(container) + + while run: + while True: + next_elem = _next_element_sibling(run) + if next_elem and _is_run(next_elem) and _can_merge(run, next_elem): + _merge_run_content(run, next_elem) + container.removeChild(next_elem) + merge_count += 1 + else: + break + + _consolidate_text(run) + run = _next_sibling_run(run) + + return merge_count + + +def _first_child_run(container): + for child in container.childNodes: + if child.nodeType == child.ELEMENT_NODE and _is_run(child): + return child + return None + + +def _next_element_sibling(node): + sibling = node.nextSibling + while sibling: + if sibling.nodeType == sibling.ELEMENT_NODE: + return sibling + sibling = sibling.nextSibling + return None + + +def _next_sibling_run(node): + sibling = node.nextSibling + while sibling: + if sibling.nodeType == sibling.ELEMENT_NODE: + if _is_run(sibling): + return sibling + sibling = sibling.nextSibling + return None + + +def _is_run(node) -> bool: + name = node.localName or node.tagName + return name == "r" or name.endswith(":r") + + +def _can_merge(run1, run2) -> bool: + rpr1 = _get_child(run1, "rPr") + rpr2 = _get_child(run2, "rPr") + + if (rpr1 is None) != (rpr2 is None): + return False + if rpr1 is None: + return True + return rpr1.toxml() == rpr2.toxml() + + +def _merge_run_content(target, source): + for child in list(source.childNodes): + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name != "rPr" and not name.endswith(":rPr"): + target.appendChild(child) + + +def _consolidate_text(run): + t_elements = _get_children(run, "t") + + for i in range(len(t_elements) - 1, 0, -1): + curr, prev = t_elements[i], t_elements[i - 1] + + if _is_adjacent(prev, curr): + prev_text = prev.firstChild.data if prev.firstChild else "" + curr_text = curr.firstChild.data if curr.firstChild else "" + merged = prev_text + curr_text + + if prev.firstChild: + prev.firstChild.data = merged + else: + prev.appendChild(run.ownerDocument.createTextNode(merged)) + + if merged.startswith(" ") or merged.endswith(" "): + prev.setAttribute("xml:space", "preserve") + elif prev.hasAttribute("xml:space"): + prev.removeAttribute("xml:space") + + run.removeChild(curr) diff --git a/scripts/office/helpers/simplify_redlines.py b/scripts/office/helpers/simplify_redlines.py new file mode 100644 index 0000000..db963bb --- /dev/null +++ b/scripts/office/helpers/simplify_redlines.py @@ -0,0 +1,197 @@ +"""Simplify tracked changes by merging adjacent w:ins or w:del elements. + +Merges adjacent elements from the same author into a single element. +Same for elements. This makes heavily-redlined documents easier to +work with by reducing the number of tracked change wrappers. + +Rules: +- Only merges w:ins with w:ins, w:del with w:del (same element type) +- Only merges if same author (ignores timestamp differences) +- Only merges if truly adjacent (only whitespace between them) +""" + +import xml.etree.ElementTree as ET +import zipfile +from pathlib import Path + +import defusedxml.minidom + +WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + +def simplify_redlines(input_dir: str) -> tuple[int, str]: + doc_xml = Path(input_dir) / "word" / "document.xml" + + if not doc_xml.exists(): + return 0, f"Error: {doc_xml} not found" + + try: + dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8")) + root = dom.documentElement + + merge_count = 0 + + containers = _find_elements(root, "p") + _find_elements(root, "tc") + + for container in containers: + merge_count += _merge_tracked_changes_in(container, "ins") + merge_count += _merge_tracked_changes_in(container, "del") + + doc_xml.write_bytes(dom.toxml(encoding="UTF-8")) + return merge_count, f"Simplified {merge_count} tracked changes" + + except Exception as e: + return 0, f"Error: {e}" + + +def _merge_tracked_changes_in(container, tag: str) -> int: + merge_count = 0 + + tracked = [ + child + for child in container.childNodes + if child.nodeType == child.ELEMENT_NODE and _is_element(child, tag) + ] + + if len(tracked) < 2: + return 0 + + i = 0 + while i < len(tracked) - 1: + curr = tracked[i] + next_elem = tracked[i + 1] + + if _can_merge_tracked(curr, next_elem): + _merge_tracked_content(curr, next_elem) + container.removeChild(next_elem) + tracked.pop(i + 1) + merge_count += 1 + else: + i += 1 + + return merge_count + + +def _is_element(node, tag: str) -> bool: + name = node.localName or node.tagName + return name == tag or name.endswith(f":{tag}") + + +def _get_author(elem) -> str: + author = elem.getAttribute("w:author") + if not author: + for attr in elem.attributes.values(): + if attr.localName == "author" or attr.name.endswith(":author"): + return attr.value + return author + + +def _can_merge_tracked(elem1, elem2) -> bool: + if _get_author(elem1) != _get_author(elem2): + return False + + node = elem1.nextSibling + while node and node != elem2: + if node.nodeType == node.ELEMENT_NODE: + return False + if node.nodeType == node.TEXT_NODE and node.data.strip(): + return False + node = node.nextSibling + + return True + + +def _merge_tracked_content(target, source): + while source.firstChild: + child = source.firstChild + source.removeChild(child) + target.appendChild(child) + + +def _find_elements(root, tag: str) -> list: + results = [] + + def traverse(node): + if node.nodeType == node.ELEMENT_NODE: + name = node.localName or node.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(node) + for child in node.childNodes: + traverse(child) + + traverse(root) + return results + + +def get_tracked_change_authors(doc_xml_path: Path) -> dict[str, int]: + if not doc_xml_path.exists(): + return {} + + try: + tree = ET.parse(doc_xml_path) + root = tree.getroot() + except ET.ParseError: + return {} + + namespaces = {"w": WORD_NS} + author_attr = f"{{{WORD_NS}}}author" + + authors: dict[str, int] = {} + for tag in ["ins", "del"]: + for elem in root.findall(f".//w:{tag}", namespaces): + author = elem.get(author_attr) + if author: + authors[author] = authors.get(author, 0) + 1 + + return authors + + +def _get_authors_from_docx(docx_path: Path) -> dict[str, int]: + try: + with zipfile.ZipFile(docx_path, "r") as zf: + if "word/document.xml" not in zf.namelist(): + return {} + with zf.open("word/document.xml") as f: + tree = ET.parse(f) + root = tree.getroot() + + namespaces = {"w": WORD_NS} + author_attr = f"{{{WORD_NS}}}author" + + authors: dict[str, int] = {} + for tag in ["ins", "del"]: + for elem in root.findall(f".//w:{tag}", namespaces): + author = elem.get(author_attr) + if author: + authors[author] = authors.get(author, 0) + 1 + return authors + except (zipfile.BadZipFile, ET.ParseError): + return {} + + +def infer_author(modified_dir: Path, original_docx: Path, default: str = "Claude") -> str: + modified_xml = modified_dir / "word" / "document.xml" + modified_authors = get_tracked_change_authors(modified_xml) + + if not modified_authors: + return default + + original_authors = _get_authors_from_docx(original_docx) + + new_changes: dict[str, int] = {} + for author, count in modified_authors.items(): + original_count = original_authors.get(author, 0) + diff = count - original_count + if diff > 0: + new_changes[author] = diff + + if not new_changes: + return default + + if len(new_changes) == 1: + return next(iter(new_changes)) + + raise ValueError( + f"Multiple authors added new changes: {new_changes}. " + "Cannot infer which author to validate." + ) diff --git a/scripts/office/pack.py b/scripts/office/pack.py new file mode 100644 index 0000000..db29ed8 --- /dev/null +++ b/scripts/office/pack.py @@ -0,0 +1,159 @@ +"""Pack a directory into a DOCX, PPTX, or XLSX file. + +Validates with auto-repair, condenses XML formatting, and creates the Office file. + +Usage: + python pack.py [--original ] [--validate true|false] + +Examples: + python pack.py unpacked/ output.docx --original input.docx + python pack.py unpacked/ output.pptx --validate false +""" + +import argparse +import sys +import shutil +import tempfile +import zipfile +from pathlib import Path + +import defusedxml.minidom + +from validators import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + +def pack( + input_directory: str, + output_file: str, + original_file: str | None = None, + validate: bool = True, + infer_author_func=None, +) -> tuple[None, str]: + input_dir = Path(input_directory) + output_path = Path(output_file) + suffix = output_path.suffix.lower() + + if not input_dir.is_dir(): + return None, f"Error: {input_dir} is not a directory" + + if suffix not in {".docx", ".pptx", ".xlsx"}: + return None, f"Error: {output_file} must be a .docx, .pptx, or .xlsx file" + + if validate and original_file: + original_path = Path(original_file) + if original_path.exists(): + success, output = _run_validation( + input_dir, original_path, suffix, infer_author_func + ) + if output: + print(output) + if not success: + return None, f"Error: Validation failed for {input_dir}" + + with tempfile.TemporaryDirectory() as temp_dir: + temp_content_dir = Path(temp_dir) / "content" + shutil.copytree(input_dir, temp_content_dir) + + for pattern in ["*.xml", "*.rels"]: + for xml_file in temp_content_dir.rglob(pattern): + _condense_xml(xml_file) + + output_path.parent.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf: + for f in temp_content_dir.rglob("*"): + if f.is_file(): + zf.write(f, f.relative_to(temp_content_dir)) + + return None, f"Successfully packed {input_dir} to {output_file}" + + +def _run_validation( + unpacked_dir: Path, + original_file: Path, + suffix: str, + infer_author_func=None, +) -> tuple[bool, str | None]: + output_lines = [] + validators = [] + + if suffix == ".docx": + author = "Claude" + if infer_author_func: + try: + author = infer_author_func(unpacked_dir, original_file) + except ValueError as e: + print(f"Warning: {e} Using default author 'Claude'.", file=sys.stderr) + + validators = [ + DOCXSchemaValidator(unpacked_dir, original_file), + RedliningValidator(unpacked_dir, original_file, author=author), + ] + elif suffix == ".pptx": + validators = [PPTXSchemaValidator(unpacked_dir, original_file)] + + if not validators: + return True, None + + total_repairs = sum(v.repair() for v in validators) + if total_repairs: + output_lines.append(f"Auto-repaired {total_repairs} issue(s)") + + success = all(v.validate() for v in validators) + + if success: + output_lines.append("All validations PASSED!") + + return success, "\n".join(output_lines) if output_lines else None + + +def _condense_xml(xml_file: Path) -> None: + try: + with open(xml_file, encoding="utf-8") as f: + dom = defusedxml.minidom.parse(f) + + for element in dom.getElementsByTagName("*"): + if element.tagName.endswith(":t"): + continue + + for child in list(element.childNodes): + if ( + child.nodeType == child.TEXT_NODE + and child.nodeValue + and child.nodeValue.strip() == "" + ) or child.nodeType == child.COMMENT_NODE: + element.removeChild(child) + + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + except Exception as e: + print(f"ERROR: Failed to parse {xml_file.name}: {e}", file=sys.stderr) + raise + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Pack a directory into a DOCX, PPTX, or XLSX file" + ) + parser.add_argument("input_directory", help="Unpacked Office document directory") + parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)") + parser.add_argument( + "--original", + help="Original file for validation comparison", + ) + parser.add_argument( + "--validate", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Run validation with auto-repair (default: true)", + ) + args = parser.parse_args() + + _, message = pack( + args.input_directory, + args.output_file, + original_file=args.original, + validate=args.validate, + ) + print(message) + + if "Error" in message: + sys.exit(1) diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd new file mode 100644 index 0000000..6454ef9 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd @@ -0,0 +1,1499 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd new file mode 100644 index 0000000..afa4f46 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd @@ -0,0 +1,146 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd new file mode 100644 index 0000000..64e66b8 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd @@ -0,0 +1,1085 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd new file mode 100644 index 0000000..687eea8 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd @@ -0,0 +1,11 @@ + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd new file mode 100644 index 0000000..6ac81b0 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd @@ -0,0 +1,3081 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd new file mode 100644 index 0000000..1dbf051 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd new file mode 100644 index 0000000..f1af17d --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd new file mode 100644 index 0000000..0a185ab --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd @@ -0,0 +1,287 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd new file mode 100644 index 0000000..14ef488 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd @@ -0,0 +1,1676 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd new file mode 100644 index 0000000..c20f3bf --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd new file mode 100644 index 0000000..ac60252 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd @@ -0,0 +1,144 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd new file mode 100644 index 0000000..424b8ba --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd @@ -0,0 +1,174 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd new file mode 100644 index 0000000..2bddce2 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd new file mode 100644 index 0000000..8a8c18b --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd new file mode 100644 index 0000000..5c42706 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd new file mode 100644 index 0000000..853c341 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd new file mode 100644 index 0000000..da835ee --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd @@ -0,0 +1,195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd new file mode 100644 index 0000000..87ad265 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd @@ -0,0 +1,582 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd new file mode 100644 index 0000000..9e86f1b --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd new file mode 100644 index 0000000..d0be42e --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd @@ -0,0 +1,4439 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd new file mode 100644 index 0000000..8821dd1 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd @@ -0,0 +1,570 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd new file mode 100644 index 0000000..ca2575c --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd @@ -0,0 +1,509 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd new file mode 100644 index 0000000..dd079e6 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd @@ -0,0 +1,12 @@ + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd new file mode 100644 index 0000000..3dd6cf6 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd @@ -0,0 +1,108 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd new file mode 100644 index 0000000..f1041e3 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd @@ -0,0 +1,96 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd new file mode 100644 index 0000000..9c5b7a6 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd @@ -0,0 +1,3646 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd b/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd new file mode 100644 index 0000000..0f13678 --- /dev/null +++ b/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd @@ -0,0 +1,116 @@ + + + + + + See http://www.w3.org/XML/1998/namespace.html and + http://www.w3.org/TR/REC-xml for information about this namespace. + + This schema document describes the XML namespace, in a form + suitable for import by other schema documents. + + Note that local names in this namespace are intended to be defined + only by the World Wide Web Consortium or its subgroups. The + following names are currently defined in this namespace and should + not be used with conflicting semantics by any Working Group, + specification, or document instance: + + base (as an attribute name): denotes an attribute whose value + provides a URI to be used as the base for interpreting any + relative URIs in the scope of the element on which it + appears; its value is inherited. This name is reserved + by virtue of its definition in the XML Base specification. + + lang (as an attribute name): denotes an attribute whose value + is a language code for the natural language of the content of + any element; its value is inherited. This name is reserved + by virtue of its definition in the XML specification. + + space (as an attribute name): denotes an attribute whose + value is a keyword indicating what whitespace processing + discipline is intended for the content of the element; its + value is inherited. This name is reserved by virtue of its + definition in the XML specification. + + Father (in any context at all): denotes Jon Bosak, the chair of + the original XML Working Group. This name is reserved by + the following decision of the W3C XML Plenary and + XML Coordination groups: + + In appreciation for his vision, leadership and dedication + the W3C XML Plenary on this 10th day of February, 2000 + reserves for Jon Bosak in perpetuity the XML name + xml:Father + + + + + This schema defines attributes and an attribute group + suitable for use by + schemas wishing to allow xml:base, xml:lang or xml:space attributes + on elements they define. + + To enable this, such a schema must import this schema + for the XML namespace, e.g. as follows: + <schema . . .> + . . . + <import namespace="http://www.w3.org/XML/1998/namespace" + schemaLocation="http://www.w3.org/2001/03/xml.xsd"/> + + Subsequently, qualified reference to any of the attributes + or the group defined below will have the desired effect, e.g. + + <type . . .> + . . . + <attributeGroup ref="xml:specialAttrs"/> + + will define a type which will schema-validate an instance + element with any of those attributes + + + + In keeping with the XML Schema WG's standard versioning + policy, this schema document will persist at + http://www.w3.org/2001/03/xml.xsd. + At the date of issue it can also be found at + http://www.w3.org/2001/xml.xsd. + The schema document at that URI may however change in the future, + in order to remain compatible with the latest version of XML Schema + itself. In other words, if the XML Schema namespace changes, the version + of this document at + http://www.w3.org/2001/xml.xsd will change + accordingly; the version at + http://www.w3.org/2001/03/xml.xsd will not change. + + + + + + In due course, we should install the relevant ISO 2- and 3-letter + codes as the enumerated possible values . . . + + + + + + + + + + + + + + + See http://www.w3.org/TR/xmlbase/ for + information about this attribute. + + + + + + + + + + diff --git a/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd b/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd new file mode 100644 index 0000000..a6de9d2 --- /dev/null +++ b/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd b/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd new file mode 100644 index 0000000..10e978b --- /dev/null +++ b/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd b/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd new file mode 100644 index 0000000..4248bf7 --- /dev/null +++ b/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd b/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd new file mode 100644 index 0000000..5649746 --- /dev/null +++ b/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/mce/mc.xsd b/scripts/office/schemas/mce/mc.xsd new file mode 100644 index 0000000..ef72545 --- /dev/null +++ b/scripts/office/schemas/mce/mc.xsd @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/microsoft/wml-2010.xsd b/scripts/office/schemas/microsoft/wml-2010.xsd new file mode 100644 index 0000000..f65f777 --- /dev/null +++ b/scripts/office/schemas/microsoft/wml-2010.xsd @@ -0,0 +1,560 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/microsoft/wml-2012.xsd b/scripts/office/schemas/microsoft/wml-2012.xsd new file mode 100644 index 0000000..6b00755 --- /dev/null +++ b/scripts/office/schemas/microsoft/wml-2012.xsd @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/microsoft/wml-2018.xsd b/scripts/office/schemas/microsoft/wml-2018.xsd new file mode 100644 index 0000000..f321d33 --- /dev/null +++ b/scripts/office/schemas/microsoft/wml-2018.xsd @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/microsoft/wml-cex-2018.xsd b/scripts/office/schemas/microsoft/wml-cex-2018.xsd new file mode 100644 index 0000000..364c6a9 --- /dev/null +++ b/scripts/office/schemas/microsoft/wml-cex-2018.xsd @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/office/schemas/microsoft/wml-cid-2016.xsd b/scripts/office/schemas/microsoft/wml-cid-2016.xsd new file mode 100644 index 0000000..fed9d15 --- /dev/null +++ b/scripts/office/schemas/microsoft/wml-cid-2016.xsd @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd b/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd new file mode 100644 index 0000000..680cf15 --- /dev/null +++ b/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd @@ -0,0 +1,4 @@ + + + + diff --git a/scripts/office/schemas/microsoft/wml-symex-2015.xsd b/scripts/office/schemas/microsoft/wml-symex-2015.xsd new file mode 100644 index 0000000..89ada90 --- /dev/null +++ b/scripts/office/schemas/microsoft/wml-symex-2015.xsd @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/scripts/office/soffice.py b/scripts/office/soffice.py new file mode 100644 index 0000000..c7f7e32 --- /dev/null +++ b/scripts/office/soffice.py @@ -0,0 +1,183 @@ +""" +Helper for running LibreOffice (soffice) in environments where AF_UNIX +sockets may be blocked (e.g., sandboxed VMs). Detects the restriction +at runtime and applies an LD_PRELOAD shim if needed. + +Usage: + from office.soffice import run_soffice, get_soffice_env + + # Option 1 – run soffice directly + result = run_soffice(["--headless", "--convert-to", "pdf", "input.docx"]) + + # Option 2 – get env dict for your own subprocess calls + env = get_soffice_env() + subprocess.run(["soffice", ...], env=env) +""" + +import os +import socket +import subprocess +import tempfile +from pathlib import Path + + +def get_soffice_env() -> dict: + env = os.environ.copy() + env["SAL_USE_VCLPLUGIN"] = "svp" + + if _needs_shim(): + shim = _ensure_shim() + env["LD_PRELOAD"] = str(shim) + + return env + + +def run_soffice(args: list[str], **kwargs) -> subprocess.CompletedProcess: + env = get_soffice_env() + return subprocess.run(["soffice"] + args, env=env, **kwargs) + + + +_SHIM_SO = Path(tempfile.gettempdir()) / "lo_socket_shim.so" + + +def _needs_shim() -> bool: + try: + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.close() + return False + except OSError: + return True + + +def _ensure_shim() -> Path: + if _SHIM_SO.exists(): + return _SHIM_SO + + src = Path(tempfile.gettempdir()) / "lo_socket_shim.c" + src.write_text(_SHIM_SOURCE) + subprocess.run( + ["gcc", "-shared", "-fPIC", "-o", str(_SHIM_SO), str(src), "-ldl"], + check=True, + capture_output=True, + ) + src.unlink() + return _SHIM_SO + + + +_SHIM_SOURCE = r""" +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +static int (*real_socket)(int, int, int); +static int (*real_socketpair)(int, int, int, int[2]); +static int (*real_listen)(int, int); +static int (*real_accept)(int, struct sockaddr *, socklen_t *); +static int (*real_close)(int); +static int (*real_read)(int, void *, size_t); + +/* Per-FD bookkeeping (FDs >= 1024 are passed through unshimmed). */ +static int is_shimmed[1024]; +static int peer_of[1024]; +static int wake_r[1024]; /* accept() blocks reading this */ +static int wake_w[1024]; /* close() writes to this */ +static int listener_fd = -1; /* FD that received listen() */ + +__attribute__((constructor)) +static void init(void) { + real_socket = dlsym(RTLD_NEXT, "socket"); + real_socketpair = dlsym(RTLD_NEXT, "socketpair"); + real_listen = dlsym(RTLD_NEXT, "listen"); + real_accept = dlsym(RTLD_NEXT, "accept"); + real_close = dlsym(RTLD_NEXT, "close"); + real_read = dlsym(RTLD_NEXT, "read"); + for (int i = 0; i < 1024; i++) { + peer_of[i] = -1; + wake_r[i] = -1; + wake_w[i] = -1; + } +} + +/* ---- socket ---------------------------------------------------------- */ +int socket(int domain, int type, int protocol) { + if (domain == AF_UNIX) { + int fd = real_socket(domain, type, protocol); + if (fd >= 0) return fd; + /* socket(AF_UNIX) blocked – fall back to socketpair(). */ + int sv[2]; + if (real_socketpair(domain, type, protocol, sv) == 0) { + if (sv[0] >= 0 && sv[0] < 1024) { + is_shimmed[sv[0]] = 1; + peer_of[sv[0]] = sv[1]; + int wp[2]; + if (pipe(wp) == 0) { + wake_r[sv[0]] = wp[0]; + wake_w[sv[0]] = wp[1]; + } + } + return sv[0]; + } + errno = EPERM; + return -1; + } + return real_socket(domain, type, protocol); +} + +/* ---- listen ---------------------------------------------------------- */ +int listen(int sockfd, int backlog) { + if (sockfd >= 0 && sockfd < 1024 && is_shimmed[sockfd]) { + listener_fd = sockfd; + return 0; + } + return real_listen(sockfd, backlog); +} + +/* ---- accept ---------------------------------------------------------- */ +int accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen) { + if (sockfd >= 0 && sockfd < 1024 && is_shimmed[sockfd]) { + /* Block until close() writes to the wake pipe. */ + if (wake_r[sockfd] >= 0) { + char buf; + real_read(wake_r[sockfd], &buf, 1); + } + errno = ECONNABORTED; + return -1; + } + return real_accept(sockfd, addr, addrlen); +} + +/* ---- close ----------------------------------------------------------- */ +int close(int fd) { + if (fd >= 0 && fd < 1024 && is_shimmed[fd]) { + int was_listener = (fd == listener_fd); + is_shimmed[fd] = 0; + + if (wake_w[fd] >= 0) { /* unblock accept() */ + char c = 0; + write(wake_w[fd], &c, 1); + real_close(wake_w[fd]); + wake_w[fd] = -1; + } + if (wake_r[fd] >= 0) { real_close(wake_r[fd]); wake_r[fd] = -1; } + if (peer_of[fd] >= 0) { real_close(peer_of[fd]); peer_of[fd] = -1; } + + if (was_listener) + _exit(0); /* conversion done – exit */ + } + return real_close(fd); +} +""" + + + +if __name__ == "__main__": + import sys + result = run_soffice(sys.argv[1:]) + sys.exit(result.returncode) diff --git a/scripts/office/unpack.py b/scripts/office/unpack.py new file mode 100644 index 0000000..0015253 --- /dev/null +++ b/scripts/office/unpack.py @@ -0,0 +1,132 @@ +"""Unpack Office files (DOCX, PPTX, XLSX) for editing. + +Extracts the ZIP archive, pretty-prints XML files, and optionally: +- Merges adjacent runs with identical formatting (DOCX only) +- Simplifies adjacent tracked changes from same author (DOCX only) + +Usage: + python unpack.py [options] + +Examples: + python unpack.py document.docx unpacked/ + python unpack.py presentation.pptx unpacked/ + python unpack.py document.docx unpacked/ --merge-runs false +""" + +import argparse +import sys +import zipfile +from pathlib import Path + +import defusedxml.minidom + +from helpers.merge_runs import merge_runs as do_merge_runs +from helpers.simplify_redlines import simplify_redlines as do_simplify_redlines + +SMART_QUOTE_REPLACEMENTS = { + "\u201c": "“", + "\u201d": "”", + "\u2018": "‘", + "\u2019": "’", +} + + +def unpack( + input_file: str, + output_directory: str, + merge_runs: bool = True, + simplify_redlines: bool = True, +) -> tuple[None, str]: + input_path = Path(input_file) + output_path = Path(output_directory) + suffix = input_path.suffix.lower() + + if not input_path.exists(): + return None, f"Error: {input_file} does not exist" + + if suffix not in {".docx", ".pptx", ".xlsx"}: + return None, f"Error: {input_file} must be a .docx, .pptx, or .xlsx file" + + try: + output_path.mkdir(parents=True, exist_ok=True) + + with zipfile.ZipFile(input_path, "r") as zf: + zf.extractall(output_path) + + xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels")) + for xml_file in xml_files: + _pretty_print_xml(xml_file) + + message = f"Unpacked {input_file} ({len(xml_files)} XML files)" + + if suffix == ".docx": + if simplify_redlines: + simplify_count, _ = do_simplify_redlines(str(output_path)) + message += f", simplified {simplify_count} tracked changes" + + if merge_runs: + merge_count, _ = do_merge_runs(str(output_path)) + message += f", merged {merge_count} runs" + + for xml_file in xml_files: + _escape_smart_quotes(xml_file) + + return None, message + + except zipfile.BadZipFile: + return None, f"Error: {input_file} is not a valid Office file" + except Exception as e: + return None, f"Error unpacking: {e}" + + +def _pretty_print_xml(xml_file: Path) -> None: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="utf-8")) + except Exception: + pass + + +def _escape_smart_quotes(xml_file: Path) -> None: + try: + content = xml_file.read_text(encoding="utf-8") + for char, entity in SMART_QUOTE_REPLACEMENTS.items(): + content = content.replace(char, entity) + xml_file.write_text(content, encoding="utf-8") + except Exception: + pass + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Unpack an Office file (DOCX, PPTX, XLSX) for editing" + ) + parser.add_argument("input_file", help="Office file to unpack") + parser.add_argument("output_directory", help="Output directory") + parser.add_argument( + "--merge-runs", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Merge adjacent runs with identical formatting (DOCX only, default: true)", + ) + parser.add_argument( + "--simplify-redlines", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Merge adjacent tracked changes from same author (DOCX only, default: true)", + ) + args = parser.parse_args() + + _, message = unpack( + args.input_file, + args.output_directory, + merge_runs=args.merge_runs, + simplify_redlines=args.simplify_redlines, + ) + print(message) + + if "Error" in message: + sys.exit(1) diff --git a/scripts/office/validate.py b/scripts/office/validate.py new file mode 100644 index 0000000..03b01f6 --- /dev/null +++ b/scripts/office/validate.py @@ -0,0 +1,111 @@ +""" +Command line tool to validate Office document XML files against XSD schemas and tracked changes. + +Usage: + python validate.py [--original ] [--auto-repair] [--author NAME] + +The first argument can be either: +- An unpacked directory containing the Office document XML files +- A packed Office file (.docx/.pptx/.xlsx) which will be unpacked to a temp directory + +Auto-repair fixes: +- paraId/durableId values that exceed OOXML limits +- Missing xml:space="preserve" on w:t elements with whitespace +""" + +import argparse +import sys +import tempfile +import zipfile +from pathlib import Path + +from validators import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + + +def main(): + parser = argparse.ArgumentParser(description="Validate Office document XML files") + parser.add_argument( + "path", + help="Path to unpacked directory or packed Office file (.docx/.pptx/.xlsx)", + ) + parser.add_argument( + "--original", + required=False, + default=None, + help="Path to original file (.docx/.pptx/.xlsx). If omitted, all XSD errors are reported and redlining validation is skipped.", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable verbose output", + ) + parser.add_argument( + "--auto-repair", + action="store_true", + help="Automatically repair common issues (hex IDs, whitespace preservation)", + ) + parser.add_argument( + "--author", + default="Claude", + help="Author name for redlining validation (default: Claude)", + ) + args = parser.parse_args() + + path = Path(args.path) + assert path.exists(), f"Error: {path} does not exist" + + original_file = None + if args.original: + original_file = Path(args.original) + assert original_file.is_file(), f"Error: {original_file} is not a file" + assert original_file.suffix.lower() in [".docx", ".pptx", ".xlsx"], ( + f"Error: {original_file} must be a .docx, .pptx, or .xlsx file" + ) + + file_extension = (original_file or path).suffix.lower() + assert file_extension in [".docx", ".pptx", ".xlsx"], ( + f"Error: Cannot determine file type from {path}. Use --original or provide a .docx/.pptx/.xlsx file." + ) + + if path.is_file() and path.suffix.lower() in [".docx", ".pptx", ".xlsx"]: + temp_dir = tempfile.mkdtemp() + with zipfile.ZipFile(path, "r") as zf: + zf.extractall(temp_dir) + unpacked_dir = Path(temp_dir) + else: + assert path.is_dir(), f"Error: {path} is not a directory or Office file" + unpacked_dir = path + + match file_extension: + case ".docx": + validators = [ + DOCXSchemaValidator(unpacked_dir, original_file, verbose=args.verbose), + ] + if original_file: + validators.append( + RedliningValidator(unpacked_dir, original_file, verbose=args.verbose, author=args.author) + ) + case ".pptx": + validators = [ + PPTXSchemaValidator(unpacked_dir, original_file, verbose=args.verbose), + ] + case _: + print(f"Error: Validation not supported for file type {file_extension}") + sys.exit(1) + + if args.auto_repair: + total_repairs = sum(v.repair() for v in validators) + if total_repairs: + print(f"Auto-repaired {total_repairs} issue(s)") + + success = all(v.validate() for v in validators) + + if success: + print("All validations PASSED!") + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/scripts/office/validators/__init__.py b/scripts/office/validators/__init__.py new file mode 100644 index 0000000..db092ec --- /dev/null +++ b/scripts/office/validators/__init__.py @@ -0,0 +1,15 @@ +""" +Validation modules for Word document processing. +""" + +from .base import BaseSchemaValidator +from .docx import DOCXSchemaValidator +from .pptx import PPTXSchemaValidator +from .redlining import RedliningValidator + +__all__ = [ + "BaseSchemaValidator", + "DOCXSchemaValidator", + "PPTXSchemaValidator", + "RedliningValidator", +] diff --git a/scripts/office/validators/base.py b/scripts/office/validators/base.py new file mode 100644 index 0000000..db4a06a --- /dev/null +++ b/scripts/office/validators/base.py @@ -0,0 +1,847 @@ +""" +Base validator with common validation logic for document files. +""" + +import re +from pathlib import Path + +import defusedxml.minidom +import lxml.etree + + +class BaseSchemaValidator: + + IGNORED_VALIDATION_ERRORS = [ + "hyphenationZone", + "purl.org/dc/terms", + ] + + UNIQUE_ID_REQUIREMENTS = { + "comment": ("id", "file"), + "commentrangestart": ("id", "file"), + "commentrangeend": ("id", "file"), + "bookmarkstart": ("id", "file"), + "bookmarkend": ("id", "file"), + "sldid": ("id", "file"), + "sldmasterid": ("id", "global"), + "sldlayoutid": ("id", "global"), + "cm": ("authorid", "file"), + "sheet": ("sheetid", "file"), + "definedname": ("id", "file"), + "cxnsp": ("id", "file"), + "sp": ("id", "file"), + "pic": ("id", "file"), + "grpsp": ("id", "file"), + } + + EXCLUDED_ID_CONTAINERS = { + "sectionlst", + } + + ELEMENT_RELATIONSHIP_TYPES = {} + + SCHEMA_MAPPINGS = { + "word": "ISO-IEC29500-4_2016/wml.xsd", + "ppt": "ISO-IEC29500-4_2016/pml.xsd", + "xl": "ISO-IEC29500-4_2016/sml.xsd", + "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd", + "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd", + "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd", + "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd", + ".rels": "ecma/fouth-edition/opc-relationships.xsd", + "people.xml": "microsoft/wml-2012.xsd", + "commentsIds.xml": "microsoft/wml-cid-2016.xsd", + "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd", + "commentsExtended.xml": "microsoft/wml-2012.xsd", + "chart": "ISO-IEC29500-4_2016/dml-chart.xsd", + "theme": "ISO-IEC29500-4_2016/dml-main.xsd", + "drawing": "ISO-IEC29500-4_2016/dml-main.xsd", + } + + MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006" + XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace" + + PACKAGE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/relationships" + ) + OFFICE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships" + ) + CONTENT_TYPES_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/content-types" + ) + + MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"} + + OOXML_NAMESPACES = { + "http://schemas.openxmlformats.org/officeDocument/2006/math", + "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "http://schemas.openxmlformats.org/schemaLibrary/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/chart", + "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/diagram", + "http://schemas.openxmlformats.org/drawingml/2006/picture", + "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "http://schemas.openxmlformats.org/presentationml/2006/main", + "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes", + "http://www.w3.org/XML/1998/namespace", + } + + def __init__(self, unpacked_dir, original_file=None, verbose=False): + self.unpacked_dir = Path(unpacked_dir).resolve() + self.original_file = Path(original_file) if original_file else None + self.verbose = verbose + + self.schemas_dir = Path(__file__).parent.parent / "schemas" + + patterns = ["*.xml", "*.rels"] + self.xml_files = [ + f for pattern in patterns for f in self.unpacked_dir.rglob(pattern) + ] + + if not self.xml_files: + print(f"Warning: No XML files found in {self.unpacked_dir}") + + def validate(self): + raise NotImplementedError("Subclasses must implement the validate method") + + def repair(self) -> int: + return self.repair_whitespace_preservation() + + def repair_whitespace_preservation(self) -> int: + repairs = 0 + + for xml_file in self.xml_files: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + modified = False + + for elem in dom.getElementsByTagName("*"): + if elem.tagName.endswith(":t") and elem.firstChild: + text = elem.firstChild.nodeValue + if text and (text.startswith((' ', '\t')) or text.endswith((' ', '\t'))): + if elem.getAttribute("xml:space") != "preserve": + elem.setAttribute("xml:space", "preserve") + text_preview = repr(text[:30]) + "..." if len(text) > 30 else repr(text) + print(f" Repaired: {xml_file.name}: Added xml:space='preserve' to {elem.tagName}: {text_preview}") + repairs += 1 + modified = True + + if modified: + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + + except Exception: + pass + + return repairs + + def validate_xml(self): + errors = [] + + for xml_file in self.xml_files: + try: + lxml.etree.parse(str(xml_file)) + except lxml.etree.XMLSyntaxError as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {e.lineno}: {e.msg}" + ) + except Exception as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Unexpected error: {str(e)}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} XML violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All XML files are well-formed") + return True + + def validate_namespaces(self): + errors = [] + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + declared = set(root.nsmap.keys()) - {None} + + for attr_val in [ + v for k, v in root.attrib.items() if k.endswith("Ignorable") + ]: + undeclared = set(attr_val.split()) - declared + errors.extend( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Namespace '{ns}' in Ignorable but not declared" + for ns in undeclared + ) + except lxml.etree.XMLSyntaxError: + continue + + if errors: + print(f"FAILED - {len(errors)} namespace issues:") + for error in errors: + print(error) + return False + if self.verbose: + print("PASSED - All namespace prefixes properly declared") + return True + + def validate_unique_ids(self): + errors = [] + global_ids = {} + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + file_ids = {} + + mc_elements = root.xpath( + ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE} + ) + for elem in mc_elements: + elem.getparent().remove(elem) + + for elem in root.iter(): + tag = ( + elem.tag.split("}")[-1].lower() + if "}" in elem.tag + else elem.tag.lower() + ) + + if tag in self.UNIQUE_ID_REQUIREMENTS: + in_excluded_container = any( + ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS + for ancestor in elem.iterancestors() + ) + if in_excluded_container: + continue + + attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag] + + id_value = None + for attr, value in elem.attrib.items(): + attr_local = ( + attr.split("}")[-1].lower() + if "}" in attr + else attr.lower() + ) + if attr_local == attr_name: + id_value = value + break + + if id_value is not None: + if scope == "global": + if id_value in global_ids: + prev_file, prev_line, prev_tag = global_ids[ + id_value + ] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> " + f"already used in {prev_file} at line {prev_line} in <{prev_tag}>" + ) + else: + global_ids[id_value] = ( + xml_file.relative_to(self.unpacked_dir), + elem.sourceline, + tag, + ) + elif scope == "file": + key = (tag, attr_name) + if key not in file_ids: + file_ids[key] = {} + + if id_value in file_ids[key]: + prev_line = file_ids[key][id_value] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> " + f"(first occurrence at line {prev_line})" + ) + else: + file_ids[key][id_value] = elem.sourceline + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} ID uniqueness violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All required IDs are unique") + return True + + def validate_file_references(self): + errors = [] + + rels_files = list(self.unpacked_dir.rglob("*.rels")) + + if not rels_files: + if self.verbose: + print("PASSED - No .rels files found") + return True + + all_files = [] + for file_path in self.unpacked_dir.rglob("*"): + if ( + file_path.is_file() + and file_path.name != "[Content_Types].xml" + and not file_path.name.endswith(".rels") + ): + all_files.append(file_path.resolve()) + + all_referenced_files = set() + + if self.verbose: + print( + f"Found {len(rels_files)} .rels files and {len(all_files)} target files" + ) + + for rels_file in rels_files: + try: + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + rels_dir = rels_file.parent + + referenced_files = set() + broken_refs = [] + + for rel in rels_root.findall( + ".//ns:Relationship", + namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE}, + ): + target = rel.get("Target") + if target and not target.startswith( + ("http", "mailto:") + ): + if target.startswith("/"): + target_path = self.unpacked_dir / target.lstrip("/") + elif rels_file.name == ".rels": + target_path = self.unpacked_dir / target + else: + base_dir = rels_dir.parent + target_path = base_dir / target + + try: + target_path = target_path.resolve() + if target_path.exists() and target_path.is_file(): + referenced_files.add(target_path) + all_referenced_files.add(target_path) + else: + broken_refs.append((target, rel.sourceline)) + except (OSError, ValueError): + broken_refs.append((target, rel.sourceline)) + + if broken_refs: + rel_path = rels_file.relative_to(self.unpacked_dir) + for broken_ref, line_num in broken_refs: + errors.append( + f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}" + ) + + except Exception as e: + rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append(f" Error parsing {rel_path}: {e}") + + unreferenced_files = set(all_files) - all_referenced_files + + if unreferenced_files: + for unref_file in sorted(unreferenced_files): + unref_rel_path = unref_file.relative_to(self.unpacked_dir) + errors.append(f" Unreferenced file: {unref_rel_path}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship validation errors:") + for error in errors: + print(error) + print( + "CRITICAL: These errors will cause the document to appear corrupt. " + + "Broken references MUST be fixed, " + + "and unreferenced files MUST be referenced or removed." + ) + return False + else: + if self.verbose: + print( + "PASSED - All references are valid and all files are properly referenced" + ) + return True + + def validate_all_relationship_ids(self): + import lxml.etree + + errors = [] + + for xml_file in self.xml_files: + if xml_file.suffix == ".rels": + continue + + rels_dir = xml_file.parent / "_rels" + rels_file = rels_dir / f"{xml_file.name}.rels" + + if not rels_file.exists(): + continue + + try: + rels_root = lxml.etree.parse(str(rels_file)).getroot() + rid_to_type = {} + + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rid = rel.get("Id") + rel_type = rel.get("Type", "") + if rid: + if rid in rid_to_type: + rels_rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append( + f" {rels_rel_path}: Line {rel.sourceline}: " + f"Duplicate relationship ID '{rid}' (IDs must be unique)" + ) + type_name = ( + rel_type.split("/")[-1] if "/" in rel_type else rel_type + ) + rid_to_type[rid] = type_name + + xml_root = lxml.etree.parse(str(xml_file)).getroot() + + r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE + rid_attrs_to_check = ["id", "embed", "link"] + for elem in xml_root.iter(): + for attr_name in rid_attrs_to_check: + rid_attr = elem.get(f"{{{r_ns}}}{attr_name}") + if not rid_attr: + continue + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + elem_name = ( + elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag + ) + + if rid_attr not in rid_to_type: + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> r:{attr_name} references non-existent relationship '{rid_attr}' " + f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})" + ) + elif attr_name == "id" and self.ELEMENT_RELATIONSHIP_TYPES: + expected_type = self._get_expected_relationship_type( + elem_name + ) + if expected_type: + actual_type = rid_to_type[rid_attr] + if expected_type not in actual_type.lower(): + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' " + f"but should point to a '{expected_type}' relationship" + ) + + except Exception as e: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + errors.append(f" Error processing {xml_rel_path}: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship ID reference errors:") + for error in errors: + print(error) + print("\nThese ID mismatches will cause the document to appear corrupt!") + return False + else: + if self.verbose: + print("PASSED - All relationship ID references are valid") + return True + + def _get_expected_relationship_type(self, element_name): + elem_lower = element_name.lower() + + if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES: + return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower] + + if elem_lower.endswith("id") and len(elem_lower) > 2: + prefix = elem_lower[:-2] + if prefix.endswith("master"): + return prefix.lower() + elif prefix.endswith("layout"): + return prefix.lower() + else: + if prefix == "sld": + return "slide" + return prefix.lower() + + if elem_lower.endswith("reference") and len(elem_lower) > 9: + prefix = elem_lower[:-9] + return prefix.lower() + + return None + + def validate_content_types(self): + errors = [] + + content_types_file = self.unpacked_dir / "[Content_Types].xml" + if not content_types_file.exists(): + print("FAILED - [Content_Types].xml file not found") + return False + + try: + root = lxml.etree.parse(str(content_types_file)).getroot() + declared_parts = set() + declared_extensions = set() + + for override in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override" + ): + part_name = override.get("PartName") + if part_name is not None: + declared_parts.add(part_name.lstrip("/")) + + for default in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default" + ): + extension = default.get("Extension") + if extension is not None: + declared_extensions.add(extension.lower()) + + declarable_roots = { + "sld", + "sldLayout", + "sldMaster", + "presentation", + "document", + "workbook", + "worksheet", + "theme", + } + + media_extensions = { + "png": "image/png", + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "gif": "image/gif", + "bmp": "image/bmp", + "tiff": "image/tiff", + "wmf": "image/x-wmf", + "emf": "image/x-emf", + } + + all_files = list(self.unpacked_dir.rglob("*")) + all_files = [f for f in all_files if f.is_file()] + + for xml_file in self.xml_files: + path_str = str(xml_file.relative_to(self.unpacked_dir)).replace( + "\\", "/" + ) + + if any( + skip in path_str + for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"] + ): + continue + + try: + root_tag = lxml.etree.parse(str(xml_file)).getroot().tag + root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag + + if root_name in declarable_roots and path_str not in declared_parts: + errors.append( + f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml" + ) + + except Exception: + continue + + for file_path in all_files: + if file_path.suffix.lower() in {".xml", ".rels"}: + continue + if file_path.name == "[Content_Types].xml": + continue + if "_rels" in file_path.parts or "docProps" in file_path.parts: + continue + + extension = file_path.suffix.lstrip(".").lower() + if extension and extension not in declared_extensions: + if extension in media_extensions: + relative_path = file_path.relative_to(self.unpacked_dir) + errors.append( + f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: ' + ) + + except Exception as e: + errors.append(f" Error parsing [Content_Types].xml: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} content type declaration errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print( + "PASSED - All content files are properly declared in [Content_Types].xml" + ) + return True + + def validate_file_against_xsd(self, xml_file, verbose=False): + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + + is_valid, current_errors = self._validate_single_file_xsd( + xml_file, unpacked_dir + ) + + if is_valid is None: + return None, set() + elif is_valid: + return True, set() + + original_errors = self._get_original_file_errors(xml_file) + + assert current_errors is not None + new_errors = current_errors - original_errors + + new_errors = { + e for e in new_errors + if not any(pattern in e for pattern in self.IGNORED_VALIDATION_ERRORS) + } + + if new_errors: + if verbose: + relative_path = xml_file.relative_to(unpacked_dir) + print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)") + for error in list(new_errors)[:3]: + truncated = error[:250] + "..." if len(error) > 250 else error + print(f" - {truncated}") + return False, new_errors + else: + if verbose: + print( + f"PASSED - No new errors (original had {len(current_errors)} errors)" + ) + return True, set() + + def validate_against_xsd(self): + new_errors = [] + original_error_count = 0 + valid_count = 0 + skipped_count = 0 + + for xml_file in self.xml_files: + relative_path = str(xml_file.relative_to(self.unpacked_dir)) + is_valid, new_file_errors = self.validate_file_against_xsd( + xml_file, verbose=False + ) + + if is_valid is None: + skipped_count += 1 + continue + elif is_valid and not new_file_errors: + valid_count += 1 + continue + elif is_valid: + original_error_count += 1 + valid_count += 1 + continue + + new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)") + for error in list(new_file_errors)[:3]: + new_errors.append( + f" - {error[:250]}..." if len(error) > 250 else f" - {error}" + ) + + if self.verbose: + print(f"Validated {len(self.xml_files)} files:") + print(f" - Valid: {valid_count}") + print(f" - Skipped (no schema): {skipped_count}") + if original_error_count: + print(f" - With original errors (ignored): {original_error_count}") + print( + f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}" + ) + + if new_errors: + print("\nFAILED - Found NEW validation errors:") + for error in new_errors: + print(error) + return False + else: + if self.verbose: + print("\nPASSED - No new XSD validation errors introduced") + return True + + def _get_schema_path(self, xml_file): + if xml_file.name in self.SCHEMA_MAPPINGS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name] + + if xml_file.suffix == ".rels": + return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"] + + if "charts/" in str(xml_file) and xml_file.name.startswith("chart"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"] + + if "theme/" in str(xml_file) and xml_file.name.startswith("theme"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"] + + if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name] + + return None + + def _clean_ignorable_namespaces(self, xml_doc): + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + for elem in xml_copy.iter(): + attrs_to_remove = [] + + for attr in elem.attrib: + if "{" in attr: + ns = attr.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + attrs_to_remove.append(attr) + + for attr in attrs_to_remove: + del elem.attrib[attr] + + self._remove_ignorable_elements(xml_copy) + + return lxml.etree.ElementTree(xml_copy) + + def _remove_ignorable_elements(self, root): + elements_to_remove = [] + + for elem in list(root): + if not hasattr(elem, "tag") or callable(elem.tag): + continue + + tag_str = str(elem.tag) + if tag_str.startswith("{"): + ns = tag_str.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + elements_to_remove.append(elem) + continue + + self._remove_ignorable_elements(elem) + + for elem in elements_to_remove: + root.remove(elem) + + def _preprocess_for_mc_ignorable(self, xml_doc): + root = xml_doc.getroot() + + if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib: + del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"] + + return xml_doc + + def _validate_single_file_xsd(self, xml_file, base_path): + schema_path = self._get_schema_path(xml_file) + if not schema_path: + return None, None + + try: + with open(schema_path, "rb") as xsd_file: + parser = lxml.etree.XMLParser() + xsd_doc = lxml.etree.parse( + xsd_file, parser=parser, base_url=str(schema_path) + ) + schema = lxml.etree.XMLSchema(xsd_doc) + + with open(xml_file, "r") as f: + xml_doc = lxml.etree.parse(f) + + xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc) + xml_doc = self._preprocess_for_mc_ignorable(xml_doc) + + relative_path = xml_file.relative_to(base_path) + if ( + relative_path.parts + and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS + ): + xml_doc = self._clean_ignorable_namespaces(xml_doc) + + if schema.validate(xml_doc): + return True, set() + else: + errors = set() + for error in schema.error_log: + errors.add(error.message) + return False, errors + + except Exception as e: + return False, {str(e)} + + def _get_original_file_errors(self, xml_file): + if self.original_file is None: + return set() + + import tempfile + import zipfile + + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + relative_path = xml_file.relative_to(unpacked_dir) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_path) + + original_xml_file = temp_path / relative_path + + if not original_xml_file.exists(): + return set() + + is_valid, errors = self._validate_single_file_xsd( + original_xml_file, temp_path + ) + return errors if errors else set() + + def _remove_template_tags_from_text_nodes(self, xml_doc): + warnings = [] + template_pattern = re.compile(r"\{\{[^}]*\}\}") + + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + def process_text_content(text, content_type): + if not text: + return text + matches = list(template_pattern.finditer(text)) + if matches: + for match in matches: + warnings.append( + f"Found template tag in {content_type}: {match.group()}" + ) + return template_pattern.sub("", text) + return text + + for elem in xml_copy.iter(): + if not hasattr(elem, "tag") or callable(elem.tag): + continue + tag_str = str(elem.tag) + if tag_str.endswith("}t") or tag_str == "t": + continue + + elem.text = process_text_content(elem.text, "text content") + elem.tail = process_text_content(elem.tail, "tail content") + + return lxml.etree.ElementTree(xml_copy), warnings + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/scripts/office/validators/docx.py b/scripts/office/validators/docx.py new file mode 100644 index 0000000..fec405e --- /dev/null +++ b/scripts/office/validators/docx.py @@ -0,0 +1,446 @@ +""" +Validator for Word document XML files against XSD schemas. +""" + +import random +import re +import tempfile +import zipfile + +import defusedxml.minidom +import lxml.etree + +from .base import BaseSchemaValidator + + +class DOCXSchemaValidator(BaseSchemaValidator): + + WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + W14_NAMESPACE = "http://schemas.microsoft.com/office/word/2010/wordml" + W16CID_NAMESPACE = "http://schemas.microsoft.com/office/word/2016/wordml/cid" + + ELEMENT_RELATIONSHIP_TYPES = {} + + def validate(self): + if not self.validate_xml(): + return False + + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + if not self.validate_unique_ids(): + all_valid = False + + if not self.validate_file_references(): + all_valid = False + + if not self.validate_content_types(): + all_valid = False + + if not self.validate_against_xsd(): + all_valid = False + + if not self.validate_whitespace_preservation(): + all_valid = False + + if not self.validate_deletions(): + all_valid = False + + if not self.validate_insertions(): + all_valid = False + + if not self.validate_all_relationship_ids(): + all_valid = False + + if not self.validate_id_constraints(): + all_valid = False + + if not self.validate_comment_markers(): + all_valid = False + + self.compare_paragraph_counts() + + return all_valid + + def validate_whitespace_preservation(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"): + if elem.text: + text = elem.text + if re.search(r"^[ \t\n\r]", text) or re.search( + r"[ \t\n\r]$", text + ): + xml_space_attr = f"{{{self.XML_NAMESPACE}}}space" + if ( + xml_space_attr not in elem.attrib + or elem.attrib[xml_space_attr] != "preserve" + ): + text_preview = ( + repr(text)[:50] + "..." + if len(repr(text)) > 50 + else repr(text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} whitespace preservation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All whitespace is properly preserved") + return True + + def validate_deletions(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + for t_elem in root.xpath(".//w:del//w:t", namespaces=namespaces): + if t_elem.text: + text_preview = ( + repr(t_elem.text)[:50] + "..." + if len(repr(t_elem.text)) > 50 + else repr(t_elem.text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {t_elem.sourceline}: found within : {text_preview}" + ) + + for instr_elem in root.xpath( + ".//w:del//w:instrText", namespaces=namespaces + ): + text_preview = ( + repr(instr_elem.text or "")[:50] + "..." + if len(repr(instr_elem.text or "")) > 50 + else repr(instr_elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {instr_elem.sourceline}: found within (use ): {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} deletion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:t elements found within w:del elements") + return True + + def count_paragraphs_in_unpacked(self): + count = 0 + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + except Exception as e: + print(f"Error counting paragraphs in unpacked document: {e}") + + return count + + def count_paragraphs_in_original(self): + original = self.original_file + if original is None: + return 0 + + count = 0 + + try: + with tempfile.TemporaryDirectory() as temp_dir: + with zipfile.ZipFile(original, "r") as zip_ref: + zip_ref.extractall(temp_dir) + + doc_xml_path = temp_dir + "/word/document.xml" + root = lxml.etree.parse(doc_xml_path).getroot() + + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + + except Exception as e: + print(f"Error counting paragraphs in original document: {e}") + + return count + + def validate_insertions(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + invalid_elements = root.xpath( + ".//w:ins//w:delText[not(ancestor::w:del)]", namespaces=namespaces + ) + + for elem in invalid_elements: + text_preview = ( + repr(elem.text or "")[:50] + "..." + if len(repr(elem.text or "")) > 50 + else repr(elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} insertion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:delText elements within w:ins elements") + return True + + def compare_paragraph_counts(self): + original_count = self.count_paragraphs_in_original() + new_count = self.count_paragraphs_in_unpacked() + + diff = new_count - original_count + diff_str = f"+{diff}" if diff > 0 else str(diff) + print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})") + + def _parse_id_value(self, val: str, base: int = 16) -> int: + return int(val, base) + + def validate_id_constraints(self): + errors = [] + para_id_attr = f"{{{self.W14_NAMESPACE}}}paraId" + durable_id_attr = f"{{{self.W16CID_NAMESPACE}}}durableId" + + for xml_file in self.xml_files: + try: + for elem in lxml.etree.parse(str(xml_file)).iter(): + if val := elem.get(para_id_attr): + if self._parse_id_value(val, base=16) >= 0x80000000: + errors.append( + f" {xml_file.name}:{elem.sourceline}: paraId={val} >= 0x80000000" + ) + + if val := elem.get(durable_id_attr): + if xml_file.name == "numbering.xml": + try: + if self._parse_id_value(val, base=10) >= 0x7FFFFFFF: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} >= 0x7FFFFFFF" + ) + except ValueError: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} must be decimal in numbering.xml" + ) + else: + if self._parse_id_value(val, base=16) >= 0x7FFFFFFF: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} >= 0x7FFFFFFF" + ) + except Exception: + pass + + if errors: + print(f"FAILED - {len(errors)} ID constraint violations:") + for e in errors: + print(e) + elif self.verbose: + print("PASSED - All paraId/durableId values within constraints") + return not errors + + def validate_comment_markers(self): + errors = [] + + document_xml = None + comments_xml = None + for xml_file in self.xml_files: + if xml_file.name == "document.xml" and "word" in str(xml_file): + document_xml = xml_file + elif xml_file.name == "comments.xml": + comments_xml = xml_file + + if not document_xml: + if self.verbose: + print("PASSED - No document.xml found (skipping comment validation)") + return True + + try: + doc_root = lxml.etree.parse(str(document_xml)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + range_starts = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentRangeStart", namespaces=namespaces + ) + } + range_ends = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentRangeEnd", namespaces=namespaces + ) + } + references = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentReference", namespaces=namespaces + ) + } + + orphaned_ends = range_ends - range_starts + for comment_id in sorted( + orphaned_ends, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + errors.append( + f' document.xml: commentRangeEnd id="{comment_id}" has no matching commentRangeStart' + ) + + orphaned_starts = range_starts - range_ends + for comment_id in sorted( + orphaned_starts, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + errors.append( + f' document.xml: commentRangeStart id="{comment_id}" has no matching commentRangeEnd' + ) + + comment_ids = set() + if comments_xml and comments_xml.exists(): + comments_root = lxml.etree.parse(str(comments_xml)).getroot() + comment_ids = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in comments_root.xpath( + ".//w:comment", namespaces=namespaces + ) + } + + marker_ids = range_starts | range_ends | references + invalid_refs = marker_ids - comment_ids + for comment_id in sorted( + invalid_refs, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + if comment_id: + errors.append( + f' document.xml: marker id="{comment_id}" references non-existent comment' + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append(f" Error parsing XML: {e}") + + if errors: + print(f"FAILED - {len(errors)} comment marker violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All comment markers properly paired") + return True + + def repair(self) -> int: + repairs = super().repair() + repairs += self.repair_durableId() + return repairs + + def repair_durableId(self) -> int: + repairs = 0 + + for xml_file in self.xml_files: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + modified = False + + for elem in dom.getElementsByTagName("*"): + if not elem.hasAttribute("w16cid:durableId"): + continue + + durable_id = elem.getAttribute("w16cid:durableId") + needs_repair = False + + if xml_file.name == "numbering.xml": + try: + needs_repair = ( + self._parse_id_value(durable_id, base=10) >= 0x7FFFFFFF + ) + except ValueError: + needs_repair = True + else: + try: + needs_repair = ( + self._parse_id_value(durable_id, base=16) >= 0x7FFFFFFF + ) + except ValueError: + needs_repair = True + + if needs_repair: + value = random.randint(1, 0x7FFFFFFE) + if xml_file.name == "numbering.xml": + new_id = str(value) + else: + new_id = f"{value:08X}" + + elem.setAttribute("w16cid:durableId", new_id) + print( + f" Repaired: {xml_file.name}: durableId {durable_id} → {new_id}" + ) + repairs += 1 + modified = True + + if modified: + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + + except Exception: + pass + + return repairs + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/scripts/office/validators/pptx.py b/scripts/office/validators/pptx.py new file mode 100644 index 0000000..09842aa --- /dev/null +++ b/scripts/office/validators/pptx.py @@ -0,0 +1,275 @@ +""" +Validator for PowerPoint presentation XML files against XSD schemas. +""" + +import re + +from .base import BaseSchemaValidator + + +class PPTXSchemaValidator(BaseSchemaValidator): + + PRESENTATIONML_NAMESPACE = ( + "http://schemas.openxmlformats.org/presentationml/2006/main" + ) + + ELEMENT_RELATIONSHIP_TYPES = { + "sldid": "slide", + "sldmasterid": "slidemaster", + "notesmasterid": "notesmaster", + "sldlayoutid": "slidelayout", + "themeid": "theme", + "tablestyleid": "tablestyles", + } + + def validate(self): + if not self.validate_xml(): + return False + + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + if not self.validate_unique_ids(): + all_valid = False + + if not self.validate_uuid_ids(): + all_valid = False + + if not self.validate_file_references(): + all_valid = False + + if not self.validate_slide_layout_ids(): + all_valid = False + + if not self.validate_content_types(): + all_valid = False + + if not self.validate_against_xsd(): + all_valid = False + + if not self.validate_notes_slide_references(): + all_valid = False + + if not self.validate_all_relationship_ids(): + all_valid = False + + if not self.validate_no_duplicate_slide_layouts(): + all_valid = False + + return all_valid + + def validate_uuid_ids(self): + import lxml.etree + + errors = [] + uuid_pattern = re.compile( + r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$" + ) + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + for elem in root.iter(): + for attr, value in elem.attrib.items(): + attr_name = attr.split("}")[-1].lower() + if attr_name == "id" or attr_name.endswith("id"): + if self._looks_like_uuid(value): + if not uuid_pattern.match(value): + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} UUID ID validation errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All UUID-like IDs contain valid hex values") + return True + + def _looks_like_uuid(self, value): + clean_value = value.strip("{}()").replace("-", "") + return len(clean_value) == 32 and all(c.isalnum() for c in clean_value) + + def validate_slide_layout_ids(self): + import lxml.etree + + errors = [] + + slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml")) + + if not slide_masters: + if self.verbose: + print("PASSED - No slide masters found") + return True + + for slide_master in slide_masters: + try: + root = lxml.etree.parse(str(slide_master)).getroot() + + rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels" + + if not rels_file.exists(): + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}" + ) + continue + + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + valid_layout_rids = set() + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "slideLayout" in rel_type: + valid_layout_rids.add(rel.get("Id")) + + for sld_layout_id in root.findall( + f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId" + ): + r_id = sld_layout_id.get( + f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id" + ) + layout_id = sld_layout_id.get("id") + + if r_id and r_id not in valid_layout_rids: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' " + f"references r:id='{r_id}' which is not found in slide layout relationships" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} slide layout ID validation errors:") + for error in errors: + print(error) + print( + "Remove invalid references or add missing slide layouts to the relationships file." + ) + return False + else: + if self.verbose: + print("PASSED - All slide layout IDs reference valid slide layouts") + return True + + def validate_no_duplicate_slide_layouts(self): + import lxml.etree + + errors = [] + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + layout_rels = [ + rel + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ) + if "slideLayout" in rel.get("Type", "") + ] + + if len(layout_rels) > 1: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references" + ) + + except Exception as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print("FAILED - Found slides with duplicate slideLayout references:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All slides have exactly one slideLayout reference") + return True + + def validate_notes_slide_references(self): + import lxml.etree + + errors = [] + notes_slide_references = {} + + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + if not slide_rels_files: + if self.verbose: + print("PASSED - No slide relationship files found") + return True + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "notesSlide" in rel_type: + target = rel.get("Target", "") + if target: + normalized_target = target.replace("../", "") + + slide_name = rels_file.stem.replace( + ".xml", "" + ) + + if normalized_target not in notes_slide_references: + notes_slide_references[normalized_target] = [] + notes_slide_references[normalized_target].append( + (slide_name, rels_file) + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + for target, references in notes_slide_references.items(): + if len(references) > 1: + slide_names = [ref[0] for ref in references] + errors.append( + f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}" + ) + for slide_name, rels_file in references: + errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}") + + if errors: + print( + f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:" + ) + for error in errors: + print(error) + print("Each slide may optionally have its own slide file.") + return False + else: + if self.verbose: + print("PASSED - All notes slide references are unique") + return True + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/scripts/office/validators/redlining.py b/scripts/office/validators/redlining.py new file mode 100644 index 0000000..71c81b6 --- /dev/null +++ b/scripts/office/validators/redlining.py @@ -0,0 +1,247 @@ +""" +Validator for tracked changes in Word documents. +""" + +import subprocess +import tempfile +import zipfile +from pathlib import Path + + +class RedliningValidator: + + def __init__(self, unpacked_dir, original_docx, verbose=False, author="Claude"): + self.unpacked_dir = Path(unpacked_dir) + self.original_docx = Path(original_docx) + self.verbose = verbose + self.author = author + self.namespaces = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + } + + def repair(self) -> int: + return 0 + + def validate(self): + modified_file = self.unpacked_dir / "word" / "document.xml" + if not modified_file.exists(): + print(f"FAILED - Modified document.xml not found at {modified_file}") + return False + + try: + import xml.etree.ElementTree as ET + + tree = ET.parse(modified_file) + root = tree.getroot() + + del_elements = root.findall(".//w:del", self.namespaces) + ins_elements = root.findall(".//w:ins", self.namespaces) + + author_del_elements = [ + elem + for elem in del_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == self.author + ] + author_ins_elements = [ + elem + for elem in ins_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == self.author + ] + + if not author_del_elements and not author_ins_elements: + if self.verbose: + print(f"PASSED - No tracked changes by {self.author} found.") + return True + + except Exception: + pass + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + try: + with zipfile.ZipFile(self.original_docx, "r") as zip_ref: + zip_ref.extractall(temp_path) + except Exception as e: + print(f"FAILED - Error unpacking original docx: {e}") + return False + + original_file = temp_path / "word" / "document.xml" + if not original_file.exists(): + print( + f"FAILED - Original document.xml not found in {self.original_docx}" + ) + return False + + try: + import xml.etree.ElementTree as ET + + modified_tree = ET.parse(modified_file) + modified_root = modified_tree.getroot() + original_tree = ET.parse(original_file) + original_root = original_tree.getroot() + except ET.ParseError as e: + print(f"FAILED - Error parsing XML files: {e}") + return False + + self._remove_author_tracked_changes(original_root) + self._remove_author_tracked_changes(modified_root) + + modified_text = self._extract_text_content(modified_root) + original_text = self._extract_text_content(original_root) + + if modified_text != original_text: + error_message = self._generate_detailed_diff( + original_text, modified_text + ) + print(error_message) + return False + + if self.verbose: + print(f"PASSED - All changes by {self.author} are properly tracked") + return True + + def _generate_detailed_diff(self, original_text, modified_text): + error_parts = [ + f"FAILED - Document text doesn't match after removing {self.author}'s tracked changes", + "", + "Likely causes:", + " 1. Modified text inside another author's or tags", + " 2. Made edits without proper tracked changes", + " 3. Didn't nest inside when deleting another's insertion", + "", + "For pre-redlined documents, use correct patterns:", + " - To reject another's INSERTION: Nest inside their ", + " - To restore another's DELETION: Add new AFTER their ", + "", + ] + + git_diff = self._get_git_word_diff(original_text, modified_text) + if git_diff: + error_parts.extend(["Differences:", "============", git_diff]) + else: + error_parts.append("Unable to generate word diff (git not available)") + + return "\n".join(error_parts) + + def _get_git_word_diff(self, original_text, modified_text): + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + original_file = temp_path / "original.txt" + modified_file = temp_path / "modified.txt" + + original_file.write_text(original_text, encoding="utf-8") + modified_file.write_text(modified_text, encoding="utf-8") + + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "--word-diff-regex=.", + "-U0", + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + + if content_lines: + return "\n".join(content_lines) + + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "-U0", + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + return "\n".join(content_lines) + + except (subprocess.CalledProcessError, FileNotFoundError, Exception): + pass + + return None + + def _remove_author_tracked_changes(self, root): + ins_tag = f"{{{self.namespaces['w']}}}ins" + del_tag = f"{{{self.namespaces['w']}}}del" + author_attr = f"{{{self.namespaces['w']}}}author" + + for parent in root.iter(): + to_remove = [] + for child in parent: + if child.tag == ins_tag and child.get(author_attr) == self.author: + to_remove.append(child) + for elem in to_remove: + parent.remove(elem) + + deltext_tag = f"{{{self.namespaces['w']}}}delText" + t_tag = f"{{{self.namespaces['w']}}}t" + + for parent in root.iter(): + to_process = [] + for child in parent: + if child.tag == del_tag and child.get(author_attr) == self.author: + to_process.append((child, list(parent).index(child))) + + for del_elem, del_index in reversed(to_process): + for elem in del_elem.iter(): + if elem.tag == deltext_tag: + elem.tag = t_tag + + for child in reversed(list(del_elem)): + parent.insert(del_index, child) + parent.remove(del_elem) + + def _extract_text_content(self, root): + p_tag = f"{{{self.namespaces['w']}}}p" + t_tag = f"{{{self.namespaces['w']}}}t" + + paragraphs = [] + for p_elem in root.findall(f".//{p_tag}"): + text_parts = [] + for t_elem in p_elem.findall(f".//{t_tag}"): + if t_elem.text: + text_parts.append(t_elem.text) + paragraph_text = "".join(text_parts) + if paragraph_text: + paragraphs.append(paragraph_text) + + return "\n".join(paragraphs) + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/scripts/thumbnail.py b/scripts/thumbnail.py new file mode 100644 index 0000000..edcbdc0 --- /dev/null +++ b/scripts/thumbnail.py @@ -0,0 +1,289 @@ +"""Create thumbnail grids from PowerPoint presentation slides. + +Creates a grid layout of slide thumbnails for quick visual analysis. +Labels each thumbnail with its XML filename (e.g., slide1.xml). +Hidden slides are shown with a placeholder pattern. + +Usage: + python thumbnail.py input.pptx [output_prefix] [--cols N] + +Examples: + python thumbnail.py presentation.pptx + # Creates: thumbnails.jpg + + python thumbnail.py template.pptx grid --cols 4 + # Creates: grid.jpg (or grid-1.jpg, grid-2.jpg for large decks) +""" + +import argparse +import subprocess +import sys +import tempfile +import zipfile +from pathlib import Path + +import defusedxml.minidom +from office.soffice import get_soffice_env +from PIL import Image, ImageDraw, ImageFont + +THUMBNAIL_WIDTH = 300 +CONVERSION_DPI = 100 +MAX_COLS = 6 +DEFAULT_COLS = 3 +JPEG_QUALITY = 95 +GRID_PADDING = 20 +BORDER_WIDTH = 2 +FONT_SIZE_RATIO = 0.10 +LABEL_PADDING_RATIO = 0.4 + + +def main(): + parser = argparse.ArgumentParser( + description="Create thumbnail grids from PowerPoint slides." + ) + parser.add_argument("input", help="Input PowerPoint file (.pptx)") + parser.add_argument( + "output_prefix", + nargs="?", + default="thumbnails", + help="Output prefix for image files (default: thumbnails)", + ) + parser.add_argument( + "--cols", + type=int, + default=DEFAULT_COLS, + help=f"Number of columns (default: {DEFAULT_COLS}, max: {MAX_COLS})", + ) + + args = parser.parse_args() + + cols = min(args.cols, MAX_COLS) + if args.cols > MAX_COLS: + print(f"Warning: Columns limited to {MAX_COLS}") + + input_path = Path(args.input) + if not input_path.exists() or input_path.suffix.lower() != ".pptx": + print(f"Error: Invalid PowerPoint file: {args.input}", file=sys.stderr) + sys.exit(1) + + output_path = Path(f"{args.output_prefix}.jpg") + + try: + slide_info = get_slide_info(input_path) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + visible_images = convert_to_images(input_path, temp_path) + + if not visible_images and not any(s["hidden"] for s in slide_info): + print("Error: No slides found", file=sys.stderr) + sys.exit(1) + + slides = build_slide_list(slide_info, visible_images, temp_path) + + grid_files = create_grids(slides, cols, THUMBNAIL_WIDTH, output_path) + + print(f"Created {len(grid_files)} grid(s):") + for grid_file in grid_files: + print(f" {grid_file}") + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +def get_slide_info(pptx_path: Path) -> list[dict]: + with zipfile.ZipFile(pptx_path, "r") as zf: + rels_content = zf.read("ppt/_rels/presentation.xml.rels").decode("utf-8") + rels_dom = defusedxml.minidom.parseString(rels_content) + + rid_to_slide = {} + for rel in rels_dom.getElementsByTagName("Relationship"): + rid = rel.getAttribute("Id") + target = rel.getAttribute("Target") + rel_type = rel.getAttribute("Type") + if "slide" in rel_type and target.startswith("slides/"): + rid_to_slide[rid] = target.replace("slides/", "") + + pres_content = zf.read("ppt/presentation.xml").decode("utf-8") + pres_dom = defusedxml.minidom.parseString(pres_content) + + slides = [] + for sld_id in pres_dom.getElementsByTagName("p:sldId"): + rid = sld_id.getAttribute("r:id") + if rid in rid_to_slide: + hidden = sld_id.getAttribute("show") == "0" + slides.append({"name": rid_to_slide[rid], "hidden": hidden}) + + return slides + + +def build_slide_list( + slide_info: list[dict], + visible_images: list[Path], + temp_dir: Path, +) -> list[tuple[Path, str]]: + if visible_images: + with Image.open(visible_images[0]) as img: + placeholder_size = img.size + else: + placeholder_size = (1920, 1080) + + slides = [] + visible_idx = 0 + + for info in slide_info: + if info["hidden"]: + placeholder_path = temp_dir / f"hidden-{info['name']}.jpg" + placeholder_img = create_hidden_placeholder(placeholder_size) + placeholder_img.save(placeholder_path, "JPEG") + slides.append((placeholder_path, f"{info['name']} (hidden)")) + else: + if visible_idx < len(visible_images): + slides.append((visible_images[visible_idx], info["name"])) + visible_idx += 1 + + return slides + + +def create_hidden_placeholder(size: tuple[int, int]) -> Image.Image: + img = Image.new("RGB", size, color="#F0F0F0") + draw = ImageDraw.Draw(img) + line_width = max(5, min(size) // 100) + draw.line([(0, 0), size], fill="#CCCCCC", width=line_width) + draw.line([(size[0], 0), (0, size[1])], fill="#CCCCCC", width=line_width) + return img + + +def convert_to_images(pptx_path: Path, temp_dir: Path) -> list[Path]: + pdf_path = temp_dir / f"{pptx_path.stem}.pdf" + + result = subprocess.run( + [ + "soffice", + "--headless", + "--convert-to", + "pdf", + "--outdir", + str(temp_dir), + str(pptx_path), + ], + capture_output=True, + text=True, + env=get_soffice_env(), + ) + if result.returncode != 0 or not pdf_path.exists(): + raise RuntimeError("PDF conversion failed") + + result = subprocess.run( + [ + "pdftoppm", + "-jpeg", + "-r", + str(CONVERSION_DPI), + str(pdf_path), + str(temp_dir / "slide"), + ], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError("Image conversion failed") + + return sorted(temp_dir.glob("slide-*.jpg")) + + +def create_grids( + slides: list[tuple[Path, str]], + cols: int, + width: int, + output_path: Path, +) -> list[str]: + max_per_grid = cols * (cols + 1) + grid_files = [] + + for chunk_idx, start_idx in enumerate(range(0, len(slides), max_per_grid)): + end_idx = min(start_idx + max_per_grid, len(slides)) + chunk_slides = slides[start_idx:end_idx] + + grid = create_grid(chunk_slides, cols, width) + + if len(slides) <= max_per_grid: + grid_filename = output_path + else: + stem = output_path.stem + suffix = output_path.suffix + grid_filename = output_path.parent / f"{stem}-{chunk_idx + 1}{suffix}" + + grid_filename.parent.mkdir(parents=True, exist_ok=True) + grid.save(str(grid_filename), quality=JPEG_QUALITY) + grid_files.append(str(grid_filename)) + + return grid_files + + +def create_grid( + slides: list[tuple[Path, str]], + cols: int, + width: int, +) -> Image.Image: + font_size = int(width * FONT_SIZE_RATIO) + label_padding = int(font_size * LABEL_PADDING_RATIO) + + with Image.open(slides[0][0]) as img: + aspect = img.height / img.width + height = int(width * aspect) + + rows = (len(slides) + cols - 1) // cols + grid_w = cols * width + (cols + 1) * GRID_PADDING + grid_h = rows * (height + font_size + label_padding * 2) + (rows + 1) * GRID_PADDING + + grid = Image.new("RGB", (grid_w, grid_h), "white") + draw = ImageDraw.Draw(grid) + + try: + font = ImageFont.load_default(size=font_size) + except Exception: + font = ImageFont.load_default() + + for i, (img_path, slide_name) in enumerate(slides): + row, col = i // cols, i % cols + x = col * width + (col + 1) * GRID_PADDING + y_base = ( + row * (height + font_size + label_padding * 2) + (row + 1) * GRID_PADDING + ) + + label = slide_name + bbox = draw.textbbox((0, 0), label, font=font) + text_w = bbox[2] - bbox[0] + draw.text( + (x + (width - text_w) // 2, y_base + label_padding), + label, + fill="black", + font=font, + ) + + y_thumbnail = y_base + label_padding + font_size + label_padding + + with Image.open(img_path) as img: + img.thumbnail((width, height), Image.Resampling.LANCZOS) + w, h = img.size + tx = x + (width - w) // 2 + ty = y_thumbnail + (height - h) // 2 + grid.paste(img, (tx, ty)) + + if BORDER_WIDTH > 0: + draw.rectangle( + [ + (tx - BORDER_WIDTH, ty - BORDER_WIDTH), + (tx + w + BORDER_WIDTH - 1, ty + h + BORDER_WIDTH - 1), + ], + outline="gray", + width=BORDER_WIDTH, + ) + + return grid + + +if __name__ == "__main__": + main()