PDF Manipulation Reference

Complete command-line toolkit for PDF operations.

╔══════════════════════════════════════════════════════════════╗
║                    PDF CLI TOOLKIT                           ║
╠══════════════════════════════════════════════════════════════╣
║                                                              ║
║  Tool Stack:                                                 ║
║  ├─ poppler-utils  → View, extract, convert (pdftoppm, etc) ║
║  ├─ qpdf           → Manipulate, encrypt, optimize, repair  ║
║  ├─ ghostscript    → Compress, convert, render              ║
║  ├─ pdftk          → Merge, split, rotate, watermark        ║
║  ├─ ocrmypdf       → Add OCR text layer                     ║
║  ├─ img2pdf        → Images to PDF (lossless)               ║
║  ├─ pdfgrep        → Search text in PDFs                    ║
║  └─ exiftool       → View/edit metadata                     ║
║                                                              ║
╚══════════════════════════════════════════════════════════════╝

1. Installation

  • Arch

  • RHEL/Fedora

  • Debian/Ubuntu

sudo pacman -S poppler qpdf ghostscript pdftk img2pdf pdfgrep perl-image-exiftool
yay -S ocrmypdf  # AUR
sudo dnf install poppler-utils qpdf ghostscript pdftk img2pdf pdfgrep perl-Image-ExifTool ocrmypdf
sudo apt install poppler-utils qpdf ghostscript pdftk-java img2pdf pdfgrep libimage-exiftool-perl ocrmypdf

1.1. Verify Installation

# Check all tools
for cmd in pdfinfo qpdf gs pdftk ocrmypdf img2pdf pdfgrep exiftool; do
  command -v $cmd &>/dev/null && echo "✓ $cmd" || echo "✗ $cmd (missing)"
done

2. Quick Reference Card

# ═══════════════════════════════════════════════════════════════
# VIEWING & INFO
# ═══════════════════════════════════════════════════════════════
pdfinfo doc.pdf
pdfinfo doc.pdf | grep Pages
qpdf --show-npages doc.pdf
pdffonts doc.pdf
# ═══════════════════════════════════════════════════════════════
# SPLITTING
# ═══════════════════════════════════════════════════════════════
pdfseparate doc.pdf page-%02d.pdf
qpdf doc.pdf --pages . 1-10 -- first10.pdf
pdftk doc.pdf cat 5-10 output ch2.pdf
# ═══════════════════════════════════════════════════════════════
# MERGING
# ═══════════════════════════════════════════════════════════════
pdfunite a.pdf b.pdf c.pdf merged.pdf
qpdf --empty --pages a.pdf b.pdf -- out.pdf
pdftk a.pdf b.pdf cat output merged.pdf
# ═══════════════════════════════════════════════════════════════
# CONVERTING
# ═══════════════════════════════════════════════════════════════
pdftoppm -png -r 300 doc.pdf page
pdftotext doc.pdf output.txt
pdftohtml doc.pdf output.html
img2pdf *.png -o output.pdf
# ═══════════════════════════════════════════════════════════════
# COMPRESSION
# ═══════════════════════════════════════════════════════════════
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook \
   -dNOPAUSE -dBATCH -sOutputFile=small.pdf large.pdf
# ═══════════════════════════════════════════════════════════════
# ENCRYPTION
# ═══════════════════════════════════════════════════════════════
qpdf --encrypt user owner 256 -- in.pdf encrypted.pdf
qpdf --decrypt --password=pass encrypted.pdf decrypted.pdf
# ═══════════════════════════════════════════════════════════════
# OCR
# ═══════════════════════════════════════════════════════════════
ocrmypdf scanned.pdf searchable.pdf
ocrmypdf -l eng+deu scan.pdf out.pdf
# ═══════════════════════════════════════════════════════════════
# REPAIR
# ═══════════════════════════════════════════════════════════════
qpdf --linearize damaged.pdf repaired.pdf
gs -o repaired.pdf -sDEVICE=pdfwrite -dPDFSETTINGS=/prepress damaged.pdf
# ═══════════════════════════════════════════════════════════════
# SEARCH
# ═══════════════════════════════════════════════════════════════
pdfgrep -n "pattern" doc.pdf
pdfgrep -r "pattern" ./pdfs/

3. Poppler Utils

The poppler-utils package provides lightweight, fast PDF tools.

3.1. pdfinfo - PDF Metadata

# Full info
pdfinfo document.pdf
# Example output:
# Title:          My Document
# Author:         John Doe
# Pages:          42
# Page size:      612 x 792 pts (letter)
# File size:      1234567 bytes
# PDF version:    1.4
# Just page count
pdfinfo doc.pdf | grep -oP 'Pages:\s+\K\d+'
# Check if encrypted
pdfinfo doc.pdf | grep Encrypted

3.2. pdfseparate - Split Pages

# Split into individual pages
pdfseparate input.pdf "output-%02d.pdf"
# Creates: output-01.pdf, output-02.pdf, ...
# Numbering formats
pdfseparate doc.pdf "page-%d.pdf"
pdfseparate doc.pdf "page-%02d.pdf"
pdfseparate doc.pdf "page-%03d.pdf"
# Extract specific page range
pdfseparate -f 5 -l 10 doc.pdf "chapter-%d.pdf"
# -f = first page, -l = last page

3.3. pdfunite - Merge PDFs

# Merge multiple PDFs
pdfunite file1.pdf file2.pdf file3.pdf merged.pdf
# Merge all PDFs in directory (sorted)
pdfunite $(ls -v *.pdf) combined.pdf
# Merge with glob
pdfunite chapter-*.pdf book.pdf

3.4. pdftoppm - PDF to Images

# PNG at 300 DPI (print quality)
pdftoppm -png -r 300 document.pdf output-prefix
# Creates: output-prefix-1.png, output-prefix-2.png, ...
# JPEG at 150 DPI (screen quality)
pdftoppm -jpeg -r 150 document.pdf output
# Single page only (page 5)
pdftoppm -png -r 300 -f 5 -l 5 document.pdf page5
# Page range
pdftoppm -png -r 200 -f 10 -l 20 document.pdf pages
# Grayscale
pdftoppm -gray -png -r 300 document.pdf output
# TIFF format (for archival)
pdftoppm -tiff -r 300 document.pdf output
Flag Purpose

-png

Output PNG format

-jpeg

Output JPEG format

-tiff

Output TIFF format

-r N

Resolution in DPI

-f N

First page

-l N

Last page

-gray

Grayscale output

-mono

Monochrome (1-bit)

-scale-to N

Scale to N pixels

3.5. pdftotext - Extract Text

# Extract all text
pdftotext document.pdf output.txt
# Maintain layout
pdftotext -layout document.pdf output.txt
# Extract to stdout
pdftotext document.pdf -
# Specific pages
pdftotext -f 1 -l 10 document.pdf first10.txt
# Extract as HTML (preserves some formatting)
pdftotext -htmlmeta document.pdf output.html

3.6. pdfimages - Extract Embedded Images

# Extract all images as PNG
pdfimages -png document.pdf image-prefix
# Extract as original format
pdfimages -all document.pdf images
# List images without extracting
pdfimages -list document.pdf
# Extract from specific pages
pdfimages -f 5 -l 10 -png document.pdf images

3.7. pdftohtml - PDF to HTML

# Basic conversion
pdftohtml document.pdf output
# Single HTML file
pdftohtml -s document.pdf output.html
# With images extracted
pdftohtml -fmt png document.pdf output
# Ignore images
pdftohtml -i document.pdf output.html

3.8. pdffonts - List Fonts

# List all fonts in PDF
pdffonts document.pdf
# Example output:
# name                  type         encoding   emb sub uni
# BAAAAA+Arial          TrueType     WinAnsi    yes yes yes
# CAAAAA+TimesNewRoman  TrueType     WinAnsi    yes yes yes

4. QPDF

QPDF is powerful for structural PDF manipulation, encryption, and repair.

4.1. Basic Operations

# Linearize (optimize for web)
qpdf --linearize input.pdf output.pdf
# Check PDF structure
qpdf --check input.pdf
# Show page count
qpdf --show-npages input.pdf
# Show detailed structure
qpdf --show-object=1 input.pdf

4.2. Page Manipulation

# Extract pages 1-10
qpdf input.pdf --pages . 1-10 -- output.pdf
# Extract specific pages
qpdf input.pdf --pages . 1,3,5,7-10 -- output.pdf
# Remove pages (keep all except 5-10)
qpdf input.pdf --pages . 1-4,11-z -- output.pdf
# Reverse page order
qpdf input.pdf --pages . z-1 -- reversed.pdf
# Extract even pages only
qpdf input.pdf --pages . 2,4,6,8,10 -- evens.pdf
# Collate two documents (interleave)
qpdf --collate a.pdf --pages a.pdf b.pdf -- interleaved.pdf

4.3. Merging with QPDF

# Merge entire PDFs
qpdf --empty --pages file1.pdf file2.pdf file3.pdf -- merged.pdf
# Merge with page selections
qpdf --empty --pages \
  doc1.pdf 1-5 \
  doc2.pdf 10-20 \
  doc3.pdf 1-z \
  -- combined.pdf
# Insert pages from one PDF into another
qpdf original.pdf --pages original.pdf 1-5 insert.pdf original.pdf 6-z -- output.pdf

4.4. Rotation

# Rotate all pages 90° clockwise
qpdf input.pdf --rotate=+90 -- output.pdf
# Rotate all pages 90° counter-clockwise
qpdf input.pdf --rotate=-90 -- output.pdf
# Rotate specific pages
qpdf input.pdf --rotate=+90:1,3,5 -- output.pdf
# Rotate page range
qpdf input.pdf --rotate=+180:10-20 -- output.pdf

4.5. Split into Chunks

# Split into single pages
qpdf input.pdf --split-pages output-%d.pdf
# Split into N-page chunks
qpdf input.pdf --split-pages=10 chunk-%d.pdf

4.6. Flatten Forms & Annotations

# Flatten form fields (make uneditable)
qpdf --flatten-annotations=all input.pdf flattened.pdf
# Flatten only form fields
qpdf --flatten-annotations=forms input.pdf output.pdf
# Generate appearance streams (fix display issues)
qpdf --generate-appearances input.pdf output.pdf

5. Ghostscript

Ghostscript (gs) is the Swiss Army knife for PDF conversion and compression.

5.1. Compression Presets

# Screen quality (72 dpi) - smallest file
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/screen \
   -dNOPAUSE -dQUIET -dBATCH -sOutputFile=screen.pdf input.pdf
# Ebook quality (150 dpi) - good for tablets
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook \
   -dNOPAUSE -dQUIET -dBATCH -sOutputFile=ebook.pdf input.pdf
# Printer quality (300 dpi)
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/printer \
   -dNOPAUSE -dQUIET -dBATCH -sOutputFile=printer.pdf input.pdf
# Prepress quality (300 dpi, color preserving)
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/prepress \
   -dNOPAUSE -dQUIET -dBATCH -sOutputFile=prepress.pdf input.pdf
# Default (no downsampling)
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/default \
   -dNOPAUSE -dQUIET -dBATCH -sOutputFile=default.pdf input.pdf
Preset DPI Best For

/screen

72

Web viewing, email

/ebook

150

Tablets, e-readers

/printer

300

Home printing

/prepress

300

Professional printing

/default

-

General use

5.2. Custom Compression

# Fine-tuned compression
gs -sDEVICE=pdfwrite \
   -dCompatibilityLevel=1.4 \
   -dDownsampleColorImages=true \
   -dColorImageResolution=150 \
   -dDownsampleGrayImages=true \
   -dGrayImageResolution=150 \
   -dDownsampleMonoImages=true \
   -dMonoImageResolution=150 \
   -dNOPAUSE -dBATCH -dQUIET \
   -sOutputFile=custom.pdf input.pdf

5.3. Convert to PDF/A (Archival)

gs -dPDFA=2 -dBATCH -dNOPAUSE -dNOOUTERSAVE \
   -sColorConversionStrategy=UseDeviceIndependentColor \
   -sDEVICE=pdfwrite \
   -sOutputFile=archive.pdf input.pdf

5.4. Grayscale Conversion

gs -sDEVICE=pdfwrite \
   -sProcessColorModel=DeviceGray \
   -sColorConversionStrategy=Gray \
   -dOverrideICC \
   -dNOPAUSE -dBATCH -dQUIET \
   -sOutputFile=grayscale.pdf input.pdf

5.5. PDF to PostScript and Back

# PDF to PostScript
gs -sDEVICE=ps2write -sOutputFile=output.ps input.pdf
# PostScript to PDF
gs -sDEVICE=pdfwrite -sOutputFile=output.pdf input.ps

5.6. Extract Page Range

gs -sDEVICE=pdfwrite \
   -dFirstPage=5 -dLastPage=15 \
   -dNOPAUSE -dBATCH -dQUIET \
   -sOutputFile=pages5-15.pdf input.pdf

5.7. Repair Damaged PDF

gs -o repaired.pdf -sDEVICE=pdfwrite -dPDFSETTINGS=/prepress damaged.pdf

6. PDFtk

PDFtk (PDF Toolkit) excels at merging, splitting, and form operations.

6.1. Merge

# Merge files
pdftk file1.pdf file2.pdf file3.pdf cat output merged.pdf
# Merge with handles (for complex operations)
pdftk A=file1.pdf B=file2.pdf cat A B output merged.pdf
# Merge specific pages
pdftk A=file1.pdf B=file2.pdf cat A1-5 B10-20 output combined.pdf

6.2. Split

# Split into single pages
pdftk input.pdf burst output page_%02d.pdf
# Extract page range
pdftk input.pdf cat 5-10 output chapter.pdf
# Extract specific pages
pdftk input.pdf cat 1 3 5 7 output odds.pdf
# Extract all except certain pages
pdftk input.pdf cat 1-4 11-end output without5-10.pdf

6.3. Rotate

# Rotate all pages
pdftk input.pdf cat 1-endright output rotated.pdf
# Rotation options: north, south, east, west, left, right, down
# Rotate specific pages
pdftk input.pdf cat 1-4 5right 6-end output mixed.pdf

6.4. Watermark & Stamp

# Background (behind content)
pdftk input.pdf background watermark.pdf output watermarked.pdf
# Stamp (on top of content)
pdftk input.pdf stamp logo.pdf output stamped.pdf
# Multi-page stamp (different stamp per page)
pdftk input.pdf multistamp stamps.pdf output output.pdf

6.5. Form Operations

# Extract form field data
pdftk form.pdf dump_data_fields > fields.txt
# Fill form from FDF file
pdftk form.pdf fill_form data.fdf output filled.pdf
# Generate FDF from filled form
pdftk filled.pdf generate_fdf output data.fdf
# Flatten form (make fields uneditable)
pdftk form.pdf output flat.pdf flatten

6.6. Metadata

# Dump metadata
pdftk input.pdf dump_data output metadata.txt
# Update metadata
pdftk input.pdf update_info metadata.txt output updated.pdf
# Example metadata.txt:
# InfoBegin
# InfoKey: Title
# InfoValue: My Document
# InfoBegin
# InfoKey: Author
# InfoValue: John Doe

6.7. Repair

# Repair PDF structure
pdftk damaged.pdf output repaired.pdf

6.8. Attachments

# Attach file to PDF
pdftk input.pdf attach_files attachment.zip output with_attachment.pdf
# Unpack attachments
pdftk input.pdf unpack_files output ./attachments/

7. OCR with ocrmypdf

Add searchable text layer to scanned PDFs.

7.1. Basic OCR

# Add text layer (English)
ocrmypdf input.pdf output.pdf
# Specify language
ocrmypdf -l eng input.pdf output.pdf
# Multiple languages
ocrmypdf -l eng+deu+fra input.pdf output.pdf
# List available languages
tesseract --list-langs

7.2. OCR Options

# Skip pages that already have text
ocrmypdf --skip-text input.pdf output.pdf
# Force OCR on all pages (even with existing text)
ocrmypdf --force-ocr input.pdf output.pdf
# Redo OCR (replace existing text)
ocrmypdf --redo-ocr input.pdf output.pdf
# Deskew pages (straighten)
ocrmypdf --deskew input.pdf output.pdf
# Clean pages (remove noise)
ocrmypdf --clean input.pdf output.pdf
# Rotate pages to correct orientation
ocrmypdf --rotate-pages input.pdf output.pdf

7.3. Output Options

# Create PDF/A for archival
ocrmypdf --output-type pdfa input.pdf output.pdf
# Optimize file size
ocrmypdf --optimize 3 input.pdf output.pdf
# 0 = no optimization
# 1 = lossless
# 2 = lossy (JPEG quality 75)
# 3 = aggressive lossy
# Keep original quality (no recompression)
ocrmypdf --output-type pdf input.pdf output.pdf

7.4. Processing Options

# Use all CPU cores
ocrmypdf -j 0 input.pdf output.pdf
# Limit to 4 cores
ocrmypdf -j 4 input.pdf output.pdf
# Show progress
ocrmypdf -v 1 input.pdf output.pdf
# Very verbose
ocrmypdf -v 2 input.pdf output.pdf

7.5. Batch OCR

# OCR all PDFs in directory
for pdf in *.pdf; do
  ocrmypdf --skip-text "$pdf" "ocr_$pdf"
done
# Parallel batch OCR
find . -name "*.pdf" | parallel ocrmypdf --skip-text {} ocr/{}

8. Image to PDF

8.1. img2pdf - Lossless Conversion

# Single image
img2pdf image.png -o output.pdf
# Multiple images
img2pdf page1.png page2.png page3.png -o document.pdf
# All images in directory
img2pdf *.png -o combined.pdf
# All images sorted
img2pdf $(ls -v *.png) -o sorted.pdf
# Specify page size
img2pdf --pagesize A4 *.jpg -o a4.pdf
# Specify DPI
img2pdf --dpi 300 *.png -o output.pdf
# Auto-orient
img2pdf --auto-orient *.jpg -o output.pdf

8.2. Using ImageMagick

# Convert images to PDF
convert *.png output.pdf
# With specific quality
convert -quality 90 *.jpg output.pdf
# Resize while converting
convert -resize 50% *.png output.pdf

9. Search PDFs

9.1. pdfgrep

# Basic search
pdfgrep "pattern" document.pdf
# With page numbers
pdfgrep -n "pattern" document.pdf
# Case insensitive
pdfgrep -i "pattern" document.pdf
# Recursive search
pdfgrep -r "pattern" ./documents/
# Count matches
pdfgrep -c "pattern" document.pdf
# Show context (lines before/after)
pdfgrep -C 2 "pattern" document.pdf
# Regular expression
pdfgrep -P "\d{3}-\d{4}" document.pdf
# Search multiple files
pdfgrep "pattern" *.pdf
# Only show filenames
pdfgrep -l "pattern" *.pdf

9.2. Search with pdftotext + grep

# Extract and search
pdftotext document.pdf - | grep "pattern"
# With line numbers
pdftotext document.pdf - | grep -n "pattern"
# Search all PDFs in directory
for pdf in *.pdf; do
  echo "=== $pdf ===" && pdftotext "$pdf" - | grep -i "pattern"
done

10. Metadata

10.1. View Metadata

# Using pdfinfo
pdfinfo document.pdf
# Using exiftool (more detailed)
exiftool document.pdf
# Using qpdf
qpdf --show-object=trailer document.pdf

10.2. Edit Metadata with exiftool

# Set title
exiftool -Title="My Document" document.pdf
# Set author
exiftool -Author="John Doe" document.pdf
# Set multiple fields
exiftool -Title="Doc" -Author="Me" -Subject="Test" document.pdf
# Clear all metadata
exiftool -all= document.pdf
# Copy metadata from one PDF to another
exiftool -TagsFromFile source.pdf dest.pdf

10.3. Edit Metadata with qpdf

# Create metadata JSON
cat > meta.json << 'EOF'
{
  "/Title": "My Document",
  "/Author": "John Doe",
  "/Subject": "Documentation",
  "/Keywords": "pdf, manual",
  "/Creator": "Custom Tool"
}
EOF
# Apply metadata (via replacement)
qpdf input.pdf --replace-object 1 meta.json -- output.pdf

11. Compression Workflows

11.1. Compare Compression Results

#!/bin/bash
# compare-compression.sh
input="$1"
echo "Original: $(du -h "$input" | cut -f1)"

for preset in screen ebook printer prepress; do
  gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/$preset \
     -dNOPAUSE -dQUIET -dBATCH -sOutputFile="/tmp/$preset.pdf" "$input"
  echo "$preset: $(du -h /tmp/$preset.pdf | cut -f1)"
done

11.2. Aggressive Compression

# Maximum compression
gs -sDEVICE=pdfwrite \
   -dCompatibilityLevel=1.4 \
   -dPDFSETTINGS=/screen \
   -dDownsampleColorImages=true \
   -dColorImageResolution=72 \
   -dDownsampleGrayImages=true \
   -dGrayImageResolution=72 \
   -dDownsampleMonoImages=true \
   -dMonoImageResolution=72 \
   -dColorImageDownsampleType=/Bicubic \
   -dGrayImageDownsampleType=/Bicubic \
   -dMonoImageDownsampleType=/Bicubic \
   -dNOPAUSE -dBATCH -dQUIET \
   -sOutputFile=tiny.pdf large.pdf

11.3. Compression Function

# Add to ~/.bashrc or ~/.config/fish/functions/
pdf-compress() {
  local input="$1"
  local quality="${2:-ebook}"  # default: ebook
  local output="${input%.pdf}-${quality}.pdf"

  gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/$quality \
     -dNOPAUSE -dQUIET -dBATCH -sOutputFile="$output" "$input"

  echo "Original: $(du -h "$input" | cut -f1)"
  echo "Compressed: $(du -h "$output" | cut -f1)"
}

12. Encryption & Security

12.1. Encrypt with QPDF

# 256-bit AES encryption
qpdf --encrypt user-password owner-password 256 -- input.pdf encrypted.pdf
# With permissions
qpdf --encrypt user-pass owner-pass 256 \
  --print=none \
  --modify=none \
  --extract=n \
  -- input.pdf protected.pdf
# Allow printing only
qpdf --encrypt "" owner-pass 256 \
  --print=full \
  --modify=none \
  -- input.pdf printable.pdf

12.2. Permission Options

Option Values

--print

none, low, full

--modify

none, assembly, form, annotate, all

--extract

y, n

--annotate

y, n

--form

y, n

12.3. Decrypt

# Decrypt with password
qpdf --decrypt --password=secret encrypted.pdf decrypted.pdf
# Check if encrypted
qpdf --is-encrypted input.pdf && echo "Encrypted" || echo "Not encrypted"
# Show encryption details
qpdf --show-encryption input.pdf

12.4. Encrypt with pdftk

# Set passwords
pdftk input.pdf output protected.pdf \
  user_pw userpass owner_pw ownerpass
# Set permissions
pdftk input.pdf output protected.pdf \
  owner_pw secret \
  allow printing

13. Repair Corrupt PDFs

13.1. Method 1: QPDF

# Linearize (often fixes issues)
qpdf --linearize damaged.pdf repaired.pdf
# QDF mode (normalized structure)
qpdf --qdf damaged.pdf repaired.pdf
# Ignore errors and try to recover
qpdf --replace-input damaged.pdf

13.2. Method 2: Ghostscript

# Re-render PDF
gs -o repaired.pdf -sDEVICE=pdfwrite -dPDFSETTINGS=/prepress damaged.pdf

13.3. Method 3: pdftk

# Simple repair
pdftk damaged.pdf output repaired.pdf

13.4. Method 4: MuPDF

# Clean and repair
mutool clean damaged.pdf repaired.pdf

13.5. Validation

# Check PDF validity
qpdf --check file.pdf
# Detailed warnings
qpdf --check --verbose file.pdf

14. Batch Operations

14.1. Compress All PDFs

#!/bin/bash
# compress-all.sh

mkdir -p compressed
for pdf in *.pdf; do
  echo "Compressing: $pdf"
  gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook \
     -dNOPAUSE -dQUIET -dBATCH \
     -sOutputFile="compressed/$pdf" "$pdf"
done

echo "Done. Size comparison:"
du -sh . compressed/

14.2. OCR All Scanned PDFs

#!/bin/bash
# ocr-all.sh

mkdir -p ocr
for pdf in *.pdf; do
  echo "OCR: $pdf"
  ocrmypdf --skip-text -j 4 "$pdf" "ocr/$pdf"
done

14.3. Convert All to Images

#!/bin/bash
# pdf-to-images.sh

for pdf in *.pdf; do
  dir="${pdf%.pdf}"
  mkdir -p "$dir"
  pdftoppm -png -r 300 "$pdf" "$dir/page"
  echo "Converted: $pdf → $dir/"
done

14.4. Merge All PDFs in Directory

# Sorted by name
pdfunite $(ls -v *.pdf) combined.pdf
# Sorted by date
pdfunite $(ls -t *.pdf) combined.pdf

14.5. Parallel Processing

# Using GNU parallel
find . -name "*.pdf" | parallel -j 4 ocrmypdf --skip-text {} ocr/{}
# Compress in parallel
find . -name "*.pdf" | parallel -j 4 \
  'gs -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook -dNOPAUSE -dBATCH -dQUIET \
      -sOutputFile="compressed/{/}" {}'

15. Common Workflows

15.1. Scanned Document to Searchable PDF

#!/bin/bash
# scan-to-searchable.sh

input="$1"
output="${input%.pdf}-searchable.pdf"

# 1. Deskew and clean
# 2. OCR with English
# 3. Optimize size
ocrmypdf --deskew --clean --optimize 2 -l eng "$input" "$output"

echo "Created: $output"

15.2. Book Scan Processing

#!/bin/bash
# process-book.sh

input="$1"
output="processed-${input}"

# 1. OCR
# 2. Deskew
# 3. Clean
# 4. Optimize for ebook
ocrmypdf \
  --deskew \
  --clean \
  --rotate-pages \
  --optimize 2 \
  -l eng \
  "$input" "/tmp/ocr.pdf"

# 5. Further compress
gs -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook \
   -dNOPAUSE -dQUIET -dBATCH \
   -sOutputFile="$output" "/tmp/ocr.pdf"

rm /tmp/ocr.pdf
echo "Processed: $output ($(du -h "$output" | cut -f1))"

15.3. Extract Chapter from Book

# Extract pages 50-100 from book
qpdf book.pdf --pages . 50-100 -- chapter3.pdf

15.4. Create Handout (Multiple Pages Per Sheet)

# Using pdfnup (from pdfjam)
pdfnup --nup 2x2 presentation.pdf --outfile handout.pdf

15.5. Combine Images into Booklet

# 1. Convert images to PDF
img2pdf --auto-orient $(ls -v *.jpg) -o temp.pdf
# 2. OCR if needed
ocrmypdf --skip-text temp.pdf final.pdf
# 3. Cleanup
rm temp.pdf

16. Tool Comparison

Task Poppler QPDF GS PDFtk

View info

pdfinfo

--show-npages

-

dump_data

Split pages

pdfseparate

--split-pages

-

burst

Merge

pdfunite

--pages

-

cat

Rotate

-

--rotate

-

cat Xright

Compress

-

-

-dPDFSETTINGS

-

Encrypt

-

--encrypt

-

user_pw

Repair

-

--linearize

(rerender)

(reprocess)

Extract text

pdftotext

-

-

-

To images

pdftoppm

-

-

-

Forms

-

--flatten

-

fill_form