Git Filter-Repo
Repository surgery with git-filter-repo: remove secrets, extract projects, rewrite authors.
Filter-repo Fundamentals
|
|
# INSTALL
pip install git-filter-repo # Via pip
pacman -S git-filter-repo # Arch Linux
brew install git-filter-repo # macOS
# VERIFY INSTALLATION
git filter-repo --version
# BASIC SYNTAX
git filter-repo [options]
# KEY OPTIONS
# --invert-paths Exclude matching paths (delete them)
# --path PATH Include/exclude specific paths
# --subdirectory-filter DIR Keep only subdirectory
# --path-rename OLD:NEW Rename paths
# --message-callback Modify commit messages
# --email-callback Modify emails
# --name-callback Modify names
# --force Run without fresh clone check
# ALWAYS START WITH FRESH CLONE
git clone --mirror git@github.com:user/repo.git
cd repo.git
# Run filter-repo commands
git push --force --all
git push --force --tags
Remove Secrets from History
# SCENARIO: Accidentally committed .env file
# 1. Clone fresh (mirror clone for complete history)
git clone --mirror git@github.com:user/repo.git
cd repo.git
# 2. Remove the sensitive file from ALL history
git filter-repo --invert-paths --path .env
# 3. Force push (rewrites remote history)
git push --force --all
git push --force --tags
# 4. All collaborators must re-clone!
# Their local repos are now incompatible
# REMOVE MULTIPLE SENSITIVE FILES
git filter-repo \
--invert-paths \
--path .env \
--path secrets.yaml \
--path credentials.json \
--path config/api_keys.txt
# REMOVE BY PATTERN
# Filter-repo doesn't support globs directly
# Use --path-regex instead:
git filter-repo --invert-paths --path-regex '.*\.key$'
git filter-repo --invert-paths --path-regex 'secrets/.*'
# REMOVE DIRECTORY
git filter-repo --invert-paths --path secrets/
git filter-repo --invert-paths --path config/private/
# AFTER REMOVING SECRETS
# 1. Rotate all exposed credentials
# 2. Revoke tokens/keys
# 3. Update .gitignore
# 4. Consider secrets exposure in logs/backups
# VERIFY REMOVAL
git log --all --full-history -- .env
# Should show no results
Extract Subdirectory to New Repository
# SCENARIO: Want to make src/myapp its own repository
# 1. Clone the repository
git clone git@github.com:user/monorepo.git
cd monorepo
# 2. Extract subdirectory (becomes root)
git filter-repo --subdirectory-filter src/myapp
# 3. Result
# Before: monorepo/
# βββ src/
# β βββ myapp/
# β β βββ main.py
# β β βββ utils.py
# β βββ other/
# βββ docs/
#
# After: monorepo/
# βββ main.py
# βββ utils.py
# 4. Update remote and push
git remote remove origin
git remote add origin git@github.com:user/myapp.git
git push -u origin main
# KEEP MULTIPLE SUBDIRECTORIES
git filter-repo \
--subdirectory-filter src/myapp \
--subdirectory-filter src/shared
# EXTRACT AND PRESERVE PATH
# If you want to keep the path structure:
git filter-repo --path src/myapp/
# Result: src/myapp/* stays as src/myapp/*
# EXTRACT WITH RENAME
git filter-repo \
--subdirectory-filter src/myapp \
--path-rename '':app/
# Moves files into app/ subdirectory
Rename and Reorganize Paths
# RENAME DIRECTORY IN HISTORY
git filter-repo --path-rename old-name/:new-name/
# EXAMPLE: lib/ β src/
git filter-repo --path-rename lib/:src/
# MULTIPLE RENAMES
git filter-repo \
--path-rename lib/:src/lib/ \
--path-rename tests/:src/tests/ \
--path-rename docs/:documentation/
# FLATTEN DIRECTORY STRUCTURE
# Remove a directory level:
git filter-repo --path-rename src/app/:app/
# NEST INTO SUBDIRECTORY
# Add a directory level:
git filter-repo --path-rename '':project/
# RENAME FILE PATTERN
# Use callback for complex patterns:
git filter-repo --filename-callback '
return filename.replace(b".jsx", b".tsx") if filename.endswith(b".jsx") else filename
'
# CASE CONVERSION
git filter-repo --filename-callback '
return filename.lower()
'
Rewrite Author Information
# FIX AUTHOR EMAIL
git filter-repo --email-callback '
return email.replace(b"old@example.com", b"new@example.com")
'
# FIX MULTIPLE EMAILS
git filter-repo --email-callback '
email_map = {
b"old1@example.com": b"correct@example.com",
b"old2@work.com": b"correct@example.com",
b"typo@exmaple.com": b"correct@example.com",
}
return email_map.get(email, email)
'
# FIX AUTHOR NAME
git filter-repo --name-callback '
return name.replace(b"oldname", b"New Name")
'
# FIX BOTH NAME AND EMAIL
# Use mailmap file approach:
# Create .mailmap file:
# New Name <new@email.com> Old Name <old@email.com>
# Another <another@email.com> <wrong@email.com>
git filter-repo --mailmap .mailmap
# EXAMPLE .mailmap
# Evan Rosado <evan@example.com> <evanusmodestus@localhost>
# Evan Rosado <evan@example.com> <evan@old-domain.com>
# Evan Rosado <evan@example.com> root <root@localhost>
# NORMALIZE ALL EMAILS TO LOWERCASE
git filter-repo --email-callback '
return email.lower()
'
# REPLACE DOMAIN IN ALL EMAILS
git filter-repo --email-callback '
return email.replace(b"@old-company.com", b"@new-company.com")
'
Rewrite Commit Messages
# ADD PREFIX TO ALL MESSAGES
git filter-repo --message-callback '
return b"[MIGRATED] " + message
'
# REPLACE TEXT IN MESSAGES
git filter-repo --message-callback '
return message.replace(b"old-project", b"new-project")
'
# FIX TICKET REFERENCES
git filter-repo --message-callback '
return message.replace(b"JIRA-", b"PROJ-")
'
# REMOVE SENSITIVE CONTENT FROM MESSAGES
git filter-repo --message-callback '
import re
# Remove API keys from messages
message = re.sub(rb"api[_-]?key[=:]\s*\S+", b"api_key=REDACTED", message, flags=re.I)
return message
'
# ADD SIGN-OFF TO ALL COMMITS
git filter-repo --message-callback '
if b"Signed-off-by:" not in message:
message = message.rstrip() + b"\n\nSigned-off-by: Name <email@example.com>\n"
return message
'
# STANDARDIZE MESSAGE FORMAT
git filter-repo --message-callback '
# Ensure message ends with single newline
return message.rstrip() + b"\n"
'
# REMOVE CO-AUTHORED-BY
git filter-repo --message-callback '
lines = message.split(b"\n")
lines = [l for l in lines if not l.startswith(b"Co-Authored-By:")]
return b"\n".join(lines)
'
Advanced Operations
# ANALYZE REPOSITORY FIRST
git filter-repo --analyze
# Creates .git/filter-repo/analysis/ with:
# - path-all-sizes.txt (all files ever)
# - blob-shas-and-paths.txt
# - directories-all-sizes.txt
# FIND LARGEST FILES
cat .git/filter-repo/analysis/path-all-sizes.txt | head -20
# REMOVE LARGE FILES
git filter-repo --invert-paths --path large-file.zip
git filter-repo --strip-blobs-bigger-than 50M
# REMOVE FILES BY EXTENSION
git filter-repo --invert-paths --path-regex '.*\.(zip|tar\.gz|exe)$'
# COMBINE MULTIPLE OPERATIONS
git filter-repo \
--invert-paths --path secrets/ \
--invert-paths --path-regex '.*\.key$' \
--path-rename lib/:src/lib/ \
--strip-blobs-bigger-than 10M
# PRESERVE REFS
git filter-repo --preserve-commit-hashes --force
# WARNING: Only works if not changing content!
# LIMIT TO SPECIFIC REFS
git filter-repo --refs refs/heads/main --invert-paths --path secrets/
# Only rewrite main branch
# DRY RUN (sort of)
# filter-repo doesn't have dry-run, but:
git clone --mirror origin backup-repo.git # Backup first
cd repo.git
git filter-repo [options]
# Compare with backup if needed
# CALLBACKS WITH COMPLEX LOGIC
git filter-repo --commit-callback '
# Skip commits from bots
if b"[bot]" in commit.author_email:
commit.skip()
'
# REMOVE EMPTY COMMITS AFTER FILTERING
git filter-repo --prune-empty=always
# Commits that become empty after filtering are removed
Infrastructure Repository Cleanup
# SCENARIO: Old domus-* repo has sensitive data in history
# 1. Backup first!
cp -r ~/atelier/_bibliotheca/domus-infra-ops ~/backup-domus-infra-ops
# 2. Clone fresh
git clone --mirror git@github.com:EvanusModestus/domus-infra-ops.git
cd domus-infra-ops.git
# 3. Analyze what's in history
git filter-repo --analyze
cat .git/filter-repo/analysis/path-all-sizes.txt | head -30
# 4. Remove sensitive patterns
git filter-repo \
--invert-paths --path '.env' \
--invert-paths --path 'credentials.json' \
--invert-paths --path-regex '.*password.*\.txt$'
# 5. Force push to remotes
SSH_AUTH_SOCK=/run/user/1000/ssh-agent.socket git push --force --all origin
SSH_AUTH_SOCK=/run/user/1000/ssh-agent.socket git push --force --tags origin
# 6. Re-clone on all machines
rm -rf ~/atelier/_bibliotheca/domus-infra-ops
git clone git@github.com:EvanusModestus/domus-infra-ops.git
# RENAME REPO PATHS IN HISTORY
# Example: Moving from old structure to Antora structure
git filter-repo --path-rename 'docs/':docs/asciidoc/modules/ROOT/
# CONSOLIDATE REPOS
# Merge repo2 into repo1 preserving history:
# (In repo1)
git remote add repo2 ../repo2
git fetch repo2
git merge repo2/main --allow-unrelated-histories
git remote remove repo2
# Then use filter-repo to reorganize paths if needed
# CLEAN ANTORA BUILD ARTIFACTS FROM HISTORY
# If build/ was accidentally committed:
git filter-repo --invert-paths --path build/
# REMOVE OLD VAULT TOKENS FROM HISTORY
git filter-repo --message-callback '
import re
message = re.sub(rb"hvs\.[a-zA-Z0-9]+", b"hvs.REDACTED", message)
return message
'
Filter-repo Gotchas
# WRONG: Running on existing checkout
git filter-repo --invert-paths --path .env
# Error: Refusing to run on non-fresh clone
# CORRECT: Use fresh clone or --force
git clone --mirror origin repo.git
cd repo.git
git filter-repo --invert-paths --path .env
# OR (dangerous):
git filter-repo --force --invert-paths --path .env
# WRONG: Forgetting to push tags
git filter-repo --invert-paths --path secrets/
git push --force --all
# Tags still contain old history!
# CORRECT: Push tags too
git push --force --tags
# WRONG: Expecting collaborators to auto-sync
git filter-repo --invert-paths --path .env
git push --force --all
# Others' repos now broken!
# CORRECT: Coordinate with team
# 1. Notify everyone
# 2. Run filter-repo
# 3. Everyone must re-clone
# 4. Old branches/stashes have old history
# WRONG: Not backing up first
git filter-repo --strip-blobs-bigger-than 1M
# Oops, removed important files!
# CORRECT: Always backup
cp -r repo repo-backup
# OR: git clone --mirror original backup.git
# WRONG: Running analyze after filter
git filter-repo --invert-paths --path file
git filter-repo --analyze
# Analysis shows filtered state, not original
# CORRECT: Analyze first
git filter-repo --analyze # See what exists
# Then filter
# WRONG: Expecting immediate effect on GitHub/GitLab
# Servers may cache old refs temporarily
# CORRECT: Wait or request GC
# GitHub: Contact support for repository GC
# GitLab: Admin can trigger manual GC
# WRONG: Using with submodules
git filter-repo --subdirectory-filter src/
# Submodule references may break
# CORRECT: Handle submodules separately
# Remove submodule refs or update them manually
Quick Reference
# SETUP (always fresh clone)
git clone --mirror git@github.com:user/repo.git
cd repo.git
# ANALYZE FIRST
git filter-repo --analyze
cat .git/filter-repo/analysis/path-all-sizes.txt
# REMOVE PATHS
git filter-repo --invert-paths --path .env
git filter-repo --invert-paths --path secrets/
git filter-repo --invert-paths --path-regex '.*\.key$'
# EXTRACT SUBDIRECTORY
git filter-repo --subdirectory-filter src/app
# RENAME PATHS
git filter-repo --path-rename old/:new/
# FIX AUTHORS
git filter-repo --mailmap .mailmap
git filter-repo --email-callback 'return email.replace(b"old", b"new")'
# FIX MESSAGES
git filter-repo --message-callback 'return b"prefix: " + message'
# REMOVE LARGE FILES
git filter-repo --strip-blobs-bigger-than 50M
# FORCE PUSH (REQUIRED AFTER ALL OPERATIONS)
git push --force --all origin
git push --force --tags origin
# TEAM ACTIONS AFTER
# 1. All collaborators re-clone
# 2. Old branches/stashes are invalid
# 3. Old PRs may break
# 4. CI/CD may need updates