Linux Process Management

Process management, signals, and resource control.

Process Inspection

# PS - PROCESS STATUS
ps aux                                   # All processes, user format
ps auxf                                  # With tree (forest)
ps -ef                                   # POSIX format
ps -eo pid,ppid,user,%cpu,%mem,cmd      # Custom columns

# USEFUL PS COLUMN COMBOS
ps -eo pid,ppid,user,stat,%cpu,%mem,vsz,rss,tty,start,time,cmd --sort=-%mem | head -20
# stat: process state
# vsz: virtual memory
# rss: resident (actual) memory
# time: cumulative CPU time

# PROCESS STATES:
# R = Running
# S = Sleeping (waiting for event)
# D = Uninterruptible sleep (usually I/O)
# Z = Zombie (terminated, waiting for parent)
# T = Stopped (suspended)

# FIND SPECIFIC PROCESSES
ps aux | grep -v grep | grep nginx      # Filter by name
pgrep -a nginx                          # Better: shows command
pgrep -f "python app.py"                # Match full command line
pidof nginx                             # Just PIDs

# PSTREE - Process hierarchy
pstree                                   # Full tree
pstree -p                                # With PIDs
pstree -p -a                             # With arguments
pstree -u user                           # User's processes

# TOP/HTOP - Interactive monitoring
top                                      # Basic
htop                                     # Better (colors, mouse)

# TOP KEYBOARD SHORTCUTS:
# M = Sort by memory
# P = Sort by CPU
# k = Kill process
# 1 = Show individual CPUs
# c = Show full command

# HTOP ADDITIONS:
# F5 = Tree view
# F6 = Sort by column
# F9 = Kill (signal selection)
# / = Search

# PROCESS INFO FROM /proc
ls /proc/1234/                          # Process directory
cat /proc/1234/cmdline | tr '\0' ' '    # Command line
cat /proc/1234/environ | tr '\0' '\n'   # Environment
cat /proc/1234/status                   # Detailed status
cat /proc/1234/limits                   # Resource limits
ls -l /proc/1234/fd                     # Open files
ls -l /proc/1234/cwd                    # Working directory

# MEMORY DETAILS
cat /proc/1234/maps                     # Memory mappings
cat /proc/1234/smaps | grep -E '^(Size|Rss|Pss):'  # Memory stats

# INFRASTRUCTURE: Find memory-hungry processes
ps -eo pid,user,%mem,rss,cmd --sort=-%mem | head -10 | \
    awk 'NR>1 {printf "%5s %10s %5s%% %10.1f MB  %s\n", $1, $2, $3, $4/1024, $5}'

Process Control

# SIGNALS
# Common signals:
# 1  SIGHUP   - Hangup (reload config for many daemons)
# 2  SIGINT   - Interrupt (Ctrl+C)
# 9  SIGKILL  - Kill immediately (cannot be caught)
# 15 SIGTERM  - Terminate gracefully (default)
# 18 SIGCONT  - Continue (resume stopped process)
# 19 SIGSTOP  - Stop (cannot be caught)

# KILL COMMANDS
kill PID                                 # SIGTERM (graceful)
kill -9 PID                              # SIGKILL (force)
kill -HUP PID                            # SIGHUP (reload)
kill -STOP PID                           # Suspend
kill -CONT PID                           # Resume

# KILL BY NAME
pkill nginx                              # SIGTERM to all nginx
pkill -9 -f "python app.py"             # SIGKILL by command
killall nginx                            # Kill all with exact name

# KILL WITH CONFIRMATION
pgrep -a nginx                           # See what will be killed
pkill nginx                              # Then kill

# BACKGROUND/FOREGROUND
command &                                # Run in background
jobs                                     # List background jobs
fg %1                                    # Bring job 1 to foreground
bg %1                                    # Continue job 1 in background
Ctrl+Z                                   # Suspend current process

# DISOWN - Survive logout
command &                                # Start in background
disown %1                                # Detach from shell
# Or
nohup command &                          # Redirect output, ignore hangup
nohup command > /tmp/out.log 2>&1 &      # Explicit redirect

# WAIT FOR PROCESSES
wait $PID                                # Wait for specific PID
wait                                     # Wait for all background jobs

# TIMEOUT
timeout 30 command                       # Kill after 30 seconds
timeout -s KILL 30 command              # Use SIGKILL
timeout --preserve-status 30 command    # Preserve exit code

# INFRASTRUCTURE: Kill runaway processes
# Find processes using >50% CPU for >1 minute
ps -eo pid,user,%cpu,etime,cmd --sort=-%cpu | \
    awk 'NR>1 && $3>50 && $4~/^[0-9][0-9]:/ {print "RUNAWAY:", $0}'

# Kill zombie processes (must kill parent!)
ps aux | awk '$8=="Z" {print $2, $11}'
# Find parent and kill that instead

Process Priority

# NICE - CPU priority
# Range: -20 (highest) to 19 (lowest), default 0

nice -n 10 command                       # Start with low priority
nice -n -5 command                       # High priority (needs root)
renice -n 10 -p 1234                     # Change running process
renice -n 5 -u username                  # All processes for user

# VIEW NICE VALUES
ps -eo pid,ni,cmd | head -20             # ni = nice value
top                                       # NI column

# IONICE - I/O priority
# Classes: 1=realtime, 2=best-effort, 3=idle

ionice -c 3 command                      # Idle I/O (won't block others)
ionice -c 2 -n 7 command                 # Best-effort, low priority
ionice -c 2 -n 0 command                 # Best-effort, high priority
ionice -p 1234                           # Show current for PID

# INFRASTRUCTURE: Low-priority backup
nice -n 19 ionice -c 3 rsync -av /data /backup

# CPULIMIT - Limit CPU percentage
# Install: pacman -S cpulimit
cpulimit -p 1234 -l 50                   # Limit PID to 50% CPU
cpulimit -e nginx -l 30                  # Limit by name

# PROCESSOR AFFINITY (pin to CPUs)
taskset -c 0,1 command                   # Run on CPUs 0 and 1
taskset -pc 0,1 1234                     # Change running process
taskset -p 1234                          # Show current affinity

# CHRT - Real-time priority
# DANGEROUS: Can hang system if misused
chrt -f 1 command                        # FIFO scheduler, priority 1
chrt -r 1 command                        # Round-robin scheduler
chrt -p 1234                             # Show current

# RESOURCE LIMITS (ulimit)
ulimit -a                                # Show all limits
ulimit -n                                # Open files limit
ulimit -n 65536                          # Increase (current shell)

# PERMANENT LIMITS: /etc/security/limits.conf
# evanusmodestus  soft  nofile  65536
# evanusmodestus  hard  nofile  65536
# *               soft  nproc   4096

# CGROUPS v2 (systemd)
# View resource usage
systemctl status sshd                    # Shows cgroup
systemd-cgtop                            # Top for cgroups

# Set limits via systemd
systemctl set-property nginx.service CPUQuota=50%
systemctl set-property nginx.service MemoryMax=512M

Process Debugging

# STRACE - System call tracer
strace command                           # Trace new process
strace -p 1234                           # Attach to running process
strace -f command                        # Follow forks

# Useful strace options
strace -e open command                   # Only file opens
strace -e network command                # Network calls
strace -e trace=file command             # File-related calls
strace -e trace=process command          # Process calls (fork, exec)

strace -c command                        # Statistics summary
strace -t command                        # Timestamps
strace -T command                        # Time per call

# COMMON STRACE PATTERNS
# What files is it opening?
strace -e openat,open command 2>&1 | grep -v ENOENT

# What network connections?
strace -e connect,accept command

# Why is it slow?
strace -c -p 1234                        # Attach and get stats
# Look for high count on slow calls (read, write, poll)

# What config is it reading?
strace -e openat cat /etc/passwd 2>&1 | head -10

# LTRACE - Library call tracer
ltrace command                           # Trace library calls
ltrace -e malloc command                 # Specific function

# LSOF - List open files
lsof                                     # All open files (huge!)
lsof -p 1234                             # Files for specific PID
lsof -u username                         # Files for user
lsof -i                                  # Network connections
lsof -i :80                              # Who's using port 80
lsof +D /var/log                         # Files in directory
lsof /path/to/file                       # Who has file open

# FUSER - Find processes using files
fuser -v /var/log/messages               # Who's using this file
fuser -k /mnt/usb                        # Kill processes using mount
fuser -v 80/tcp                          # Who's using port 80

# PROC FILESYSTEM DEBUG
# CPU info
cat /proc/cpuinfo | grep -E '^(processor|model name)'

# Memory info
cat /proc/meminfo | grep -E '^(MemTotal|MemFree|MemAvailable|Cached)'

# Kernel info
cat /proc/version
cat /proc/cmdline                        # Boot parameters

# Process-specific
cat /proc/1234/stack                     # Kernel stack (needs root)
cat /proc/1234/syscall                   # Current syscall

# INFRASTRUCTURE: Debug hanging process
# 1. Find what it's waiting on
cat /proc/1234/wchan                     # Kernel wait channel

# 2. Check file descriptors
ls -l /proc/1234/fd | head -20           # What's it connected to

# 3. Stack trace
cat /proc/1234/stack                     # Where in kernel

# 4. Full strace
strace -p 1234 -o /tmp/strace.log &
sleep 10
kill %1
head -100 /tmp/strace.log

Background Jobs & Daemons

# BASIC BACKGROUND
command &                                # Run in background
command > /tmp/out.log 2>&1 &            # With output redirect

# JOB CONTROL
jobs                                     # List jobs
jobs -l                                  # With PIDs
fg                                       # Bring last to foreground
fg %2                                    # Bring job 2
bg %1                                    # Continue job 1 in background

# SURVIVE LOGOUT
nohup command &                          # Classic method
nohup command > /var/log/myapp.log 2>&1 &

disown %1                                # Detach from shell
disown -a                                # Detach all

# SCREEN/TMUX (better for interactive)
screen -S mysession                      # Start named session
screen -d -r mysession                   # Reattach
Ctrl+A d                                 # Detach

tmux new -s mysession                    # Start named session
tmux attach -t mysession                 # Reattach
Ctrl+B d                                 # Detach

# SETSID - New session
setsid command                           # Run in new session
# No controlling terminal, won't receive signals from shell

# SYSTEMD USER UNITS (proper way for user daemons)
mkdir -p ~/.config/systemd/user/

cat > ~/.config/systemd/user/myapp.service <<'EOF'
[Unit]
Description=My Application

[Service]
ExecStart=/path/to/myapp
Restart=always
RestartSec=5

[Install]
WantedBy=default.target
EOF

systemctl --user daemon-reload
systemctl --user enable --now myapp

# Check status
systemctl --user status myapp
journalctl --user -u myapp -f

# INFRASTRUCTURE: Port forward as service
cat > ~/.config/systemd/user/wazuh-dashboard-pf.service <<'EOF'
[Unit]
Description=Wazuh Dashboard Port Forward
After=network-online.target
Wants=network-online.target

[Service]
Type=simple
ExecStart=/usr/bin/kubectl -n wazuh port-forward service/wazuh-dashboard 8443:443 --address 0.0.0.0
Restart=always
RestartSec=10

[Install]
WantedBy=default.target
EOF

systemctl --user enable --now wazuh-dashboard-pf

Control Groups (cgroups)

# CGROUPS V2 (unified hierarchy, systemd default)
# View current cgroups
cat /proc/cgroups
mount | grep cgroup

# SYSTEMD CGROUP MANAGEMENT
systemd-cgtop                            # Top for cgroups
systemd-cgls                             # Cgroup tree

# VIEW SERVICE CGROUP
systemctl status nginx                   # Shows CGroup line
cat /sys/fs/cgroup/system.slice/nginx.service/cgroup.controllers

# SET RESOURCE LIMITS VIA SYSTEMD
# CPU limit (50%)
systemctl set-property nginx.service CPUQuota=50%

# Memory limit (512MB hard, 256MB high water)
systemctl set-property nginx.service MemoryMax=512M
systemctl set-property nginx.service MemoryHigh=256M

# I/O weight (10-10000, default 100)
systemctl set-property nginx.service IOWeight=50

# VIEW CURRENT LIMITS
systemctl show nginx.service | grep -E '^(CPU|Memory|IO)'

# OVERRIDE FILE (persistent)
mkdir -p /etc/systemd/system/nginx.service.d/
cat > /etc/systemd/system/nginx.service.d/limits.conf <<'EOF'
[Service]
CPUQuota=50%
MemoryMax=512M
EOF
systemctl daemon-reload
systemctl restart nginx

# MANUAL CGROUP OPERATIONS (cgroups v2)
# Create cgroup
mkdir /sys/fs/cgroup/mygroup

# Set limits
echo "50000 100000" > /sys/fs/cgroup/mygroup/cpu.max  # 50% CPU
echo "536870912" > /sys/fs/cgroup/mygroup/memory.max  # 512MB

# Add process to cgroup
echo 1234 > /sys/fs/cgroup/mygroup/cgroup.procs

# CGEXEC (cgroup-tools)
# Install: pacman -S libcgroup / apt install cgroup-tools
cgexec -g cpu,memory:/mygroup command

# SLICE HIERARCHY
# systemd organizes:
# -.slice (root)
# ├── system.slice (system services)
# ├── user.slice (user sessions)
# │   └── user-1000.slice (specific user)
# └── machine.slice (VMs/containers)

# VIEW HIERARCHY
systemd-cgls
ls /sys/fs/cgroup/

# INFRASTRUCTURE: Limit Vault memory
mkdir -p /etc/systemd/system/vault.service.d/
cat > /etc/systemd/system/vault.service.d/limits.conf <<'EOF'
[Service]
MemoryMax=2G
MemoryHigh=1536M
CPUQuota=200%
EOF
systemctl daemon-reload
systemctl restart vault

Performance Analysis

# LOAD AVERAGE
uptime                                   # Quick view
cat /proc/loadavg                        # Raw numbers
# 1.00 = 1 CPU fully utilized
# 4.00 on 4-core = all cores busy

# CPU USAGE
mpstat 1 5                               # Per-CPU stats every 1s, 5 times
mpstat -P ALL 1                          # All CPUs

# MEMORY USAGE
free -h                                  # Human readable
vmstat 1 5                               # Virtual memory stats

# vmstat columns:
# r  = processes waiting for CPU
# b  = processes in uninterruptible sleep
# si = swap in (bad if non-zero)
# so = swap out (bad if non-zero)

# I/O STATS
iostat -x 1 5                            # Extended I/O stats
iotop                                    # Top for I/O
iotop -o                                 # Only active I/O

# NETWORK
sar -n DEV 1 5                           # Network interface stats
ss -s                                    # Socket statistics

# PERF (advanced)
perf top                                 # Top functions
perf record command                      # Record profile
perf report                              # View profile

# Process-specific
perf record -p 1234 sleep 30             # Profile PID for 30s
perf report

# FLAMEGRAPHS
perf record -F 99 -g -p 1234 sleep 30
perf script | stackcollapse-perf.pl | flamegraph.pl > flame.svg

# QUICK PERFORMANCE CHECK
echo "=== CPU ===" && mpstat 1 1 | tail -1
echo "=== Memory ===" && free -h | grep Mem
echo "=== I/O ===" && iostat -x 1 1 | tail -2
echo "=== Load ===" && uptime

# INFRASTRUCTURE: Performance across hosts
for host in kvm-01 vault-01 k3s-master-01; do
    echo "=== $host ==="
    ssh "$host" "uptime" 2>/dev/null | awk -F'load average:' '{print "  Load:" $2}'
    ssh "$host" "free -h" 2>/dev/null | awk '/Mem:/ {print "  Mem:", $3, "/", $2}'
done

# FIND RESOURCE HOGS
echo "=== Top CPU ===" && ps -eo pid,user,%cpu,cmd --sort=-%cpu | head -5
echo "=== Top Memory ===" && ps -eo pid,user,%mem,rss,cmd --sort=-%mem | head -5
echo "=== Top I/O ===" && iotop -b -n 1 | head -5

Process Gotchas

# WRONG: Kill with SIGKILL first
kill -9 1234                             # Process can't clean up

# CORRECT: Graceful first, force if needed
kill 1234                                # SIGTERM
sleep 2
kill -0 1234 2>/dev/null && kill -9 1234 # Force if still running

# WRONG: Ignoring zombie processes
# Zombies mean parent isn't reaping children

# CORRECT: Find and fix parent
ps aux | awk '$8=="Z" {print "Zombie:", $2, "Parent:", $3}'
# Kill or fix parent process

# WRONG: nohup without redirect
nohup command &                          # Creates nohup.out in current dir

# CORRECT: Explicit redirect
nohup command > /var/log/myapp.log 2>&1 &

# WRONG: Using kill -9 on D state process
# Uninterruptible sleep (D) cannot receive signals!
kill -9 1234                             # Won't work

# CORRECT: Fix underlying I/O problem
# D state usually means NFS hang, disk issue, or kernel bug
lsof -p 1234                             # What's it waiting on
cat /proc/1234/stack                     # Kernel stack

# WRONG: Nice value confusion
nice -n 10 command                       # LOWER priority
nice -n -10 command                      # HIGHER priority

# CORRECT: Remember negative = higher priority
# -20 = highest, 19 = lowest

# WRONG: Assuming pkill matches exact name
pkill python                             # Kills ALL processes with "python" anywhere

# CORRECT: Be specific
pkill -x python                          # Exact match only
pkill -f "python app.py"                 # Match full command

# WRONG: Background job survives logout (sometimes)
command &                                # May be killed on logout

# CORRECT: Ensure survival
nohup command &
# Or
disown %1
# Or
setsid command

# WRONG: Checking if process exists with ps
if ps aux | grep myprocess; then         # grep matches itself!

# CORRECT: Use pgrep or filter grep
if pgrep -x myprocess > /dev/null; then
    echo "Running"
fi

Quick Reference

# INSPECTION
ps auxf                      # Process tree
pgrep -a pattern             # Find by name
pstree -p                    # Hierarchy with PIDs
top/htop                     # Interactive monitor

# CONTROL
kill PID                     # SIGTERM (graceful)
kill -9 PID                  # SIGKILL (force)
pkill -f "pattern"           # Kill by command
timeout 30 command           # Kill after 30s

# BACKGROUND
command &                    # Background
nohup command > log 2>&1 &   # Survive logout
disown %1                    # Detach job
screen -S name               # Session manager

# PRIORITY
nice -n 19 command           # Lowest priority
ionice -c 3 command          # Idle I/O
renice -n 10 -p PID          # Change running

# LIMITS
ulimit -a                    # Show limits
ulimit -n 65536              # Open files
systemctl set-property svc CPUQuota=50%

# DEBUG
strace -p PID                # System calls
lsof -p PID                  # Open files
lsof -i :80                  # Port usage
/proc/PID/                   # Process info

# PERFORMANCE
uptime                       # Load average
vmstat 1                     # Memory/CPU
iostat -x 1                  # I/O stats