Chapter 16: Downloading Data
Real-world data comes in files. Parse CSV and JSON for visualization.
CSV Format
Comma-separated values. Simple, universal format.
timestamp,server,cpu,memory 2024-01-15 09:00,web-01,45.2,68.5 2024-01-15 09:05,web-01,52.1,70.2 2024-01-15 09:10,web-01,48.8,69.1
Reading CSV with csv Module
from pathlib import Path
import csv
path = Path('metrics.csv')
lines = path.read_text().splitlines()
reader = csv.reader(lines)
header_row = next(reader) (1)
print(header_row)
# ['timestamp', 'server', 'cpu', 'memory']
# Show column indices
for index, column_header in enumerate(header_row):
print(index, column_header)
| 1 | next() gets first row (headers) |
Extracting Data
from pathlib import Path
import csv
path = Path('metrics.csv')
lines = path.read_text().splitlines()
reader = csv.reader(lines)
header_row = next(reader)
# Extract CPU values
cpu_values = []
for row in reader:
cpu = float(row[2]) (1)
cpu_values.append(cpu)
print(cpu_values)
| 1 | Column index from header inspection |
Plotting CSV Data
from pathlib import Path
import csv
from datetime import datetime
import matplotlib.pyplot as plt
path = Path('metrics.csv')
lines = path.read_text().splitlines()
reader = csv.reader(lines)
header_row = next(reader)
timestamps, cpu_values = [], []
for row in reader:
timestamp = datetime.strptime(row[0], '%Y-%m-%d %H:%M') (1)
timestamps.append(timestamp)
cpu_values.append(float(row[2]))
# Plot
plt.style.use('seaborn-v0_8')
fig, ax = plt.subplots()
ax.plot(timestamps, cpu_values, color='red')
ax.set_title("Server CPU Usage")
ax.set_xlabel("")
ax.set_ylabel("CPU %")
fig.autofmt_xdate() (2)
plt.savefig('cpu_usage.png', bbox_inches='tight')
plt.show()
| 1 | Parse datetime string |
| 2 | Rotate and align date labels |
Date Parsing
| Code | Meaning | Example |
|---|---|---|
|
4-digit year |
2024 |
|
2-digit month |
01-12 |
|
2-digit day |
01-31 |
|
Hour (24h) |
00-23 |
|
Minute |
00-59 |
|
Second |
00-59 |
|
Day name |
Monday |
|
Month name |
January |
# Common formats
datetime.strptime('2024-01-15', '%Y-%m-%d')
datetime.strptime('01/15/2024', '%m/%d/%Y')
datetime.strptime('2024-01-15 09:30:00', '%Y-%m-%d %H:%M:%S')
datetime.strptime('Jan 15, 2024', '%b %d, %Y')
Handling Missing Data
Real data has gaps. Handle gracefully:
reader = csv.reader(lines)
header_row = next(reader)
timestamps, cpu_values = [], []
for row in reader:
timestamp = datetime.strptime(row[0], '%Y-%m-%d %H:%M')
try:
cpu = float(row[2])
except ValueError:
print(f"Missing data for {timestamp}")
else:
timestamps.append(timestamp)
cpu_values.append(cpu)
Multiple Data Series
from pathlib import Path
import csv
from datetime import datetime
import matplotlib.pyplot as plt
path = Path('metrics.csv')
lines = path.read_text().splitlines()
reader = csv.reader(lines)
header_row = next(reader)
timestamps, cpu_values, memory_values = [], [], []
for row in reader:
try:
timestamp = datetime.strptime(row[0], '%Y-%m-%d %H:%M')
cpu = float(row[2])
memory = float(row[3])
except ValueError:
continue
else:
timestamps.append(timestamp)
cpu_values.append(cpu)
memory_values.append(memory)
plt.style.use('seaborn-v0_8')
fig, ax = plt.subplots()
ax.plot(timestamps, cpu_values, color='red', alpha=0.7, label='CPU')
ax.plot(timestamps, memory_values, color='blue', alpha=0.7, label='Memory')
ax.set_title("Server Metrics")
ax.set_ylabel("Percentage")
ax.legend()
ax.fill_between(timestamps, cpu_values, alpha=0.2, color='red') (1)
fig.autofmt_xdate()
plt.savefig('server_metrics.png', bbox_inches='tight')
| 1 | Shaded area under line |
JSON Data
JSON is more complex than CSV. Supports nested structures.
{
"servers": [
{
"hostname": "web-01",
"ip": "10.0.1.10",
"metrics": {
"cpu": 45.2,
"memory": 68.5
}
}
]
}
Reading JSON
from pathlib import Path
import json
path = Path('servers.json')
contents = path.read_text()
data = json.loads(contents)
# Navigate structure
for server in data['servers']:
hostname = server['hostname']
cpu = server['metrics']['cpu']
print(f"{hostname}: {cpu}% CPU")
GeoJSON Example
Earthquake data from USGS:
from pathlib import Path
import json
import plotly.express as px
path = Path('eq_data_30_day_m1.geojson')
contents = path.read_text()
all_eq_data = json.loads(contents)
# Examine structure
print(all_eq_data.keys())
# dict_keys(['type', 'metadata', 'features'])
features = all_eq_data['features']
print(len(features)) # Number of earthquakes
# Examine first earthquake
first_eq = features[0]
print(first_eq.keys())
# dict_keys(['type', 'properties', 'geometry', 'id'])
print(first_eq['properties']['mag']) # Magnitude
print(first_eq['properties']['place']) # Location
print(first_eq['geometry']['coordinates']) # [lon, lat, depth]
Extracting GeoJSON Data
# Extract all earthquake data
mags, lons, lats, titles = [], [], [], []
for eq in features:
mag = eq['properties']['mag']
lon = eq['geometry']['coordinates'][0]
lat = eq['geometry']['coordinates'][1]
title = eq['properties']['title']
mags.append(mag)
lons.append(lon)
lats.append(lat)
titles.append(title)
Mapping Earthquakes with Plotly
import plotly.express as px
title = "Global Earthquakes"
fig = px.scatter_geo(
lat=lats,
lon=lons,
size=mags, (1)
color=mags, (2)
color_continuous_scale='Viridis',
labels={'color': 'Magnitude'},
projection='natural earth',
title=title,
hover_name=titles, (3)
)
fig.write_html('global_earthquakes.html')
fig.show()
| 1 | Point size by magnitude |
| 2 | Color by magnitude |
| 3 | Hover text shows earthquake title |
Examining JSON Structure
When exploring new JSON data:
# Pretty print first item
import json
print(json.dumps(data[0], indent=2))
# Save readable version
formatted = json.dumps(data, indent=2)
Path('readable.json').write_text(formatted)
Infrastructure Example: Network Device Data
from pathlib import Path
import json
import matplotlib.pyplot as plt
# Sample switch port data
data = {
"switches": [
{
"hostname": "sw-core-01",
"ports": [
{"port": "Gi1/0/1", "in_mbps": 450, "out_mbps": 380},
{"port": "Gi1/0/2", "in_mbps": 220, "out_mbps": 195}
]
}
]
}
# Extract and plot
switch = data['switches'][0]
ports = [p['port'] for p in switch['ports']]
in_traffic = [p['in_mbps'] for p in switch['ports']]
out_traffic = [p['out_mbps'] for p in switch['ports']]
x = range(len(ports))
width = 0.35
fig, ax = plt.subplots()
ax.bar([i - width/2 for i in x], in_traffic, width, label='In')
ax.bar([i + width/2 for i in x], out_traffic, width, label='Out')
ax.set_ylabel('Mbps')
ax.set_title(f"Port Utilization - {switch['hostname']}")
ax.set_xticks(x)
ax.set_xticklabels(ports)
ax.legend()
plt.savefig('port_utilization.png', bbox_inches='tight')
Quick Reference
| CSV Operation | Code |
|---|---|
Read file |
|
Get header |
|
Parse date |
|
Skip errors |
|
| JSON Operation | Code |
|---|---|
Parse |
|
Dump |
|
Pretty print |
|
Navigate |
|
Exercises
16-1. System Logs
Parse a CSV log file with timestamp, level, message. Count errors per hour.
16-2. Network Comparison
Plot traffic data for multiple switches on same chart.
16-3. Automated Title
Extract title from JSON metadata for chart title.
16-4. Fire Data
Download USGS fire data. Map active fires.
16-5. Traffic Patterns
Create CSV of network traffic by hour. Find peak usage times.
Summary
-
CSV:
csv.reader()parses rows,next()gets header -
Datetime:
strptime()parses strings,strftime()formats output -
JSON:
loads()parses strings,dumps()creates strings -
Missing data: wrap in
try/exceptto skip bad records -
GeoJSON: standard format for geographic data
-
Plotly:
px.scatter_geo()creates interactive maps -
Always examine JSON structure before parsing
Next: Working with APIs.