from pathlib import Path

DATA = Path('/srv/data/iscape-air-quality')

for f in sorted(DATA.rglob('*')):
    if f.is_file():
        print(f"{f.relative_to(DATA)}  ({f.stat().st_size/1e6:,.1f} MB)")

10361.csv  (0.0 MB)
10362.csv  (0.0 MB)
5048.csv  (0.5 MB)
5087.csv  (0.0 MB)
5101.csv  (2.0 MB)
5104.csv  (2.2 MB)
5106.csv  (0.7 MB)
5107.csv  (0.0 MB)
5109.csv  (1.2 MB)
5110.csv  (2.2 MB)
5111.csv  (0.1 MB)
5117.csv  (2.2 MB)
5120.csv  (0.2 MB)
5121.csv  (8.3 MB)
5122.csv  (3.7 MB)
5124.csv  (2.2 MB)
5125.csv  (0.0 MB)
5126.csv  (0.0 MB)
5128.csv  (0.0 MB)
5129.csv  (40.3 MB)
5130.csv  (0.0 MB)
5131.csv  (0.0 MB)
5137.csv  (0.2 MB)
5138.csv  (0.6 MB)
5146.csv  (0.1 MB)
5164.csv  (2.7 MB)
5165.csv  (1.7 MB)
5169.csv  (0.0 MB)
5170.csv  (0.0 MB)
5171.csv  (0.9 MB)
5215.csv  (0.1 MB)
5221.csv  (0.4 MB)
5227.csv  (0.0 MB)
5242.csv  (0.0 MB)
5253.csv  (0.0 MB)
5254.csv  (0.0 MB)
5255.csv  (0.0 MB)
5256.csv  (0.0 MB)
5257.csv  (0.0 MB)
5258.csv  (1.1 MB)
5271.csv  (9.4 MB)
5276.csv  (0.2 MB)
5277.csv  (0.0 MB)
5278.csv  (0.8 MB)
5298.csv  (0.0 MB)
5299.csv  (0.1 MB)
5304.csv  (0.0 MB)
5305.csv  (0.1 MB)
5306.csv  (0.2 MB)
5309.csv  (0.0 MB)
5310.csv  (0.1 MB)
5311.csv  (0.0 MB)
5312.csv  (0.0 MB)
5313.csv  (0.0 MB)
5315.csv  (0.7 MB)
5317.csv  (0.2 MB)
5332.csv  (0.0 MB)
5333.csv  (0.0 MB)
5334.csv  (0.0 MB)
5335.csv  (0.1 MB)
5336.csv  (0.0 MB)
5339.csv  (0.2 MB)
5340.csv  (0.2 MB)
5341.csv  (0.0 MB)
5342.csv  (0.0 MB)
5343.csv  (0.0 MB)
5344.csv  (0.0 MB)
5345.csv  (0.0 MB)
5346.csv  (0.0 MB)
5347.csv  (0.0 MB)
5348.csv  (0.0 MB)
5349.csv  (0.0 MB)
5350.csv  (0.0 MB)
5351.csv  (0.0 MB)
5356.csv  (0.1 MB)
5358.csv  (0.1 MB)
5376.csv  (0.0 MB)
5377.csv  (0.7 MB)
5382.csv  (0.1 MB)
5390.csv  (0.9 MB)
5391.csv  (2.7 MB)
5397.csv  (1.6 MB)
5398.csv  (0.1 MB)
5434.csv  (5.3 MB)
5436.csv  (0.3 MB)
5464.csv  (31.5 MB)
5529.csv  (0.6 MB)
5630.csv  (2.3 MB)
8700.csv  (0.1 MB)
8701.csv  (0.1 MB)
9561.csv  (14.1 MB)
test_description_2019-12_EXT_CITIZEN_WORKSHOP_BOLOGNA.yaml  (0.0 MB)
test_description_2019-12_EXT_CITIZEN_WORKSHOP_BOTTROP.yaml  (0.0 MB)
test_description_2019-12_EXT_CITIZEN_WORKSHOP_DUBLIN.yaml  (0.0 MB)
test_description_2019-12_EXT_CITIZEN_WORKSHOP_HASSELT.yaml  (0.0 MB)
test_description_2019-12_EXT_CITIZEN_WORKSHOP_SURREY.yaml  (0.0 MB)
test_description_2019-12_EXT_CITIZEN_WORKSHOP_VANTAA.yaml  (0.0 MB)

import pandas as pd

csvs = sorted(DATA.rglob('*.csv')) + sorted(DATA.rglob('*.csv.gz')) + sorted(DATA.rglob('*.gz'))
print('Using:', csvs[0].name)

def load_csv(path, **kw):
    """Robust reader: detects the separator and tries utf-8 then latin-1."""
    for enc in ('utf-8', 'latin-1'):
        try:
            return pd.read_csv(path, sep=None, engine='python', encoding=enc, **kw)
        except UnicodeDecodeError:
            continue

df = load_csv(csvs[0], nrows=100_000)
df.head()

Using: 10361.csv

df.info()
df.describe(include='all').T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95 entries, 0 to 94
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Time       95 non-null     object 
 1   BATT       95 non-null     float64
 2   EXT_PM_1   91 non-null     float64
 3   EXT_PM_10  95 non-null     float64
 4   EXT_PM_25  95 non-null     float64
 5   HUM        95 non-null     float64
 6   LIGHT      95 non-null     float64
 7   NOISE_A    95 non-null     float64
 8   PRESS      95 non-null     float64
 9   TEMP       95 non-null     float64
dtypes: float64(9), object(1)
memory usage: 7.5+ KB

import matplotlib.pyplot as plt

num = df.select_dtypes('number')
if num.shape[1]:
    col = num.columns[0]
    num[col].plot.hist(bins=50, figsize=(8, 4), title=col)
    plt.tight_layout()
else:
    print('No direct numeric columns: explore df on your own.')

total = 0
for chunk in pd.read_csv(file, chunksize=1_000_000):
    total += len(chunk)

# ⚠️ RESTORE: this DISCARDS YOUR CHANGES to this notebook and resets it to the original.
# 1. Uncomment the line below (remove the #)   2. Run this cell
# 3. Then: menu File → Reload Notebook from Disk

# !git -C ~/citizen-science-data fetch -q origin && git -C ~/citizen-science-data checkout origin/main -- iscape-air-quality.ipynb && echo "Restored. Now: File → Reload Notebook from Disk"

	Time	BATT	EXT_PM_1	EXT_PM_10	EXT_PM_25	HUM	LIGHT	NOISE_A	PRESS	TEMP
0	2019-10-19 20:46:00+01:00	98.0	10.0	21.0	17.0	81.37	43.0	48.84	100.47	18.83
1	2019-10-19 20:47:00+01:00	98.0	6.0	12.0	11.0	79.68	51.0	54.19	100.47	18.90
2	2019-10-20 12:46:00+01:00	98.0	2.0	4.0	4.0	68.39	75.0	51.72	101.39	21.09
3	2019-10-20 12:49:00+01:00	98.0	1.0	2.0	2.0	61.43	72.0	48.45	101.41	21.12
4	2019-10-20 12:50:00+01:00	98.0	1.0	4.0	2.0	60.74	82.0	50.48	101.41	21.13

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Time	95	95	2019-10-19 20:46:00+01:00	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
BATT	95.0	NaN	NaN	NaN	97.2	0.766312	96.0	97.0	97.0	98.0	98.0
EXT_PM_1	91.0	NaN	NaN	NaN	3.131868	2.486447	1.0	1.0	2.0	5.0	12.0
EXT_PM_10	95.0	NaN	NaN	NaN	6.621053	4.720212	1.0	3.0	5.0	8.5	26.0
EXT_PM_25	95.0	NaN	NaN	NaN	5.410526	3.824598	1.0	3.0	5.0	7.0	20.0
HUM	95.0	NaN	NaN	NaN	70.574316	6.912956	53.51	65.425	71.39	75.485	81.37
LIGHT	95.0	NaN	NaN	NaN	803.031579	684.068888	30.0	117.5	882.0	1280.0	2880.0
NOISE_A	95.0	NaN	NaN	NaN	62.528526	6.556418	47.63	59.095	63.65	67.125	73.94
PRESS	95.0	NaN	NaN	NaN	102.006947	0.407131	100.47	101.75	102.23	102.285	102.32
TEMP	95.0	NaN	NaN	NaN	12.270737	4.480108	8.22	9.465	10.23	12.46	21.19

iSCAPE Citizen Science Workshops Air Quality Data¶

What's in the dataset¶

Load the data¶

First look¶

A first chart¶

Working with data larger than memory¶

Your turn¶