from pathlib import Path

DATA = Path('/srv/data/xaire-no2-barcelona')

for f in sorted(DATA.rglob('*')):
    if f.is_file():
        print(f"{f.relative_to(DATA)}  ({f.stat().st_size/1e6:,.1f} MB)")

xaire_datainbrief.csv  (0.1 MB)

import pandas as pd

csvs = sorted(DATA.rglob('*.csv')) + sorted(DATA.rglob('*.csv.gz')) + sorted(DATA.rglob('*.gz'))
print('Using:', csvs[0].name)

def load_csv(path, **kw):
    """Robust reader: detects the separator and tries utf-8 then latin-1."""
    for enc in ('utf-8', 'latin-1'):
        try:
            return pd.read_csv(path, sep=None, engine='python', encoding=enc, **kw)
        except UnicodeDecodeError:
            continue

df = load_csv(csvs[0], nrows=100_000)
df.head()

Using: xaire_datainbrief.csv

df.info()
df.describe(include='all').T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 725 entries, 0 to 724
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   tube_code     725 non-null    int64  
 1   school        725 non-null    object 
 2   lat           725 non-null    float64
 3   long          725 non-null    float64
 4   address       725 non-null    object 
 5   type          725 non-null    object 
 6   no2_raw       725 non-null    float64
 7   no2_unbiased  725 non-null    int64  
 8   no2_2017      725 non-null    int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 51.1+ KB

import matplotlib.pyplot as plt

num = df.select_dtypes('number')
if num.shape[1]:
    col = num.columns[0]
    num[col].plot.hist(bins=50, figsize=(8, 4), title=col)
    plt.tight_layout()
else:
    print('No direct numeric columns: explore df on your own.')

total = 0
for chunk in pd.read_csv(file, chunksize=1_000_000):
    total += len(chunk)

# ⚠️ RESTORE: this DISCARDS YOUR CHANGES to this notebook and resets it to the original.
# 1. Uncomment the line below (remove the #)   2. Run this cell
# 3. Then: menu File → Reload Notebook from Disk

# !git -C ~/citizen-science-data fetch -q origin && git -C ~/citizen-science-data checkout origin/main -- xaire-no2-barcelona.ipynb && echo "Restored. Now: File → Reload Notebook from Disk"

	tube_code	school	lat	long	address	type	no2_raw	no2_unbiased	no2_2017
0	1003091	xAire	41.397720	2.149740	carrer denia, 2-4	traffic	38.290447	41	39
1	1003092	xAire	41.398180	2.148380	plaça cardona, 1-2	background	42.617421	46	44
2	1003093	xAire	41.399155	2.145611	carrer aribau 265	traffic	26.454660	28	27
3	1003094	xAire	41.401366	2.148259	via augusta, 114	traffic	60.009157	64	62
4	1003095	xAire	41.403742	2.142602	carrer balmes 350-352	traffic	98.931474	106	102

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
tube_code	725.0	NaN	NaN	NaN	1089424.386207	10902.337857	1003091.0	1090624.0	1090825.0	1091029.0	1091241.0
school	725	19	xAire	77	NaN	NaN	NaN	NaN	NaN	NaN	NaN
lat	725.0	NaN	NaN	NaN	41.403482	0.020399	41.356773	41.386892	41.40235	41.42319	41.445149
long	725.0	NaN	NaN	NaN	2.164603	0.023831	2.1105	2.148791	2.168509	2.1812	2.210294
address	725	673	traffic	13	NaN	NaN	NaN	NaN	NaN	NaN	NaN
type	725	4	traffic	482	NaN	NaN	NaN	NaN	NaN	NaN	NaN
no2_raw	725.0	NaN	NaN	NaN	46.528329	15.156252	11.724384	37.373208	43.114328	52.212016	122.537611
no2_unbiased	725.0	NaN	NaN	NaN	49.77931	16.22633	13.0	40.0	46.0	56.0	131.0
no2_2017	725.0	NaN	NaN	NaN	47.875862	15.629994	13.0	38.0	44.0	54.0	126.0

xAire: High-Resolution NO₂ Citizen Science Dataset (Barcelona)¶

What's in the dataset¶

Load the data¶

First look¶

A first chart¶

Working with data larger than memory¶

Your turn¶