from pathlib import Path

DATA = Path('/srv/data/pecbms-birds')

for f in sorted(DATA.rglob('*')):
    if f.is_file():
        print(f"{f.relative_to(DATA)}  ({f.stat().st_size/1e6:,.1f} MB)")

indices2017.csv  (0.2 MB)
monitoring_schemes.xlsx  (0.0 MB)
national_indices2017.csv  (1.0 MB)
species_country.csv  (0.0 MB)
trends2017.csv  (0.0 MB)
trends_short2017.csv  (0.0 MB)

import pandas as pd

csvs = sorted(DATA.rglob('*.csv')) + sorted(DATA.rglob('*.csv.gz')) + sorted(DATA.rglob('*.gz'))
print('Using:', csvs[0].name)

def load_csv(path, **kw):
    """Robust reader: detects the separator and tries utf-8 then latin-1."""
    for enc in ('utf-8', 'latin-1'):
        try:
            return pd.read_csv(path, sep=None, engine='python', encoding=enc, **kw)
        except UnicodeDecodeError:
            continue

df = load_csv(csvs[0], nrows=100_000)
df.head()

Using: indices2017.csv

df.info()
df.describe(include='all').T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5628 entries, 0 to 5627
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   species      5628 non-null   object 
 1   euring_code  5628 non-null   int64  
 2   year         5628 non-null   int64  
 3   index        5628 non-null   int64  
 4   se           5628 non-null   float64
dtypes: float64(1), int64(3), object(1)
memory usage: 220.0+ KB

import matplotlib.pyplot as plt

num = df.select_dtypes('number')
if num.shape[1]:
    col = num.columns[0]
    num[col].plot.hist(bins=50, figsize=(8, 4), title=col)
    plt.tight_layout()
else:
    print('No direct numeric columns: explore df on your own.')

total = 0
for chunk in pd.read_csv(file, chunksize=1_000_000):
    total += len(chunk)

# ⚠️ RESTORE: this DISCARDS YOUR CHANGES to this notebook and resets it to the original.
# 1. Uncomment the line below (remove the #)   2. Run this cell
# 3. Then: menu File → Reload Notebook from Disk

# !git -C ~/citizen-science-data fetch -q origin && git -C ~/citizen-science-data checkout origin/main -- pecbms-birds.ipynb && echo "Restored. Now: File → Reload Notebook from Disk"

	species	euring_code	year	index	se
0	Tachybaptus ruficollis	70	1990	100	0.0
1	Tachybaptus ruficollis	70	1991	77	18.0
2	Tachybaptus ruficollis	70	1992	106	26.0
3	Tachybaptus ruficollis	70	1993	103	21.0
4	Tachybaptus ruficollis	70	1994	128	27.0

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
species	5628	170	Emberiza calandra	38	NaN	NaN	NaN	NaN	NaN	NaN	NaN
euring_code	5628.0	NaN	NaN	NaN	10893.665956	4699.543011	70.0	7240.0	11870.0	14640.0	18820.0
year	5628.0	NaN	NaN	NaN	2000.059701	10.680107	1980.0	1991.0	2001.0	2009.0	2017.0
index	5628.0	NaN	NaN	NaN	111.312011	104.486863	1.0	68.0	97.0	123.0	2175.0
se	5628.0	NaN	NaN	NaN	30.746722	95.895804	-1.040816	7.0	12.0	22.0	2103.0

Pan-European Common Bird Monitoring Scheme (PECBMS)¶

What's in the dataset¶

Load the data¶

First look¶

A first chart¶

Working with data larger than memory¶

Your turn¶