from pathlib import Path

DATA = Path('/srv/data/uk-butterflies')

for f in sorted(DATA.rglob('*')):
    if f.is_file():
        print(f"{f.relative_to(DATA)}  ({f.stat().st_size/1e6:,.1f} MB)")

bfly_garden_collated_indices.csv  (0.0 MB)
bfly_garden_trends.csv  (0.0 MB)
bfly_species_list.csv  (0.0 MB)
readme.txt  (0.0 MB)

import pandas as pd

csvs = sorted(DATA.rglob('*.csv')) + sorted(DATA.rglob('*.csv.gz')) + sorted(DATA.rglob('*.gz'))
print('Using:', csvs[0].name)

def load_csv(path, **kw):
    """Robust reader: detects the separator and tries utf-8 then latin-1."""
    for enc in ('utf-8', 'latin-1'):
        try:
            return pd.read_csv(path, sep=None, engine='python', encoding=enc, **kw)
        except UnicodeDecodeError:
            continue

df = load_csv(csvs[0], nrows=100_000)
df.head()

Using: bfly_garden_collated_indices.csv

df.info()
df.describe(include='all').T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   species         308 non-null    object 
 1   year            308 non-null    int64  
 2   nsite           308 non-null    int64  
 3   nsite_obs       308 non-null    int64  
 4   col_index       308 non-null    float64
 5   lci             308 non-null    float64
 6   l95conf         308 non-null    float64
 7   u95conf         308 non-null    float64
 8   fitted_lci      308 non-null    float64
 9   fitted_l95conf  308 non-null    float64
 10  fitted_u95conf  308 non-null    float64
dtypes: float64(7), int64(3), object(1)
memory usage: 26.6+ KB

import matplotlib.pyplot as plt

num = df.select_dtypes('number')
if num.shape[1]:
    col = num.columns[0]
    num[col].plot.hist(bins=50, figsize=(8, 4), title=col)
    plt.tight_layout()
else:
    print('No direct numeric columns: explore df on your own.')

total = 0
for chunk in pd.read_csv(file, chunksize=1_000_000):
    total += len(chunk)

# ⚠️ RESTORE: this DISCARDS YOUR CHANGES to this notebook and resets it to the original.
# 1. Uncomment the line below (remove the #)   2. Run this cell
# 3. Then: menu File → Reload Notebook from Disk

# !git -C ~/citizen-science-data fetch -q origin && git -C ~/citizen-science-data checkout origin/main -- uk-butterflies.ipynb && echo "Restored. Now: File → Reload Notebook from Disk"

	species	year	nsite	nsite_obs	col_index	lci	l95conf	u95conf	fitted_lci	fitted_l95conf	fitted_u95conf
0	XBRIM	2007	749	323	1.437071	2.000000	1.959294	2.042456	1.956572	1.925235	1.988029
1	XBRIM	2008	1180	394	1.068299	1.871215	1.822448	1.917814	1.982491	1.953426	2.011819
2	XBRIM	2009	1593	659	1.736926	2.082303	2.048810	2.114600	2.008410	1.981473	2.035889
3	XBRIM	2010	1612	708	1.580160	2.041223	2.011267	2.074272	2.034329	2.008725	2.059815
4	XBRIM	2011	1756	792	1.552585	2.033577	2.003766	2.061689	2.060248	2.036203	2.084098

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
species	308	22	XBRIM	14	NaN	NaN	NaN	NaN	NaN	NaN	NaN
year	308.0	NaN	NaN	NaN	2013.5	4.037689	2007.0	2010.0	2013.5	2017.0	2020.0
nsite	308.0	NaN	NaN	NaN	2078.272727	954.187546	749.0	1612.0	2043.0	2219.0	5113.0
nsite_obs	308.0	NaN	NaN	NaN	782.811688	715.066447	11.0	151.75	654.5	1203.0	3857.0
col_index	308.0	NaN	NaN	NaN	2.953842	3.724978	0.0231	0.180571	1.803224	3.711155	18.86507
lci	308.0	NaN	NaN	NaN	2.124361	0.213594	1.438771	2.0	2.118742	2.248717	2.990015
l95conf	308.0	NaN	NaN	NaN	2.043395	0.242366	1.337598	1.919155	2.065491	2.199789	2.925809
u95conf	308.0	NaN	NaN	NaN	2.19818	0.208954	1.528086	2.070657	2.183854	2.307915	3.058669
fitted_lci	308.0	NaN	NaN	NaN	2.124361	0.144486	1.700082	2.034238	2.121252	2.209837	2.595916
fitted_l95conf	308.0	NaN	NaN	NaN	2.065514	0.161379	1.484046	1.963813	2.085061	2.164837	2.516123
fitted_u95conf	308.0	NaN	NaN	NaN	2.179648	0.142184	1.854062	2.084191	2.160999	2.253874	2.682628

Trends in Butterfly Populations in UK Gardens¶

What's in the dataset¶

Load the data¶

First look¶

A first chart¶

Working with data larger than memory¶

Your turn¶