from pathlib import Path

DATA = Path('/srv/data/tropical-rainforest-birds')

for f in sorted(DATA.rglob('*')):
    if f.is_file():
        print(f"{f.relative_to(DATA)}  ({f.stat().st_size/1e6:,.1f} MB)")

analysis.R  (0.1 MB)
calc_er_summary_metrics.R  (0.0 MB)
calc_occupancy_metrics.R  (0.0 MB)
data-raw.zip  (16.2 MB)
data.zip  (39.2 MB)
det_prob_test.R  (0.0 MB)
environmental-variables_checklists.csv  (17.6 MB)
gis.zip  (1,552.8 MB)
logistic_cicra.R  (0.0 MB)
logistic_samples_testset.zip  (0.1 MB)
logit_cicra.R  (0.0 MB)
logit_postprocessing_results.zip  (0.7 MB)
main_cicra_only.R  (0.1 MB)
main_cicra_only_audio.R  (0.1 MB)
occ_model_selection_audio.R  (0.0 MB)
occ_model_selection_eBird.R  (0.0 MB)
occ_model_selection_pooled.R  (0.0 MB)
run_occ_temp.R  (0.0 MB)

import pandas as pd

csvs = sorted(DATA.rglob('*.csv')) + sorted(DATA.rglob('*.csv.gz')) + sorted(DATA.rglob('*.gz'))
print('Using:', csvs[0].name)

def load_csv(path, **kw):
    """Robust reader: detects the separator and tries utf-8 then latin-1."""
    for enc in ('utf-8', 'latin-1'):
        try:
            return pd.read_csv(path, sep=None, engine='python', encoding=enc, **kw)
        except UnicodeDecodeError:
            continue

df = load_csv(csvs[0], nrows=100_000)
df.head()

Using: environmental-variables_checklists.csv

df.info()
df.describe(include='all').T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30580 entries, 0 to 30579
Data columns (total 69 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   checklist_id                       30580 non-null  object 
 1   elevation_mean_300m                30580 non-null  float64
 2   elevation_sd_300m                  30580 non-null  float64
 3   HAND_mean_300m                     30580 non-null  float64
 4   HAND_sd_300m                       30580 non-null  float64
 5   canopy_mean_300m                   30580 non-null  float64
 6   canopy_sd_300m                     30580 non-null  float64
 7   ed_c01_open_water_300m             30580 non-null  float64
 8   pland_c01_open_water_300m          30580 non-null  float64
 9   ed_c02_floodplain_300m             30580 non-null  float64
 10  pland_c02_floodplain_300m          30580 non-null  float64
 11  ed_c03_transition_forest_300m      30580 non-null  float64
 12  pland_c03_transition_forest_300m   30580 non-null  float64
 13  ed_c04_terra_firme_300m            30580 non-null  float64
 14  pland_c04_terra_firme_300m         30580 non-null  float64
 15  ed_c05_premontane_forest_300m      30580 non-null  int64  
 16  pland_c05_premontane_forest_300m   30580 non-null  int64  
 17  ed_c06_montane_forest_300m         30580 non-null  int64  
 18  pland_c06_montane_forest_300m      30580 non-null  int64  
 19  ed_c07_grass_300m                  30580 non-null  int64  
 20  pland_c07_grass_300m               30580 non-null  int64  
 21  ed_c08_flooded_vegetation_300m     30580 non-null  float64
 22  pland_c08_flooded_vegetation_300m  30580 non-null  float64
 23  ed_c09_crops_300m                  30580 non-null  float64
 24  pland_c09_crops_300m               30580 non-null  float64
 25  ed_c10_scrub_shrub_300m            30580 non-null  int64  
 26  pland_c10_scrub_shrub_300m         30580 non-null  int64  
 27  ed_c11_built_area_300m             30580 non-null  float64
 28  pland_c11_built_area_300m          30580 non-null  float64
 29  ed_c12_bare_ground_300m            30580 non-null  float64
 30  pland_c12_bare_ground_300m         30580 non-null  float64
 31  ed_c13_snow_ice_300m               30580 non-null  int64  
 32  pland_c13_snow_ice_300m            30580 non-null  int64  
 33  ed_c14_sparsely_vegetated_300m     30580 non-null  float64
 34  pland_c14_sparsely_vegetated_300m  30580 non-null  float64
 35  elevation_mean_1k                  30580 non-null  float64
 36  elevation_sd_1k                    30580 non-null  float64
 37  HAND_mean_1k                       30580 non-null  float64
 38  HAND_sd_1k                         30580 non-null  float64
 39  canopy_mean_1k                     30580 non-null  float64
 40  canopy_sd_1k                       30580 non-null  float64
 41  ed_c01_open_water_1k               30580 non-null  float64
 42  pland_c01_open_water_1k            30580 non-null  float64
 43  ed_c02_floodplain_1k               30580 non-null  float64
 44  pland_c02_floodplain_1k            30580 non-null  float64
 45  ed_c03_transition_forest_1k        30580 non-null  float64
 46  pland_c03_transition_forest_1k     30580 non-null  float64
 47  ed_c04_terra_firme_1k              30580 non-null  float64
 48  pland_c04_terra_firme_1k           30580 non-null  float64
 49  ed_c05_premontane_forest_1k        30580 non-null  float64
 50  pland_c05_premontane_forest_1k     30580 non-null  float64
 51  ed_c06_montane_forest_1k           30580 non-null  int64  
 52  pland_c06_montane_forest_1k        30580 non-null  int64  
 53  ed_c07_grass_1k                    30580 non-null  int64  
 54  pland_c07_grass_1k                 30580 non-null  int64  
 55  ed_c08_flooded_vegetation_1k       30580 non-null  float64
 56  pland_c08_flooded_vegetation_1k    30580 non-null  float64
 57  ed_c09_crops_1k                    30580 non-null  float64
 58  pland_c09_crops_1k                 30580 non-null  float64
 59  ed_c10_scrub_shrub_1k              30580 non-null  int64  
 60  pland_c10_scrub_shrub_1k           30580 non-null  int64  
 61  ed_c11_built_area_1k               30580 non-null  float64
 62  pland_c11_built_area_1k            30580 non-null  float64
 63  ed_c12_bare_ground_1k              30580 non-null  float64
 64  pland_c12_bare_ground_1k           30580 non-null  float64
 65  ed_c13_snow_ice_1k                 30580 non-null  int64  
 66  pland_c13_snow_ice_1k              30580 non-null  int64  
 67  ed_c14_sparsely_vegetated_1k       30580 non-null  float64
 68  pland_c14_sparsely_vegetated_1k    30580 non-null  float64
dtypes: float64(50), int64(18), object(1)
memory usage: 16.1+ MB

import matplotlib.pyplot as plt

num = df.select_dtypes('number')
if num.shape[1]:
    col = num.columns[0]
    num[col].plot.hist(bins=50, figsize=(8, 4), title=col)
    plt.tight_layout()
else:
    print('No direct numeric columns: explore df on your own.')

total = 0
for chunk in pd.read_csv(file, chunksize=1_000_000):
    total += len(chunk)

# ⚠️ RESTORE: this DISCARDS YOUR CHANGES to this notebook and resets it to the original.
# 1. Uncomment the line below (remove the #)   2. Run this cell
# 3. Then: menu File → Reload Notebook from Disk

# !git -C ~/citizen-science-data fetch -q origin && git -C ~/citizen-science-data checkout origin/main -- tropical-rainforest-birds.ipynb && echo "Restored. Now: File → Reload Notebook from Disk"

	checklist_id	elevation_mean_300m	elevation_sd_300m	HAND_mean_300m	HAND_sd_300m	canopy_mean_300m	canopy_sd_300m	ed_c01_open_water_300m	pland_c01_open_water_300m	ed_c02_floodplain_300m	...	ed_c12_bare_ground_1k	pland_c12_bare_ground_1k
0	G7641541	207.530899	15.869239	22.530880	15.869239	16.383030	11.567419	28.112308	29.545455	46.249281	...	3.908009	0.780696
1	G7641542	207.530899	15.869239	22.530880	15.869239	16.383030	11.567419	28.112308	29.545455	46.249281	...	3.908009	0.780696
2	G7641547	207.530899	15.869239	22.530880	15.869239	16.383030	11.567419	28.112308	29.545455	46.249281	...	3.908009	0.780696
3	G7678829	207.530899	15.869239	22.530880	15.869239	16.383030	11.567419	28.112308	29.545455	46.249281	...	3.908009	0.780696
4	G7206243	289.131989	14.737938	19.887278	14.575183	21.488491	13.826680	32.564318	28.967254	18.995852	...	0.000000	0.000000

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
checklist_id	30580	30510	G11200839	8	NaN	NaN	NaN	NaN	NaN	NaN	NaN
elevation_mean_300m	30580.0	NaN	NaN	NaN	259.416624	23.507895	169.163605	247.568634	257.509918	269.676544	502.928345
elevation_sd_300m	30580.0	NaN	NaN	NaN	10.588889	6.652001	0.0	4.082106	10.058463	17.622892	26.432089
HAND_mean_300m	30580.0	NaN	NaN	NaN	28.07559	16.539799	0.0	14.94241	25.06822	40.058731	119.920013
HAND_sd_300m	30580.0	NaN	NaN	NaN	10.834183	6.624898	0.0	4.206368	10.202206	18.314869	25.497375
canopy_mean_300m	30580.0	NaN	NaN	NaN	28.349423	3.712381	0.0	27.086662	28.927591	30.048012	35.838051
canopy_sd_300m	30580.0	NaN	NaN	NaN	3.827301	2.691151	0.0	1.868741	2.890475	4.805346	15.560221
ed_c01_open_water_300m	30580.0	NaN	NaN	NaN	8.084925	15.617845	0.0	0.0	0.0	7.058714	118.79025
pland_c01_open_water_300m	30580.0	NaN	NaN	NaN	3.337372	7.59246	0.0	0.0	0.0	0.737101	100.0
ed_c02_floodplain_300m	30580.0	NaN	NaN	NaN	27.647269	21.84691	0.0	6.161236	27.352516	40.603123	130.178123
pland_c02_floodplain_300m	30580.0	NaN	NaN	NaN	39.331729	34.715929	0.0	4.914005	37.009804	76.22549	100.0
ed_c03_transition_forest_300m	30580.0	NaN	NaN	NaN	37.924055	26.702645	0.0	11.700393	45.88164	55.94208	138.604656
pland_c03_transition_forest_300m	30580.0	NaN	NaN	NaN	13.678431	15.670454	0.0	2.70936	9.1133	19.65602	94.472362
ed_c04_terra_firme_300m	30580.0	NaN	NaN	NaN	18.169868	16.422677	0.0	0.0	22.004416	27.419887	102.994069
pland_c04_terra_firme_300m	30580.0	NaN	NaN	NaN	42.357551	36.071288	0.0	2.0	45.652174	70.515971	100.0
ed_c05_premontane_forest_300m	30580.0	NaN	NaN	NaN	0.0	0.0	0.0	0.0	0.0	0.0	0.0
pland_c05_premontane_forest_300m	30580.0	NaN	NaN	NaN	0.0	0.0	0.0	0.0	0.0	0.0	0.0
ed_c06_montane_forest_300m	30580.0	NaN	NaN	NaN	0.0	0.0	0.0	0.0	0.0	0.0	0.0
pland_c06_montane_forest_300m	30580.0	NaN	NaN	NaN	0.0	0.0	0.0	0.0	0.0	0.0	0.0
ed_c07_grass_300m	30580.0	NaN	NaN	NaN	0.0	0.0	0.0	0.0	0.0	0.0	0.0

Tropical Rainforest Birds: Acoustic + eBird Distribution Models¶

What's in the dataset¶

Load the data¶

First look¶

A first chart¶

Working with data larger than memory¶

Your turn¶