from pathlib import Path

DATA = Path('/srv/data/ebird-id-accuracy')

for f in sorted(DATA.rglob('*')):
    if f.is_file():
        print(f"{f.relative_to(DATA)}  ({f.stat().st_size/1e6:,.1f} MB)")

Appendix_S1.rar  (1.8 MB)
Appendix_S2.xlsx  (0.0 MB)
Appendix_S3.html  (0.9 MB)

import pandas as pd

xlsx = sorted(DATA.rglob('*.xlsx'))
print('Using:', xlsx[0].name)
df = pd.read_excel(xlsx[0])
df.head()

Using: Appendix_S2.xlsx

df.info()
df.describe(include='all').T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 377 entries, 0 to 376
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   rank                     377 non-null    int64  
 1   quality_group            377 non-null    object 
 2   scientific_name          377 non-null    object 
 3   hard_to_id               377 non-null    bool   
 4   samples                  377 non-null    int64  
 5   true_positives           377 non-null    int64  
 6   false_positives          377 non-null    int64  
 7   false_negatives          377 non-null    int64  
 8   precision                377 non-null    float64
 9   recall                   377 non-null    float64
 10  min (precision, recall)  377 non-null    float64
dtypes: bool(1), float64(3), int64(5), object(2)
memory usage: 29.9+ KB

import matplotlib.pyplot as plt

num = df.select_dtypes('number')
if num.shape[1]:
    col = num.columns[0]
    num[col].plot.hist(bins=50, figsize=(8, 4), title=col)
    plt.tight_layout()
else:
    print('No direct numeric columns: explore df on your own.')

# ⚠️ RESTORE: this DISCARDS YOUR CHANGES to this notebook and resets it to the original.
# 1. Uncomment the line below (remove the #)   2. Run this cell
# 3. Then: menu File → Reload Notebook from Disk

# !git -C ~/citizen-science-data fetch -q origin && git -C ~/citizen-science-data checkout origin/main -- ebird-id-accuracy.ipynb && echo "Restored. Now: File → Reload Notebook from Disk"

	rank	quality_group	scientific_name	hard_to_id	samples	true_positives	precision	recall	min (precision, recall)
0	1	high-quality	Polioptila dumicola	False	773	773	100.0	100.0	100.0
1	2	high-quality	Turdus chiguanco	False	458	458	100.0	100.0	100.0
2	3	high-quality	Amblyramphus holosericeus	False	443	443	100.0	100.0	100.0
3	4	high-quality	Coryphistera alaudina	False	385	385	100.0	100.0	100.0
4	5	high-quality	Tachuris rubrigastra	False	287	287	100.0	100.0	100.0

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
rank	377.0	NaN	NaN	NaN	189.0	108.974768	1.0	95.0	189.0	283.0	377.0
quality_group	377	3	high-quality	291	NaN	NaN	NaN	NaN	NaN	NaN	NaN
scientific_name	377	377	Polioptila dumicola	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
hard_to_id	377	2	False	292	NaN	NaN	NaN	NaN	NaN	NaN	NaN
samples	377.0	NaN	NaN	NaN	183.297082	277.64094	6.0	32.0	78.0	210.0	1878.0
true_positives	377.0	NaN	NaN	NaN	180.639257	276.008571	5.0	30.0	75.0	204.0	1874.0
false_positives	377.0	NaN	NaN	NaN	2.657825	5.104457	0.0	0.0	1.0	3.0	46.0
false_negatives	377.0	NaN	NaN	NaN	2.38992	4.489847	0.0	0.0	0.0	3.0	30.0
precision	377.0	NaN	NaN	NaN	97.498183	4.768119	64.285714	97.368421	99.5671	100.0	100.0
recall	377.0	NaN	NaN	NaN	97.976756	3.921286	63.636364	97.619048	100.0	100.0	100.0
min (precision, recall)	377.0	NaN	NaN	NaN	96.567405	5.439289	63.636364	95.614035	98.908297	100.0	100.0

Bird Identification Accuracy in eBird Citizen Science Data (Argentina)¶

What's in the dataset¶

Load the data¶

First look¶

A first chart¶

Your turn¶