from pathlib import Path

DATA = Path('/srv/data/meadowatch-phenology')

for f in sorted(DATA.rglob('*')):
    if f.is_file():
        print(f"{f.relative_to(DATA)}  ({f.stat().st_size/1e6:,.1f} MB)")

MW_PhenoDat_2013_2019_anonymized.csv  (6.8 MB)
MW_Phenocurves.csv  (0.1 MB)
MW_SDDall.csv  (0.0 MB)
MW_SiteInfo_2013_2020.csv  (0.0 MB)
MW_Volunteer_info_2013_2019_anonymized.csv  (0.0 MB)
MW_metadata.xlsx  (0.0 MB)

import pandas as pd

csvs = sorted(DATA.rglob('*.csv')) + sorted(DATA.rglob('*.csv.gz')) + sorted(DATA.rglob('*.gz'))
print('Using:', csvs[0].name)

def load_csv(path, **kw):
    """Robust reader: detects the separator and tries utf-8 then latin-1."""
    for enc in ('utf-8', 'latin-1'):
        try:
            return pd.read_csv(path, sep=None, engine='python', encoding=enc, **kw)
        except UnicodeDecodeError:
            continue

df = load_csv(csvs[0], nrows=100_000)
df.head()

Using: MW_PhenoDat_2013_2019_anonymized.csv

df.info()
df.describe(include='all').T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66452 entries, 0 to 66451
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Transect                66452 non-null  object 
 1   Date                    66452 non-null  object 
 2   Year                    66452 non-null  int64  
 3   Month                   66452 non-null  int64  
 4   Day                     66451 non-null  float64
 5   Observer 1              65994 non-null  object 
 6   Observer 2              22136 non-null  object 
 7   Observer 3              3671 non-null   object 
 8   Observer 4              884 non-null    object 
 9   Observer 5              69 non-null     object 
 10  Observer 6              0 non-null      float64
 11  Observer_group          66452 non-null  object 
 12  Scientist_or_volunteer  66452 non-null  object 
 13  Site_Code               66452 non-null  object 
 14  Species                 66452 non-null  object 
 15  Snow                    54848 non-null  float64
 16  Bud                     58993 non-null  float64
 17  Bud_rank                18348 non-null  float64
 18  Flower                  58791 non-null  float64
 19  Flower_rank             18273 non-null  float64
 20  Fruit                   58519 non-null  float64
 21  Fruit_rank              18281 non-null  float64
 22  Disperse                58324 non-null  float64
 23  Disperse_rank           18356 non-null  float64
 24  Herb                    1827 non-null   float64
dtypes: float64(12), int64(2), object(11)
memory usage: 12.7+ MB

import matplotlib.pyplot as plt

num = df.select_dtypes('number')
if num.shape[1]:
    col = num.columns[0]
    num[col].plot.hist(bins=50, figsize=(8, 4), title=col)
    plt.tight_layout()
else:
    print('No direct numeric columns: explore df on your own.')

total = 0
for chunk in pd.read_csv(file, chunksize=1_000_000):
    total += len(chunk)

# ⚠️ RESTORE: this DISCARDS YOUR CHANGES to this notebook and resets it to the original.
# 1. Uncomment the line below (remove the #)   2. Run this cell
# 3. Then: menu File → Reload Notebook from Disk

# !git -C ~/citizen-science-data fetch -q origin && git -C ~/citizen-science-data checkout origin/main -- meadowatch-phenology.ipynb && echo "Restored. Now: File → Reload Notebook from Disk"

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Transect	66452	2	Reflection Lakes	41478	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Date	66452	513	7/25/17	497	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Year	66452.0	NaN	NaN	NaN	2016.497306	1.830398	2013.0	2015.0	2017.0	2018.0	2019.0
Month	66452.0	NaN	NaN	NaN	7.576055	0.807843	5.0	7.0	8.0	8.0	10.0
Day	66451.0	NaN	NaN	NaN	15.490151	8.637501	1.0	8.0	15.0	23.0	31.0
Observer 1	65994	277	d2	2878	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Observer 2	22136	180	j14	778	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Observer 3	3671	44	s13	287	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Observer 4	884	14	t18	73	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Observer 5	69	1	e15	69	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Observer 6	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Observer_group	66452	4	single	43858	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Scientist_or_volunteer	66452	2	Volunteer	57936	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Site_Code	66452	32	RL6	5707	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Species	66452	17	LUAR	10515	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Snow	54848.0	NaN	NaN	NaN	0.048055	0.208459	0.0	0.0	0.0	0.0	1.0
Bud	58993.0	NaN	NaN	NaN	0.224993	0.417581	0.0	0.0	0.0	0.0	1.0
Bud_rank	18348.0	NaN	NaN	NaN	0.251581	0.62517	0.0	0.0	0.0	0.0	4.0
Flower	58791.0	NaN	NaN	NaN	0.25162	0.433948	0.0	0.0	0.0	1.0	1.0
Flower_rank	18273.0	NaN	NaN	NaN	0.322279	0.658415	0.0	0.0	0.0	0.0	4.0

MeadoWatch: Wildflower Phenology in Mount Rainier National Park¶

What's in the dataset¶

Load the data¶

First look¶

A first chart¶

Working with data larger than memory¶

Your turn¶

	Transect	Date	Year	Month	Day	Observer 1	Observer 2	Observer 3	Observer 4	Observer 5	...	Herb
0	Reflection Lakes	7/12/13	2013	7	12.0	j1	a8	NaN	NaN	NaN	...	NaN
1	Reflection Lakes	7/12/13	2013	7	12.0	j1	a8	NaN	NaN	NaN	...	NaN
2	Reflection Lakes	7/12/13	2013	7	12.0	j1	a8	NaN	NaN	NaN	...	NaN
3	Reflection Lakes	7/12/13	2013	7	12.0	j1	a8	NaN	NaN	NaN	...	NaN
4	Reflection Lakes	7/12/13	2013	7	12.0	j1	a8	NaN	NaN	NaN	...	NaN