from pathlib import Path

DATA = Path('/srv/data/tess-light-pollution')

for f in sorted(DATA.rglob('*')):
    if f.is_file():
        print(f"{f.relative_to(DATA)}  ({f.stat().st_size/1e6:,.1f} MB)")

ro-crate-metadata.json  (0.0 MB)
ro-crate-preview.html  (0.0 MB)
tess-network-analysis-results.csv  (0.0 MB)
tess-network-analysis-script.R  (0.0 MB)
tess-network-procedure.ttl  (0.1 MB)
tess-network-results.csv  (0.5 MB)
tess-network-results.ttl  (2.3 MB)
tess-network-survey.ttl  (2.4 MB)

import pandas as pd

csvs = sorted(DATA.rglob('*.csv')) + sorted(DATA.rglob('*.csv.gz')) + sorted(DATA.rglob('*.gz'))
print('Using:', csvs[0].name)

def load_csv(path, **kw):
    """Robust reader: detects the separator and tries utf-8 then latin-1."""
    for enc in ('utf-8', 'latin-1'):
        try:
            return pd.read_csv(path, sep=None, engine='python', encoding=enc, **kw)
        except UnicodeDecodeError:
            continue

df = load_csv(csvs[0], nrows=100_000)
df.head()

Using: tess-network-analysis-results.csv

df.info()
df.describe(include='all').T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   variable      10 non-null     object 
 1   mean          10 non-null     float64
 2   stdev         10 non-null     float64
 3   correlation   10 non-null     float64
 4   pvalue        10 non-null     float64
 5   significance  8 non-null      object 
dtypes: float64(4), object(2)
memory usage: 608.0+ bytes

import matplotlib.pyplot as plt

num = df.select_dtypes('number')
if num.shape[1]:
    col = num.columns[0]
    num[col].plot.hist(bins=50, figsize=(8, 4), title=col)
    plt.tight_layout()
else:
    print('No direct numeric columns: explore df on your own.')

total = 0
for chunk in pd.read_csv(file, chunksize=1_000_000):
    total += len(chunk)

# ⚠️ RESTORE: this DISCARDS YOUR CHANGES to this notebook and resets it to the original.
# 1. Uncomment the line below (remove the #)   2. Run this cell
# 3. Then: menu File → Reload Notebook from Disk

# !git -C ~/citizen-science-data fetch -q origin && git -C ~/citizen-science-data checkout origin/main -- tess-light-pollution.ipynb && echo "Restored. Now: File → Reload Notebook from Disk"

	variable	mean	stdev	correlation	pvalue	significance
0	achievement	4.160256	0.815808	0.423953	1.097718e-04	***
1	belongingness	3.730769	0.710278	0.456155	2.702843e-05	***
2	benevolence	4.461538	0.596359	0.619622	1.463416e-09	***
3	conformity	2.307692	0.957644	0.075494	5.112308e-01	NaN
4	hedonism	4.205128	0.811588	0.588373	1.471785e-08	***

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
variable	10	10	achievement	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
mean	10.0	NaN	NaN	NaN	3.776923	0.7744	2.307692	3.230769	4.169872	4.330128	4.474359
stdev	10.0	NaN	NaN	NaN	0.805613	0.144445	0.596359	0.721084	0.794575	0.815291	1.117624
correlation	10.0	NaN	NaN	NaN	0.417705	0.196864	0.075494	0.309438	0.440054	0.564128	0.672226
pvalue	10.0	NaN	NaN	NaN	0.070149	0.164198	0.0	0.000001	0.000068	0.012106	0.511231
significance	8	2	***	7	NaN	NaN	NaN	NaN	NaN	NaN	NaN

TESS Network Motivation Survey (Light Pollution Citizen Science)¶

What's in the dataset¶

Load the data¶

First look¶

A first chart¶

Working with data larger than memory¶

Your turn¶