from pathlib import Path

DATA = Path('/srv/data/galaxy-zoo-desi')

for f in sorted(DATA.rglob('*')):
    if f.is_file():
        print(f"{f.relative_to(DATA)}  ({f.stat().st_size/1e6:,.1f} MB)")

external_catalog.parquet  (1,616.7 MB)
gz_desi_deep_learning_catalog_advanced.parquet  (7,558.1 MB)
gz_desi_deep_learning_catalog_friendly.csv  (1,612.4 MB)
gz_desi_deep_learning_catalog_friendly.parquet  (658.8 MB)
gz_desi_gzd8_volunteer_core_catalog.parquet  (6.4 MB)
gz_desi_gzd8_volunteer_extended_catalog.parquet  (5.5 MB)

import pandas as pd

csvs = sorted(DATA.rglob('*.csv')) + sorted(DATA.rglob('*.csv.gz')) + sorted(DATA.rglob('*.gz'))
print('Using:', csvs[0].name)

def load_csv(path, **kw):
    """Robust reader: detects the separator and tries utf-8 then latin-1."""
    for enc in ('utf-8', 'latin-1'):
        try:
            return pd.read_csv(path, sep=None, engine='python', encoding=enc, **kw)
        except UnicodeDecodeError:
            continue

df = load_csv(csvs[0], nrows=100_000)
df.head()

Using: gz_desi_deep_learning_catalog_friendly.csv

df.info()
df.describe(include='all').T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 41 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   dr8_id                                        100000 non-null  object 
 1   ra                                            100000 non-null  float64
 2   dec                                           100000 non-null  float64
 3   brickid                                       100000 non-null  int64  
 4   objid                                         100000 non-null  int64  
 5   smooth-or-featured_smooth_fraction            100000 non-null  float64
 6   smooth-or-featured_featured-or-disk_fraction  100000 non-null  float64
 7   smooth-or-featured_artifact_fraction          100000 non-null  float64
 8   disk-edge-on_yes_fraction                     11948 non-null   float64
 9   disk-edge-on_no_fraction                      11948 non-null   float64
 10  has-spiral-arms_yes_fraction                  7351 non-null    float64
 11  has-spiral-arms_no_fraction                   7351 non-null    float64
 12  bar_strong_fraction                           7351 non-null    float64
 13  bar_weak_fraction                             7351 non-null    float64
 14  bar_no_fraction                               7351 non-null    float64
 15  bulge-size_dominant_fraction                  7351 non-null    float64
 16  bulge-size_large_fraction                     7351 non-null    float64
 17  bulge-size_moderate_fraction                  7351 non-null    float64
 18  bulge-size_small_fraction                     7351 non-null    float64
 19  bulge-size_none_fraction                      7351 non-null    float64
 20  how-rounded_round_fraction                    81439 non-null   float64
 21  how-rounded_in-between_fraction               81439 non-null   float64
 22  how-rounded_cigar-shaped_fraction             81439 non-null   float64
 23  edge-on-bulge_boxy_fraction                   2460 non-null    float64
 24  edge-on-bulge_none_fraction                   2460 non-null    float64
 25  edge-on-bulge_rounded_fraction                2460 non-null    float64
 26  spiral-winding_tight_fraction                 4685 non-null    float64
 27  spiral-winding_medium_fraction                4685 non-null    float64
 28  spiral-winding_loose_fraction                 4685 non-null    float64
 29  spiral-arm-count_1_fraction                   4685 non-null    float64
 30  spiral-arm-count_2_fraction                   4685 non-null    float64
 31  spiral-arm-count_3_fraction                   4685 non-null    float64
 32  spiral-arm-count_4_fraction                   4685 non-null    float64
 33  spiral-arm-count_more-than-4_fraction         4685 non-null    float64
 34  spiral-arm-count_cant-tell_fraction           4685 non-null    float64
 35  merging_none_fraction                         100000 non-null  float64
 36  merging_minor-disturbance_fraction            100000 non-null  float64
 37  merging_major-disturbance_fraction            100000 non-null  float64
 38  merging_merger_fraction                       100000 non-null  float64
 39  catalog_version                               100000 non-null  object 
 40  legacy_survey_data_release                    100000 non-null  object 
dtypes: float64(36), int64(2), object(3)
memory usage: 31.3+ MB

import matplotlib.pyplot as plt

num = df.select_dtypes('number')
if num.shape[1]:
    col = num.columns[0]
    num[col].plot.hist(bins=50, figsize=(8, 4), title=col)
    plt.tight_layout()
else:
    print('No direct numeric columns: explore df on your own.')

total = 0
for chunk in pd.read_csv(file, chunksize=1_000_000):
    total += len(chunk)

# ⚠️ RESTORE: this DISCARDS YOUR CHANGES to this notebook and resets it to the original.
# 1. Uncomment the line below (remove the #)   2. Run this cell
# 3. Then: menu File → Reload Notebook from Disk

# !git -C ~/citizen-science-data fetch -q origin && git -C ~/citizen-science-data checkout origin/main -- galaxy-zoo-desi.ipynb && echo "Restored. Now: File → Reload Notebook from Disk"

	dr8_id	ra	dec	brickid	objid	smooth-or-featured_smooth_fraction	smooth-or-featured_featured-or-disk_fraction	smooth-or-featured_artifact_fraction	disk-edge-on_yes_fraction	disk-edge-on_no_fraction	...	spiral-arm-count_3_fraction	spiral-arm-count_4_fraction	spiral-arm-count_more-than-4_fraction	spiral-arm-count_cant-tell_fraction	merging_none_fraction	merging_minor-disturbance_fraction	merging_major-disturbance_fraction	merging_merger_fraction	catalog_version	legacy_survey_data_release
0	100000_1081	32.084931	-44.311422	100000	1081	0.69	0.25	0.06	NaN	NaN	...	NaN	NaN	NaN	NaN	0.84	0.12	0.02	0.01	1.0.0	DR8
1	100000_1401	32.140085	-44.293668	100000	1401	0.77	0.12	0.11	NaN	NaN	...	NaN	NaN	NaN	NaN	0.60	0.15	0.05	0.21	1.0.0	DR8
2	100000_1483	32.275015	-44.288957	100000	1483	0.81	0.10	0.08	NaN	NaN	...	NaN	NaN	NaN	NaN	0.59	0.17	0.04	0.19	1.0.0	DR8
3	100000_1509	32.045648	-44.287172	100000	1509	0.64	0.27	0.09	NaN	NaN	...	NaN	NaN	NaN	NaN	0.17	0.07	0.05	0.71	1.0.0	DR8
4	100000_1869	32.170627	-44.267273	100000	1869	0.88	0.05	0.07	NaN	NaN	...	NaN	NaN	NaN	NaN	0.68	0.25	0.05	0.02	1.0.0	DR8

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
dr8_id	100000	100000	100000_1081	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
ra	100000.0	NaN	NaN	NaN	158.501302	139.958766	0.000383	40.929024	79.160883	323.514389	359.992159
dec	100000.0	NaN	NaN	NaN	-43.343537	0.581483	-44.374993	-43.84433	-43.34548	-42.841351	-42.125242
brickid	100000.0	NaN	NaN	NaN	104152.80792	2432.567319	100000.0	102041.0	104129.0	106249.0	108350.0
objid	100000.0	NaN	NaN	NaN	2608.71779	1685.785361	0.0	1214.0	2448.0	3796.0	8488.0
smooth-or-featured_smooth_fraction	100000.0	NaN	NaN	NaN	0.676405	0.214371	0.02	0.6	0.77	0.83	0.91
smooth-or-featured_featured-or-disk_fraction	100000.0	NaN	NaN	NaN	0.213579	0.208996	0.02	0.07	0.12	0.28	0.95
smooth-or-featured_artifact_fraction	100000.0	NaN	NaN	NaN	0.110018	0.095357	0.03	0.07	0.09	0.11	0.91
disk-edge-on_yes_fraction	11948.0	NaN	NaN	NaN	0.309472	0.387722	0.01	0.03	0.06	0.77	0.99
disk-edge-on_no_fraction	11948.0	NaN	NaN	NaN	0.690528	0.387722	0.01	0.23	0.94	0.97	0.99
has-spiral-arms_yes_fraction	7351.0	NaN	NaN	NaN	0.815897	0.177276	0.05	0.75	0.88	0.94	0.99
has-spiral-arms_no_fraction	7351.0	NaN	NaN	NaN	0.184103	0.177276	0.01	0.06	0.12	0.25	0.95
bar_strong_fraction	7351.0	NaN	NaN	NaN	0.163595	0.14871	0.02	0.06	0.11	0.22	0.87
bar_weak_fraction	7351.0	NaN	NaN	NaN	0.32223	0.104038	0.04	0.24	0.32	0.4	0.61
bar_no_fraction	7351.0	NaN	NaN	NaN	0.514122	0.20164	0.04	0.36	0.54	0.68	0.94
bulge-size_dominant_fraction	7351.0	NaN	NaN	NaN	0.014595	0.012033	0.01	0.01	0.01	0.02	0.24
bulge-size_large_fraction	7351.0	NaN	NaN	NaN	0.059431	0.067475	0.01	0.02	0.03	0.07	0.56
bulge-size_moderate_fraction	7351.0	NaN	NaN	NaN	0.41169	0.171707	0.04	0.27	0.41	0.55	0.8
bulge-size_small_fraction	7351.0	NaN	NaN	NaN	0.432016	0.188136	0.03	0.29	0.43	0.57	0.91
bulge-size_none_fraction	7351.0	NaN	NaN	NaN	0.0821	0.122954	0.01	0.01	0.03	0.09	0.84

Galaxy Zoo DESI: Detailed Morphology Classifications¶

What's in the dataset¶

Load the data¶

First look¶

A first chart¶

Working with data larger than memory¶

Your turn¶