from pathlib import Path

DATA = Path('/srv/data/sonyc-urban-sound')

for f in sorted(DATA.rglob('*')):
    if f.is_file():
        print(f"{f.relative_to(DATA)}  ({f.stat().st_size/1e6:,.1f} MB)")

README.md  (0.0 MB)
annotations.csv  (2.6 MB)
audio.tar.gz  (1,892.5 MB)
dcase-ust-taxonomy.yaml  (0.0 MB)

import pandas as pd

csvs = sorted(DATA.rglob('*.csv')) + sorted(DATA.rglob('*.csv.gz')) + sorted(DATA.rglob('*.gz'))
print('Using:', csvs[0].name)

def load_csv(path, **kw):
    """Robust reader: detects the separator and tries utf-8 then latin-1."""
    for enc in ('utf-8', 'latin-1'):
        try:
            return pd.read_csv(path, sep=None, engine='python', encoding=enc, **kw)
        except UnicodeDecodeError:
            continue

df = load_csv(csvs[0], nrows=100_000)
df.head()

Using: annotations.csv

df.info()
df.describe(include='all').T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10349 entries, 0 to 10348
Data columns (total 70 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   split                                         10349 non-null  object 
 1   sensor_id                                     10349 non-null  int64  
 2   audio_filename                                10349 non-null  object 
 3   annotator_id                                  10349 non-null  int64  
 4   1-1_small-sounding-engine_presence            10349 non-null  float64
 5   1-2_medium-sounding-engine_presence           10349 non-null  float64
 6   1-3_large-sounding-engine_presence            10349 non-null  float64
 7   1-X_engine-of-uncertain-size_presence         10349 non-null  float64
 8   2-1_rock-drill_presence                       10349 non-null  float64
 9   2-2_jackhammer_presence                       10349 non-null  float64
 10  2-3_hoe-ram_presence                          10349 non-null  float64
 11  2-4_pile-driver_presence                      10349 non-null  float64
 12  2-X_other-unknown-impact-machinery_presence   10349 non-null  float64
 13  3-1_non-machinery-impact_presence             10349 non-null  float64
 14  4-1_chainsaw_presence                         10349 non-null  float64
 15  4-2_small-medium-rotating-saw_presence        10349 non-null  float64
 16  4-3_large-rotating-saw_presence               10349 non-null  float64
 17  4-X_other-unknown-powered-saw_presence        10349 non-null  float64
 18  5-1_car-horn_presence                         10349 non-null  float64
 19  5-2_car-alarm_presence                        10349 non-null  float64
 20  5-3_siren_presence                            10349 non-null  float64
 21  5-4_reverse-beeper_presence                   10349 non-null  float64
 22  5-X_other-unknown-alert-signal_presence       10349 non-null  float64
 23  6-1_stationary-music_presence                 10349 non-null  float64
 24  6-2_mobile-music_presence                     10349 non-null  float64
 25  6-3_ice-cream-truck_presence                  10349 non-null  float64
 26  6-X_music-from-uncertain-source_presence      10349 non-null  float64
 27  7-1_person-or-small-group-talking_presence    10349 non-null  float64
 28  7-2_person-or-small-group-shouting_presence   10349 non-null  float64
 29  7-3_large-crowd_presence                      10349 non-null  float64
 30  7-4_amplified-speech_presence                 10349 non-null  float64
 31  7-X_other-unknown-human-voice_presence        10349 non-null  float64
 32  8-1_dog-barking-whining_presence              10349 non-null  float64
 33  1-1_small-sounding-engine_proximity           10349 non-null  object 
 34  1-2_medium-sounding-engine_proximity          10349 non-null  object 
 35  1-3_large-sounding-engine_proximity           10349 non-null  object 
 36  1-X_engine-of-uncertain-size_proximity        10349 non-null  object 
 37  2-1_rock-drill_proximity                      10349 non-null  object 
 38  2-2_jackhammer_proximity                      10349 non-null  object 
 39  2-3_hoe-ram_proximity                         10349 non-null  object 
 40  2-4_pile-driver_proximity                     10349 non-null  object 
 41  2-X_other-unknown-impact-machinery_proximity  10349 non-null  object 
 42  3-1_non-machinery-impact_proximity            10349 non-null  object 
 43  4-1_chainsaw_proximity                        10349 non-null  object 
 44  4-2_small-medium-rotating-saw_proximity       10349 non-null  object 
 45  4-3_large-rotating-saw_proximity              10349 non-null  object 
 46  4-X_other-unknown-powered-saw_proximity       10349 non-null  object 
 47  5-1_car-horn_proximity                        10349 non-null  object 
 48  5-2_car-alarm_proximity                       10349 non-null  object 
 49  5-3_siren_proximity                           10349 non-null  object 
 50  5-4_reverse-beeper_proximity                  10349 non-null  object 
 51  5-X_other-unknown-alert-signal_proximity      10349 non-null  object 
 52  6-1_stationary-music_proximity                10349 non-null  object 
 53  6-2_mobile-music_proximity                    10349 non-null  object 
 54  6-3_ice-cream-truck_proximity                 10349 non-null  object 
 55  6-X_music-from-uncertain-source_proximity     10349 non-null  object 
 56  7-1_person-or-small-group-talking_proximity   10349 non-null  object 
 57  7-2_person-or-small-group-shouting_proximity  10349 non-null  object 
 58  7-3_large-crowd_proximity                     10349 non-null  object 
 59  7-4_amplified-speech_proximity                10349 non-null  object 
 60  7-X_other-unknown-human-voice_proximity       10349 non-null  object 
 61  8-1_dog-barking-whining_proximity             10349 non-null  object 
 62  1_engine_presence                             10349 non-null  int64  
 63  2_machinery-impact_presence                   10349 non-null  int64  
 64  3_non-machinery-impact_presence               10349 non-null  float64
 65  4_powered-saw_presence                        10349 non-null  int64  
 66  5_alert-signal_presence                       10349 non-null  int64  
 67  6_music_presence                              10349 non-null  int64  
 68  7_human-voice_presence                        10349 non-null  int64  
 69  8_dog_presence                                10349 non-null  int64  
dtypes: float64(30), int64(9), object(31)
memory usage: 5.5+ MB

import matplotlib.pyplot as plt

num = df.select_dtypes('number')
if num.shape[1]:
    col = num.columns[0]
    num[col].plot.hist(bins=50, figsize=(8, 4), title=col)
    plt.tight_layout()
else:
    print('No direct numeric columns: explore df on your own.')

total = 0
for chunk in pd.read_csv(file, chunksize=1_000_000):
    total += len(chunk)

# ⚠️ RESTORE: this DISCARDS YOUR CHANGES to this notebook and resets it to the original.
# 1. Uncomment the line below (remove the #)   2. Run this cell
# 3. Then: menu File → Reload Notebook from Disk

# !git -C ~/citizen-science-data fetch -q origin && git -C ~/citizen-science-data checkout origin/main -- sonyc-urban-sound.ipynb && echo "Restored. Now: File → Reload Notebook from Disk"

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
split	10349	2	train	7053	NaN	NaN	NaN	NaN	NaN	NaN	NaN
sensor_id	10349.0	NaN	NaN	NaN	20.042516	15.039104	0.0	5.0	19.0	37.0	42.0
audio_filename	10349	2794	00_000066.wav	22	NaN	NaN	NaN	NaN	NaN	NaN	NaN
annotator_id	10349.0	NaN	NaN	NaN	338.44584	429.420583	-5.0	57.0	158.0	427.0	1846.0
1-1_small-sounding-engine_presence	10349.0	NaN	NaN	NaN	-0.076433	0.350126	-1.0	0.0	0.0	0.0	1.0
1-2_medium-sounding-engine_presence	10349.0	NaN	NaN	NaN	-0.000193	0.452413	-1.0	0.0	0.0	0.0	1.0
1-3_large-sounding-engine_presence	10349.0	NaN	NaN	NaN	0.035559	0.489048	-1.0	0.0	0.0	0.0	1.0
1-X_engine-of-uncertain-size_presence	10349.0	NaN	NaN	NaN	-0.064741	0.36869	-1.0	0.0	0.0	0.0	1.0
2-1_rock-drill_presence	10349.0	NaN	NaN	NaN	-0.09827	0.385154	-1.0	0.0	0.0	0.0	1.0
2-2_jackhammer_presence	10349.0	NaN	NaN	NaN	-0.10001	0.382438	-1.0	0.0	0.0	0.0	1.0
2-3_hoe-ram_presence	10349.0	NaN	NaN	NaN	-0.117982	0.352656	-1.0	0.0	0.0	0.0	1.0
2-4_pile-driver_presence	10349.0	NaN	NaN	NaN	-0.121268	0.346835	-1.0	0.0	0.0	0.0	1.0
2-X_other-unknown-impact-machinery_presence	10349.0	NaN	NaN	NaN	-0.097787	0.385903	-1.0	0.0	0.0	0.0	1.0
3-1_non-machinery-impact_presence	10349.0	NaN	NaN	NaN	-0.080008	0.412146	-1.0	0.0	0.0	0.0	1.0
4-1_chainsaw_presence	10349.0	NaN	NaN	NaN	-0.127162	0.373123	-1.0	0.0	0.0	0.0	1.0
4-2_small-medium-rotating-saw_presence	10349.0	NaN	NaN	NaN	-0.128418	0.371003	-1.0	0.0	0.0	0.0	1.0
4-3_large-rotating-saw_presence	10349.0	NaN	NaN	NaN	-0.129288	0.369526	-1.0	0.0	0.0	0.0	1.0
4-X_other-unknown-powered-saw_presence	10349.0	NaN	NaN	NaN	-0.131124	0.366381	-1.0	0.0	0.0	0.0	1.0
5-1_car-horn_presence	10349.0	NaN	NaN	NaN	-0.072471	0.424379	-1.0	0.0	0.0	0.0	1.0
5-2_car-alarm_presence	10349.0	NaN	NaN	NaN	-0.124456	0.343319	-1.0	0.0	0.0	0.0	1.0

SONYC Urban Sound Tagging (SONYC-UST)¶

What's in the dataset¶

Load the data¶

First look¶

A first chart¶

Working with data larger than memory¶

Your turn¶

	split	audio_filename	annotator_id	1-1_small-sounding-engine_presence	1-2_medium-sounding-engine_presence	1-3_large-sounding-engine_presence	1-X_engine-of-uncertain-size_presence	2-1_rock-drill_presence	2-2_jackhammer_presence	...	7-X_other-unknown-human-voice_proximity	8-1_dog-barking-whining_proximity	1_engine_presence	2_machinery-impact_presence	3_non-machinery-impact_presence	4_powered-saw_presence	5_alert-signal_presence	6_music_presence	7_human-voice_presence	8_dog_presence
0	validate	00_000066.wav	95	1.0	1.0	1.0	1.0	1.0	1.0	...	far	far	1	1	1.0	1	1	1	1	1
1	validate	00_000066.wav	108	0.0	0.0	1.0	0.0	0.0	0.0	...	-1	-1	1	0	0.0	0	0	0	0	0
2	validate	00_000066.wav	127	1.0	0.0	0.0	0.0	0.0	0.0	...	-1	-1	1	0	0.0	0	0	0	0	0
3	validate	00_000118.wav	45	0.0	0.0	0.0	0.0	0.0	0.0	...	-1	-1	0	0	0.0	0	0	0	1	0
4	validate	00_000118.wav	58	0.0	1.0	0.0	0.0	0.0	0.0	...	-1	-1	1	0	0.0	0	0	0	1	0