from pathlib import Path

DATA = Path('/srv/data/fruit-plants-brazil')

for f in sorted(DATA.rglob('*')):
    if f.is_file():
        print(f"{f.relative_to(DATA)}  ({f.stat().st_size/1e6:,.1f} MB)")

observations_22-08-2023.csv  (6.0 MB)

import pandas as pd

csvs = sorted(DATA.rglob('*.csv')) + sorted(DATA.rglob('*.csv.gz')) + sorted(DATA.rglob('*.gz'))
print('Using:', csvs[0].name)

def load_csv(path, **kw):
    """Robust reader: detects the separator and tries utf-8 then latin-1."""
    for enc in ('utf-8', 'latin-1'):
        try:
            return pd.read_csv(path, sep=None, engine='python', encoding=enc, **kw)
        except UnicodeDecodeError:
            continue

df = load_csv(csvs[0], nrows=100_000)
df.head()

Using: observations_22-08-2023.csv

df.info()
df.describe(include='all').T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10942 entries, 0 to 10941
Data columns (total 47 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   ﻿id                               10942 non-null  int64  
 1   observed_on_string                10839 non-null  object 
 2   observed_on                       10837 non-null  object 
 3   time_observed_at                  10150 non-null  object 
 4   time_zone                         10940 non-null  object 
 5   user_id                           10942 non-null  int64  
 6   user_login                        10942 non-null  object 
 7   user_name                         9037 non-null   object 
 8   created_at                        10942 non-null  object 
 9   updated_at                        10942 non-null  object 
 10  quality_grade                     10942 non-null  object 
 11  license                           6170 non-null   object 
 12  url                               10942 non-null  object 
 13  image_url                         10942 non-null  object 
 14  tag_list                          329 non-null    object 
 15  description                       1338 non-null   object 
 16  num_identification_agreements     10942 non-null  int64  
 17  num_identification_disagreements  10942 non-null  int64  
 18  captive_cultivated                10942 non-null  bool   
 19  oauth_application_id              8222 non-null   float64
 20  place_guess                       10941 non-null  object 
 21  latitude                          10942 non-null  object 
 22  longitude                         10942 non-null  object 
 23  positional_accuracy               8832 non-null   float64
 24  private_place_guess               0 non-null      float64
 25  private_latitude                  0 non-null      float64
 26  private_longitude                 0 non-null      float64
 27  public_positional_accuracy        8899 non-null   float64
 28  geoprivacy                        67 non-null     object 
 29  taxon_geoprivacy                  3613 non-null   object 
 30  coordinates_obscured              10942 non-null  bool   
 31  positioning_method                3445 non-null   object 
 32  positioning_device                3595 non-null   object 
 33  species_guess                     9681 non-null   object 
 34  scientific_name                   10942 non-null  object 
 35  common_name                       8616 non-null   object 
 36  iconic_taxon_name                 10942 non-null  object 
 37  taxon_id                          10942 non-null  int64  
 38  taxon_kingdom_name                10942 non-null  object 
 39  taxon_phylum_name                 10942 non-null  object 
 40  taxon_class_name                  10942 non-null  object 
 41  taxon_order_name                  10942 non-null  object 
 42  taxon_family_name                 10942 non-null  object 
 43  taxon_genus_name                  10936 non-null  object 
 44  taxon_species_name                9702 non-null   object 
 45  taxon_subspecies_name             28 non-null     object 
 46  taxon_variety_name                66 non-null     object 
dtypes: bool(2), float64(6), int64(5), object(34)
memory usage: 3.8+ MB

import matplotlib.pyplot as plt

num = df.select_dtypes('number')
if num.shape[1]:
    col = num.columns[0]
    num[col].plot.hist(bins=50, figsize=(8, 4), title=col)
    plt.tight_layout()
else:
    print('No direct numeric columns: explore df on your own.')

total = 0
for chunk in pd.read_csv(file, chunksize=1_000_000):
    total += len(chunk)

# ⚠️ RESTORE: this DISCARDS YOUR CHANGES to this notebook and resets it to the original.
# 1. Uncomment the line below (remove the #)   2. Run this cell
# 3. Then: menu File → Reload Notebook from Disk

# !git -C ~/citizen-science-data fetch -q origin && git -C ~/citizen-science-data checkout origin/main -- fruit-plants-brazil.ipynb && echo "Restored. Now: File → Reload Notebook from Disk"

	id	observed_on_string	observed_on	time_observed_at	time_zone	user_id	user_login	user_name	created_at	updated_at	...	taxon_id	taxon_kingdom_name	taxon_phylum_name	taxon_class_name	taxon_order_name	taxon_family_name	taxon_genus_name	taxon_species_name	taxon_subspecies_name	taxon_variety_name
0	26052	2011-07-24	24/07/2011	NaN	Brasilia	1370	designonze	Gabriela Castro	2011-07-25 23:51:48 UTC	2020-10-24 03:33:16 UTC	...	85098	Plantae	Tracheophyta	Magnoliopsida	Sapindales	Anacardiaceae	Spondias	Spondias mombin	NaN	NaN
1	122013	2012-08-16	16/08/2012	NaN	Brasilia	9421	mauricio_mercadante	Mauricio Mercadante	2012-09-13 01:36:04 UTC	2022-08-31 20:50:10 UTC	...	84838	Plantae	Tracheophyta	Magnoliopsida	Myrtales	Myrtaceae	Syzygium	Syzygium jambos	NaN	NaN
2	122037	2012-06-26	26/06/2012	NaN	Hawaii	9421	mauricio_mercadante	Mauricio Mercadante	2012-09-13 03:35:07 UTC	2022-08-31 20:50:10 UTC	...	153017	Plantae	Tracheophyta	Magnoliopsida	Fabales	Fabaceae	Dipteryx	Dipteryx alata	NaN	NaN
3	123165	2012-04-22	22/04/2012	NaN	Hawaii	9421	mauricio_mercadante	Mauricio Mercadante	2012-09-15 22:50:10 UTC	2022-08-31 20:50:10 UTC	...	118964	Plantae	Tracheophyta	Magnoliopsida	Malvales	Malvaceae	Sterculia	Sterculia striata	NaN	NaN
4	220970	2013-03-22	22/03/2013	NaN	Santiago	14169	netosevero	Francisco Severo Neto	2013-03-22 14:37:46 UTC	2023-06-28 13:52:28 UTC	...	153017	Plantae	Tracheophyta	Magnoliopsida	Fabales	Fabaceae	Dipteryx	Dipteryx alata	NaN	NaN

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
id	10942.0	NaN	NaN	NaN	90096010.014257	49771355.303694	26052.0	47738730.75	86225883.5	137934586.0	179628795.0
observed_on_string	10839	10517	2022-11-19	38	NaN	NaN	NaN	NaN	NaN	NaN	NaN
observed_on	10837	2271	02/05/2021	170	NaN	NaN	NaN	NaN	NaN	NaN	NaN
time_observed_at	10150	9970	2020-10-15 17:56:00 UTC	9	NaN	NaN	NaN	NaN	NaN	NaN	NaN
time_zone	10940	62	Brasilia	8107	NaN	NaN	NaN	NaN	NaN	NaN	NaN
user_id	10942.0	NaN	NaN	NaN	2932202.384939	1932926.099828	1370.0	1347391.0	2581998.0	4187166.0	7269362.0
user_login	10942	2765	ericfischerrempe	566	NaN	NaN	NaN	NaN	NaN	NaN	NaN
user_name	9037	2028	Eric Fischer Rempe	566	NaN	NaN	NaN	NaN	NaN	NaN	NaN
created_at	10942	10834	2020-09-04 13:37:39 UTC	3	NaN	NaN	NaN	NaN	NaN	NaN	NaN
updated_at	10942	10702	2023-07-27 18:27:45 UTC	57	NaN	NaN	NaN	NaN	NaN	NaN	NaN
quality_grade	10942	3	research	4572	NaN	NaN	NaN	NaN	NaN	NaN	NaN
license	6170	6	CC-BY-NC	5284	NaN	NaN	NaN	NaN	NaN	NaN	NaN
url	10942	10942	http://www.inaturalist.org/observations/26052	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
image_url	10942	10941	https://static.inaturalist.org/photos/34889661...	2	NaN	NaN	NaN	NaN	NaN	NaN	NaN
tag_list	329	228	UnB, Universidade de Brasília, taxonomy:binomi...	8	NaN	NaN	NaN	NaN	NaN	NaN	NaN
description	1338	1062	Fotografado pela Patrulha Centaurus	19	NaN	NaN	NaN	NaN	NaN	NaN	NaN
num_identification_agreements	10942.0	NaN	NaN	NaN	0.866112	0.853433	0.0	0.0	1.0	1.0	6.0
num_identification_disagreements	10942.0	NaN	NaN	NaN	0.007677	0.090371	0.0	0.0	0.0	0.0	2.0
captive_cultivated	10942	2	False	7736	NaN	NaN	NaN	NaN	NaN	NaN	NaN
oauth_application_id	8222.0	NaN	NaN	NaN	9.53819	48.331838	2.0	2.0	2.0	3.0	524.0

Fruit-Bearing Plant Species Observations in Brazilian Cities (iNaturalist)¶

What's in the dataset¶

Load the data¶

First look¶

A first chart¶

Working with data larger than memory¶

Your turn¶