from pathlib import Path

DATA = Path('/srv/data/marcsi-marine')

for f in sorted(DATA.rglob('*')):
    if f.is_file():
        print(f"{f.relative_to(DATA)}  ({f.stat().st_size/1e6:,.1f} MB)")

MARCSI database_final.csv  (1.4 MB)
MARCSI database_final_csvw.json  (0.0 MB)

import pandas as pd

csvs = sorted(DATA.rglob('*.csv')) + sorted(DATA.rglob('*.csv.gz')) + sorted(DATA.rglob('*.gz'))
print('Using:', csvs[0].name)

def load_csv(path, **kw):
    """Robust reader: detects the separator and tries utf-8 then latin-1."""
    for enc in ('utf-8', 'latin-1'):
        try:
            return pd.read_csv(path, sep=None, engine='python', encoding=enc, **kw)
        except UnicodeDecodeError:
            continue

df = load_csv(csvs[0], nrows=100_000)
df.head()

Using: MARCSI database_final.csv

df.info()
df.describe(include='all').T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1267 entries, 0 to 1266
Data columns (total 29 columns):
 #   Column                                                             Non-Null Count  Dtype 
---  ------                                                             --------------  ----- 
 0   Id                                                                 1267 non-null   int64 
 1   Marine citizen science initiative title                            1267 non-null   object
 2   Description / initiative summary                                   1266 non-null   object
 3   Initiative topic / keywords                                        1256 non-null   object
 4   Scientific topic (1)                                               1267 non-null   object
 5   Scientific topic (2)                                               159 non-null    object
 6   Scientific topic (3)                                               12 non-null     object
 7   Start date                                                         1213 non-null   object
 8   End date                                                           1176 non-null   object
 9   Aim / intention / purpose of citizen science initiative / project  1258 non-null   object
 10  Marine focus area                                                  1256 non-null   object
 11  Geographical scale                                                 1257 non-null   object
 12  Geographic location - country/countries                            1234 non-null   object
 13  Initiative documentation                                           1258 non-null   object
 14  Initiative host / coordinator                                      1075 non-null   object
 15  Initiative contact and email                                       1043 non-null   object
 16  Sponsor / funding                                                  1022 non-null   object
 17  Status                                                             1248 non-null   object
 18  Data collection methodology                                        1015 non-null   object
 19  Type of data collected                                             1015 non-null   object
 20  Quantity of data collected                                         1003 non-null   object
 21  Indicators used                                                    1004 non-null   object
 22  Open Access                                                        1267 non-null   object
 23  Findable                                                           690 non-null    object
 24  Accessible                                                         690 non-null    object
 25  Interoperable                                                      690 non-null    object
 26  Reusable                                                           690 non-null    object
 27  Distribution                                                       459 non-null    object
 28  License                                                            41 non-null     object
dtypes: int64(1), object(28)
memory usage: 287.2+ KB

import matplotlib.pyplot as plt

num = df.select_dtypes('number')
if num.shape[1]:
    col = num.columns[0]
    num[col].plot.hist(bins=50, figsize=(8, 4), title=col)
    plt.tight_layout()
else:
    print('No direct numeric columns: explore df on your own.')

total = 0
for chunk in pd.read_csv(file, chunksize=1_000_000):
    total += len(chunk)

# ⚠️ RESTORE: this DISCARDS YOUR CHANGES to this notebook and resets it to the original.
# 1. Uncomment the line below (remove the #)   2. Run this cell
# 3. Then: menu File → Reload Notebook from Disk

# !git -C ~/citizen-science-data fetch -q origin && git -C ~/citizen-science-data checkout origin/main -- marcsi-marine.ipynb && echo "Restored. Now: File → Reload Notebook from Disk"

	Id	Marine citizen science initiative title	Description / initiative summary	Initiative topic / keywords	Scientific topic (1)	Scientific topic (2)	Scientific topic (3)	Start date	End date	Aim / intention / purpose of citizen science initiative / project	...	Type of data collected	Quantity of data collected	Indicators used	Open Access	Findable	Accessible	Interoperable	Reusable	Distribution	License
0	1	Tangaroa Blue Foundation Australian Marine D...	Australian national project to quantify and re...	Marine Debris, Partnerships	Ecology (e.g. coastal ecology, the state of ce...	NaN	NaN	2004	Present	Tangaroa Blue Foundation is an Australian-wide...	...	Marine Debris (plastic, glass, ceramic & const...	Not found	See Marine Debris app	Yes (raw data is available)	No	Yes	No	No	https://www.tangaroablue.org/database/	NaN
1	2	Litter Intelligence	Litter Intelligence is Aotearoas first and on...	Pollution, plastic, marine litter	Pollution (e.g. marine litter or the effect of...	NaN	NaN	2018	Present	To collect and input litter data, provide insi...	...	Litter - platics, glass and ceramics, paper ...	Litter density on Beach - 320 Average items pe...	litter density, plastic percentage	Yes (raw data is available)	No	Yes	No	Yes	https://litterintelligence.org/data/	NZ - CC 3 NC
2	3	Citclops	The Citclops project aims to develop systems t...	water quality; water colour;	Environmental variables (e.g. water quality, t...	NaN	NaN	2012	2015	Citclops aims: \n1. To enable citizens partic...	...	Colour and clarity of the water	See the apps	Ocean colour via Forel-Ule index	Yes (raw data is available)	No	Yes	No	No	http://www.citclops.eu/	NaN
3	4	Marine Debris Tracker	Designed to help citizen scientists by contrib...	marine litter, plastic	Pollution (e.g. marine litter or the effect of...	NaN	NaN	2010	Present	The Marine Debris Tracker mobile app, is a joi...	...	plastic, metal, glass, rubber, cloth, paper & ...	6,881,410 debris items collected	Total Debris count, total collection events	Yes (raw data is available)	No	Yes	No	No	https://debristracker.org/data	NaN
4	5	Phytoplankton Monitoring Network (PMN)	Better understanding of harmful algal blooms t...	Phytoplankton, harmful algal blooms	Environmental variables (e.g. water quality, t...	NaN	NaN	2001	Present	The National Phytoplankton Monitoring Network ...	...	date, time of sampling, water and air temperat...	See the on-line data map> https://www.ncei.noa...	Date, species, water conditions (water tempera...	Yes (raw data is available)	No	Yes	No	No	https://www.ncei.noaa.gov/maps/phytoplankton/v...	NaN

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Id	1267.0	NaN	NaN	NaN	634.0	365.895705	1.0	317.5	634.0	950.5	1267.0
Marine citizen science initiative title	1267	1266	Marine animals and coral reef monitoring along...	2	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Description / initiative summary	1266	1249	The objective of the network is to improve kno...	5	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Initiative topic / keywords	1256	847	Species monitoring	66	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Scientific topic (1)	1267	9	Single species (e.g. marine mammals, fish, bir...	494	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Scientific topic (2)	159	7	Biodiversity (e.g. collecting information on m...	42	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Scientific topic (3)	12	6	Pollution (e.g. marine litter or the effect of...	4	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Start date	1213	125	Not found	271	NaN	NaN	NaN	NaN	NaN	NaN	NaN
End date	1176	54	Present	580	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Aim / intention / purpose of citizen science initiative / project	1258	1239	Our aim is to secure the future of Critically ...	5	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Marine focus area	1256	8	On shore	308	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Geographical scale	1257	12	National	383	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Geographic location - country/countries	1234	560	Global	151	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Initiative documentation	1258	1083	Not found	148	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Initiative host / coordinator	1075	732	Not found	123	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Initiative contact and email	1043	655	Not found	289	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Sponsor / funding	1022	353	Not found	590	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Status	1248	7	active	703	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Data collection methodology	1015	188	Not found	438	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Type of data collected	1015	202	Not found	395	NaN	NaN	NaN	NaN	NaN	NaN	NaN

MARCSI: Inventory of Marine Citizen Science Initiatives (Global)¶

What's in the dataset¶

Load the data¶

First look¶

A first chart¶

Working with data larger than memory¶

Your turn¶