Skip to content

sst.transform

Transform utilities for preparing SST and ENSO time series.

join_on_month(sst, enso, start=None)

Join SST and ENSO records on their monthly date column.

Parameters:

Name Type Description Default
sst DataFrame

Sea surface temperature observations produced by :func:tidy.

required
enso DataFrame

ENSO index observations produced by :func:tidy.

required
start str

Earliest date to retain after joining (inclusive). Parsed with :func:pandas.to_datetime if provided.

None

Returns:

Type Description
DataFrame

DataFrame containing the merged records, filtered to start when supplied, and indexed consecutively.

Examples:

>>> import pandas as pd
>>> sst = tidy(pd.DataFrame({"date": ["2000-01-01"], "sst_c": [20.0]}), "date", "sst_c")
>>> enso = tidy(pd.DataFrame({"date": ["2000-01-01"], "nino34": [0.5]}), "date", "nino34")
>>> join_on_month(sst, enso).columns.tolist()
['date', 'sst_c', 'sst_c_roll12', 'nino34', 'nino34_roll12']
Source code in src/sst/transform.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def join_on_month(sst: pd.DataFrame, enso: pd.DataFrame, start: str | None = None) -> pd.DataFrame:
    """Join SST and ENSO records on their monthly ``date`` column.

    Parameters
    ----------
    sst : pandas.DataFrame
        Sea surface temperature observations produced by :func:`tidy`.
    enso : pandas.DataFrame
        ENSO index observations produced by :func:`tidy`.
    start : str, optional
        Earliest date to retain after joining (inclusive). Parsed with
        :func:`pandas.to_datetime` if provided.

    Returns
    -------
    pandas.DataFrame
        DataFrame containing the merged records, filtered to ``start`` when
        supplied, and indexed consecutively.

    Examples
    --------
    >>> import pandas as pd
    >>> sst = tidy(pd.DataFrame({"date": ["2000-01-01"], "sst_c": [20.0]}), "date", "sst_c")
    >>> enso = tidy(pd.DataFrame({"date": ["2000-01-01"], "nino34": [0.5]}), "date", "nino34")
    >>> join_on_month(sst, enso).columns.tolist()
    ['date', 'sst_c', 'sst_c_roll12', 'nino34', 'nino34_roll12']
    """

    df = pd.merge(sst, enso, on="date", how="left")
    if start:
        df = df[df["date"] >= pd.to_datetime(start)]
    return df.reset_index(drop=True)

metrics(df)

Summarize rolling SST and ENSO time series with key indicators.

Parameters:

Name Type Description Default
df DataFrame

Joined SST and ENSO tidy data that contains a date column along with at least one rolling SST column (sst_c_roll*) and one rolling ENSO column (nino34_roll*).

required

Returns:

Type Description
DataFrame

Single-row DataFrame containing trend, delta, correlation, and record count statistics for the supplied series.

Examples:

>>> import pandas as pd
>>> joined = join_on_month(
...     tidy(pd.DataFrame({"date": ["2000-01-01"], "sst_c": [20.0]}), "date", "sst_c"),
...     tidy(pd.DataFrame({"date": ["2000-01-01"], "nino34": [0.5]}), "date", "nino34"),
... )
>>> metrics(joined).columns.tolist()
['sst_trend_c_per_decade', 'delta_sst_last_yr_c', 'delta_enso_last_yr', 'corr_sst_enso_roll', 'n_months']
Source code in src/sst/transform.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def metrics(df: pd.DataFrame) -> pd.DataFrame:
    """Summarize rolling SST and ENSO time series with key indicators.

    Parameters
    ----------
    df : pandas.DataFrame
        Joined SST and ENSO tidy data that contains a ``date`` column along
        with at least one rolling SST column (``sst_c_roll*``) and one rolling
        ENSO column (``nino34_roll*``).

    Returns
    -------
    pandas.DataFrame
        Single-row DataFrame containing trend, delta, correlation, and record
        count statistics for the supplied series.

    Examples
    --------
    >>> import pandas as pd
    >>> joined = join_on_month(
    ...     tidy(pd.DataFrame({"date": ["2000-01-01"], "sst_c": [20.0]}), "date", "sst_c"),
    ...     tidy(pd.DataFrame({"date": ["2000-01-01"], "nino34": [0.5]}), "date", "nino34"),
    ... )
    >>> metrics(joined).columns.tolist()
    ['sst_trend_c_per_decade', 'delta_sst_last_yr_c', 'delta_enso_last_yr', 'corr_sst_enso_roll', 'n_months']
    """

    d = df.set_index("date")

    sst_col = (
        "sst_c_roll12"
        if "sst_c_roll12" in d.columns
        else [c for c in d.columns if c.startswith("sst_c_roll")][0]
    )
    enso_col = (
        "nino34_roll12"
        if "nino34_roll12" in d.columns
        else [c for c in d.columns if c.startswith("nino34_roll")][0]
    )

    sst_trend_c_per_dec = _simple_trend(d[sst_col].dropna(), per="decade")

    delta_sst_lastyr = _delta_last_year(d[sst_col])
    delta_enso_lastyr = _delta_last_year(d[enso_col])

    corr = d[sst_col].corr(d[enso_col])

    return pd.DataFrame(
        [
            {
                "sst_trend_c_per_decade": (
                    round(sst_trend_c_per_dec, 3) if pd.notna(sst_trend_c_per_dec) else None
                ),
                "delta_sst_last_yr_c": (
                    round(delta_sst_lastyr, 3) if pd.notna(delta_sst_lastyr) else None
                ),
                "delta_enso_last_yr": (
                    round(delta_enso_lastyr, 3) if pd.notna(delta_enso_lastyr) else None
                ),
                "corr_sst_enso_roll": round(float(corr), 3) if pd.notna(corr) else None,
                "n_months": int(len(d)),
            }
        ]
    )

tidy(df, date_col, value_col, roll=12)

Create a tidy, chronologically ordered DataFrame with rolling means.

Parameters:

Name Type Description Default
df DataFrame

Raw input data containing at least the date and value columns.

required
date_col str

Name of the column with dates parsable by :func:pandas.to_datetime.

required
value_col str

Name of the column with the measurement to smooth.

required
roll int

Rolling window size (number of observations) used to compute the mean.

12

Returns:

Type Description
DataFrame

Sorted copy of the original data with a new column containing the rolling mean named "{value_col}_roll{roll}".

Examples:

>>> import pandas as pd
>>> raw = pd.DataFrame({"date": ["2000-01-01", "2000-02-01"], "sst_c": [20.0, 20.1]})
>>> tidy(raw, "date", "sst_c").columns.tolist()
['date', 'sst_c', 'sst_c_roll12']
Source code in src/sst/transform.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def tidy(df: pd.DataFrame, date_col: str, value_col: str, roll: int = 12) -> pd.DataFrame:
    """Create a tidy, chronologically ordered DataFrame with rolling means.

    Parameters
    ----------
    df : pandas.DataFrame
        Raw input data containing at least the date and value columns.
    date_col : str
        Name of the column with dates parsable by :func:`pandas.to_datetime`.
    value_col : str
        Name of the column with the measurement to smooth.
    roll : int, default=12
        Rolling window size (number of observations) used to compute the mean.

    Returns
    -------
    pandas.DataFrame
        Sorted copy of the original data with a new column containing the
        rolling mean named ``"{value_col}_roll{roll}"``.

    Examples
    --------
    >>> import pandas as pd
    >>> raw = pd.DataFrame({"date": ["2000-01-01", "2000-02-01"], "sst_c": [20.0, 20.1]})
    >>> tidy(raw, "date", "sst_c").columns.tolist()
    ['date', 'sst_c', 'sst_c_roll12']
    """

    out = df[[date_col, value_col]].copy()

    out[date_col] = pd.to_datetime(out[date_col])
    out = out.sort_values(date_col).dropna()

    out[f"{value_col}_roll{roll}"] = out[value_col].rolling(roll, min_periods=1).mean()
    return out