import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
import seaborn as sns
from datetime import timedelta
from scipy import stats

# ── Plot style ──────────────────────────────────────────────────────────────
PALETTE = {
    'primary':   '#1DB954',   # Spotify green
    'secondary': '#E8450A',   # warm orange
    'accent':    '#3D7FBF',   # steel blue
    'neutral':   '#888888',
    'bg':        '#F9F9F8',
}

plt.rcParams.update({
    'figure.dpi':          130,
    'figure.facecolor':    PALETTE['bg'],
    'axes.facecolor':      PALETTE['bg'],
    'axes.spines.top':     False,
    'axes.spines.right':   False,
    'axes.spines.left':    False,
    'axes.spines.bottom':  False,
    'axes.grid':           True,
    'grid.alpha':          0.25,
    'grid.linestyle':      '--',
    'font.family':         'sans-serif',
    'font.size':           11,
    'axes.titlesize':      13,
    'axes.titleweight':    'semibold',
    'axes.labelcolor':     '#444',
    'xtick.color':         '#666',
    'ytick.color':         '#666',
})

def style_ax(ax, title=None, xlabel=None, ylabel=None):
    """Apply consistent finishing touches to an axes."""
    if title:   ax.set_title(title, pad=10)
    if xlabel:  ax.set_xlabel(xlabel, labelpad=8)
    if ylabel:  ax.set_ylabel(ylabel, labelpad=8)
    ax.tick_params(length=0)
    return ax

def safe_label(name):
    """Escape $ signs so matplotlib doesn't treat them as LaTeX math delimiters."""
    return str(name).replace('$', r'\$')

print('Libraries loaded.')

Libraries loaded.

strava_df = pd.read_csv('strava_activities.csv')
print(f'Raw shape: {strava_df.shape}')

# Drop administrative/social columns that add no analytical value
drop_cols = [
    'resource_state', 'athlete', 'comment_count', 'photo_count', 'total_photo_count',
    'athlete_count', 'kudos_count', 'has_kudoed', 'pr_count', 'trainer', 'commute',
    'manual', 'private', 'visibility', 'flagged', 'from_accepted_tag',
    'display_hide_heartrate_option', 'heartrate_opt_out', 'upload_id',
    'upload_id_str', 'external_id', 'gear_id', 'location_city', 'location_state',
    'location_country', 'workout_type', 'type', 'sport_type', 'device_name', 'has_heartrate'
]
strava_df = strava_df.drop(columns=[c for c in drop_cols if c in strava_df.columns])

# Parse timestamps; derive end time from elapsed seconds
strava_df['start_date'] = pd.to_datetime(strava_df['start_date'])
strava_df['end_date']   = strava_df['start_date'] + pd.to_timedelta(strava_df['elapsed_time'], unit='s')

# Unit conversions to imperial
strava_df['avg_speed_mph']    = strava_df['average_speed'] * 2.23694
strava_df['distance_miles']   = strava_df['distance']      * 0.000621371
strava_df['elapsed_minutes']  = strava_df['elapsed_time']  / 60

# Elevation — Strava exports in meters, convert to feet
# total_elevation_gain: cumulative ascent over the run (descents not counted)
if 'total_elevation_gain' in strava_df.columns:
    strava_df['elevation_gain_ft'] = strava_df['total_elevation_gain'] * 3.28084
    # Elevation gain per mile — controls for the fact that longer runs naturally gain more
    strava_df['elev_gain_per_mile'] = (
        strava_df['elevation_gain_ft'] / strava_df['distance_miles'].replace(0, np.nan)
    )
    print(f'Elevation data present: {strava_df["elevation_gain_ft"].notna().sum()} runs with values')
    print(f'  Mean elevation gain: {strava_df["elevation_gain_ft"].mean():.0f} ft')
    print(f'  Max elevation gain:  {strava_df["elevation_gain_ft"].max():.0f} ft')
else:
    print('⚠  total_elevation_gain column not found — elevation analysis will be skipped')
    strava_df['elevation_gain_ft'] = np.nan
    strava_df['elev_gain_per_mile'] = np.nan

# Time features for confounder analysis
strava_df['hour_of_day'] = strava_df['start_date'].dt.hour
strava_df['day_of_week'] = strava_df['start_date'].dt.day_name()
strava_df['month']       = strava_df['start_date'].dt.to_period('M').astype(str)
strava_df['week']        = strava_df['start_date'].dt.isocalendar().week

print(f'\nCleaned shape: {strava_df.shape}')
strava_df[['start_date', 'end_date', 'avg_speed_mph', 'distance_miles',
           'elapsed_minutes', 'elevation_gain_ft']].head()

Raw shape: (73, 48)
Elevation data present: 73 runs with values
  Mean elevation gain: 139 ft
  Max elevation gain:  455 ft

Cleaned shape: (73, 28)

/var/folders/9b/pqv3317s0y7gshfwzdd28mfw0000gn/T/ipykernel_36510/4102232015.py:43: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
  strava_df['month']       = strava_df['start_date'].dt.to_period('M').astype(str)

spotify_df = pd.read_json('Spotify_Account_Data/StreamingHistory_music_0.json')
print(f'Raw shape: {spotify_df.shape}')

# Normalize timestamps and derive startTime
# Spotify only gives endTime — we back-calculate from msPlayed
spotify_df['endTime']      = pd.to_datetime(spotify_df['endTime']).dt.tz_localize(None)
spotify_df['startTime']    = spotify_df['endTime'] - pd.to_timedelta(spotify_df['msPlayed'], unit='ms')
spotify_df['duration_sec'] = spotify_df['msPlayed'] / 1000

# Filter out skipped tracks (played < 10s — probably accidental)
before = len(spotify_df)
spotify_df = spotify_df[spotify_df['duration_sec'] >= 10].copy()
print(f'Dropped {before - len(spotify_df)} skipped/accidental plays (<10s)')

print(f'\nDate range: {spotify_df["endTime"].min().date()}  →  {spotify_df["endTime"].max().date()}')
print(f'Unique artists: {spotify_df["artistName"].nunique():,}')
print(f'Unique tracks:  {spotify_df["trackName"].nunique():,}')
spotify_df.head(3)

Raw shape: (8076, 4)
Dropped 846 skipped/accidental plays (<10s)

Date range: 2025-04-29  →  2026-04-28
Unique artists: 1,397
Unique tracks:  3,764

def song_in_run(row, run_start, run_end):
    """
    Returns True if >50% of this track's play duration overlapped
    with the run window [run_start, run_end].
    """
    overlap_start = max(row['startTime'], run_start)
    overlap_end   = min(row['endTime'],   run_end)
    overlap_sec   = (overlap_end - overlap_start).total_seconds()
    song_dur_sec  = row['duration_sec']

    if song_dur_sec <= 0:
        return False
    return overlap_sec > 0.5 * song_dur_sec


# Strip timezone info from Strava so comparisons work cleanly
strava_naive = strava_df.copy()
strava_naive['start_date'] = strava_naive['start_date'].dt.tz_localize(None)
strava_naive['end_date']   = strava_naive['end_date'].dt.tz_localize(None)

run_songs_dict = {}
for _, run in strava_naive.iterrows():
    mask = spotify_df.apply(
        song_in_run, axis=1,
        run_start=run['start_date'],
        run_end=run['end_date']
    )
    run_songs_dict[run['id']] = spotify_df[mask].index.tolist()

matched   = sum(1 for v in run_songs_dict.values() if len(v) > 0)
unmatched = len(strava_df) - matched
match_rate = matched / len(strava_df)

print(f'Total runs:           {len(strava_df):>5}')
print(f'Runs with music data: {matched:>5}  ({match_rate:.1%})')
print(f'Runs without match:   {unmatched:>5}  (predates Spotify export window)')

Total runs:              73
Runs with music data:    40  (54.8%)
Runs without match:      33  (predates Spotify export window)

# Build monthly coverage summary
run_match_summary = [
    {
        'run_id':     run['id'],
        'start_date': run['start_date'],
        'month':      run['month'],
        'n_songs':    len(run_songs_dict.get(run['id'], [])),
        'has_music':  len(run_songs_dict.get(run['id'], [])) > 0
    }
    for _, run in strava_df.iterrows()
]
match_df = pd.DataFrame(run_match_summary)
match_df['start_date'] = pd.to_datetime(match_df['start_date'])

monthly = match_df.copy()
monthly['ym'] = monthly['start_date'].dt.to_period('M')
monthly_agg = monthly.groupby('ym').agg(
    total_runs   =('run_id',    'count'),
    matched_runs =('has_music', 'sum')
).reset_index()
monthly_agg['match_rate'] = monthly_agg['matched_runs'] / monthly_agg['total_runs']
monthly_agg['ym_str']     = monthly_agg['ym'].astype(str)

fig, axes = plt.subplots(1, 2, figsize=(16, 5), facecolor=PALETTE['bg'])
fig.suptitle('Data Coverage: Spotify–Strava Temporal Overlap by Month', fontsize=14, y=1.02)

# Stacked bar
axes[0].bar(monthly_agg['ym_str'], monthly_agg['total_runs'],
            color=PALETTE['neutral'], alpha=0.4, label='No music match', width=0.7)
axes[0].bar(monthly_agg['ym_str'], monthly_agg['matched_runs'],
            color=PALETTE['primary'], alpha=0.9, label='Music matched', width=0.7)
style_ax(axes[0], 'Run Count vs. Matched Runs by Month', 'Month', 'Runs')
axes[0].tick_params(axis='x', rotation=45)
axes[0].legend(frameon=False)

# Match rate line
axes[1].fill_between(monthly_agg['ym_str'], monthly_agg['match_rate'] * 100,
                     alpha=0.15, color=PALETTE['primary'])
axes[1].plot(monthly_agg['ym_str'], monthly_agg['match_rate'] * 100,
             color=PALETTE['primary'], marker='o', linewidth=2.5, markersize=5)
axes[1].axhline(50, color=PALETTE['secondary'], linestyle='--', alpha=0.6, label='50%')
style_ax(axes[1], 'Music Match Rate by Month (%)', 'Month', 'Match Rate (%)')
axes[1].tick_params(axis='x', rotation=45)
axes[1].set_ylim(0, 105)
axes[1].legend(frameon=False)

plt.tight_layout()
plt.show()
print(f'Coverage note: months with <50% match rate are outside the Spotify export window.')

/var/folders/9b/pqv3317s0y7gshfwzdd28mfw0000gn/T/ipykernel_36510/2916224216.py:16: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
  monthly['ym'] = monthly['start_date'].dt.to_period('M')

Coverage note: months with <50% match rate are outside the Spotify export window.

# ── Flatten run_songs_dict into a long-form dataframe ──────────────────────
rows = []
for run_id, song_idxs in run_songs_dict.items():
    for idx in song_idxs:
        song = spotify_df.loc[idx]
        rows.append({
            'run_id':       run_id,
            'artistName':   song['artistName'],
            'trackName':    song['trackName'],
            'duration_sec': song['duration_sec'],
        })

run_song_flat = pd.DataFrame(rows)
print(f'Total run–song pairs: {len(run_song_flat):,}')

# ── Merge with run-level performance metrics ────────────────────────────────
run_meta = strava_df[['id', 'avg_speed_mph', 'distance_miles', 'elapsed_minutes',
                       'elevation_gain_ft', 'elev_gain_per_mile',
                       'hour_of_day', 'day_of_week', 'month', 'start_date']].copy()
run_song_flat = run_song_flat.merge(run_meta, left_on='run_id', right_on='id', how='left')

# ── Run-level listening features ────────────────────────────────────────────
run_level = run_song_flat.groupby('run_id').agg(
    n_songs           =('trackName',    'count'),
    n_unique_artists  =('artistName',   'nunique'),
    n_unique_tracks   =('trackName',    'nunique'),
    total_music_sec   =('duration_sec', 'sum'),
).reset_index()
run_level = run_level.merge(run_meta, left_on='run_id', right_on='id', how='left')
run_level['songs_per_minute']  = run_level['n_songs'] / run_level['elapsed_minutes']
run_level['track_repeat_rate'] = 1 - (run_level['n_unique_tracks'] / run_level['n_songs'])

# ── Artist-level aggregate features ────────────────────────────────────────
artist_summary = run_song_flat.groupby('artistName').agg(
    avg_speed        =('avg_speed_mph',      'mean'),
    std_speed        =('avg_speed_mph',      'std'),
    median_speed     =('avg_speed_mph',      'median'),
    avg_distance     =('distance_miles',     'mean'),
    std_distance     =('distance_miles',     'std'),
    avg_elevation    =('elevation_gain_ft',  'mean'),
    std_elevation    =('elevation_gain_ft',  'std'),
    avg_elev_per_mi  =('elev_gain_per_mile', 'mean'),
    appearance_count =('run_id',             'nunique'),
    total_plays      =('trackName',          'count'),
).reset_index()

# Filter to artists with 2+ run appearances
artist_summary_filtered = artist_summary[artist_summary['appearance_count'] >= 2].copy()

print(f'Artists with 2+ appearances: {len(artist_summary_filtered)}')
print(f'Filtered out:                {len(artist_summary) - len(artist_summary_filtered)} single-run artists')
print()
print('Top 8 by avg elevation gain (ft):')
print(
    artist_summary_filtered
    .sort_values('avg_elevation', ascending=False)
    .head(8)[['artistName', 'avg_elevation', 'std_elevation', 'avg_speed', 'appearance_count']]
    .to_string(index=False)
)

Total run–song pairs: 530
Artists with 2+ appearances: 85
Filtered out:                96 single-run artists

Top 8 by avg elevation gain (ft):
    artistName  avg_elevation  std_elevation  avg_speed  appearance_count
     prod. DTM     241.797908       4.175828   6.470349                 2
Dillon Francis     237.532816       4.651393   6.530970                 3
       DECiBEL     237.204732       6.495732   6.526272                 2
 Bert On Beats     237.204732       5.303743   6.526272                 2
         Diplo     237.204732       6.495732   6.526272                 2
    DJ Ketchup     237.204732       6.495732   6.526272                 2
  DJ Du Marcel     237.204732       6.495732   6.526272                 2
     Costuleta     237.204732       6.495732   6.526272                 2

fig = plt.figure(figsize=(16, 11), facecolor=PALETTE['bg'])
fig.suptitle('Running Performance: Baseline Distributions', fontsize=15, y=1.01)
gs = fig.add_gridspec(2, 3, hspace=0.4, wspace=0.35)

ax_speed = fig.add_subplot(gs[0, 0])
ax_dist  = fig.add_subplot(gs[0, 1])
ax_elev  = fig.add_subplot(gs[0, 2])
ax_dur   = fig.add_subplot(gs[1, 0])
ax_scat  = fig.add_subplot(gs[1, 1])
ax_escat = fig.add_subplot(gs[1, 2])

# Speed
ax_speed.hist(strava_df['avg_speed_mph'].dropna(), bins=24,
              color=PALETTE['primary'], edgecolor='white', linewidth=0.6, alpha=0.9)
ax_speed.axvline(strava_df['avg_speed_mph'].mean(), color=PALETTE['secondary'],
                 linestyle='--', linewidth=1.8,
                 label=f'Mean: {strava_df["avg_speed_mph"].mean():.2f} mph')
ax_speed.axvline(strava_df['avg_speed_mph'].median(), color=PALETTE['accent'],
                 linestyle=':', linewidth=1.8,
                 label=f'Median: {strava_df["avg_speed_mph"].median():.2f} mph')
ax_speed.legend(frameon=False, fontsize=9)
style_ax(ax_speed, 'Avg Speed Distribution', 'Speed (mph)', 'Frequency')

# Distance
ax_dist.hist(strava_df['distance_miles'].dropna(), bins=24,
             color=PALETTE['secondary'], edgecolor='white', linewidth=0.6, alpha=0.9)
ax_dist.axvline(strava_df['distance_miles'].mean(), color=PALETTE['primary'],
                linestyle='--', linewidth=1.8,
                label=f'Mean: {strava_df["distance_miles"].mean():.2f} mi')
ax_dist.legend(frameon=False, fontsize=9)
style_ax(ax_dist, 'Distance Distribution', 'Distance (miles)', 'Frequency')

# Elevation
elev_data = strava_df['elevation_gain_ft'].dropna()
if len(elev_data) > 0:
    ax_elev.hist(elev_data, bins=24,
                 color='#9B59B6', edgecolor='white', linewidth=0.6, alpha=0.9)
    ax_elev.axvline(elev_data.mean(), color=PALETTE['secondary'], linestyle='--', linewidth=1.8,
                    label=f'Mean: {elev_data.mean():.0f} ft')
    ax_elev.axvline(elev_data.median(), color=PALETTE['accent'], linestyle=':', linewidth=1.8,
                    label=f'Median: {elev_data.median():.0f} ft')
    ax_elev.legend(frameon=False, fontsize=9)
else:
    ax_elev.text(0.5, 0.5, 'No elevation data', ha='center', va='center', transform=ax_elev.transAxes)
style_ax(ax_elev, 'Elevation Gain Distribution', 'Elevation Gain (ft)', 'Frequency')

# Duration
ax_dur.hist(strava_df['elapsed_minutes'].dropna(), bins=24,
            color=PALETTE['accent'], edgecolor='white', linewidth=0.6, alpha=0.9)
style_ax(ax_dur, 'Run Duration Distribution', 'Duration (minutes)', 'Frequency')

# Speed vs Distance
sdf = strava_df[['distance_miles', 'avg_speed_mph']].dropna()
ax_scat.scatter(sdf['distance_miles'], sdf['avg_speed_mph'],
                alpha=0.45, color=PALETTE['accent'], edgecolors='white', linewidth=0.4, s=40)
m, b, r, p, _ = stats.linregress(sdf['distance_miles'], sdf['avg_speed_mph'])
x_line = np.linspace(sdf['distance_miles'].min(), sdf['distance_miles'].max(), 100)
ax_scat.plot(x_line, m*x_line+b, color=PALETTE['secondary'], linewidth=2,
             label=f'r = {r:.2f}  (p = {p:.3f})')
ax_scat.legend(frameon=False, fontsize=9)
style_ax(ax_scat, 'Speed vs. Distance', 'Distance (miles)', 'Avg Speed (mph)')

# Speed vs Elevation — the key new chart
edf = strava_df[['elevation_gain_ft', 'avg_speed_mph']].dropna()
if len(edf) > 5:
    sc = ax_escat.scatter(edf['elevation_gain_ft'], edf['avg_speed_mph'],
                          alpha=0.45, color='#9B59B6', edgecolors='white', linewidth=0.4, s=40)
    m2, b2, r2, p2, _ = stats.linregress(edf['elevation_gain_ft'], edf['avg_speed_mph'])
    x2 = np.linspace(edf['elevation_gain_ft'].min(), edf['elevation_gain_ft'].max(), 100)
    ax_escat.plot(x2, m2*x2+b2, color=PALETTE['secondary'], linewidth=2,
                  label=f'r = {r2:.2f}  (p = {p2:.3f})')
    ax_escat.legend(frameon=False, fontsize=9)
    print(f'Speed ~ Elevation: r = {r2:.3f}, p = {p2:.4f}')
    if r2 < -0.15:
        print('  → Negative correlation: hillier runs tend to be slower (expected).')
    elif abs(r2) < 0.1:
        print('  → Weak correlation: elevation and speed largely independent in this dataset.')
style_ax(ax_escat, 'Speed vs. Elevation Gain', 'Elevation Gain (ft)', 'Avg Speed (mph)')

plt.tight_layout()
plt.show()

print()
print(strava_df[['avg_speed_mph', 'distance_miles', 'elapsed_minutes', 'elevation_gain_ft']].describe().round(2))

Speed ~ Elevation: r = 0.056, p = 0.6353
  → Weak correlation: elevation and speed largely independent in this dataset.

/var/folders/9b/pqv3317s0y7gshfwzdd28mfw0000gn/T/ipykernel_36510/2879799073.py:80: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  plt.tight_layout()

       avg_speed_mph  distance_miles  elapsed_minutes  elevation_gain_ft
count          73.00           73.00            73.00              73.00
mean            5.88            4.07            53.89             139.19
std             0.84            2.50            52.66              87.04
min             3.70            0.07             1.57               0.00
25%             5.44            2.38            28.07              72.51
50%             6.04            3.65            45.37             137.47
75%             6.47            5.39            59.27             184.71
max             7.95           13.14           368.42             454.72

fig, axes = plt.subplots(2, 2, figsize=(16, 10), facecolor=PALETTE['bg'])
fig.suptitle('Temporal Patterns in Running Activity', fontsize=14)

day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Hour of day
hour_counts = strava_df['hour_of_day'].value_counts().sort_index()
axes[0,0].bar(hour_counts.index, hour_counts.values,
              color=PALETTE['primary'], edgecolor='white', linewidth=0.5, alpha=0.9, width=0.8)
axes[0,0].set_xticks(range(0, 24, 2))
style_ax(axes[0,0], 'Runs by Hour of Day', 'Hour (24h)', 'Number of Runs')

# Day of week count
dow_counts = strava_df['day_of_week'].value_counts().reindex(day_order).fillna(0)
axes[0,1].bar(range(7), dow_counts.values,
              color=PALETTE['secondary'], edgecolor='white', linewidth=0.5, alpha=0.9)
axes[0,1].set_xticks(range(7))
axes[0,1].set_xticklabels([d[:3] for d in day_order])
style_ax(axes[0,1], 'Runs by Day of Week', 'Day', 'Number of Runs')

# Avg speed by day
speed_by_day = strava_df.groupby('day_of_week')['avg_speed_mph'].mean().reindex(day_order)
colors_spd = [PALETTE['primary'] if v == speed_by_day.max() else PALETTE['accent']
              for v in speed_by_day.values]
axes[1,0].bar(range(7), speed_by_day.values,
              color=colors_spd, edgecolor='white', linewidth=0.5, alpha=0.9)
axes[1,0].set_xticks(range(7))
axes[1,0].set_xticklabels([d[:3] for d in day_order])
axes[1,0].set_ylim(speed_by_day.min() * 0.97, speed_by_day.max() * 1.03)
style_ax(axes[1,0], 'Avg Speed by Day of Week', 'Day', 'Avg Speed (mph)')

# ── NEW: Avg elevation by day ───────────────────────────────────────────────
elev_by_day = strava_df.groupby('day_of_week')['elevation_gain_ft'].mean().reindex(day_order)
elev_valid  = elev_by_day.dropna()
if len(elev_valid) > 0:
    colors_elv = ['#8E44AD' if v == elev_valid.max() else '#C39BD3'
                  for v in elev_by_day.values]
    axes[1,1].bar(range(7), elev_by_day.fillna(0).values,
                  color=colors_elv, edgecolor='white', linewidth=0.5, alpha=0.9)
    axes[1,1].set_xticks(range(7))
    axes[1,1].set_xticklabels([d[:3] for d in day_order])
    axes[1,1].set_ylim(0, elev_valid.max() * 1.12)
    style_ax(axes[1,1], 'Avg Elevation Gain by Day of Week', 'Day', 'Avg Elevation Gain (ft)')

    hilliest_day  = elev_by_day.idxmax()
    flattest_day  = elev_by_day.idxmin()
    fastest_day   = speed_by_day.idxmax()
    print(f'Hilliest day on average:  {hilliest_day}  ({elev_by_day.max():.0f} ft)')
    print(f'Flattest day on average:  {flattest_day}  ({elev_by_day.min():.0f} ft)')
    print(f'Fastest day on average:   {fastest_day}  ({speed_by_day.max():.2f} mph)')
    if hilliest_day == fastest_day:
        print('  ⚠ Hilliest and fastest day overlap — elevation may not be suppressing speed here.')
    elif flattest_day == fastest_day:
        print('  → Fastest day is also flattest — terrain likely contributes to the speed signal.')
    else:
        print('  → Fastest and hilliest days differ — terrain and pace vary somewhat independently by day.')
else:
    axes[1,1].text(0.5, 0.5, 'No elevation data', ha='center', va='center',
                   transform=axes[1,1].transAxes, color=PALETTE['neutral'])
    style_ax(axes[1,1], 'Avg Elevation Gain by Day of Week')

plt.tight_layout()
plt.show()

Hilliest day on average:  Saturday  (190 ft)
Flattest day on average:  Tuesday  (115 ft)
Fastest day on average:   Wednesday  (6.25 mph)
  → Fastest and hilliest days differ — terrain and pace vary somewhat independently by day.

# Join elevation into run_level for this section
run_level_plot = run_level.copy()
if 'elevation_gain_ft' not in run_level_plot.columns:
    run_level_plot = run_level_plot.merge(
        strava_df[['id','elevation_gain_ft']], left_on='run_id', right_on='id', how='left'
    )

fig, axes = plt.subplots(1, 4, figsize=(22, 5), facecolor=PALETTE['bg'])
fig.suptitle('Listening Behavior During Runs', fontsize=14)

# Songs per run
axes[0].hist(run_level_plot['n_songs'].dropna(), bins=22,
             color=PALETTE['primary'], edgecolor='white', alpha=0.9)
style_ax(axes[0], 'Songs Per Run', 'Number of Songs', 'Frequency')

# Unique artists per run
axes[1].hist(run_level_plot['n_unique_artists'].dropna(), bins=18,
             color=PALETTE['secondary'], edgecolor='white', alpha=0.9)
style_ax(axes[1], 'Unique Artists Per Run', 'Unique Artists', 'Frequency')

# Songs vs speed
rl_spd = run_level_plot.dropna(subset=['n_songs', 'avg_speed_mph'])
axes[2].scatter(rl_spd['n_songs'], rl_spd['avg_speed_mph'],
                alpha=0.45, color=PALETTE['accent'], edgecolors='white', linewidth=0.4, s=45)
if len(rl_spd) > 5:
    m_s, b_s, r_s, p_s, _ = stats.linregress(rl_spd['n_songs'], rl_spd['avg_speed_mph'])
    x_s = np.linspace(rl_spd['n_songs'].min(), rl_spd['n_songs'].max(), 100)
    axes[2].plot(x_s, m_s*x_s+b_s, color=PALETTE['secondary'],
                 linewidth=2, label=f'r = {r_s:.2f}  (p={p_s:.3f})')
    axes[2].legend(frameon=False)
style_ax(axes[2], 'Songs Per Run vs. Avg Speed', 'Songs Per Run', 'Avg Speed (mph)')

# ── NEW: Songs vs elevation ──────────────────────────────────────────────────
rl_elv = run_level_plot.dropna(subset=['n_songs', 'elevation_gain_ft'])
if len(rl_elv) > 5:
    axes[3].scatter(rl_elv['n_songs'], rl_elv['elevation_gain_ft'],
                    alpha=0.45, color='#8E44AD', edgecolors='white', linewidth=0.4, s=45)
    m_e, b_e, r_e, p_e, _ = stats.linregress(rl_elv['n_songs'], rl_elv['elevation_gain_ft'])
    x_e = np.linspace(rl_elv['n_songs'].min(), rl_elv['n_songs'].max(), 100)
    axes[3].plot(x_e, m_e*x_e+b_e, color=PALETTE['secondary'],
                 linewidth=2, label=f'r = {r_e:.2f}  (p={p_e:.3f})')
    axes[3].legend(frameon=False)
    print(f'Songs per run ~ Elevation: r = {r_e:.3f},  p = {p_e:.4f}')
    if r_e > 0.3 and p_e < 0.05:
        print('  → More songs on hillier runs — likely because hilly runs are longer.')
    elif abs(r_e) < 0.1:
        print('  → Essentially no relationship between playlist size and elevation.')
    else:
        print('  → Weak association — song count alone is not a reliable elevation signal.')
else:
    axes[3].text(0.5, 0.5, 'Insufficient data', ha='center', va='center',
                 transform=axes[3].transAxes, color=PALETTE['neutral'])
style_ax(axes[3], 'Songs Per Run vs. Elevation Gain', 'Songs Per Run', 'Elevation Gain (ft)')

plt.tight_layout()
plt.show()

print()
print(run_level_plot[['n_songs', 'n_unique_artists', 'songs_per_minute', 'track_repeat_rate']].describe().round(3))

Songs per run ~ Elevation: r = 0.608,  p = 0.0000
  → More songs on hillier runs — likely because hilly runs are longer.

       n_songs  n_unique_artists  songs_per_minute  track_repeat_rate
count   40.000            40.000            40.000             40.000
mean    13.250            10.400             0.297              0.009
std      5.869             5.324             0.047              0.029
min      1.000             1.000             0.132              0.000
25%      9.750             6.000             0.276              0.000
50%     13.000            10.500             0.308              0.000
75%     16.000            13.000             0.325              0.000
max     32.000            24.000             0.378              0.143

top_speed    = artist_summary_filtered.nlargest(12, 'avg_speed').sort_values('avg_speed')
bottom_speed = artist_summary_filtered.nsmallest(12, 'avg_speed').sort_values('avg_speed', ascending=False)
top_dist     = artist_summary_filtered.nlargest(12, 'avg_distance').sort_values('avg_distance')
bottom_dist  = artist_summary_filtered.nsmallest(12, 'avg_distance').sort_values('avg_distance', ascending=False)

has_elev = artist_summary_filtered['avg_elevation'].notna().sum() > 2
if has_elev:
    top_elev    = artist_summary_filtered.nlargest(12, 'avg_elevation').sort_values('avg_elevation')
    bottom_elev = artist_summary_filtered.nsmallest(12, 'avg_elevation').sort_values('avg_elevation', ascending=False)

def barh_artists(ax, df, col, err_col, color, title, xlabel):
    labels = [safe_label(n) for n in df['artistName']]
    values = df[col].values
    errors = df[err_col].fillna(0).values
    bars = ax.barh(labels, values, xerr=errors,
                   color=color, edgecolor='white', linewidth=0.5,
                   capsize=4, alpha=0.88, error_kw={'elinewidth': 1.2, 'ecolor': '#555'})
    for bar, (_, row) in zip(bars, df.iterrows()):
        ax.text(bar.get_width() + max(errors) * 0.05,
                bar.get_y() + bar.get_height() / 2,
                f'n={int(row["appearance_count"])}',
                va='center', ha='left', fontsize=9, color='#888')
    ax.set_title(title, pad=10)
    ax.set_xlabel(xlabel, labelpad=8)
    ax.tick_params(length=0)
    return ax

# ── Speed & Distance (always shown) ────────────────────────────────────────
fig, axs = plt.subplots(2, 2, figsize=(20, 14), facecolor=PALETTE['bg'])
fig.suptitle(
    'Artist Association with Speed & Distance\n(error bars = ±1 std dev, min. 2 run appearances)',
    fontsize=14, y=1.01
)
barh_artists(axs[0,0], top_speed,    'avg_speed',    'std_speed',
             PALETTE['primary'],   'Top 12 Artists by Avg Speed',    'Avg Speed (mph)')
barh_artists(axs[1,0], bottom_speed, 'avg_speed',    'std_speed',
             PALETTE['secondary'], 'Bottom 12 Artists by Avg Speed', 'Avg Speed (mph)')
barh_artists(axs[0,1], top_dist,     'avg_distance', 'std_distance',
             PALETTE['accent'],    'Top 12 Artists by Avg Distance', 'Avg Distance (miles)')
barh_artists(axs[1,1], bottom_dist,  'avg_distance', 'std_distance',
             '#9B59B6',            'Bottom 12 Artists by Avg Distance', 'Avg Distance (miles)')
plt.tight_layout()
plt.show()

# ── Elevation (shown if data present) ──────────────────────────────────────
if has_elev:
    fig2, axs2 = plt.subplots(1, 2, figsize=(20, 7), facecolor=PALETTE['bg'])
    fig2.suptitle(
        'Artist Association with Elevation Gain\n(total elevation gain in feet, error bars = ±1 std dev)',
        fontsize=14, y=1.02
    )
    barh_artists(axs2[0], top_elev,    'avg_elevation', 'std_elevation',
                 '#8E44AD', 'Top 12 Artists by Avg Elevation Gain', 'Avg Elevation Gain (ft)')
    barh_artists(axs2[1], bottom_elev, 'avg_elevation', 'std_elevation',
                 '#27AE60', 'Bottom 12 Artists by Avg Elevation Gain', 'Avg Elevation Gain (ft)')
    plt.tight_layout()
    plt.show()

    # Cross-reference: speed rank vs elevation rank
    ranked = artist_summary_filtered.copy()
    ranked['speed_rank'] = ranked['avg_speed'].rank(ascending=False)
    ranked['elev_rank']  = ranked['avg_elevation'].rank(ascending=False)
    r_se, p_se = stats.pearsonr(
        ranked['speed_rank'].dropna(),
        ranked['elev_rank'].dropna()
    )
    print(f'Correlation between artist speed rank and elevation rank: r = {r_se:.3f}, p = {p_se:.4f}')
    if p_se < 0.05 and r_se < 0:
        print('  → Fast artists tend to appear on lower-elevation runs — worth flagging as a confounder.')
    elif p_se < 0.05 and r_se > 0:
        print('  → Fast artists also appear on higher-elevation runs — elevation is not suppressing the speed signal.')
    else:
        print('  → No significant rank correlation — speed and elevation rankings are largely independent.')
else:
    print('No elevation data available for artist-level breakdown.')

Correlation between artist speed rank and elevation rank: r = 0.264, p = 0.0145
  → Fast artists also appear on higher-elevation runs — elevation is not suppressing the speed signal.

fig, axes = plt.subplots(1, 2, figsize=(16, 5), facecolor=PALETTE['bg'])
fig.suptitle('Artist Listening Frequency During Runs', fontsize=14)

# Appearance count distribution — shows the long tail
axes[0].hist(artist_summary['appearance_count'], bins=22,
             color=PALETTE['accent'], edgecolor='white', alpha=0.9)
axes[0].axvline(2, color=PALETTE['secondary'], linestyle='--', linewidth=1.5,
                label='Min. threshold (2)')
axes[0].legend(frameon=False)
style_ax(axes[0], 'Distribution of Artist Appearance Counts',
         'Runs Appeared In', 'Number of Artists')

# Top 15 most frequent
top_freq = artist_summary.nlargest(15, 'appearance_count').sort_values('appearance_count')
colors = [PALETTE['primary'] if v >= top_freq['appearance_count'].quantile(.75)
          else PALETTE['accent'] for v in top_freq['appearance_count']]
axes[1].barh([safe_label(n) for n in top_freq['artistName']],
             top_freq['appearance_count'],
             color=colors, edgecolor='white', linewidth=0.5, alpha=0.9)
style_ax(axes[1], 'Top 15 Most Frequent Artists During Runs',
         'Runs Appeared In', '')

plt.tight_layout()
plt.show()

has_music_ids = {k for k, v in run_songs_dict.items() if len(v) > 0}
strava_df['has_music'] = strava_df['id'].isin(has_music_ids)

music_speeds    = strava_df[strava_df['has_music']]['avg_speed_mph'].dropna()
no_music_speeds = strava_df[~strava_df['has_music']]['avg_speed_mph'].dropna()
music_elev      = strava_df[strava_df['has_music']]['elevation_gain_ft'].dropna()
no_music_elev   = strava_df[~strava_df['has_music']]['elevation_gain_ft'].dropna()

fig, axes = plt.subplots(1, 3, figsize=(18, 5), facecolor=PALETTE['bg'])
fig.suptitle('Coverage Check: Music-Matched vs. Unmatched Runs', fontsize=14)

# Scatter: speed vs distance, coloured by music coverage
for has, group in strava_df.groupby('has_music'):
    label = 'Music matched' if has else 'No match (outside window)'
    color = PALETTE['primary'] if has else PALETTE['neutral']
    axes[0].scatter(group['distance_miles'], group['avg_speed_mph'],
                    alpha=0.45, color=color, label=label, s=40,
                    edgecolors='white', linewidth=0.3)
axes[0].legend(frameon=False)
style_ax(axes[0], 'Speed vs. Distance by Coverage Group', 'Distance (miles)', 'Avg Speed (mph)')

# Speed boxplot
def styled_boxplot(ax, groups, labels, colors):
    bp = ax.boxplot(
        groups, labels=labels, patch_artist=True,
        medianprops=dict(color='white', linewidth=2),
        whiskerprops=dict(linewidth=1), capprops=dict(linewidth=1),
        flierprops=dict(marker='o', markersize=4, alpha=0.4),
    )
    for box, c in zip(bp['boxes'], colors):
        box.set_facecolor(c); box.set_alpha(0.8)
    return bp

styled_boxplot(axes[1], [music_speeds, no_music_speeds],
               ['Music matched', 'No match'],
               [PALETTE['primary'], PALETTE['neutral']])
style_ax(axes[1], 'Speed Distribution: Coverage Groups', '', 'Avg Speed (mph)')

# ── NEW: Elevation boxplot ───────────────────────────────────────────────────
if len(music_elev) > 2 and len(no_music_elev) > 2:
    styled_boxplot(axes[2], [music_elev, no_music_elev],
                   ['Music matched', 'No match'],
                   ['#8E44AD', PALETTE['neutral']])
    style_ax(axes[2], 'Elevation Distribution: Coverage Groups', '', 'Elevation Gain (ft)')
else:
    axes[2].text(0.5, 0.5, 'Insufficient elevation data', ha='center', va='center',
                 transform=axes[2].transAxes)
    style_ax(axes[2], 'Elevation: Coverage Groups')

plt.tight_layout()
plt.show()

# ── Statistical tests on all three dimensions ───────────────────────────────
t_spd, p_spd = stats.ttest_ind(music_speeds, no_music_speeds, equal_var=False)
print(f'Speed   — Welch t-test: t = {t_spd:.3f},  p = {p_spd:.4f}  '
      f'{"⚠ significant difference" if p_spd < 0.05 else "✓ groups comparable"}')

if len(music_elev) > 2 and len(no_music_elev) > 2:
    t_elv, p_elv = stats.ttest_ind(music_elev, no_music_elev, equal_var=False)
    print(f'Elevation — Welch t-test: t = {t_elv:.3f},  p = {p_elv:.4f}  '
          f'{"⚠ significant difference — music runs are systematically hillier/flatter" if p_elv < 0.05 else "✓ elevation comparable between groups"}')
    print(f'  Music-run mean elevation:    {music_elev.mean():.0f} ft')
    print(f'  No-music-run mean elevation: {no_music_elev.mean():.0f} ft')

/var/folders/9b/pqv3317s0y7gshfwzdd28mfw0000gn/T/ipykernel_36510/250615795.py:24: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  bp = ax.boxplot(
/var/folders/9b/pqv3317s0y7gshfwzdd28mfw0000gn/T/ipykernel_36510/250615795.py:24: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  bp = ax.boxplot(

Speed   — Welch t-test: t = 2.275,  p = 0.0264  ⚠ significant difference
Elevation — Welch t-test: t = 0.295,  p = 0.7691  ✓ elevation comparable between groups
  Music-run mean elevation:    142 ft
  No-music-run mean elevation: 136 ft

corr_cols = ['avg_speed_mph', 'distance_miles', 'elapsed_minutes',
             'elevation_gain_ft', 'elev_gain_per_mile',
             'n_songs', 'n_unique_artists', 'songs_per_minute']

# Join elevation into run_level
run_level_elev = run_level.merge(
    strava_df[['id', 'elevation_gain_ft', 'elev_gain_per_mile']],
    left_on='run_id', right_on='id', how='left', suffixes=('', '_dup')
)
for col in ['elevation_gain_ft', 'elev_gain_per_mile']:
    if col not in run_level_elev.columns:
        if col + '_dup' in run_level_elev.columns:
            run_level_elev[col] = run_level_elev[col + '_dup']

available_cols = [c for c in corr_cols if c in run_level_elev.columns]
run_level_music = run_level_elev.dropna(subset=available_cols)
corr = run_level_music[available_cols].corr()

fig, ax = plt.subplots(figsize=(11, 9), facecolor=PALETTE['bg'])
im = ax.imshow(corr.values, cmap='RdYlGn', vmin=-1, vmax=1, aspect='auto')
plt.colorbar(im, ax=ax, shrink=0.8)

labels = [c.replace('_', ' ').replace('mph', '(mph)').replace('ft', '(ft)') for c in available_cols]
ax.set_xticks(range(len(available_cols))); ax.set_xticklabels(labels, rotation=40, ha='right', fontsize=10)
ax.set_yticks(range(len(available_cols))); ax.set_yticklabels(labels, fontsize=10)

for i in range(len(available_cols)):
    for j in range(len(available_cols)):
        val = corr.values[i, j]
        text_color = 'white' if abs(val) > 0.6 else '#333'
        ax.text(j, i, f'{val:.2f}', ha='center', va='center', fontsize=9, color=text_color)

ax.set_title('Pearson Correlation Matrix — Run-Level Features incl. Elevation (music runs only)',
             fontsize=13, pad=12)
plt.tight_layout()
plt.show()

# Print the elevation row specifically so it's visible in output
if 'elevation_gain_ft' in corr.columns:
    print('\nElevation gain correlations:')
    print(corr['elevation_gain_ft'].drop('elevation_gain_ft').sort_values().round(3).to_string())

Elevation gain correlations:
songs_per_minute      0.023
elev_gain_per_mile    0.092
avg_speed_mph         0.241
n_unique_artists      0.597
n_songs               0.608
elapsed_minutes       0.720
distance_miles        0.843

top_artists   = set(artist_summary_filtered.nlargest(10, 'avg_speed')['artistName'])
top_dist_art  = set(artist_summary_filtered.nlargest(10, 'avg_distance')['artistName'])

top_speed_runs  = run_song_flat[run_song_flat['artistName'].isin(top_artists)]['run_id'].unique()
rest_speed_runs = run_song_flat[~run_song_flat['artistName'].isin(top_artists)]['run_id'].unique()
top_dist_runs   = run_song_flat[run_song_flat['artistName'].isin(top_dist_art)]['run_id'].unique()
rest_dist_runs  = run_song_flat[~run_song_flat['artistName'].isin(top_dist_art)]['run_id'].unique()

top_speeds_v  = strava_df[strava_df['id'].isin(top_speed_runs)]['avg_speed_mph'].dropna()
rest_speeds_v = strava_df[strava_df['id'].isin(rest_speed_runs)]['avg_speed_mph'].dropna()
top_dist_v    = strava_df[strava_df['id'].isin(top_dist_runs)]['distance_miles'].dropna()
rest_dist_v   = strava_df[strava_df['id'].isin(rest_dist_runs)]['distance_miles'].dropna()

t_speed, p_speed = stats.ttest_ind(top_speeds_v, rest_speeds_v, equal_var=False)
t_dist,  p_dist  = stats.ttest_ind(top_dist_v,   rest_dist_v,   equal_var=False)

print('── Speed Test ────────────────────────────────────')
print(f'  Top artists mean:  {top_speeds_v.mean():.3f} mph  (n={len(top_speeds_v)})')
print(f'  All others mean:   {rest_speeds_v.mean():.3f} mph  (n={len(rest_speeds_v)})')
print(f'  t = {t_speed:.4f},  p = {p_speed:.4f}  {"✓ significant" if p_speed < 0.05 else "✗ not significant"}')
print()
print('── Distance Test ─────────────────────────────────')
print(f'  Top artists mean:  {top_dist_v.mean():.3f} miles  (n={len(top_dist_v)})')
print(f'  All others mean:   {rest_dist_v.mean():.3f} miles  (n={len(rest_dist_v)})')
print(f'  t = {t_dist:.4f},  p = {p_dist:.4f}  {"✓ significant" if p_dist < 0.05 else "✗ not significant"}')

# ── NEW: Elevation t-test ────────────────────────────────────────────────────
has_elev_data = artist_summary_filtered['avg_elevation'].notna().sum() > 2
if has_elev_data:
    top_elev_art   = set(artist_summary_filtered.nlargest(10, 'avg_elevation')['artistName'])
    top_elev_runs  = run_song_flat[run_song_flat['artistName'].isin(top_elev_art)]['run_id'].unique()
    rest_elev_runs = run_song_flat[~run_song_flat['artistName'].isin(top_elev_art)]['run_id'].unique()
    top_elev_v     = strava_df[strava_df['id'].isin(top_elev_runs)]['elevation_gain_ft'].dropna()
    rest_elev_v    = strava_df[strava_df['id'].isin(rest_elev_runs)]['elevation_gain_ft'].dropna()

    t_elev, p_elev = stats.ttest_ind(top_elev_v, rest_elev_v, equal_var=False)

    print()
    print('── Elevation Test ────────────────────────────────')
    print(f'  Top-elevation artists mean: {top_elev_v.mean():.0f} ft  (n={len(top_elev_v)})')
    print(f'  All others mean:            {rest_elev_v.mean():.0f} ft  (n={len(rest_elev_v)})')
    print(f'  t = {t_elev:.4f},  p = {p_elev:.4f}  {"✓ significant" if p_elev < 0.05 else "✗ not significant"}')
    if p_elev < 0.05:
        print('  → Certain artists do appear systematically on hillier routes.')
        print('    This is a planning/selection signal, not a causal music effect.')
    else:
        print('  → Artist presence is elevation-agnostic — no terrain clustering by artist.')

── Speed Test ────────────────────────────────────
  Top artists mean:  6.553 mph  (n=21)
  All others mean:   6.084 mph  (n=40)
  t = 2.9595,  p = 0.0045  ✓ significant

── Distance Test ─────────────────────────────────
  Top artists mean:  6.277 miles  (n=7)
  All others mean:   4.212 miles  (n=40)
  t = 5.4580,  p = 0.0000  ✓ significant

── Elevation Test ────────────────────────────────
  Top-elevation artists mean: 229 ft  (n=5)
  All others mean:            142 ft  (n=40)
  t = 6.2525,  p = 0.0001  ✓ significant
  → Certain artists do appear systematically on hillier routes.
    This is a planning/selection signal, not a causal music effect.

def cohens_d(a, b):
    """Pooled standard deviation Cohen's d."""
    pooled_std = np.sqrt((a.std()**2 + b.std()**2) / 2)
    return (a.mean() - b.mean()) / pooled_std if pooled_std > 0 else 0.0

d_speed = cohens_d(top_speeds_v, rest_speeds_v)
d_dist  = cohens_d(top_dist_v,   rest_dist_v)

plot_items = [
    (top_speeds_v, rest_speeds_v, t_speed, p_speed, d_speed, 'Speed (mph)',     PALETTE['primary']),
    (top_dist_v,   rest_dist_v,   t_dist,  p_dist,  d_dist,  'Distance (miles)', PALETTE['accent']),
]

if has_elev_data:
    d_elev = cohens_d(top_elev_v, rest_elev_v)
    plot_items.append(
        (top_elev_v, rest_elev_v, t_elev, p_elev, d_elev, 'Elevation Gain (ft)', '#8E44AD')
    )
    print(f"Cohen's d (elevation): {d_elev:.3f}  → {'small' if abs(d_elev) < 0.2 else 'medium' if abs(d_elev) < 0.5 else 'large'} effect")

print(f"Cohen's d (speed):     {d_speed:.3f}  → {'small' if abs(d_speed) < 0.2 else 'medium' if abs(d_speed) < 0.5 else 'large'} effect")
print(f"Cohen's d (distance):  {d_dist:.3f}  → {'small' if abs(d_dist) < 0.2 else 'medium' if abs(d_dist) < 0.5 else 'large'} effect")

# Summary bar: d magnitudes across all three metrics
metric_labels = [x[5] for x in plot_items]
d_vals = [abs(x[4]) for x in plot_items]
bar_colors = [x[6] for x in plot_items]

fig_sum, ax_sum = plt.subplots(figsize=(7, 3.5), facecolor=PALETTE['bg'])
bars = ax_sum.barh(metric_labels, d_vals, color=bar_colors, edgecolor='white', alpha=0.88)
ax_sum.axvline(0.2, color='#aaa', linestyle='--', linewidth=1, label='Small (0.2)')
ax_sum.axvline(0.5, color='#666', linestyle='--', linewidth=1, label='Medium (0.5)')
ax_sum.axvline(0.8, color='#333', linestyle='--', linewidth=1, label='Large (0.8)')
for bar, val in zip(bars, d_vals):
    ax_sum.text(val + 0.01, bar.get_y() + bar.get_height()/2,
                f'd = {val:.3f}', va='center', fontsize=10, color='#444')
ax_sum.set_xlabel("Cohen's d (absolute)", labelpad=8)
ax_sum.set_title("Effect Size Summary: Top-Artist Groups vs. All Others", pad=10)
ax_sum.legend(frameon=False, fontsize=9)
ax_sum.tick_params(length=0)
plt.tight_layout()
plt.show()

# Detailed overlapping histograms per metric
fig, axes = plt.subplots(1, len(plot_items), figsize=(7*len(plot_items), 5), facecolor=PALETTE['bg'])
if len(plot_items) == 1: axes = [axes]
fig.suptitle('Group Comparison: Top Artist Runs vs. All Other Runs', fontsize=14)

for ax, (top_v, rest_v, t, p, d, metric, col) in zip(axes, plot_items):
    ax.hist(top_v,  bins=14, alpha=0.65, color=col,
            label=f'Top artists  (n={len(top_v)})', edgecolor='white', linewidth=0.5)
    ax.hist(rest_v, bins=14, alpha=0.65, color=PALETTE['neutral'],
            label=f'Other artists (n={len(rest_v)})', edgecolor='white', linewidth=0.5)
    ax.axvline(top_v.mean(),  color=col,               linestyle='--', linewidth=1.8)
    ax.axvline(rest_v.mean(), color=PALETTE['neutral'], linestyle='--', linewidth=1.8)
    ax.set_title(f"d = {d:.3f}  |  p = {p:.4f}", fontsize=12)
    ax.set_xlabel(metric, labelpad=8)
    ax.set_ylabel('Frequency', labelpad=8)
    ax.legend(frameon=False, fontsize=10)
    ax.tick_params(length=0)

plt.tight_layout()
plt.show()

Cohen's d (elevation): 2.043  → large effect
Cohen's d (speed):     0.748  → large effect
Cohen's d (distance):  1.642  → large effect

strava_df['time_of_day'] = pd.cut(
    strava_df['hour_of_day'],
    bins=[0, 6, 11, 15, 19, 24],
    labels=['Night (0–6)', 'Morning (6–11)', 'Midday (11–15)', 'Afternoon (15–19)', 'Evening (19–24)'],
    right=False
)

fig, axes = plt.subplots(2, 2, figsize=(16, 10), facecolor=PALETTE['bg'])
fig.suptitle('Time-of-Day Effects on Pace & Elevation', fontsize=14)

tod_speed = strava_df.groupby('time_of_day', observed=True)['avg_speed_mph'].mean().dropna()
tod_elev  = strava_df.groupby('time_of_day', observed=True)['elevation_gain_ft'].mean().dropna()
tod_count = strava_df.groupby('time_of_day', observed=True)['id'].count()

# Speed by time of day
highlight_spd = tod_speed.idxmax()
axes[0,0].bar(range(len(tod_speed)),
              tod_speed.values,
              color=[PALETTE['primary'] if t == highlight_spd else PALETTE['accent']
                     for t in tod_speed.index],
              edgecolor='white', alpha=0.9)
axes[0,0].set_xticks(range(len(tod_speed)))
axes[0,0].set_xticklabels([str(t) for t in tod_speed.index], rotation=25, ha='right')
axes[0,0].set_ylim(tod_speed.min() * 0.96, tod_speed.max() * 1.04)
style_ax(axes[0,0], 'Avg Speed by Time of Day', '', 'Avg Speed (mph)')

# Elevation by time of day
if len(tod_elev) > 0:
    highlight_elv = tod_elev.idxmax()
    axes[0,1].bar(range(len(tod_elev)),
                  tod_elev.values,
                  color=['#8E44AD' if t == highlight_elv else '#C39BD3'
                         for t in tod_elev.index],
                  edgecolor='white', alpha=0.9)
    axes[0,1].set_xticks(range(len(tod_elev)))
    axes[0,1].set_xticklabels([str(t) for t in tod_elev.index], rotation=25, ha='right')
    axes[0,1].set_ylim(0, tod_elev.max() * 1.12)
    style_ax(axes[0,1], 'Avg Elevation Gain by Time of Day', '', 'Avg Elevation Gain (ft)')
else:
    axes[0,1].text(0.5, 0.5, 'No elevation data', ha='center', va='center',
                   transform=axes[0,1].transAxes)

# Run count
axes[1,0].bar(range(len(tod_count)), tod_count.values,
              color=PALETTE['secondary'], edgecolor='white', alpha=0.9)
axes[1,0].set_xticks(range(len(tod_count)))
axes[1,0].set_xticklabels([str(t) for t in tod_count.index], rotation=25, ha='right')
style_ax(axes[1,0], 'Run Count by Time of Day', '', 'Number of Runs')

# Combined: speed rank vs elevation rank by time slot (scatter)
tod_both = pd.DataFrame({'speed': tod_speed, 'elevation': tod_elev}).dropna()
if len(tod_both) > 2:
    axes[1,1].scatter(tod_both['elevation'], tod_both['speed'],
                      s=120, color=PALETTE['accent'], edgecolors='white', linewidth=0.8, zorder=3)
    for label, row in tod_both.iterrows():
        axes[1,1].annotate(str(label).split(' ')[0],
                           (row['elevation'], row['speed']),
                           xytext=(5, 4), textcoords='offset points', fontsize=9, color='#555')
    style_ax(axes[1,1], 'Avg Elevation vs. Avg Speed\nby Time Slot', 'Avg Elevation (ft)', 'Avg Speed (mph)')
else:
    axes[1,1].text(0.5, 0.5, 'Insufficient time-slot data', ha='center', va='center',
                   transform=axes[1,1].transAxes)

plt.tight_layout()
plt.show()

# ── ANOVAs ───────────────────────────────────────────────────────────────────
spd_groups  = [g['avg_speed_mph'].dropna().values
               for _, g in strava_df.groupby('time_of_day', observed=True) if len(g) > 2]
elev_groups = [g['elevation_gain_ft'].dropna().values
               for _, g in strava_df.groupby('time_of_day', observed=True)
               if len(g) > 2 and g['elevation_gain_ft'].notna().sum() > 2]

if len(spd_groups) >= 2:
    f_spd, p_spd_anova = stats.f_oneway(*spd_groups)
    print(f'ANOVA — speed ~ time_of_day:     F = {f_spd:.3f},  p = {p_spd_anova:.4f}  '
          f'{"⚠ significant" if p_spd_anova < 0.05 else "✓ not significant"}')

if len(elev_groups) >= 2:
    f_elv, p_elv_anova = stats.f_oneway(*elev_groups)
    print(f'ANOVA — elevation ~ time_of_day: F = {f_elv:.3f},  p = {p_elv_anova:.4f}  '
          f'{"⚠ significant — elevation varies by time slot" if p_elv_anova < 0.05 else "✓ not significant — elevation is time-of-day neutral"}')
    if p_spd_anova < 0.05 and p_elv_anova < 0.05:
        print()
        print('  Both speed and elevation vary by time of day.')
        print('  Any artist-level correlation must be interpreted with this dual confound in mind.')
    elif p_elv_anova < 0.05 and p_spd_anova >= 0.05:
        print()
        print('  Elevation varies by time slot but speed does not.')
        print('  Time of day confounds elevation-based artist findings more than speed-based ones.')

ANOVA — speed ~ time_of_day:     F = 3.100,  p = 0.0515  ✓ not significant
ANOVA — elevation ~ time_of_day: F = 0.173,  p = 0.8419  ✓ not significant — elevation is time-of-day neutral

# ── Merge time_of_day into run_song_flat ────────────────────────────────────
for col in ['time_of_day', 'id_tod']:
    if col in run_song_flat.columns:
        run_song_flat = run_song_flat.drop(columns=[col])

run_song_flat = run_song_flat.merge(
    strava_df[['id', 'time_of_day']],
    left_on='run_id', right_on='id', how='left'
)

top_10_speed_artists = artist_summary_filtered.nlargest(10, 'avg_speed')['artistName'].tolist()

# ── Chart 1: Top speed artists — time-of-day breakdown ─────────────────────
top_artist_tod = (
    run_song_flat[run_song_flat['artistName'].isin(top_10_speed_artists)]
    .groupby(['artistName', 'time_of_day'], observed=True)
    .size().unstack(fill_value=0)
)
top_artist_tod_pct = top_artist_tod.div(top_artist_tod.sum(axis=1), axis=0)

tod_colors = [PALETTE['accent'], PALETTE['primary'], PALETTE['secondary'], '#9B59B6', '#E67E22']

fig, axes = plt.subplots(1, 2, figsize=(20, 6), facecolor=PALETTE['bg'])
fig.suptitle('Confounder Check: When Do Top Artists Actually Appear?', fontsize=14)

if not top_artist_tod_pct.empty:
    top_artist_tod_pct.plot(kind='bar', stacked=True, ax=axes[0],
                            color=tod_colors[:len(top_artist_tod_pct.columns)],
                            edgecolor='white', linewidth=0.5, alpha=0.9)
    axes[0].set_xticklabels([safe_label(a) for a in top_artist_tod_pct.index],
                             rotation=35, ha='right')
    axes[0].set_ylabel('Proportion of appearances')
    axes[0].set_title('Top-Speed Artists — Time-of-Day Distribution', fontsize=12, pad=8)
    axes[0].legend(bbox_to_anchor=(1.01, 1), loc='upper left', frameon=False, fontsize=9)
    axes[0].tick_params(length=0)

# ── Chart 2: Top elevation artists — time-of-day breakdown (NEW) ───────────
has_elev_art = artist_summary_filtered['avg_elevation'].notna().sum() > 2
if has_elev_art:
    top_10_elev_artists = artist_summary_filtered.nlargest(10, 'avg_elevation')['artistName'].tolist()
    top_elev_tod = (
        run_song_flat[run_song_flat['artistName'].isin(top_10_elev_artists)]
        .groupby(['artistName', 'time_of_day'], observed=True)
        .size().unstack(fill_value=0)
    )
    top_elev_tod_pct = top_elev_tod.div(top_elev_tod.sum(axis=1), axis=0)

    if not top_elev_tod_pct.empty:
        top_elev_tod_pct.plot(kind='bar', stacked=True, ax=axes[1],
                              color=tod_colors[:len(top_elev_tod_pct.columns)],
                              edgecolor='white', linewidth=0.5, alpha=0.9)
        axes[1].set_xticklabels([safe_label(a) for a in top_elev_tod_pct.index],
                                 rotation=35, ha='right')
        axes[1].set_ylabel('Proportion of appearances')
        axes[1].set_title('Top-Elevation Artists — Time-of-Day Distribution', fontsize=12, pad=8)
        axes[1].legend(bbox_to_anchor=(1.01, 1), loc='upper left', frameon=False, fontsize=9)
        axes[1].tick_params(length=0)

        # Cross-reference: do speed-top and elevation-top artists share the same time slots?
        spd_dominant_slot  = top_artist_tod_pct.sum().idxmax() if not top_artist_tod_pct.empty else None
        elev_dominant_slot = top_elev_tod_pct.sum().idxmax()   if not top_elev_tod_pct.empty else None
        print(f'Top-speed artists most common time slot:     {spd_dominant_slot}')
        print(f'Top-elevation artists most common time slot: {elev_dominant_slot}')
        if spd_dominant_slot == elev_dominant_slot:
            print('  ⚠ Same dominant slot — speed and elevation artist groups may be conflated by time-of-day.')
        else:
            print('  ✓ Different dominant slots — speed and elevation artist groups have distinct time patterns.')
else:
    axes[1].text(0.5, 0.5, 'Elevation artist data not available',
                 ha='center', va='center', transform=axes[1].transAxes, color=PALETTE['neutral'])
    axes[1].set_title('Top-Elevation Artists — Time-of-Day Distribution', fontsize=12, pad=8)

plt.tight_layout()
plt.show()

Top-speed artists most common time slot:     Evening (19–24)
Top-elevation artists most common time slot: Afternoon (15–19)
  ✓ Different dominant slots — speed and elevation artist groups have distinct time patterns.

# ── 1. Elevation distribution by music coverage ────────────────────────────
fig, axes = plt.subplots(1, 3, figsize=(18, 5), facecolor=PALETTE['bg'])
fig.suptitle('Elevation Gain Analysis', fontsize=14)

# Distribution
elev_data = strava_df['elevation_gain_ft'].dropna()
axes[0].hist(elev_data, bins=28, color='#8E44AD', edgecolor='white', alpha=0.9)
axes[0].axvline(elev_data.mean(),   color=PALETTE['secondary'], linestyle='--', linewidth=1.8,
                label=f'Mean: {elev_data.mean():.0f} ft')
axes[0].axvline(elev_data.median(), color=PALETTE['accent'],    linestyle=':',  linewidth=1.8,
                label=f'Median: {elev_data.median():.0f} ft')
axes[0].legend(frameon=False, fontsize=10)
style_ax(axes[0], 'Elevation Gain Distribution', 'Elevation Gain (ft)', 'Frequency')

# Elevation vs Distance scatter
edf2 = strava_df[['distance_miles', 'elevation_gain_ft']].dropna()
axes[1].scatter(edf2['distance_miles'], edf2['elevation_gain_ft'],
                alpha=0.4, color='#8E44AD', edgecolors='white', linewidth=0.4, s=40)
m3, b3, r3, p3, _ = stats.linregress(edf2['distance_miles'], edf2['elevation_gain_ft'])
x3 = np.linspace(edf2['distance_miles'].min(), edf2['distance_miles'].max(), 100)
axes[1].plot(x3, m3*x3+b3, color=PALETTE['secondary'], linewidth=2,
             label=f'r = {r3:.2f}  (p = {p3:.3f})')
axes[1].legend(frameon=False, fontsize=10)
style_ax(axes[1], 'Elevation vs. Distance', 'Distance (miles)', 'Elevation Gain (ft)')

# Elevation per mile — normalised hilliness
epm = strava_df['elev_gain_per_mile'].dropna()
axes[2].hist(epm, bins=24, color=PALETTE['accent'], edgecolor='white', alpha=0.9)
axes[2].axvline(epm.mean(), color=PALETTE['secondary'], linestyle='--', linewidth=1.8,
                label=f'Mean: {epm.mean():.0f} ft/mi')
axes[2].legend(frameon=False, fontsize=10)
style_ax(axes[2], 'Elevation Gain Per Mile\n(normalised hilliness)', 'ft / mile', 'Frequency')

plt.tight_layout()
plt.show()

print(f'Elevation ~ Distance correlation: r = {r3:.3f}  (p = {p3:.4f})')
if r3 > 0.5:
    print('  → Strong positive — longer runs gain substantially more elevation.')
    print('     Use elev_gain_per_mile (not raw elevation) for fair artist comparisons.')
else:
    print('  → Moderate/weak — elevation varies independently of distance enough to analyse separately.')

# ── 2. Artist-level elevation t-test ──────────────────────────────────────
print()
if artist_summary_filtered['avg_elevation'].notna().sum() > 2:
    top_elev_artists  = set(artist_summary_filtered.nlargest(10, 'avg_elevation')['artistName'])

    top_elev_run_ids  = run_song_flat[run_song_flat['artistName'].isin(top_elev_artists)]['run_id'].unique()
    rest_elev_run_ids = run_song_flat[~run_song_flat['artistName'].isin(top_elev_artists)]['run_id'].unique()

    top_elev_v  = strava_df[strava_df['id'].isin(top_elev_run_ids)]['elevation_gain_ft'].dropna()
    rest_elev_v = strava_df[strava_df['id'].isin(rest_elev_run_ids)]['elevation_gain_ft'].dropna()

    t_elev, p_elev = stats.ttest_ind(top_elev_v, rest_elev_v, equal_var=False)
    d_elev = cohens_d(top_elev_v, rest_elev_v)

    print('── Elevation Artist Test ─────────────────────────────────')
    print(f'  Top-elevation artists mean: {top_elev_v.mean():.0f} ft  (n={len(top_elev_v)})')
    print(f'  All others mean:            {rest_elev_v.mean():.0f} ft  (n={len(rest_elev_v)})')
    print(f'  t = {t_elev:.4f},  p = {p_elev:.4f}  {"✓ significant" if p_elev < 0.05 else "✗ not significant"}')
    print(f"  Cohen's d = {d_elev:.3f}  → {'small' if abs(d_elev) < 0.2 else 'medium' if abs(d_elev) < 0.5 else 'large'} effect")

    # ── 3. Are top-elevation artists also fast? ────────────────────────────
    top_elev_speeds = strava_df[strava_df['id'].isin(top_elev_run_ids)]['avg_speed_mph'].dropna()
    rest_speeds_all = strava_df[strava_df['id'].isin(rest_elev_run_ids)]['avg_speed_mph'].dropna()
    t_es, p_es = stats.ttest_ind(top_elev_speeds, rest_speeds_all, equal_var=False)

    print()
    print('── Are top-elevation artists also faster? ────────────────')
    print(f'  Speed on top-elevation-artist runs: {top_elev_speeds.mean():.3f} mph')
    print(f'  Speed on all other runs:            {rest_speeds_all.mean():.3f} mph')
    print(f'  t = {t_es:.4f},  p = {p_es:.4f}  {"✓ significant" if p_es < 0.05 else "✗ not significant"}')
    if p_es < 0.05 and top_elev_speeds.mean() < rest_speeds_all.mean():
        print('  → Hilly-artist runs are slower — elevation is suppressing speed, expected.')
    elif p_es < 0.05 and top_elev_speeds.mean() > rest_speeds_all.mean():
        print('  → Hilly-artist runs are faster — these artists appear on challenging but fast efforts.')
    else:
        print('  → No significant speed difference — elevation and speed are independent in artist grouping.')

    # ── 4. Scatter: artist avg_elevation vs avg_speed ─────────────────────
    plot_df = artist_summary_filtered.dropna(subset=['avg_elevation', 'avg_speed'])
    fig2, ax2 = plt.subplots(figsize=(10, 6), facecolor=PALETTE['bg'])
    sc = ax2.scatter(
        plot_df['avg_elevation'], plot_df['avg_speed'],
        s=plot_df['appearance_count'] * 18,
        alpha=0.65, color='#8E44AD', edgecolors='white', linewidth=0.5
    )
    if len(plot_df) > 5:
        m4, b4, r4, p4, _ = stats.linregress(plot_df['avg_elevation'], plot_df['avg_speed'])
        x4 = np.linspace(plot_df['avg_elevation'].min(), plot_df['avg_elevation'].max(), 100)
        ax2.plot(x4, m4*x4+b4, color=PALETTE['secondary'], linewidth=2,
                 label=f'r = {r4:.2f}  (p = {p4:.3f})')
        ax2.legend(frameon=False)
        print(f'\nArtist avg_elevation ~ avg_speed: r = {r4:.3f}, p = {p4:.4f}')
        if p4 < 0.05 and r4 < 0:
            print('  → Artists on hillier runs tend to be slower — route difficulty is suppressing speed signal.')
        elif p4 < 0.05 and r4 > 0:
            print('  → Artists on hillier runs are actually faster — these might be strong-effort runs regardless of terrain.')
        else:
            print('  → No significant relationship between an artist\'s typical elevation and typical speed.')

    # Label top artists
    top_n = plot_df.nlargest(5, 'avg_elevation')
    for _, row in top_n.iterrows():
        ax2.annotate(safe_label(row['artistName']),
                     (row['avg_elevation'], row['avg_speed']),
                     xytext=(6, 3), textcoords='offset points',
                     fontsize=8, color='#555')
    ax2.set_xlabel('Artist Avg Elevation Gain (ft)', labelpad=8)
    ax2.set_ylabel('Artist Avg Speed (mph)', labelpad=8)
    ax2.set_title(
        'Artist Avg Elevation vs. Avg Speed\n(bubble size = number of run appearances)',
        fontsize=13, pad=10
    )
    ax2.tick_params(length=0)
    plt.tight_layout()
    plt.show()
else:
    print('Elevation data not available — skipping artist-level elevation tests.')

Elevation ~ Distance correlation: r = 0.826  (p = 0.0000)
  → Strong positive — longer runs gain substantially more elevation.
     Use elev_gain_per_mile (not raw elevation) for fair artist comparisons.

── Elevation Artist Test ─────────────────────────────────
  Top-elevation artists mean: 229 ft  (n=5)
  All others mean:            142 ft  (n=40)
  t = 6.2525,  p = 0.0001  ✓ significant
  Cohen's d = 2.043  → large effect

── Are top-elevation artists also faster? ────────────────
  Speed on top-elevation-artist runs: 6.063 mph
  Speed on all other runs:            6.084 mph
  t = -0.0464,  p = 0.9650  ✗ not significant
  → No significant speed difference — elevation and speed are independent in artist grouping.

Artist avg_elevation ~ avg_speed: r = 0.210, p = 0.0532
  → No significant relationship between an artist's typical elevation and typical speed.

# Rough sketch — requires spotipy + auth
import spotipy
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(...))
features = sp.audio_features(track_ids)  # returns tempo, energy, valence, etc.

Dataset	Export format	Key constraint
Strava activities	CSV	One row per activity; includes all sport types
Spotify streaming history	JSON	~12 months max; only `endTime` provided (no start)

	start_date	end_date	avg_speed_mph	distance_miles	elapsed_minutes	elevation_gain_ft
0	2026-04-24 18:58:53+00:00	2026-04-24 19:58:09+00:00	6.625816	6.317479	59.266667	232.611556
1	2026-04-22 17:02:02+00:00	2026-04-22 17:59:06+00:00	6.549760	6.120132	57.066667	238.845152
2	2026-04-21 19:04:13+00:00	2026-04-21 19:28:09+00:00	7.296898	2.898261	23.933333	79.068244
3	2026-04-20 18:35:37+00:00	2026-04-20 19:26:29+00:00	6.388701	5.394619	50.866667	181.102368
4	2026-04-19 17:12:34+00:00	2026-04-19 18:06:12+00:00	5.822755	4.733480	53.633333	149.278220

	endTime	artistName	trackName	msPlayed	startTime	duration_sec
0	2025-04-29 21:22:00	Mick Jenkins	Jazz	231751	2025-04-29 21:18:08.249	231.751
3	2025-04-29 22:09:00	Yellowman	Zungguzungguguzungguzeng	24796	2025-04-29 22:08:35.204	24.796
5	2025-04-30 01:23:00	The Smashing Pumpkins	Zero - Remastered 2012	160173	2025-04-30 01:20:19.827	160.173

d ≈ 0.2	Small effect
d ≈ 0.5	Medium effect
d ≈ 0.8	Large effect

Limitation	Impact	Mitigation
Small samples per artist	Most averages based on 2–5 runs — low reliability	Increase data volume; switch to track-level audio features
Correlation ≠ causation	Music choice reflects intent, not effect	Acknowledge in framing; design around it
Spotify export window (~1 year)	~30–40% of Strava history has no music data	Request extended Spotify history; use API for full stream
No audio features	"Artist" is a crude proxy for musical characteristics	Pull BPM, energy, valence via Spotify Web API
50% overlap threshold	Arbitrary; sensitive to edge cases	Sensitivity analysis over multiple thresholds
No heart rate data	Pace is one signal; effort is another	Enable HR on Strava; adds richer outcome variable
Time-of-day confounder	Fast runs and music choices may both be driven by run type	Include `time_of_day` as covariate in any regression
Elevation ~ distance collinearity	Longer runs naturally gain more elevation	Use `elev_gain_per_mile` for normalised hilliness comparisons
Route selection bias	Hilly routes are chosen before the run starts	Elevation is as much about planning intent as musical preference

Cross-Platform Temporal Analysis: Spotify × Strava¶

Table of Contents¶

1. Imports & Configuration¶

2. Data Loading & Cleaning¶

2.1 Strava¶

2.2 Spotify¶

3. Temporal Join: Linking Songs to Runs¶

3.1 Data Coverage: Where Does the Overlap Actually Fall?¶

4. Feature Engineering¶

5. Exploratory Data Analysis¶

5.1 Running Baseline — What Do My Runs Actually Look Like?¶

5.2 Temporal Patterns — When Do I Run?¶

5.3 Listening Behavior During Runs¶

5.4 Artist Associations with Speed, Distance & Elevation¶

5.5 Most Frequent Artists — Who Actually Has Enough Data?¶

5.6 Coverage Check — Do Music Runs Look Different From Non-Music Runs?¶

6. Statistical Validation¶

6.1 Correlation Matrix — How Do Run-Level Features Relate?¶

6.2 Welch's t-test — Speed, Distance & Elevation¶

6.3 Effect Size — Cohen's d for All Three Metrics¶

6.4 Time-of-Day Confounder — Pace & Elevation by Time Slot¶

7. Confounder Analysis¶

7. Elevation Deep-Dive¶

8. Key Findings¶

✅ What held up¶

⚠ What didn't survive scrutiny¶

🧭 The honest summary¶

9. Limitations¶

10. What's Next¶

1. Spotify Audio Features via Web API¶

2. Proper Multi-Outcome Regression Model¶

3. Full Spotify History¶

4. Run Clustering by Listening + Terrain Profile¶