import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# ── Plot style ───────────────────────────────────────────────────────────────
PALETTE = {
    'primary':   '#C0392B',   # economist red
    'secondary': '#2C3E50',   # dark slate
    'accent':    '#2980B9',   # blue
    'positive':  '#27AE60',   # overvalued (currency too strong)
    'negative':  '#E74C3C',   # undervalued (currency too weak)
    'neutral':   '#95A5A6',
    'bg':        '#F9F9F8',
    'us':        '#C0392B',
}

plt.rcParams.update({
    'figure.dpi':         130,
    'figure.facecolor':   PALETTE['bg'],
    'axes.facecolor':     PALETTE['bg'],
    'axes.spines.top':    False,
    'axes.spines.right':  False,
    'axes.spines.left':   False,
    'axes.spines.bottom': False,
    'axes.grid':          True,
    'grid.alpha':         0.2,
    'grid.linestyle':     '--',
    'font.family':        'sans-serif',
    'font.size':          11,
    'axes.titlesize':     13,
    'axes.titleweight':   'semibold',
    'axes.labelcolor':    '#444',
    'xtick.color':        '#666',
    'ytick.color':        '#666',
})

def style_ax(ax, title=None, xlabel=None, ylabel=None):
    if title:  ax.set_title(title, pad=10)
    if xlabel: ax.set_xlabel(xlabel, labelpad=8)
    if ylabel: ax.set_ylabel(ylabel, labelpad=8)
    ax.tick_params(length=0)
    return ax

# Regional mapping — used throughout
REGION_MAP = {
    'United States':'North America','Canada':'North America','Mexico':'Latin America',
    'Argentina':'Latin America','Brazil':'Latin America','Chile':'Latin America',
    'Colombia':'Latin America','Peru':'Latin America','Uruguay':'Latin America',
    'Venezuela':'Latin America','Costa Rica':'Latin America','Honduras':'Latin America',
    'Guatemala':'Latin America','El Salvador':'Latin America','Nicaragua':'Latin America',
    'United Kingdom':'Western Europe','Euro area':'Western Europe','Switzerland':'Western Europe',
    'Norway':'Western Europe','Sweden':'Western Europe','Denmark':'Western Europe',
    'Czech Republic':'Western Europe','Hungary':'Western Europe','Poland':'Western Europe',
    'Romania':'Western Europe','Ukraine':'Eastern Europe','Russia':'Eastern Europe',
    'Turkey':'Eastern Europe','Israel':'Middle East & Africa','Saudi Arabia':'Middle East & Africa',
    'UAE':'Middle East & Africa','Egypt':'Middle East & Africa','South Africa':'Middle East & Africa',
    'Nigeria':'Middle East & Africa','Kenya':'Middle East & Africa',
    'Japan':'Asia-Pacific','China':'Asia-Pacific','South Korea':'Asia-Pacific',
    'Australia':'Asia-Pacific','New Zealand':'Asia-Pacific','Hong Kong':'Asia-Pacific',
    'Singapore':'Asia-Pacific','Taiwan':'Asia-Pacific','Thailand':'Asia-Pacific',
    'Malaysia':'Asia-Pacific','Indonesia':'Asia-Pacific','Philippines':'Asia-Pacific',
    'Vietnam':'Asia-Pacific','Sri Lanka':'Asia-Pacific','Pakistan':'Asia-Pacific',
    'India':'Asia-Pacific','Bangladesh':'Asia-Pacific',
}

print('Configuration loaded.')

Configuration loaded.

df = pd.read_csv('big_mac.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

# Attach region
df['region'] = df['name'].map(REGION_MAP).fillna('Other')

print(f'Rows: {len(df):,}')
print(f'Countries: {df["name"].nunique()}')
print(f'Date range: {df["date"].min().date()}  →  {df["date"].max().date()}')
print(f'Survey dates: {df["date"].nunique()}')
print()
print('Missing values:')
key_cols = ['dollar_price','USD_raw','USD_adjusted','local_price','dollar_ex']
print(df[key_cols].isna().sum().to_string())
print()
df[key_cols].describe().round(3)

Rows: 1,386
Countries: 57
Date range: 2000-04-01  →  2020-07-01
Survey dates: 33

Missing values:
dollar_price      0
USD_raw           0
USD_adjusted    684
local_price       0
dollar_ex         0

latest_date = df['date'].max()
latest = df[df['date'] == latest_date].dropna(subset=['dollar_price']).copy()
latest = latest.sort_values('dollar_price')

us_price = latest[latest['name'] == 'United States']['dollar_price'].values
us_price = us_price[0] if len(us_price) > 0 else None

fig, ax = plt.subplots(figsize=(10, max(8, len(latest) * 0.28)), facecolor=PALETTE['bg'])

colors = ['#E74C3C' if p > (us_price or 0) else '#3498DB' for p in latest['dollar_price']]
bars = ax.barh(latest['name'], latest['dollar_price'],
               color=colors, edgecolor='white', linewidth=0.4, alpha=0.88)

if us_price:
    ax.axvline(us_price, color=PALETTE['us'], linewidth=2, linestyle='--',
               label=f'US price: ${us_price:.2f}')
    ax.legend(frameon=False, fontsize=10)

ax.set_xlabel('Big Mac Price (USD)', labelpad=8)
ax.set_title(f'Big Mac Prices by Country — {latest_date.strftime("%b %Y")}\n'
             f'Red = more expensive than US · Blue = cheaper than US',
             fontsize=12, pad=10)
ax.tick_params(length=0, labelsize=9)
plt.tight_layout()
plt.show()

print(f'Most expensive: {latest.iloc[-1]["name"]}  ${latest.iloc[-1]["dollar_price"]:.2f}')
print(f'Least expensive: {latest.iloc[0]["name"]}  ${latest.iloc[0]["dollar_price"]:.2f}')
if us_price:
    pct_above = (latest['dollar_price'] > us_price).sum()
    print(f'Countries more expensive than US: {pct_above} of {len(latest)}')

Most expensive: Switzerland  $6.91
Least expensive: South Africa  $1.86
Countries more expensive than US: 3 of 56

us = df[df['name'] == 'United States'].dropna(subset=['dollar_price']).copy()
us['years_since_2000'] = us['date'].dt.year + us['date'].dt.month/12 - 2000

fig, ax = plt.subplots(figsize=(12, 5), facecolor=PALETTE['bg'])

ax.fill_between(us['date'], us['dollar_price'], alpha=0.12, color=PALETTE['primary'])
ax.plot(us['date'], us['dollar_price'],
        color=PALETTE['primary'], linewidth=2.5, marker='o', markersize=4, label='US price')

# Fit and plot trend line
slope, intercept, r, p, se = stats.linregress(us['years_since_2000'], us['dollar_price'])
x_fit = np.linspace(us['years_since_2000'].min(), us['years_since_2000'].max(), 200)
dates_fit = pd.to_datetime('2000-01-01') + pd.to_timedelta(x_fit * 365.25, unit='D')
ax.plot(dates_fit, slope * x_fit + intercept,
        color=PALETTE['secondary'], linewidth=1.5, linestyle='--', alpha=0.7,
        label=f'Trend  (r = {r:.3f}, +${slope:.2f}/yr)')

ax.yaxis.set_major_formatter(mticker.FormatStrFormatter('$%.2f'))
ax.legend(frameon=False, fontsize=10)
style_ax(ax, 'US Big Mac Price Over Time', '', 'Price (USD)')
plt.tight_layout()
plt.show()

start_price = us.iloc[0]['dollar_price']
end_price   = us.iloc[-1]['dollar_price']
years_span  = (us.iloc[-1]['date'] - us.iloc[0]['date']).days / 365.25
cagr = (end_price / start_price) ** (1 / years_span) - 1

print(f'Price in {us.iloc[0]["date"].year}:  ${start_price:.2f}')
print(f'Price in {us.iloc[-1]["date"].year}:  ${end_price:.2f}')
print(f'Total increase:    {(end_price/start_price - 1)*100:.1f}%')
print(f'CAGR:              {cagr*100:.2f}% per year')
print(f'Linear trend:      +${slope:.3f} per year  (r = {r:.3f})')

Price in 2000:  $2.51
Price in 2020:  $5.71
Total increase:    127.5%
CAGR:              4.14% per year
Linear trend:      +$0.177 per year  (r = 0.993)

fig, ax = plt.subplots(figsize=(14, 6), facecolor=PALETTE['bg'])

for name, group in df.groupby('name'):
    g = group.dropna(subset=['dollar_price'])
    if name == 'United States':
        continue
    ax.plot(g['date'], g['dollar_price'],
            color=PALETTE['neutral'], linewidth=0.8, alpha=0.35)

# US on top
us_g = df[df['name'] == 'United States'].dropna(subset=['dollar_price'])
ax.plot(us_g['date'], us_g['dollar_price'],
        color=PALETTE['primary'], linewidth=2.5, label='United States', zorder=5)

ax.yaxis.set_major_formatter(mticker.FormatStrFormatter('$%.2f'))
ax.legend(frameon=False, fontsize=10)
style_ax(ax, 'Big Mac Dollar Price — All Countries (US highlighted)',
         '', 'Price (USD)')
plt.tight_layout()
plt.show()

val_latest = df[df['date'] == latest_date].dropna(subset=['USD_raw']).copy()
# Exclude euro area aggregate to avoid double-counting
val_latest = val_latest[val_latest['name'] != 'Euro area']

top_over  = val_latest.nlargest(15, 'USD_raw')
top_under = val_latest.nsmallest(15, 'USD_raw')
combined  = pd.concat([top_under, top_over]).sort_values('USD_raw')

fig, ax = plt.subplots(figsize=(10, 9), facecolor=PALETTE['bg'])
colors = [PALETTE['positive'] if v > 0 else PALETTE['negative']
          for v in combined['USD_raw']]
bars = ax.barh(combined['name'], combined['USD_raw'] * 100,
               color=colors, edgecolor='white', linewidth=0.4, alpha=0.9)
ax.axvline(0, color=PALETTE['secondary'], linewidth=1.2)
ax.xaxis.set_major_formatter(mticker.PercentFormatter())
style_ax(ax,
         f'Currency Valuation vs USD — Raw PPP ({latest_date.strftime("%b %Y")})\n'
         f'Green = overvalued (Big Mac costs more than US) · Red = undervalued',
         'Implied Over/Undervaluation (%)', '')
ax.tick_params(length=0)
plt.tight_layout()
plt.show()

most_over  = combined.iloc[-1]
most_under = combined.iloc[0]
print(f'Most overvalued:  {most_over["name"]:25s}  {most_over["USD_raw"]*100:+.1f}%')
print(f'Most undervalued: {most_under["name"]:25s}  {most_under["USD_raw"]*100:+.1f}%')

Most overvalued:  Switzerland                +20.9%
Most undervalued: South Africa               -67.4%

# GDP-adjusted index requires supplemental GDP data not present in the base export.
# Instead: compare Raw PPP valuation across two time points — earliest vs most recent —
# to show whether currency misalignment has grown, shrunk, or reversed over the sample period.

earliest_date = df.dropna(subset=['USD_raw'])['date'].min()
val_early = (df[df['date'] == earliest_date]
             .dropna(subset=['USD_raw'])
             .query('name != "Euro area"')
             .copy())
val_early_idx = val_early.set_index('name')['USD_raw']

# Countries present in both periods
shared = set(combined['name']) & set(val_early['name'])
combined_both = combined[combined['name'].isin(shared)].copy()
combined_both['USD_raw_early'] = combined_both['name'].map(val_early_idx)
combined_both['valuation_change'] = combined_both['USD_raw'] - combined_both['USD_raw_early']
combined_both = combined_both.sort_values('USD_raw')

fig, axes = plt.subplots(1, 2, figsize=(18, 8), facecolor=PALETTE['bg'])
fig.suptitle(
    f'Currency Valuation vs USD — Raw PPP\n'
    f'Left: Most Recent ({latest_date.strftime("%b %Y")})  ·  '
    f'Right: Change since {earliest_date.strftime("%b %Y")}',
    fontsize=13
)

# Left: current raw valuation
vals   = combined_both['USD_raw'].dropna()
colors = [PALETTE['positive'] if v > 0 else PALETTE['negative'] for v in vals]
axes[0].barh(combined_both['name'][:len(vals)], vals * 100,
             color=colors, edgecolor='white', linewidth=0.4, alpha=0.9)
axes[0].axvline(0, color=PALETTE['secondary'], linewidth=1.2)
axes[0].xaxis.set_major_formatter(mticker.PercentFormatter())
style_ax(axes[0], f'Raw PPP Valuation — {latest_date.strftime("%b %Y")}',
         'Over/Undervaluation (%)', '')
axes[0].tick_params(length=0, labelsize=9)

# Right: change in valuation over time (positive = became more overvalued)
chg = combined_both['valuation_change'].dropna()
chg_colors = [PALETTE['positive'] if v > 0 else PALETTE['negative'] for v in chg]
axes[1].barh(combined_both['name'][:len(chg)], chg * 100,
             color=chg_colors, edgecolor='white', linewidth=0.4, alpha=0.9)
axes[1].axvline(0, color=PALETTE['secondary'], linewidth=1.2)
axes[1].xaxis.set_major_formatter(mticker.PercentFormatter())
style_ax(axes[1],
         f'Change in Valuation since {earliest_date.strftime("%b %Y")}\n'
         f'Green = more overvalued · Red = more undervalued',
         'Change in PPP Valuation (pp)', '')
axes[1].tick_params(length=0, labelsize=9)

plt.tight_layout()
plt.show()

# Summary table
print(f'Currencies most improved (less undervalued / more overvalued) since {earliest_date.date()}:')
print(combined_both.nlargest(5, 'valuation_change')[['name','USD_raw_early','USD_raw','valuation_change']]
      .round(3).to_string(index=False))
print()
print(f'Currencies most deteriorated since {earliest_date.date()}:')
print(combined_both.nsmallest(5, 'valuation_change')[['name','USD_raw_early','USD_raw','valuation_change']]
      .round(3).to_string(index=False))

Currencies most improved (less undervalued / more overvalued) since 2000-04-01:
         name  USD_raw_early  USD_raw  valuation_change
    Australia         -0.386   -0.198             0.188
     Thailand         -0.423   -0.286             0.138
       Canada         -0.228   -0.111             0.117
  New Zealand         -0.326   -0.238             0.088
United States          0.000    0.000             0.000

Currencies most deteriorated since 2000-04-01:
   name  USD_raw_early  USD_raw  valuation_change
 Israel          0.426   -0.134            -0.560
 Mexico         -0.115   -0.610            -0.495
 Taiwan         -0.089   -0.572            -0.483
Britain          0.196   -0.251            -0.447
Denmark          0.226   -0.198            -0.424

# Find persistently extreme currencies (median absolute valuation across all dates)
median_val = (df.dropna(subset=['USD_raw'])
              .groupby('name')['USD_raw']
              .agg(median_abs=lambda x: x.abs().median(), median=lambda x: x.median())
              .reset_index())

persistent_over  = median_val.nlargest(5, 'median')['name'].tolist()
persistent_under = median_val.nsmallest(5, 'median')['name'].tolist()
highlight = persistent_over + persistent_under

cmap = plt.cm.tab10
color_map = {name: cmap(i) for i, name in enumerate(highlight)}

fig, axes = plt.subplots(1, 2, figsize=(18, 6), facecolor=PALETTE['bg'])
fig.suptitle('Persistent Currency Valuation vs USD — Raw PPP Over Time', fontsize=13)

for ax, names, panel_title in [
    (axes[0], persistent_over,  'Persistently Overvalued Currencies'),
    (axes[1], persistent_under, 'Persistently Undervalued Currencies'),
]:
    for name in names:
        g = df[df['name'] == name].dropna(subset=['USD_raw'])
        ax.plot(g['date'], g['USD_raw'] * 100,
                label=name, linewidth=2, color=color_map[name], marker='o', markersize=3)
    ax.axhline(0, color=PALETTE['secondary'], linewidth=1, linestyle='--', alpha=0.5)
    ax.yaxis.set_major_formatter(mticker.PercentFormatter())
    ax.legend(frameon=False, fontsize=9)
    style_ax(ax, panel_title, '', 'Raw PPP Valuation (%)')

plt.tight_layout()
plt.show()

def fit_country(group):
    g = group.dropna(subset=['dollar_price']).copy()
    if len(g) < 4:
        return pd.Series({'slope': np.nan, 'intercept': np.nan, 'r2': np.nan, 'n': len(g)})
    g['years'] = g['date'].dt.year + g['date'].dt.month/12 - 2000
    X = g['years'].values.reshape(-1, 1)
    y = g['dollar_price'].values
    model = LinearRegression().fit(X, y)
    return pd.Series({
        'slope':     model.coef_[0],
        'intercept': model.intercept_,
        'r2':        model.score(X, y),
        'n':         len(g),
    })

country_params = (df.groupby('name')
                   .apply(fit_country)
                   .reset_index()
                   .dropna(subset=['slope']))

# Merge region back in
region_ref = df[['name','region']].drop_duplicates()
country_params = country_params.merge(region_ref, on='name', how='left')

print(f'Countries with valid fits: {len(country_params)}')
print()
print('Summary of slope distribution:')
print(country_params['slope'].describe().round(4).to_string())
print()
print('US trend:')
us_row = country_params[country_params['name'] == 'United States']
print(us_row[['name','slope','intercept','r2']].round(4).to_string(index=False))

Countries with valid fits: 57

Summary of slope distribution:
count    57.0000
mean      0.1018
std       0.1045
min      -0.0341
25%       0.0411
50%       0.0949
75%       0.1424
max       0.6562

US trend:
         name  slope  intercept     r2
United States 0.1768     2.0952 0.9856

top_gainers = country_params.nlargest(12, 'slope').sort_values('slope')
top_losers  = country_params.nsmallest(12, 'slope').sort_values('slope', ascending=False)

fig, axes = plt.subplots(1, 2, figsize=(18, 7), facecolor=PALETTE['bg'])
fig.suptitle('Big Mac Dollar Price — Annual Rate of Change by Country', fontsize=13)

for ax, df_plot, color, title in [
    (axes[0], top_gainers, PALETTE['positive'],  'Top 12 — Fastest Rising Dollar Price'),
    (axes[1], top_losers,  PALETTE['negative'],  'Top 12 — Fastest Falling Dollar Price'),
]:
    bars = ax.barh(df_plot['name'], df_plot['slope'],
                   color=color, edgecolor='white', linewidth=0.4, alpha=0.88)
    for bar, (_, row) in zip(bars, df_plot.iterrows()):
        ax.text(bar.get_width() + 0.002,
                bar.get_y() + bar.get_height()/2,
                f'r²={row["r2"]:.2f}',
                va='center', fontsize=9, color='#666')
    ax.axvline(0, color=PALETTE['secondary'], linewidth=1)
    style_ax(ax, title, 'Slope ($/year)', '')
    ax.tick_params(length=0)

plt.tight_layout()
plt.show()

us_slope = country_params[country_params['name']=='United States']['slope'].values[0]
faster_than_us = (country_params['slope'] > us_slope).sum()
print(f'US slope: +${us_slope:.3f}/year')
print(f'Countries with faster price growth than US: {faster_than_us} of {len(country_params)}')

US slope: +$0.177/year
Countries with faster price growth than US: 4 of 57

top_r2    = country_params.nlargest(12, 'r2').sort_values('r2')
bottom_r2 = country_params.nsmallest(12, 'r2').sort_values('r2', ascending=False)

fig, axes = plt.subplots(1, 2, figsize=(18, 7), facecolor=PALETTE['bg'])
fig.suptitle('R² Fit Quality — How Predictable Is Each Country\'s Price Trend?', fontsize=13)

for ax, df_plot, color, title in [
    (axes[0], top_r2,    PALETTE['accent'],   'Top 12 — Most Predictable (High R²)'),
    (axes[1], bottom_r2, PALETTE['negative'], 'Bottom 12 — Least Predictable (Low R²)'),
]:
    ax.barh(df_plot['name'], df_plot['r2'],
            color=color, edgecolor='white', linewidth=0.4, alpha=0.88)
    ax.set_xlim(0, 1.05)
    ax.axvline(0.8, color='#aaa', linestyle='--', linewidth=1, label='R²=0.8')
    ax.axvline(0.5, color='#ccc', linestyle=':', linewidth=1, label='R²=0.5')
    ax.legend(frameon=False, fontsize=9)
    style_ax(ax, title, 'R²', '')
    ax.tick_params(length=0)

plt.tight_layout()
plt.show()

print(f'Median R² across all countries: {country_params["r2"].median():.3f}')
print(f'Countries with R² > 0.8: {(country_params["r2"] > 0.8).sum()}')
print(f'Countries with R² < 0.3: {(country_params["r2"] < 0.3).sum()}')

Median R² across all countries: 0.543
Countries with R² > 0.8: 11
Countries with R² < 0.3: 18

region_order = ['North America','Western Europe','Eastern Europe',
                'Latin America','Middle East & Africa','Asia-Pacific','Other']

# Current average price by region
latest_region = (df[df['date'] == latest_date]
                 .dropna(subset=['dollar_price'])
                 .groupby('region')['dollar_price']
                 .agg(['mean','std','count'])
                 .reindex(region_order).dropna())

# Regional slope summary
region_slope = (country_params.groupby('region')['slope']
                .agg(['mean','std','count'])
                .reindex(region_order).dropna())

fig, axes = plt.subplots(2, 2, figsize=(16, 12), facecolor=PALETTE['bg'])
fig.suptitle('Big Mac Index — Regional Analysis', fontsize=14)

reg_colors = plt.cm.Set2(np.linspace(0, 1, len(latest_region)))

# Avg price by region
axes[0,0].barh(latest_region.index, latest_region['mean'],
               xerr=latest_region['std'], color=reg_colors,
               edgecolor='white', capsize=4, alpha=0.88,
               error_kw={'elinewidth':1.2,'ecolor':'#888'})
if us_price:
    axes[0,0].axvline(us_price, color=PALETTE['primary'], linestyle='--',
                      linewidth=1.5, label=f'US: ${us_price:.2f}')
    axes[0,0].legend(frameon=False, fontsize=9)
axes[0,0].xaxis.set_major_formatter(mticker.FormatStrFormatter('$%.2f'))
style_ax(axes[0,0], f'Avg Price by Region ({latest_date.strftime("%b %Y")})',
         'Avg Dollar Price', '')

# Avg slope by region
axes[0,1].barh(region_slope.index, region_slope['mean'],
               xerr=region_slope['std'], color=reg_colors,
               edgecolor='white', capsize=4, alpha=0.88,
               error_kw={'elinewidth':1.2,'ecolor':'#888'})
axes[0,1].axvline(0, color=PALETTE['secondary'], linewidth=1)
style_ax(axes[0,1], 'Avg Annual Price Change by Region',
         'Slope ($/year)', '')

# Boxplot of current prices by region
region_groups = [df[(df['date']==latest_date) & (df['region']==r)]['dollar_price'].dropna().values
                 for r in region_order if r in df['region'].values]
valid_labels  = [r for r in region_order if r in df['region'].values and
                 len(df[(df['date']==latest_date) & (df['region']==r)]['dollar_price'].dropna()) > 0]
if region_groups:
    bp = axes[1,0].boxplot(region_groups, labels=valid_labels,
                           patch_artist=True, vert=True,
                           medianprops=dict(color='white', linewidth=2),
                           flierprops=dict(marker='o', markersize=4, alpha=0.5))
    for patch, color in zip(bp['boxes'], reg_colors):
        patch.set_facecolor(color); patch.set_alpha(0.8)
    axes[1,0].yaxis.set_major_formatter(mticker.FormatStrFormatter('$%.2f'))
    axes[1,0].set_xticklabels(valid_labels, rotation=25, ha='right', fontsize=9)
    style_ax(axes[1,0], 'Price Distribution Within Regions', '', 'Dollar Price')

# Slope vs R² scatter coloured by region
sc_colors = [reg_colors[region_order.index(r)] if r in region_order else '#ccc'
             for r in country_params['region']]
axes[1,1].scatter(country_params['r2'], country_params['slope'],
                  c=sc_colors, s=50, alpha=0.7, edgecolors='white', linewidth=0.5)
axes[1,1].axhline(us_slope, color=PALETTE['primary'], linestyle='--',
                  linewidth=1, alpha=0.6, label='US slope')
axes[1,1].axvline(0.7, color='#aaa', linestyle=':', linewidth=1, alpha=0.6, label='R²=0.7')
# Legend patches for regions
import matplotlib.patches as mpatches
patches = [mpatches.Patch(color=reg_colors[i], label=region_order[i], alpha=0.8)
           for i in range(len(region_order)) if region_order[i] in country_params['region'].values]
axes[1,1].legend(handles=patches, frameon=False, fontsize=8,
                 loc='upper left', ncol=2)
style_ax(axes[1,1], 'Price Growth Predictability vs. Rate\n(each dot = one country)',
         'R² (fit quality)', 'Slope ($/year)')

plt.tight_layout()
plt.show()

us_params = country_params[country_params['name'] == 'United States'].iloc[0]
us_slope_f     = us_params['slope']
us_intercept_f = us_params['intercept']

forecast_years = np.arange(0, 51)   # 2000 → 2050
forecast_dates = pd.to_datetime('2000-01-01') + pd.to_timedelta(forecast_years * 365.25, unit='D')
forecast_price = us_slope_f * forecast_years + us_intercept_f

# Confidence band: ±1 std of residuals
us_obs = df[df['name']=='United States'].dropna(subset=['dollar_price']).copy()
us_obs['years'] = us_obs['date'].dt.year + us_obs['date'].dt.month/12 - 2000
residuals = us_obs['dollar_price'].values - (us_slope_f * us_obs['years'].values + us_intercept_f)
resid_std = residuals.std()

fig, ax = plt.subplots(figsize=(14, 6), facecolor=PALETTE['bg'])

# Forecast band
ax.fill_between(forecast_dates, forecast_price - resid_std, forecast_price + resid_std,
                alpha=0.1, color=PALETTE['primary'])

# Forecast line
split_year = us_obs['years'].max()
hist_mask = forecast_years <= split_year
fore_mask = forecast_years >= split_year

ax.plot(forecast_dates[hist_mask], forecast_price[hist_mask],
        color=PALETTE['primary'], linewidth=2, linestyle='--', alpha=0.6)
ax.plot(forecast_dates[fore_mask], forecast_price[fore_mask],
        color=PALETTE['primary'], linewidth=2.5, linestyle='-',
        label=f'US forecast (+${us_slope_f:.2f}/yr)')

# Actual US data
ax.scatter(us_obs['date'], us_obs['dollar_price'],
           color=PALETTE['primary'], s=25, zorder=5, label='US actual')

# Overlay 2–3 comparison countries
compare_countries = []
if us_price:
    # most expensive and cheapest from latest
    valid_latest = df[df['date']==latest_date].dropna(subset=['dollar_price'])
    compare_countries = ([valid_latest.nlargest(1,'dollar_price').iloc[0]['name']] +
                         [valid_latest.nsmallest(1,'dollar_price').iloc[0]['name']] +
                         (['Switzerland'] if 'Switzerland' in df['name'].values else []))

comp_colors = [PALETTE['accent'], PALETTE['positive'], '#E67E22']
for i, cname in enumerate(compare_countries[:3]):
    cp = country_params[country_params['name']==cname]
    if cp.empty: continue
    cp = cp.iloc[0]
    yrs = np.arange(0, 51)
    dts = pd.to_datetime('2000-01-01') + pd.to_timedelta(yrs*365.25, unit='D')
    ax.plot(dts[fore_mask], cp['slope']*yrs[fore_mask]+cp['intercept'],
            linewidth=1.8, linestyle=':', color=comp_colors[i],
            label=f'{cname} forecast', alpha=0.8)

ax.yaxis.set_major_formatter(mticker.FormatStrFormatter('$%.2f'))
ax.axvline(pd.Timestamp('today'), color='#aaa', linewidth=1, linestyle='--', alpha=0.6)
ax.legend(frameon=False, fontsize=10)
style_ax(ax, 'US Big Mac Price — Historical Trend & Forecast to 2050',
         '', 'Price (USD)')
plt.tight_layout()
plt.show()

price_2030 = us_slope_f * 30 + us_intercept_f
price_2040 = us_slope_f * 40 + us_intercept_f
price_2050 = us_slope_f * 50 + us_intercept_f
print(f'US linear forecast:')
print(f'  2030: ${price_2030:.2f}')
print(f'  2040: ${price_2040:.2f}')
print(f'  2050: ${price_2050:.2f}')
print(f'  (assumes constant +${us_slope_f:.3f}/year — structural breaks not modelled)')

US linear forecast:
  2030: $7.40
  2040: $9.17
  2050: $10.94
  (assumes constant +$0.177/year — structural breaks not modelled)

Column	Description
`dollar_price`	Local Big Mac price converted to USD at market exchange rate
`USD_raw`	Raw PPP implied over/undervaluation vs USD (0 = fairly valued)
`USD_adjusted`	GDP-adjusted valuation — accounts for lower labor costs in poorer countries
`local_price`	Price in domestic currency
`dollar_ex`	Market exchange rate (local per USD) at time of survey

	dollar_price	USD_raw	USD_adjusted	local_price	dollar_ex
count	1386.000	1386.000	702.000	1386.000	1386.000
mean	3.255	-0.231	-0.020	10043.233	3817.912
std	1.261	0.298	0.256	181450.811	69296.091
min	0.640	-0.779	-0.578	1.050	0.302
25%	2.335	-0.446	-0.185	7.250	2.982
50%	3.044	-0.291	-0.032	24.250	7.751
75%	4.013	-0.067	0.097	119.000	47.092
max	8.312	1.273	1.485	4000000.000	1600500.000

Limitation	Impact	Note
Sample selection	Countries are not randomly sampled — richer, more connected economies are overrepresented	Averages and regional aggregates should be interpreted accordingly
Non-tradeable good	Big Macs can't actually be arbitraged across borders — PPP theory applies most cleanly to traded goods	The index is informal for a reason
McDonald's pricing strategy	Local prices reflect franchise decisions, real estate costs, and marketing — not just labor and input costs	Price changes can reflect corporate strategy, not macroeconomics
Survey frequency	Twice-yearly observations smooth over intra-year volatility, particularly for high-inflation economies	Currency crises between surveys are invisible in this data
Linear forecast assumption	Price growth in the US has not been constant — post-2020 acceleration breaks the simple linear model	The forecast in Section 7 is a baseline, not a prediction
Euro area aggregate	The Euro area row is a constructed average — individual member-state prices vary and are partially available	Excluded from valuation charts to avoid double-counting
GDP-adjusted index data availability	`USD_adjusted` is missing for earlier dates and some countries	Valuation comparisons use the most recent complete snapshot

The Big Mac Index: Purchasing Power Parity & Currency Valuation¶

An Empirical Analysis of the Economist's Informal PPP Benchmark, 2000–2024¶

Table of Contents¶

1. Imports & Configuration¶

2. Data Loading & Validation¶

3. Exploratory Overview¶

3.1 Global Price Snapshot — Most Recent Survey¶

3.2 US Big Mac Price Over Time¶

3.3 All-Country Price Trajectories¶

4. Currency Valuation Analysis¶

4.1 Most Over/Undervalued Currencies — Raw PPP (Most Recent)¶

4.2 GDP-Adjusted Valuation¶

4.3 Persistent Overvaluation — Selected Currencies Over Time¶

5. Price Trend Analysis¶

5.1 Linear Trend Fitting Per Country¶

5.2 Top Gainers and Losers — Annual Price Change¶

5.3 R² Fit Quality — Where Is Price Growth Most Predictable?¶

6. Regional Breakdown¶

7. US Price Forecast¶

8. Key Findings¶

Price Level & Valuation¶

Price Trends¶

Predictability¶

Regional Patterns¶

Forecast¶

9. Limitations¶