Virtual-Tutor-Eval/generate_plots_effects.py

"""
generate_plots_effects.py

Effect-focused analysis for VirTu-Eval experiment data.
Generates plots into Data/plots_effects/ organized by section:

  F. Effect Without Ökologie (vs. With)  – 5 plots
  G. Effect Per Topic                    – 2 plots
  H. All Medium × Topic Combinations     – 1 plot (3×3 grid)
  I. Outlier Influence Analysis          – 3 plots

Usage:
  python generate_plots_effects.py
"""

import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import seaborn as sns
from pathlib import Path
from scipy import stats

# =============================================================================
# CONFIG
# =============================================================================
BASE = Path("Data")
PLOT_DIR = BASE / "plots_effects"
PLOT_DIR.mkdir(parents=True, exist_ok=True)
STATS_DIR = BASE / "stats"
STATS_DIR.mkdir(exist_ok=True)

PHASE_ORDER  = ['Pre-Reading', 'Post-Reading', 'Pre-Tutoring', 'Post-Tutoring']
PHASE_LABELS = ['Pre-Read', 'Post-Read', 'Pre-Tutor', 'Post-Tutor']
PHASE_SHORT  = dict(zip(PHASE_ORDER, PHASE_LABELS))

MEDIUM_ORDER  = ['Chat', 'Video', 'VR']
MEDIUM_COLORS = {'Chat': '#2196F3', 'Video': '#FF9800', 'VR': '#4CAF50'}

TOPIC_ORDER  = ['Mendel', 'DNA-Replikation', 'Ökologie']
TOPIC_COLORS = {'Mendel': '#E91E63', 'DNA-Replikation': '#9C27B0', 'Ökologie': '#009688'}

TOPICS_NO_OEK = ['Mendel', 'DNA-Replikation']

sns.set_theme(style="whitegrid", font_scale=1.05)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.bbox'] = 'tight'


# =============================================================================
# HELPERS
# =============================================================================

def cohens_d(pre, post):
    diff = post - pre
    return diff.mean() / diff.std(ddof=1) if diff.std(ddof=1) > 0 else 0.0


def sig_stars(p):
    if p < 0.001:
        return '***'
    elif p < 0.01:
        return '**'
    elif p < 0.05:
        return '*'
    return 'n.s.'


def compute_effect(sub):
    """Return (mean_gain, sd_gain, sem_gain, d, t, p, n) for a paired subset."""
    pre  = sub['Pre_Score']
    post = sub['Post_Score']
    n = len(sub)
    if n < 2:
        return sub['Score_Gain'].mean(), sub['Score_Gain'].std(), np.nan, np.nan, np.nan, np.nan, n
    t, p = stats.ttest_rel(pre, post)
    d    = cohens_d(pre, post)
    g    = sub['Score_Gain']
    return g.mean(), g.std(ddof=1), g.sem(), d, t, p, n


def iqr_outlier_mask(series):
    """Return boolean Series: True where value is an IQR outlier (1.5×IQR rule)."""
    q1, q3 = series.quantile(0.25), series.quantile(0.75)
    iqr = q3 - q1
    return (series < q1 - 1.5 * iqr) | (series > q3 + 1.5 * iqr)


# =============================================================================
# DATA LOADING
# =============================================================================

def load_data():
    df = pd.read_csv(BASE / "test_scores_all.csv", encoding="utf-8-sig")
    df['Zeitpunkt'] = df['Zeitpunkt'].str.strip().replace('Pre-Tutor', 'Pre-Tutoring')
    df['Phase'] = pd.Categorical(df['Zeitpunkt'], categories=PHASE_ORDER, ordered=True)
    df['P_Num'] = df['Participant'].str.extract(r'(\d+)').astype(int)
    return df


def build_paired_tutoring(df):
    pre = df[df['Zeitpunkt'] == 'Pre-Tutoring'][
        ['Participant', 'Topic', 'Medium', 'Score_Pct', 'Avg_Confidence']].copy()
    post = df[df['Zeitpunkt'] == 'Post-Tutoring'][
        ['Participant', 'Topic', 'Medium', 'Score_Pct', 'Avg_Confidence']].copy()
    pre.columns  = ['Participant', 'Topic', 'Medium', 'Pre_Score',  'Pre_Conf']
    post.columns = ['Participant', 'Topic', 'Medium', 'Post_Score', 'Post_Conf']
    paired = pre.merge(post, on=['Participant', 'Topic', 'Medium'])
    paired['Score_Gain'] = paired['Post_Score'] - paired['Pre_Score']
    paired['Conf_Gain']  = paired['Post_Conf']  - paired['Pre_Conf']
    paired['P_Num'] = paired['Participant'].str.extract(r'(\d+)').astype(int)
    return paired


# =============================================================================
# F. EFFECT WITHOUT ÖKOLOGIE (vs. WITH)
# =============================================================================

def plot_F1_cohens_d_comparison(paired):
    """Bar chart: Cohen's d per medium – All Topics vs. Excl. Ökologie."""
    fig, ax = plt.subplots(figsize=(10, 6))
    x = np.arange(len(MEDIUM_ORDER))
    w = 0.35

    for j, (label, use_all, hatch) in enumerate([
        ('All Topics',     True,  ''),
        ('Excl. Ökologie', False, '//'),
    ]):
        ds, gs, ps, ns = [], [], [], []
        for m in MEDIUM_ORDER:
            sub = paired[paired['Medium'] == m]
            sub_f = sub if use_all else sub[sub['Topic'].isin(TOPICS_NO_OEK)]
            g, sd, sem, d, t, p, n = compute_effect(sub_f)
            ds.append(d if not np.isnan(d) else 0)
            gs.append(g)
            ps.append(p if not np.isnan(p) else 1)
            ns.append(n)

        bars = ax.bar(x + j*w - w/2, ds, w, label=label,
                      color=[MEDIUM_COLORS[m] for m in MEDIUM_ORDER],
                      alpha=0.85 if j == 0 else 0.45,
                      hatch=hatch, edgecolor='white', linewidth=1.2)

        for i, (b, g, p, d_val) in enumerate(zip(bars, gs, ps, ds)):
            star = sig_stars(p)
            ax.text(b.get_x() + b.get_width()/2,
                    max(d_val, 0) + 0.04,
                    f'd={d_val:.2f}\n{g:+.1f}%\n{star}',
                    ha='center', va='bottom', fontsize=8.5, fontweight='bold',
                    color='#333333')

    ax.axhline(0.2, color='gray', lw=1, ls=':', alpha=0.6)
    ax.axhline(0.5, color='gray', lw=1, ls='--', alpha=0.6)
    ax.axhline(0.8, color='gray', lw=1, ls='-', alpha=0.4)
    ax.text(2.65, 0.21, 'small',  fontsize=8, color='gray', va='bottom')
    ax.text(2.65, 0.51, 'medium', fontsize=8, color='gray', va='bottom')
    ax.text(2.65, 0.81, 'large',  fontsize=8, color='gray', va='bottom')

    ax.set_xticks(x)
    ax.set_xticklabels(MEDIUM_ORDER, fontsize=12)
    ax.set_ylabel("Cohen's d (tutoring score gain)", fontsize=12)
    ax.set_ylim(bottom=0)
    ax.legend(fontsize=11)
    ax.set_title("F1 – Effect Sizes by Medium: All Topics vs. Excl. Ökologie",
                 fontsize=13, fontweight='bold')
    fig.tight_layout()
    fig.savefig(PLOT_DIR / 'F1_cohens_d_comparison.png')
    plt.close(fig)


def plot_F2_mean_gain_comparison(paired):
    """Bar chart with 95% CI: mean score gain per medium – All vs. Excl. Ökologie."""
    fig, ax = plt.subplots(figsize=(10, 6))
    x = np.arange(len(MEDIUM_ORDER))
    w = 0.35

    for j, (label, filter_fn, alpha, hatch) in enumerate([
        ('All Topics',     lambda sub: sub,                                   0.80, ''),
        ('Excl. Ökologie', lambda sub: sub[sub['Topic'].isin(TOPICS_NO_OEK)], 0.45, '//'),
    ]):
        means, cis = [], []
        for m in MEDIUM_ORDER:
            sub_f = filter_fn(paired[paired['Medium'] == m])
            g, sd, sem, d, t, p, n = compute_effect(sub_f)
            means.append(g)
            cis.append(sem * 1.96)

        bars = ax.bar(x + j*w - w/2, means, w, label=label,
                      color=[MEDIUM_COLORS[m] for m in MEDIUM_ORDER],
                      alpha=alpha, hatch=hatch, edgecolor='white', linewidth=1.2,
                      yerr=cis, capsize=5, error_kw=dict(lw=1.5, capthick=1.5))

        for b, g in zip(bars, means):
            ax.text(b.get_x() + b.get_width()/2,
                    g + (b.get_height() * 0.05 if g >= 0 else -2),
                    f'{g:+.1f}%',
                    ha='center', va='bottom', fontsize=9, fontweight='bold',
                    color='#333333')

    ax.axhline(0, color='gray', lw=1)
    ax.set_xticks(x)
    ax.set_xticklabels(MEDIUM_ORDER, fontsize=12)
    ax.set_ylabel('Mean Score Gain (%, 95% CI)', fontsize=12)
    ax.legend(fontsize=11)
    ax.set_title('F2 – Mean Score Gain by Medium: All Topics vs. Excl. Ökologie',
                 fontsize=13, fontweight='bold')
    fig.tight_layout()
    fig.savefig(PLOT_DIR / 'F2_mean_gain_comparison.png')
    plt.close(fig)


def plot_F3_paired_slopes_comparison(paired):
    """2×3 grid: top row = All Topics, bottom row = Excl. Ökologie."""
    conditions = [
        ('All Topics',     paired,                                   0),
        ('Excl. Ökologie', paired[paired['Topic'].isin(TOPICS_NO_OEK)], 1),
    ]

    fig, axes = plt.subplots(2, 3, figsize=(18, 12), sharey=True)

    for row_idx, (cond_label, data, row) in enumerate(conditions):
        for col_idx, medium in enumerate(MEDIUM_ORDER):
            ax = axes[row][col_idx]
            sub = data[data['Medium'] == medium].sort_values('P_Num')

            for _, r in sub.iterrows():
                ax.plot([0, 1], [r['Pre_Score'], r['Post_Score']],
                        color=TOPIC_COLORS[r['Topic']], alpha=0.55, lw=1.5,
                        marker='o', markersize=5)
                ax.annotate(r['Participant'], (1.02, r['Post_Score']),
                            fontsize=7, va='center', alpha=0.6)

            if len(sub) >= 2:
                pre_m, post_m = sub['Pre_Score'].mean(), sub['Post_Score'].mean()
                ax.plot([0, 1], [pre_m, post_m],
                        color=MEDIUM_COLORS[medium], lw=4, marker='D',
                        markersize=12, zorder=10,
                        markeredgecolor='white', markeredgewidth=2)

                g, sd, sem, d, t, p, n = compute_effect(sub)
                star = sig_stars(p)
                ax.text(0.5, 0.03,
                        f'n={n}  Gain: {g:+.1f}%\nd={d:.2f}  t={t:.2f}  p={p:.3f} {star}',
                        transform=ax.transAxes, ha='center', fontsize=9,
                        bbox=dict(boxstyle='round,pad=0.4', facecolor='lightyellow', alpha=0.9))

            ax.set_xticks([0, 1])
            ax.set_xticklabels(['Pre-Tutoring', 'Post-Tutoring'], fontsize=10)
            ax.set_ylim(-5, 110)

            title_color = MEDIUM_COLORS[medium]
            if col_idx == 0:
                ax.set_ylabel(f'{cond_label}\nTest Score (%)', fontsize=10, fontweight='bold')
            if row == 0:
                ax.set_title(medium, fontsize=13, fontweight='bold', color=title_color)

    legend_els = [Line2D([0],[0], color=TOPIC_COLORS[t], lw=2, marker='o', ms=6, label=t)
                  for t in TOPIC_ORDER]
    legend_els.append(Line2D([0],[0], color='gray', lw=4, marker='D', ms=8, label='Medium Mean'))
    fig.legend(handles=legend_els, loc='lower center', ncol=4, fontsize=10,
               bbox_to_anchor=(0.5, 0.01))
    fig.suptitle('F3 – Paired Slopes: All Topics (top) vs. Excl. Ökologie (bottom)',
                 fontsize=14, fontweight='bold')
    fig.tight_layout(rect=[0, 0.05, 1, 0.97])
    fig.savefig(PLOT_DIR / 'F3_paired_slopes_comparison.png')
    plt.close(fig)


def plot_F4_gain_distribution_comparison(paired):
    """Side-by-side violin+box plots per medium: All Topics vs. Excl. Ökologie."""
    fig, axes = plt.subplots(1, 3, figsize=(18, 7), sharey=True)

    for col_idx, medium in enumerate(MEDIUM_ORDER):
        ax = axes[col_idx]
        data_all = paired[paired['Medium'] == medium]['Score_Gain'].values
        data_noe = paired[(paired['Medium'] == medium) &
                          (paired['Topic'].isin(TOPICS_NO_OEK))]['Score_Gain'].values

        positions = [0.8, 2.2]
        colors    = [MEDIUM_COLORS[medium], MEDIUM_COLORS[medium]]
        alphas    = [0.75, 0.40]
        labels_vp = ['All Topics', 'Excl. Ökologie']

        for pos, data, alpha, lbl in zip(positions,
                                          [data_all, data_noe],
                                          alphas, labels_vp):
            if len(data) < 2:
                continue
            parts = ax.violinplot(data, positions=[pos], widths=0.9,
                                  showmedians=False, showextrema=False)
            for pc in parts['bodies']:
                pc.set_facecolor(MEDIUM_COLORS[medium])
                pc.set_alpha(alpha)

            bp = ax.boxplot(data, positions=[pos], widths=0.35,
                            patch_artist=True, showmeans=True, notch=False,
                            meanprops=dict(marker='D', markerfacecolor='black',
                                          markeredgecolor='white', markersize=7),
                            medianprops=dict(color='white', lw=2),
                            boxprops=dict(facecolor=MEDIUM_COLORS[medium], alpha=alpha))

            g, sd, sem, d, t, p, n = compute_effect(
                paired[paired['Medium'] == medium] if lbl == 'All Topics'
                else paired[(paired['Medium'] == medium) & paired['Topic'].isin(TOPICS_NO_OEK)]
            )
            star = sig_stars(p)
            ax.text(pos, np.nanmax(data) + 4,
                    f'n={n}\nM={g:+.1f}%\nd={d:.2f} {star}',
                    ha='center', va='bottom', fontsize=8.5, fontweight='bold')

        ax.axhline(0, color='gray', lw=1, ls='--', alpha=0.6)
        ax.set_xticks(positions)
        ax.set_xticklabels(['All\nTopics', 'Excl.\nÖkologie'], fontsize=10)
        ax.set_title(medium, fontsize=13, fontweight='bold', color=MEDIUM_COLORS[medium])
        if col_idx == 0:
            ax.set_ylabel('Score Gain (%)', fontsize=12)

    fig.suptitle('F4 – Gain Distributions: All Topics vs. Excl. Ökologie',
                 fontsize=14, fontweight='bold')
    fig.tight_layout()
    fig.savefig(PLOT_DIR / 'F4_gain_distribution_comparison.png')
    plt.close(fig)


def plot_F5_stats_table(paired):
    """Rendered table: N, mean gain, SD, d, t, p for each medium × condition."""
    fig, ax = plt.subplots(figsize=(14, 5))
    ax.axis('off')

    rows = []
    for m in MEDIUM_ORDER:
        for cond_label, filter_fn in [
            ('All Topics',     lambda sub, _m=m: paired[paired['Medium'] == _m]),
            ('Excl. Ökologie', lambda sub, _m=m: paired[(paired['Medium'] == _m) &
                                                         paired['Topic'].isin(TOPICS_NO_OEK)]),
        ]:
            sub = filter_fn(None)
            g, sd, sem, d, t, p, n = compute_effect(sub)
            star = sig_stars(p) if not np.isnan(p) else ''
            rows.append([
                m, cond_label, str(n),
                f'{g:+.2f}', f'{sd:.2f}',
                f'{d:.3f}' if not np.isnan(d) else '–',
                f'{t:.3f}' if not np.isnan(t) else '–',
                f'{p:.3f}{star}' if not np.isnan(p) else '–',
            ])

    col_labels = ['Medium', 'Condition', 'N', 'Mean Gain (%)', 'SD',
                  "Cohen's d", 't-stat', 'p-value']
    table = ax.table(cellText=rows, colLabels=col_labels,
                     loc='center', cellLoc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(11)
    table.scale(1.0, 2.0)

    # Header style
    for j in range(len(col_labels)):
        table[0, j].set_facecolor('#37474F')
        table[0, j].set_text_props(color='white', fontweight='bold')

    # Row coloring
    medium_col_idx = {'Chat': '#BBDEFB', 'Video': '#FFE0B2', 'VR': '#C8E6C9'}
    cond_row = {'All Topics': 0.85, 'Excl. Ökologie': 0.60}
    for i, (m, cond, *_) in enumerate(rows):
        base_color = medium_col_idx[m]
        alpha_mod  = cond_row[cond]
        for j in range(len(col_labels)):
            cell = table[i + 1, j]
            cell.set_facecolor(base_color)
            cell.set_alpha(alpha_mod)

    ax.set_title('F5 – Statistical Summary: All Topics vs. Excl. Ökologie',
                 fontsize=13, fontweight='bold', pad=30)
    fig.tight_layout()
    fig.savefig(PLOT_DIR / 'F5_stats_table.png')
    plt.close(fig)


# =============================================================================
# G. EFFECT PER TOPIC
# =============================================================================

def plot_G1_effect_per_topic(paired):
    """Bar chart: mean score gain + Cohen's d per topic, 95% CI."""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    topics = TOPIC_ORDER
    means, cis, ds, ps, ns = [], [], [], [], []
    for t in topics:
        sub = paired[paired['Topic'] == t]
        g, sd, sem, d, tv, p, n = compute_effect(sub)
        means.append(g); cis.append(sem * 1.96)
        ds.append(d); ps.append(p); ns.append(n)

    bars1 = ax1.bar(topics, means, color=[TOPIC_COLORS[t] for t in topics],
                    alpha=0.8, yerr=cis, capsize=6, edgecolor='white', lw=1.5)
    for b, g, p, n in zip(bars1, means, ps, ns):
        star = sig_stars(p)
        ax1.text(b.get_x() + b.get_width()/2,
                 g + (b.get_height() * 0.05 if g >= 0 else -2),
                 f'{g:+.1f}%\nn={n}\n{star}',
                 ha='center', va='bottom', fontsize=10, fontweight='bold')
    ax1.axhline(0, color='gray', lw=1)
    ax1.set_ylabel('Mean Score Gain (%, 95% CI)', fontsize=12)
    ax1.set_title('Mean Tutoring Gain per Topic', fontsize=12, fontweight='bold')
    ax1.set_xticks(range(len(topics)))
    ax1.set_xticklabels(topics, fontsize=11)

    bars2 = ax2.bar(topics, ds, color=[TOPIC_COLORS[t] for t in topics],
                    alpha=0.8, edgecolor='white', lw=1.5)
    for b, d_val, p in zip(bars2, ds, ps):
        star = sig_stars(p)
        ax2.text(b.get_x() + b.get_width()/2,
                 max(d_val, 0) + 0.03,
                 f"d={d_val:.2f}\n{star}",
                 ha='center', va='bottom', fontsize=10, fontweight='bold')
    for thresh, label, ls in [(0.2, 'small', ':'), (0.5, 'medium', '--'), (0.8, 'large', '-')]:
        ax2.axhline(thresh, color='gray', lw=1, ls=ls, alpha=0.5)
        ax2.text(2.55, thresh + 0.02, label, fontsize=8, color='gray')
    ax2.set_ylim(bottom=0)
    ax2.set_ylabel("Cohen's d", fontsize=12)
    ax2.set_title("Effect Size (Cohen's d) per Topic", fontsize=12, fontweight='bold')
    ax2.set_xticks(range(len(topics)))
    ax2.set_xticklabels(topics, fontsize=11)

    fig.suptitle("G1 – Tutoring Effect per Topic", fontsize=14, fontweight='bold')
    fig.tight_layout()
    fig.savefig(PLOT_DIR / 'G1_effect_per_topic.png')
    plt.close(fig)


def plot_G2_slopes_per_topic(paired):
    """Paired slope plots per topic (3 panels), with medium-colored lines."""
    fig, axes = plt.subplots(1, 3, figsize=(18, 7), sharey=True)

    for col_idx, topic in enumerate(TOPIC_ORDER):
        ax = axes[col_idx]
        sub = paired[paired['Topic'] == topic].sort_values('P_Num')

        for _, r in sub.iterrows():
            ax.plot([0, 1], [r['Pre_Score'], r['Post_Score']],
                    color=MEDIUM_COLORS[r['Medium']], alpha=0.5, lw=1.5,
                    marker='o', markersize=5)
            ax.annotate(r['Participant'], (1.02, r['Post_Score']),
                        fontsize=7, va='center', alpha=0.6)

        for medium in MEDIUM_ORDER:
            msub = sub[sub['Medium'] == medium]
            if len(msub) > 0:
                pm, qm = msub['Pre_Score'].mean(), msub['Post_Score'].mean()
                ax.plot([0, 1], [pm, qm],
                        color=MEDIUM_COLORS[medium], lw=3.5, marker='D', markersize=10,
                        zorder=10, markeredgecolor='white', markeredgewidth=2,
                        label=f'{medium} ({qm-pm:+.1f}%)')

        if len(sub) >= 2:
            g, sd, sem, d, t, p, n = compute_effect(sub)
            star = sig_stars(p)
            ax.text(0.5, 0.03,
                    f'Overall: {g:+.1f}%  d={d:.2f}\nt={t:.2f}  p={p:.3f} {star}',
                    transform=ax.transAxes, ha='center', fontsize=9,
                    bbox=dict(boxstyle='round,pad=0.4', facecolor='lightyellow', alpha=0.9))

        ax.set_xticks([0, 1])
        ax.set_xticklabels(['Pre-Tutoring', 'Post-Tutoring'], fontsize=11)
        ax.set_title(topic, fontsize=14, fontweight='bold', color=TOPIC_COLORS[topic])
        ax.set_ylim(-5, 110)
        ax.legend(fontsize=9, loc='upper left')

    axes[0].set_ylabel('Test Score (%)', fontsize=12)
    fig.suptitle('G2 – Paired Slopes by Topic (Medium-Colored Lines)',
                 fontsize=14, fontweight='bold')
    fig.tight_layout(rect=[0, 0, 1, 0.96])
    fig.savefig(PLOT_DIR / 'G2_slopes_per_topic.png')
    plt.close(fig)


# =============================================================================
# H. ALL MEDIUM × TOPIC COMBINATIONS (3×3 GRID)
# =============================================================================

def plot_H1_medium_topic_grid(paired):
    """3×3 grid: rows = mediums, cols = topics. Each cell = slope plot with stats."""
    fig, axes = plt.subplots(3, 3, figsize=(18, 16), sharey=True)

    for row_idx, medium in enumerate(MEDIUM_ORDER):
        for col_idx, topic in enumerate(TOPIC_ORDER):
            ax = axes[row_idx][col_idx]
            sub = paired[(paired['Medium'] == medium) &
                         (paired['Topic'] == topic)].sort_values('P_Num')

            for _, r in sub.iterrows():
                ax.plot([0, 1], [r['Pre_Score'], r['Post_Score']],
                        color=TOPIC_COLORS[topic], alpha=0.55, lw=1.5,
                        marker='o', markersize=5)
                ax.annotate(r['Participant'], (1.02, r['Post_Score']),
                            fontsize=7, va='center', alpha=0.6)

            if len(sub) >= 2:
                pre_m, post_m = sub['Pre_Score'].mean(), sub['Post_Score'].mean()
                ax.plot([0, 1], [pre_m, post_m],
                        color=MEDIUM_COLORS[medium], lw=4, marker='D', markersize=11,
                        zorder=10, markeredgecolor='white', markeredgewidth=2)

                g, sd, sem, d, t, p, n = compute_effect(sub)
                star = sig_stars(p)
                ax.text(0.5, 0.03,
                        f'n={n}  {g:+.1f}%\nd={d:.2f}  p={p:.3f} {star}',
                        transform=ax.transAxes, ha='center', fontsize=8.5,
                        bbox=dict(boxstyle='round,pad=0.3', facecolor='lightyellow', alpha=0.9))
            elif len(sub) == 1:
                r = sub.iloc[0]
                pre_m, post_m = r['Pre_Score'], r['Post_Score']
                ax.plot([0, 1], [pre_m, post_m],
                        color=MEDIUM_COLORS[medium], lw=3, marker='D', markersize=10,
                        zorder=10, markeredgecolor='white', markeredgewidth=2)
                ax.text(0.5, 0.03, 'n=1 (no stats)', transform=ax.transAxes,
                        ha='center', fontsize=8.5, color='gray')

            ax.set_xticks([0, 1])
            ax.set_xticklabels(['Pre', 'Post'], fontsize=9)
            ax.set_ylim(-5, 110)

            if col_idx == 0:
                ax.set_ylabel(f'{medium}\nScore (%)', fontsize=10, fontweight='bold',
                              color=MEDIUM_COLORS[medium])
            if row_idx == 0:
                ax.set_title(topic, fontsize=12, fontweight='bold',
                             color=TOPIC_COLORS[topic])

    fig.suptitle('H1 – Tutoring Slopes: All Medium × Topic Combinations',
                 fontsize=15, fontweight='bold')
    fig.tight_layout(rect=[0, 0, 0.97, 0.97])
    fig.savefig(PLOT_DIR / 'H1_medium_topic_grid.png')
    plt.close(fig)


# =============================================================================
# I. OUTLIER INFLUENCE ANALYSIS
# =============================================================================

def _flag_outliers(paired):
    """Add 'Outlier' bool column based on IQR rule applied per medium."""
    paired = paired.copy()
    paired['Outlier'] = False
    for m in MEDIUM_ORDER:
        mask = paired['Medium'] == m
        paired.loc[mask, 'Outlier'] = iqr_outlier_mask(paired.loc[mask, 'Score_Gain']).values
    return paired


def plot_I1_outlier_scatter(paired):
    """Scatter of score gains per medium with outliers labeled."""
    paired_f = _flag_outliers(paired)
    fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

    for col_idx, medium in enumerate(MEDIUM_ORDER):
        ax = axes[col_idx]
        sub = paired_f[paired_f['Medium'] == medium]

        q1  = sub['Score_Gain'].quantile(0.25)
        q3  = sub['Score_Gain'].quantile(0.75)
        iqr = q3 - q1
        lo  = q1 - 1.5 * iqr
        hi  = q3 + 1.5 * iqr

        ax.axhline(hi, color='#E53935', lw=1.5, ls='--', alpha=0.7, label=f'±1.5 IQR ({lo:.1f}–{hi:.1f})')
        ax.axhline(lo, color='#E53935', lw=1.5, ls='--', alpha=0.7)
        ax.axhline(0,  color='gray', lw=1, alpha=0.5)

        rng = np.random.default_rng(42)
        for _, r in sub.iterrows():
            jit = rng.uniform(-0.12, 0.12)
            color = '#E53935' if r['Outlier'] else MEDIUM_COLORS[medium]
            ms    = 10 if r['Outlier'] else 7
            ax.scatter(0.5 + jit, r['Score_Gain'], color=color, s=ms**2,
                       alpha=0.8, edgecolors='white', lw=0.5, zorder=5)
            if r['Outlier']:
                lbl = f"{r['Participant']}\n({r['Topic'][:6]})"
                ax.annotate(lbl, (0.5 + jit, r['Score_Gain']),
                            fontsize=7.5, ha='center',
                            xytext=(20 if jit > 0 else -20, 0),
                            textcoords='offset points',
                            arrowprops=dict(arrowstyle='->', color='#E53935', lw=0.8),
                            color='#E53935', fontweight='bold')

        n_out = sub['Outlier'].sum()
        ax.set_xlim(0, 1)
        ax.set_xticks([0.5])
        ax.set_xticklabels([medium], fontsize=12)
        ax.set_title(f'{medium}\n({n_out} outlier{"s" if n_out != 1 else ""})',
                     fontsize=12, fontweight='bold', color=MEDIUM_COLORS[medium])
        if col_idx == 0:
            ax.set_ylabel('Score Gain (%)', fontsize=12)
        ax.legend(fontsize=8, loc='upper right')

    fig.suptitle('I1 – Score Gain Scatter with Outlier Flags (IQR Method)',
                 fontsize=14, fontweight='bold')
    fig.tight_layout()
    fig.savefig(PLOT_DIR / 'I1_outlier_scatter.png')
    plt.close(fig)


def plot_I2_outlier_effect_comparison(paired):
    """Cohen's d per medium: all data vs. outliers removed."""
    paired_f = _flag_outliers(paired)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    x = np.arange(len(MEDIUM_ORDER))
    w = 0.35
    all_ds, no_out_ds = [], []
    all_gs, no_out_gs = [], []
    all_ps, no_out_ps = [], []
    all_ns, no_out_ns = [], []

    for m in MEDIUM_ORDER:
        sub_all = paired_f[paired_f['Medium'] == m]
        sub_noo = sub_all[~sub_all['Outlier']]
        g1, _, _, d1, t1, p1, n1 = compute_effect(sub_all)
        g2, _, _, d2, t2, p2, n2 = compute_effect(sub_noo)
        all_ds.append(d1 if not np.isnan(d1) else 0)
        no_out_ds.append(d2 if not np.isnan(d2) else 0)
        all_gs.append(g1); no_out_gs.append(g2)
        all_ps.append(p1); no_out_ps.append(p2)
        all_ns.append(n1); no_out_ns.append(n2)

    for j, (label, ds, gs, ps, ns, alpha, hatch) in enumerate([
        ('All Data',           all_ds,    all_gs,    all_ps,    all_ns,    0.80, ''),
        ('Outliers Removed',   no_out_ds, no_out_gs, no_out_ps, no_out_ns, 0.45, '//'),
    ]):
        bars = ax1.bar(x + j*w - w/2, ds, w, label=label,
                       color=[MEDIUM_COLORS[m] for m in MEDIUM_ORDER],
                       alpha=alpha, hatch=hatch, edgecolor='white', lw=1.2)
        for b, d_val, g, p, n in zip(bars, ds, gs, ps, ns):
            star = sig_stars(p) if not np.isnan(p) else ''
            ax1.text(b.get_x() + b.get_width()/2,
                     max(d_val, 0) + 0.03,
                     f'd={d_val:.2f}\n{g:+.1f}%\n{star}',
                     ha='center', va='bottom', fontsize=8.5, fontweight='bold')

    for thresh, lbl, ls in [(0.2,'small',':'), (0.5,'medium','--'), (0.8,'large','-')]:
        ax1.axhline(thresh, color='gray', lw=1, ls=ls, alpha=0.5)
        ax1.text(2.65, thresh + 0.02, lbl, fontsize=8, color='gray')
    ax1.set_xticks(x); ax1.set_xticklabels(MEDIUM_ORDER, fontsize=12)
    ax1.set_ylim(bottom=0)
    ax1.set_ylabel("Cohen's d", fontsize=12)
    ax1.set_title("Cohen's d: All Data vs. Outliers Removed", fontsize=12, fontweight='bold')
    ax1.legend(fontsize=10)

    # Delta d
    delta_d = [no - al for al, no in zip(all_ds, no_out_ds)]
    colors_d = ['#43A047' if dd >= 0 else '#E53935' for dd in delta_d]
    bars2 = ax2.bar(MEDIUM_ORDER, delta_d, color=colors_d, alpha=0.8, edgecolor='white', lw=1.5)
    for b, dd in zip(bars2, delta_d):
        ax2.text(b.get_x() + b.get_width()/2,
                 dd + (0.01 if dd >= 0 else -0.03),
                 f'Δd={dd:+.3f}',
                 ha='center', va='bottom' if dd >= 0 else 'top',
                 fontsize=10, fontweight='bold')
    ax2.axhline(0, color='gray', lw=1)
    ax2.set_ylabel('Δ Cohen\'s d (Outliers Removed − All)', fontsize=12)
    ax2.set_title('Change in Effect Size After Removing Outliers', fontsize=12, fontweight='bold')

    fig.suptitle('I2 – Outlier Influence on Effect Sizes', fontsize=14, fontweight='bold')
    fig.tight_layout()
    fig.savefig(PLOT_DIR / 'I2_outlier_effect_comparison.png')
    plt.close(fig)


def plot_I3_outlier_heatmap(paired):
    """Heatmap: which participant×topic pairs are outliers per medium."""
    paired_f = _flag_outliers(paired)

    fig, axes = plt.subplots(1, 3, figsize=(18, 8))

    for col_idx, medium in enumerate(MEDIUM_ORDER):
        ax = axes[col_idx]
        sub = paired_f[paired_f['Medium'] == medium].copy()
        sub['Label'] = sub['Participant'] + '\n' + sub['Topic'].str[:8]

        # Build pivot: rows = participants sorted, cols = topics
        pivot = sub.pivot_table(index='Participant', columns='Topic',
                                values='Score_Gain', aggfunc='first')
        pivot = pivot.reindex(columns=TOPIC_ORDER)
        pivot = pivot.reindex(sorted(pivot.index, key=lambda x: int(x[1:])))

        outlier_pivot = sub.pivot_table(index='Participant', columns='Topic',
                                        values='Outlier', aggfunc='first')
        outlier_pivot = outlier_pivot.reindex(columns=TOPIC_ORDER)
        outlier_pivot = outlier_pivot.reindex(sorted(outlier_pivot.index,
                                                      key=lambda x: int(x[1:])))

        # Draw heatmap of score gain
        sns.heatmap(pivot.astype(float), annot=True, fmt='.1f',
                    cmap='RdYlGn', center=0, vmin=-40, vmax=60,
                    linewidths=0.8, ax=ax, cbar_kws={'label': 'Score Gain %'},
                    annot_kws={'size': 9})

        # Overlay red border for outliers
        for r_i, pid in enumerate(pivot.index):
            for c_i, topic in enumerate(TOPIC_ORDER):
                is_out = outlier_pivot.loc[pid, topic] if (pid in outlier_pivot.index and
                         topic in outlier_pivot.columns) else False
                if is_out:
                    ax.add_patch(mpatches.Rectangle(
                        (c_i, r_i), 1, 1,
                        fill=False, edgecolor='#E53935', lw=3, zorder=5))

        ax.set_title(f'{medium}', fontsize=13, fontweight='bold',
                     color=MEDIUM_COLORS[medium])
        ax.set_xlabel('Topic', fontsize=10)
        ax.set_ylabel('Participant' if col_idx == 0 else '', fontsize=10)

    fig.suptitle('I3 – Outlier Heatmap: Score Gain by Participant × Topic\n'
                 '(Red border = IQR outlier within that medium)',
                 fontsize=13, fontweight='bold')
    fig.tight_layout()
    fig.savefig(PLOT_DIR / 'I3_outlier_heatmap.png')
    plt.close(fig)


# =============================================================================
# STATS EXPORT
# =============================================================================

def export_stats(paired):
    paired_f = _flag_outliers(paired)

    # --- F: effects by medium with/without Ökologie ---
    rows_f = []
    for m in MEDIUM_ORDER:
        for cond_label, filter_fn in [
            ('All Topics',     lambda sub: sub),
            ('Excl_Oekologie', lambda sub: sub[sub['Topic'].isin(TOPICS_NO_OEK)]),
        ]:
            sub = filter_fn(paired[paired['Medium'] == m])
            g, sd, sem, d, t, p, n = compute_effect(sub)
            rows_f.append({
                'Medium': m, 'Condition': cond_label, 'N': n,
                'Mean_Gain': round(g, 3) if not np.isnan(g) else np.nan,
                'SD_Gain': round(sd, 3) if not np.isnan(sd) else np.nan,
                'Cohens_d': round(d, 3) if not np.isnan(d) else np.nan,
                't_stat': round(t, 3) if not np.isnan(t) else np.nan,
                'p_value': round(p, 4) if not np.isnan(p) else np.nan,
            })
    pd.DataFrame(rows_f).to_csv(
        STATS_DIR / 'effects_by_medium_with_without_oekologie.csv', index=False)

    # --- G: effects by topic ---
    rows_g = []
    for topic in TOPIC_ORDER:
        sub = paired[paired['Topic'] == topic]
        g, sd, sem, d, t, p, n = compute_effect(sub)
        rows_g.append({
            'Topic': topic, 'N': n,
            'Mean_Gain': round(g, 3),
            'SD_Gain': round(sd, 3),
            'Cohens_d': round(d, 3) if not np.isnan(d) else np.nan,
            't_stat': round(t, 3) if not np.isnan(t) else np.nan,
            'p_value': round(p, 4) if not np.isnan(p) else np.nan,
        })
    pd.DataFrame(rows_g).to_csv(STATS_DIR / 'effects_by_topic.csv', index=False)

    # --- H: effects by medium × topic ---
    rows_h = []
    for m in MEDIUM_ORDER:
        for topic in TOPIC_ORDER:
            sub = paired[(paired['Medium'] == m) & (paired['Topic'] == topic)]
            g, sd, sem, d, t, p, n = compute_effect(sub)
            rows_h.append({
                'Medium': m, 'Topic': topic, 'N': n,
                'Mean_Gain': round(g, 3) if not np.isnan(g) else np.nan,
                'SD_Gain': round(sd, 3) if not np.isnan(sd) else np.nan,
                'Cohens_d': round(d, 3) if not np.isnan(d) else np.nan,
                't_stat': round(t, 3) if not np.isnan(t) else np.nan,
                'p_value': round(p, 4) if not np.isnan(p) else np.nan,
            })
    pd.DataFrame(rows_h).to_csv(STATS_DIR / 'effects_by_medium_topic_grid.csv', index=False)

    # --- I: outlier influence ---
    rows_i = []
    for m in MEDIUM_ORDER:
        sub_all = paired_f[paired_f['Medium'] == m]
        sub_noo = sub_all[~sub_all['Outlier']]
        g1, sd1, _, d1, t1, p1, n1 = compute_effect(sub_all)
        g2, sd2, _, d2, t2, p2, n2 = compute_effect(sub_noo)
        outliers = sub_all[sub_all['Outlier']][['Participant', 'Topic', 'Score_Gain']]
        out_list = '; '.join(f"{r['Participant']}/{r['Topic']}({r['Score_Gain']:+.1f}%)"
                             for _, r in outliers.iterrows())
        rows_i.append({
            'Medium': m,
            'N_all': n1, 'Mean_Gain_all': round(g1, 3), 'Cohens_d_all': round(d1, 3) if not np.isnan(d1) else np.nan,
            'p_all': round(p1, 4) if not np.isnan(p1) else np.nan,
            'N_no_outliers': n2, 'Mean_Gain_no_outliers': round(g2, 3),
            'Cohens_d_no_outliers': round(d2, 3) if not np.isnan(d2) else np.nan,
            'p_no_outliers': round(p2, 4) if not np.isnan(p2) else np.nan,
            'Delta_d': round((d2 - d1) if not (np.isnan(d1) or np.isnan(d2)) else np.nan, 3),
            'Outliers': out_list,
        })
    pd.DataFrame(rows_i).to_csv(STATS_DIR / 'outlier_influence.csv', index=False)

    print(f"  Stats exported to: {STATS_DIR}")


# =============================================================================
# MAIN
# =============================================================================

def main():
    print("Loading data...")
    df     = load_data()
    paired = build_paired_tutoring(df)
    print(f"  {len(paired)} paired tutoring entries across "
          f"{paired['Participant'].nunique()} participants\n")

    sections = [
        ("F. Effect Without Ökologie (vs. With)", [
            ("F1", "Cohen's d comparison by medium",          lambda: plot_F1_cohens_d_comparison(paired)),
            ("F2", "Mean score gain comparison",              lambda: plot_F2_mean_gain_comparison(paired)),
            ("F3", "Paired slopes 2×3 grid",                  lambda: plot_F3_paired_slopes_comparison(paired)),
            ("F4", "Gain distribution comparison",            lambda: plot_F4_gain_distribution_comparison(paired)),
            ("F5", "Descriptive stats table",                 lambda: plot_F5_stats_table(paired)),
        ]),
        ("G. Effect Per Topic", [
            ("G1", "Bar chart: gain + d per topic",           lambda: plot_G1_effect_per_topic(paired)),
            ("G2", "Paired slopes per topic",                 lambda: plot_G2_slopes_per_topic(paired)),
        ]),
        ("H. Medium × Topic Grid", [
            ("H1", "3×3 grid of slope plots",                 lambda: plot_H1_medium_topic_grid(paired)),
        ]),
        ("I. Outlier Influence Analysis", [
            ("I1", "Outlier scatter per medium",              lambda: plot_I1_outlier_scatter(paired)),
            ("I2", "Effect size: all vs. outliers removed",   lambda: plot_I2_outlier_effect_comparison(paired)),
            ("I3", "Outlier heatmap (participant × topic)",   lambda: plot_I3_outlier_heatmap(paired)),
        ]),
    ]

    for section_name, plots in sections:
        print(section_name)
        for code, desc, fn in plots:
            fn()
            print(f"  [{code}] {desc}")

    print(f"\n11 plots saved to: {PLOT_DIR}")

    print("\nExporting statistics...")
    export_stats(paired)
    print("Done.")


if __name__ == "__main__":
    main()