Virtual-Tutor-Eval/generate_plots.py

"""
generate_plots.py

Consolidated visualization script for VirTu-Eval experiment data.
Generates all plots into Data/plots/ organized by section:

  A. Overall Learning Trajectory (4 plots)
  B. Tutoring Phase Deep-Dive  (5 plots)
  C. Start-to-Finish Gains     (2 plots)
  D. Confidence Analysis        (3 plots)
  E. Personality Correlations   (2 plots)

Usage:
  python generate_plots.py
"""

import csv
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import seaborn as sns
from pathlib import Path
from io import StringIO
from scipy import stats

# =============================================================================
# CONFIG
# =============================================================================
BASE = Path(__file__).resolve().parent / "Data"
PLOT_DIR = BASE / "plots"
PLOT_DIR.mkdir(exist_ok=True)
STATS_DIR = BASE / "stats"
STATS_DIR.mkdir(exist_ok=True)

PHASE_ORDER  = ['Pre-Reading', 'Post-Reading', 'Pre-Tutoring', 'Post-Tutoring']
PHASE_LABELS = ['Pre-Read', 'Post-Read', 'Pre-Tutor', 'Post-Tutor']
PHASE_SHORT  = dict(zip(PHASE_ORDER, PHASE_LABELS))

MEDIUM_ORDER  = ['Chat', 'Video', 'VR']
MEDIUM_COLORS = {'Chat': '#2196F3', 'Video': '#FF9800', 'VR': '#4CAF50'}

TOPIC_ORDER  = ['Mendel', 'DNA-Replikation', 'Ökologie']
TOPIC_COLORS = {'Mendel': '#E91E63', 'DNA-Replikation': '#9C27B0', 'Ökologie': '#009688'}

BFI_TRAITS = {
    'Neuroticism':       {'items': [1, 2, 3], 'reverse': [3]},
    'Extraversion':      {'items': [4, 5, 6], 'reverse': [6]},
    'Openness':          {'items': [7, 8, 9], 'reverse': []},
    'Agreeableness':     {'items': [10, 11, 12], 'reverse': [10]},
    'Conscientiousness': {'items': [13, 14, 15], 'reverse': [14]},
}
TRAIT_ORDER  = list(BFI_TRAITS.keys())
TRAIT_COLORS = {'Neuroticism': '#E53935', 'Extraversion': '#FB8C00',
                'Openness': '#43A047', 'Agreeableness': '#1E88E5',
                'Conscientiousness': '#8E24AA'}

sns.set_theme(style="whitegrid", font_scale=1.05)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.bbox'] = 'tight'


def cohens_d(pre, post):
    diff = post - pre
    return diff.mean() / diff.std() if diff.std() > 0 else 0


# =============================================================================
# DATA LOADING
# =============================================================================

def load_data():
    df = pd.read_csv(BASE / "test_scores_all.csv", encoding="utf-8-sig")
    # Normalize typo "Pre-Tutor" -> "Pre-Tutoring"
    df['Zeitpunkt'] = df['Zeitpunkt'].str.strip().replace('Pre-Tutor', 'Pre-Tutoring')
    df['Phase'] = pd.Categorical(df['Zeitpunkt'], categories=PHASE_ORDER, ordered=True)
    df['Phase_Label'] = pd.Categorical(
        df['Zeitpunkt'].map(PHASE_SHORT), categories=PHASE_LABELS, ordered=True)
    df['Phase_Idx'] = df['Zeitpunkt'].map({p: i for i, p in enumerate(PHASE_ORDER)})
    df['P_Num'] = df['Participant'].str.extract(r'(\d+)').astype(int)
    return df


def build_paired_tutoring(df):
    pre = df[df['Zeitpunkt'] == 'Pre-Tutoring'][
        ['Participant', 'Topic', 'Medium', 'Score_Pct', 'Avg_Confidence']].copy()
    post = df[df['Zeitpunkt'] == 'Post-Tutoring'][
        ['Participant', 'Topic', 'Medium', 'Score_Pct', 'Avg_Confidence']].copy()
    pre.columns = ['Participant', 'Topic', 'Medium', 'Pre_Score', 'Pre_Conf']
    post.columns = ['Participant', 'Topic', 'Medium', 'Post_Score', 'Post_Conf']
    paired = pre.merge(post, on=['Participant', 'Topic', 'Medium'])
    paired['Score_Gain'] = paired['Post_Score'] - paired['Pre_Score']
    paired['Conf_Gain'] = paired['Post_Conf'] - paired['Pre_Conf']
    paired['P_Num'] = paired['Participant'].str.extract(r'(\d+)').astype(int)
    return paired


def load_personality():
    path = BASE / "Final-Questionnaire.csv"
    with open(path, encoding="utf-8-sig") as f:
        reader = csv.reader(StringIO(f.read()))
        rows = list(reader)
    header, data = rows[0], rows[1:]
    records = []
    for row in data:
        pid = row[-1].strip()
        if not pid:
            continue
        pid = pid if pid.startswith('P') else f'P{pid}'
        items = {}
        for i in range(1, 16):
            try:
                items[i] = int(row[i].strip())
            except (ValueError, IndexError):
                items[i] = np.nan
        traits = {}
        for trait, info in BFI_TRAITS.items():
            vals = []
            for it in info['items']:
                v = items.get(it, np.nan)
                if pd.notna(v):
                    vals.append(8 - v if it in info['reverse'] else v)
            traits[trait] = np.mean(vals) if vals else np.nan
        rec = {'Participant': pid}
        rec.update(traits)
        records.append(rec)
    return pd.DataFrame(records)


# =============================================================================
# A. OVERALL LEARNING TRAJECTORY
# =============================================================================

def plot_A1_trajectory(df):
    fig, ax1 = plt.subplots(figsize=(10, 6))
    means = df.groupby('Phase_Label', observed=True).agg(
        S=('Score_Pct', 'mean'), S_se=('Score_Pct', 'sem'),
        C=('Avg_Confidence', 'mean'), C_se=('Avg_Confidence', 'sem'),
    ).reindex(PHASE_LABELS)
    x = np.arange(4)
    c1, c2 = '#1976D2', '#E65100'
    ax1.errorbar(x, means['S'], yerr=means['S_se']*1.96, color=c1, marker='o',
                 markersize=10, linewidth=2.5, capsize=5, capthick=2, label='Score %', zorder=5)
    ax1.set_ylabel('Test Score (%)', color=c1, fontsize=13); ax1.set_ylim(30, 100)
    ax1.tick_params(axis='y', labelcolor=c1)
    ax2 = ax1.twinx()
    ax2.errorbar(x, means['C'], yerr=means['C_se']*1.96, color=c2, marker='s',
                 markersize=10, linewidth=2.5, capsize=5, capthick=2, linestyle='--',
                 label='Confidence', zorder=5)
    ax2.set_ylabel('Avg Confidence (1-7)', color=c2, fontsize=13); ax2.set_ylim(1, 7)
    ax2.tick_params(axis='y', labelcolor=c2)
    ax1.set_xticks(x); ax1.set_xticklabels(PHASE_LABELS, fontsize=12)
    for i, row in means.iterrows():
        idx = PHASE_LABELS.index(i)
        ax1.annotate(f'{row["S"]:.1f}%', (idx, row['S']), textcoords="offset points",
                     xytext=(0, 14), ha='center', fontsize=10, color=c1, fontweight='bold')
        ax2.annotate(f'{row["C"]:.2f}', (idx, row['C']), textcoords="offset points",
                     xytext=(0, -18), ha='center', fontsize=10, color=c2, fontweight='bold')
    h1, l1 = ax1.get_legend_handles_labels()
    h2, l2 = ax2.get_legend_handles_labels()
    ax1.legend(h1+h2, l1+l2, loc='lower right', fontsize=11)
    ax1.annotate('', xy=(0.32,-0.12), xytext=(0,-0.12), arrowprops=dict(arrowstyle='<->',color='gray',lw=1.5), annotation_clip=False, xycoords='axes fraction')
    ax1.annotate('', xy=(1,-0.12), xytext=(0.68,-0.12), arrowprops=dict(arrowstyle='<->',color='gray',lw=1.5), annotation_clip=False, xycoords='axes fraction')
    ax1.text(0.16,-0.17,'Reading Phase',transform=ax1.transAxes,ha='center',fontsize=10,color='gray')
    ax1.text(0.84,-0.17,'Tutoring Phase',transform=ax1.transAxes,ha='center',fontsize=10,color='gray')
    fig.suptitle('Overall Learning Trajectory', fontsize=15, fontweight='bold')
    fig.savefig(PLOT_DIR / 'A1_trajectory.png', bbox_inches='tight'); plt.close(fig)


def plot_A2_trajectory_by_medium(df):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    x = np.arange(4); off = [-0.1, 0, 0.1]
    for j, m in enumerate(MEDIUM_ORDER):
        sub = df[df['Medium'] == m]
        ms = sub.groupby('Phase_Label', observed=True)['Score_Pct'].agg(['mean','sem']).reindex(PHASE_LABELS)
        ax1.errorbar(x+off[j], ms['mean'], yerr=ms['sem']*1.96, color=MEDIUM_COLORS[m],
                     marker='o', markersize=8, linewidth=2, capsize=4, label=m)
        mc = sub.groupby('Phase_Label', observed=True)['Avg_Confidence'].agg(['mean','sem']).reindex(PHASE_LABELS)
        ax2.errorbar(x+off[j], mc['mean'], yerr=mc['sem']*1.96, color=MEDIUM_COLORS[m],
                     marker='s', markersize=8, linewidth=2, capsize=4, linestyle='--', label=m)
    ax1.set_xticks(x); ax1.set_xticklabels(PHASE_LABELS); ax1.set_ylabel('Test Score (%)'); ax1.set_ylim(30,100); ax1.legend(title='Medium'); ax1.set_title('Score')
    ax2.set_xticks(x); ax2.set_xticklabels(PHASE_LABELS); ax2.set_ylabel('Avg Confidence (1-7)'); ax2.set_ylim(1,7); ax2.legend(title='Medium'); ax2.set_title('Confidence')
    fig.suptitle('Learning Trajectories by Medium', fontsize=14, fontweight='bold')
    fig.tight_layout(); fig.savefig(PLOT_DIR / 'A2_trajectory_by_medium.png'); plt.close(fig)


def plot_A3_trajectory_by_topic(df):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    x = np.arange(4); off = [-0.1, 0, 0.1]
    for j, t in enumerate(TOPIC_ORDER):
        sub = df[df['Topic'] == t]
        ms = sub.groupby('Phase_Label', observed=True)['Score_Pct'].agg(['mean','sem']).reindex(PHASE_LABELS)
        ax1.errorbar(x+off[j], ms['mean'], yerr=ms['sem']*1.96, color=TOPIC_COLORS[t],
                     marker='o', markersize=8, linewidth=2, capsize=4, label=t)
        mc = sub.groupby('Phase_Label', observed=True)['Avg_Confidence'].agg(['mean','sem']).reindex(PHASE_LABELS)
        ax2.errorbar(x+off[j], mc['mean'], yerr=mc['sem']*1.96, color=TOPIC_COLORS[t],
                     marker='s', markersize=8, linewidth=2, capsize=4, linestyle='--', label=t)
    ax1.set_xticks(x); ax1.set_xticklabels(PHASE_LABELS); ax1.set_ylabel('Test Score (%)'); ax1.set_ylim(30,100); ax1.legend(title='Topic'); ax1.set_title('Score')
    ax2.set_xticks(x); ax2.set_xticklabels(PHASE_LABELS); ax2.set_ylabel('Avg Confidence (1-7)'); ax2.set_ylim(1,7); ax2.legend(title='Topic'); ax2.set_title('Confidence')
    fig.suptitle('Learning Trajectories by Topic', fontsize=14, fontweight='bold')
    fig.tight_layout(); fig.savefig(PLOT_DIR / 'A3_trajectory_by_topic.png'); plt.close(fig)


def plot_A4_heatmap(df):
    pivot_s = df.pivot_table(index='Participant', columns='Zeitpunkt', values='Score_Pct', aggfunc='mean')
    pivot_s = pivot_s.reindex(columns=PHASE_ORDER).reindex(sorted(pivot_s.index, key=lambda x: int(x[1:])))
    pivot_s.columns = PHASE_LABELS
    pivot_c = df.pivot_table(index='Participant', columns='Zeitpunkt', values='Avg_Confidence', aggfunc='mean')
    pivot_c = pivot_c.reindex(columns=PHASE_ORDER).reindex(sorted(pivot_c.index, key=lambda x: int(x[1:])))
    pivot_c.columns = PHASE_LABELS
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 8))
    sns.heatmap(pivot_s, annot=True, fmt='.0f', cmap='RdYlGn', vmin=20, vmax=100, ax=ax1, linewidths=.5, cbar_kws={'label':'Score %'})
    ax1.set_title('Test Scores'); ax1.set_ylabel('Participant')
    sns.heatmap(pivot_c, annot=True, fmt='.1f', cmap='YlOrRd', vmin=1, vmax=7, ax=ax2, linewidths=.5, cbar_kws={'label':'Confidence (1-7)'})
    ax2.set_title('Confidence'); ax2.set_ylabel('')
    fig.suptitle('Participant-Level Heatmaps', fontsize=14, fontweight='bold')
    fig.tight_layout(); fig.savefig(PLOT_DIR / 'A4_heatmap.png'); plt.close(fig)


# =============================================================================
# B. TUTORING PHASE DEEP-DIVE
# =============================================================================

def plot_B1_tutoring_slopes_by_medium(paired):
    fig, axes = plt.subplots(1, 3, figsize=(18, 7), sharey=True)
    for ax, medium in zip(axes, MEDIUM_ORDER):
        sub = paired[paired['Medium'] == medium].sort_values('P_Num')
        for _, row in sub.iterrows():
            ax.plot([0,1], [row['Pre_Score'], row['Post_Score']], color=TOPIC_COLORS[row['Topic']],
                    alpha=0.5, linewidth=1.5, marker='o', markersize=5)
            ax.annotate(row['Participant'], (1.02, row['Post_Score']), fontsize=7, va='center', alpha=0.6)
        pre_m, post_m = sub['Pre_Score'].mean(), sub['Post_Score'].mean()
        ax.plot([0,1], [pre_m, post_m], color=MEDIUM_COLORS[medium], linewidth=4, marker='D',
                markersize=12, zorder=10, markeredgecolor='white', markeredgewidth=2)
        t, p = stats.ttest_rel(sub['Pre_Score'], sub['Post_Score'])
        d = cohens_d(sub['Pre_Score'], sub['Post_Score'])
        sig = '***' if p<.001 else '**' if p<.01 else '*' if p<.05 else 'n.s.'
        ax.text(0.5, 0.02, f'Gain: {post_m-pre_m:+.1f}%  d={d:.2f}\nt={t:.2f}, p={p:.3f} {sig}',
                transform=ax.transAxes, ha='center', fontsize=10,
                bbox=dict(boxstyle='round,pad=0.4', facecolor='lightyellow', alpha=0.9))
        ax.set_xticks([0,1]); ax.set_xticklabels(['Pre-Tutoring','Post-Tutoring'], fontsize=11)
        ax.set_title(medium, fontsize=14, fontweight='bold', color=MEDIUM_COLORS[medium])
        ax.set_ylim(-5, 110)
    axes[0].set_ylabel('Test Score (%)', fontsize=12)
    legend_el = [Line2D([0],[0], color=TOPIC_COLORS[t], lw=2, marker='o', ms=6, label=t) for t in TOPIC_ORDER]
    legend_el.append(Line2D([0],[0], color='gray', lw=4, marker='D', ms=8, label='Medium Mean'))
    fig.legend(handles=legend_el, loc='upper center', ncol=4, fontsize=10, bbox_to_anchor=(0.5, 0.02))
    fig.suptitle('Tutoring: Individual Trajectories by Medium', fontsize=15, fontweight='bold')
    fig.tight_layout(rect=[0,0.05,1,0.96]); fig.savefig(PLOT_DIR / 'B1_tutoring_slopes_by_medium.png'); plt.close(fig)


def plot_B2_tutoring_slopes_by_topic(paired):
    fig, axes = plt.subplots(1, 3, figsize=(18, 7), sharey=True)
    for ax, topic in zip(axes, TOPIC_ORDER):
        sub = paired[paired['Topic'] == topic].sort_values('P_Num')
        for _, row in sub.iterrows():
            ax.plot([0,1], [row['Pre_Score'], row['Post_Score']], color=MEDIUM_COLORS[row['Medium']],
                    alpha=0.5, linewidth=1.5, marker='o', markersize=5)
            ax.annotate(row['Participant'], (1.02, row['Post_Score']), fontsize=7, va='center', alpha=0.6)
        for medium in MEDIUM_ORDER:
            msub = sub[sub['Medium'] == medium]
            if len(msub) > 0:
                pm, qm = msub['Pre_Score'].mean(), msub['Post_Score'].mean()
                ax.plot([0,1], [pm, qm], color=MEDIUM_COLORS[medium], linewidth=3.5,
                        marker='D', markersize=10, zorder=10, markeredgecolor='white', markeredgewidth=2,
                        label=f'{medium} ({qm-pm:+.1f}%)')
        t, p = stats.ttest_rel(sub['Pre_Score'], sub['Post_Score'])
        d = cohens_d(sub['Pre_Score'], sub['Post_Score'])
        sig = '***' if p<.001 else '**' if p<.01 else '*' if p<.05 else 'n.s.'
        ax.text(0.5, 0.02, f'Overall: {sub["Score_Gain"].mean():+.1f}%  d={d:.2f}\np={p:.3f} {sig}',
                transform=ax.transAxes, ha='center', fontsize=10,
                bbox=dict(boxstyle='round,pad=0.4', facecolor='lightyellow', alpha=0.9))
        ax.set_xticks([0,1]); ax.set_xticklabels(['Pre-Tutoring','Post-Tutoring'], fontsize=11)
        ax.set_title(topic, fontsize=14, fontweight='bold', color=TOPIC_COLORS[topic])
        ax.set_ylim(-5, 110); ax.legend(fontsize=9, loc='upper left')
    axes[0].set_ylabel('Test Score (%)', fontsize=12)
    fig.suptitle('Tutoring: Individual Trajectories by Topic', fontsize=15, fontweight='bold')
    fig.tight_layout(rect=[0,0,1,0.96]); fig.savefig(PLOT_DIR / 'B2_tutoring_slopes_by_topic.png'); plt.close(fig)


def plot_B3_tutoring_gain_by_medium(paired):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6.5))
    rng = np.random.default_rng(42)
    for i, m in enumerate(MEDIUM_ORDER):
        sub = paired[paired['Medium'] == m]
        g, se = sub['Score_Gain'].mean(), sub['Score_Gain'].sem()
        d = cohens_d(sub['Pre_Score'], sub['Post_Score'])
        ax1.bar(i, g, color=MEDIUM_COLORS[m], alpha=0.6, width=0.6, yerr=se*1.96, capsize=6, edgecolor='white', lw=1.5)
        jit = rng.uniform(-0.15, 0.15, len(sub))
        ax1.scatter(np.full(len(sub),i)+jit, sub['Score_Gain'], color=MEDIUM_COLORS[m], s=40, alpha=0.7, edgecolors='white', lw=0.5, zorder=5)
        ax1.text(i, g+se*1.96+2, f'{g:+.1f}%\nd={d:.2f}', ha='center', fontsize=10, fontweight='bold')
        gc, sec = sub['Conf_Gain'].mean(), sub['Conf_Gain'].sem()
        dc = cohens_d(sub['Pre_Conf'], sub['Post_Conf'])
        ax2.bar(i, gc, color=MEDIUM_COLORS[m], alpha=0.6, width=0.6, yerr=sec*1.96, capsize=6, edgecolor='white', lw=1.5)
        ax2.scatter(np.full(len(sub),i)+jit, sub['Conf_Gain'], color=MEDIUM_COLORS[m], s=40, alpha=0.7, edgecolors='white', lw=0.5, zorder=5)
        ax2.text(i, gc+sec*1.96+0.15, f'{gc:+.2f}\nd={dc:.2f}', ha='center', fontsize=10, fontweight='bold')
    ax1.axhline(0, color='gray', lw=1); ax1.set_xticks(range(3)); ax1.set_xticklabels(MEDIUM_ORDER, fontsize=12); ax1.set_ylabel('Score Gain (%)')
    ax1.set_title('Score Gain')
    ax2.axhline(0, color='gray', lw=1); ax2.set_xticks(range(3)); ax2.set_xticklabels(MEDIUM_ORDER, fontsize=12); ax2.set_ylabel('Confidence Gain')
    ax2.set_title('Confidence Gain')
    fig.suptitle('Tutoring Gains by Medium (with effect sizes)', fontsize=14, fontweight='bold')
    fig.tight_layout(); fig.savefig(PLOT_DIR / 'B3_tutoring_gain_by_medium.png'); plt.close(fig)


def plot_B4_tutoring_medium_topic(paired):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    pivot = paired.pivot_table(index='Medium', columns='Topic', values='Score_Gain', aggfunc='mean')
    pivot = pivot.reindex(index=MEDIUM_ORDER, columns=TOPIC_ORDER)
    sns.heatmap(pivot, annot=True, fmt='.1f', cmap='RdYlGn', center=0, ax=ax1, linewidths=1, vmin=-10, vmax=30, cbar_kws={'label':'Score Gain %'})
    ax1.set_title('Mean Tutoring Score Gain'); ax1.set_ylabel('Medium')
    x = np.arange(3); w = 0.25
    for j, t in enumerate(TOPIC_ORDER):
        means = [paired[(paired['Medium']==m)&(paired['Topic']==t)]['Score_Gain'].mean() for m in MEDIUM_ORDER]
        sems = [paired[(paired['Medium']==m)&(paired['Topic']==t)]['Score_Gain'].sem()*1.96 for m in MEDIUM_ORDER]
        ax2.bar(x+j*w-w, means, w, yerr=sems, capsize=3, color=TOPIC_COLORS[t], alpha=0.8, label=t, edgecolor='white')
    ax2.axhline(0, color='gray', lw=0.8); ax2.set_xticks(x); ax2.set_xticklabels(MEDIUM_ORDER, fontsize=12)
    ax2.set_ylabel('Score Gain (%)'); ax2.legend(title='Topic', fontsize=9); ax2.set_title('Gain by Medium and Topic')
    fig.suptitle('Medium x Topic Interaction', fontsize=14, fontweight='bold')
    fig.tight_layout(); fig.savefig(PLOT_DIR / 'B4_tutoring_medium_topic.png'); plt.close(fig)


def plot_B5_tutoring_dashboard(paired):
    fig = plt.figure(figsize=(18, 10))
    gs = fig.add_gridspec(2, 3, hspace=0.35, wspace=0.3)
    x = np.arange(3); w = 0.35

    # A) Absolute scores
    ax = fig.add_subplot(gs[0, 0])
    pre_m = [paired[paired['Medium']==m]['Pre_Score'].mean() for m in MEDIUM_ORDER]
    post_m = [paired[paired['Medium']==m]['Post_Score'].mean() for m in MEDIUM_ORDER]
    pre_se = [paired[paired['Medium']==m]['Pre_Score'].sem()*1.96 for m in MEDIUM_ORDER]
    post_se = [paired[paired['Medium']==m]['Post_Score'].sem()*1.96 for m in MEDIUM_ORDER]
    ax.bar(x-w/2, pre_m, w, yerr=pre_se, capsize=4, color='#BBDEFB', edgecolor='#1976D2', lw=1.5, label='Pre')
    ax.bar(x+w/2, post_m, w, yerr=post_se, capsize=4, color=[MEDIUM_COLORS[m] for m in MEDIUM_ORDER], alpha=0.8, edgecolor='white', lw=1.5, label='Post')
    ax.set_xticks(x); ax.set_xticklabels(MEDIUM_ORDER); ax.set_ylabel('Score (%)'); ax.set_ylim(40,100); ax.legend(fontsize=9); ax.set_title('A) Absolute Scores', fontweight='bold')

    # B) Absolute confidence
    ax = fig.add_subplot(gs[0, 1])
    pre_c = [paired[paired['Medium']==m]['Pre_Conf'].mean() for m in MEDIUM_ORDER]
    post_c = [paired[paired['Medium']==m]['Post_Conf'].mean() for m in MEDIUM_ORDER]
    pre_cse = [paired[paired['Medium']==m]['Pre_Conf'].sem()*1.96 for m in MEDIUM_ORDER]
    post_cse = [paired[paired['Medium']==m]['Post_Conf'].sem()*1.96 for m in MEDIUM_ORDER]
    ax.bar(x-w/2, pre_c, w, yerr=pre_cse, capsize=4, color='#FFE0B2', edgecolor='#E65100', lw=1.5, label='Pre')
    ax.bar(x+w/2, post_c, w, yerr=post_cse, capsize=4, color=[MEDIUM_COLORS[m] for m in MEDIUM_ORDER], alpha=0.8, edgecolor='white', lw=1.5, label='Post')
    ax.set_xticks(x); ax.set_xticklabels(MEDIUM_ORDER); ax.set_ylabel('Confidence (1-7)'); ax.set_ylim(1,7); ax.legend(fontsize=9); ax.set_title('B) Absolute Confidence', fontweight='bold')

    # C) Gains + effect sizes
    ax = fig.add_subplot(gs[0, 2])
    for i, m in enumerate(MEDIUM_ORDER):
        sub = paired[paired['Medium']==m]
        g, se = sub['Score_Gain'].mean(), sub['Score_Gain'].sem()
        t, p = stats.ttest_rel(sub['Pre_Score'], sub['Post_Score'])
        d = cohens_d(sub['Pre_Score'], sub['Post_Score'])
        ax.bar(i, g, color=MEDIUM_COLORS[m], alpha=0.7, yerr=se*1.96, capsize=5, width=0.6)
        sig = '***' if p<.001 else '**' if p<.01 else '*' if p<.05 else 'n.s.'
        ax.text(i, g+se*1.96+1.5, f'{g:+.1f}%\nd={d:.2f} {sig}', ha='center', fontsize=10, fontweight='bold')
    ax.axhline(0, color='gray', lw=1); ax.set_xticks(range(3)); ax.set_xticklabels(MEDIUM_ORDER); ax.set_ylabel('Score Gain (%)'); ax.set_title('C) Gains + Effect Sizes', fontweight='bold')

    # D) Gain distributions
    ax = fig.add_subplot(gs[1, 0])
    for i, m in enumerate(MEDIUM_ORDER):
        bp = ax.boxplot(paired[paired['Medium']==m]['Score_Gain'], positions=[i], widths=0.5, patch_artist=True, showmeans=True, meanprops=dict(marker='D', markerfacecolor='black', markersize=6))
        bp['boxes'][0].set_facecolor(MEDIUM_COLORS[m]); bp['boxes'][0].set_alpha(0.5)
    ax.axhline(0, color='gray', lw=0.8, ls='--'); ax.set_xticks(range(3)); ax.set_xticklabels(MEDIUM_ORDER); ax.set_ylabel('Score Gain (%)'); ax.set_title('D) Gain Distributions', fontweight='bold')

    # E) Improved/same/declined
    ax = fig.add_subplot(gs[1, 1])
    for i, m in enumerate(MEDIUM_ORDER):
        sub = paired[paired['Medium']==m]
        imp = (sub['Score_Gain']>0).sum(); same = (sub['Score_Gain']==0).sum(); dec = (sub['Score_Gain']<0).sum(); tot = len(sub)
        ax.barh([i-0.15, i, i+0.15], [imp/tot*100, same/tot*100, dec/tot*100], height=0.12, color=['#43A047','#9E9E9E','#E53935'], alpha=0.8)
        ax.text(imp/tot*100+1, i-0.15, f'{imp}/{tot}', va='center', fontsize=9)
    ax.set_yticks(range(3)); ax.set_yticklabels(MEDIUM_ORDER); ax.set_xlabel('% of Participants')
    ax.legend([mpatches.Patch(color='#43A047'), mpatches.Patch(color='#9E9E9E'), mpatches.Patch(color='#E53935')], ['Improved','Same','Declined'], fontsize=8, loc='lower right')
    ax.set_title('E) Improved / Same / Declined', fontweight='bold')

    # F) Stats table
    ax = fig.add_subplot(gs[1, 2]); ax.axis('off')
    tdata = []
    for m in MEDIUM_ORDER:
        sub = paired[paired['Medium']==m]
        g = sub['Score_Gain'].mean(); t, p = stats.ttest_rel(sub['Pre_Score'], sub['Post_Score'])
        d = cohens_d(sub['Pre_Score'], sub['Post_Score']); n = len(sub)
        sig = '***' if p<.001 else '**' if p<.01 else '*' if p<.05 else ''
        tdata.append([m, str(n), f'{sub["Pre_Score"].mean():.1f}', f'{sub["Post_Score"].mean():.1f}', f'{g:+.1f}', f'{d:.2f}', f'{p:.3f}{sig}'])
    table = ax.table(cellText=tdata, colLabels=['Medium','N','Pre M','Post M','Gain',"Cohen's d",'p-value'], loc='center', cellLoc='center')
    table.auto_set_font_size(False); table.set_fontsize(11); table.scale(1.0, 1.8)
    for j in range(7): table[0,j].set_facecolor('#E0E0E0'); table[0,j].set_text_props(fontweight='bold')
    for i, m in enumerate(MEDIUM_ORDER): table[i+1,0].set_facecolor(MEDIUM_COLORS[m]); table[i+1,0].set_text_props(color='white', fontweight='bold')
    ax.set_title('F) Statistical Summary', fontweight='bold', pad=20)

    fig.suptitle('Tutoring Effectiveness Dashboard', fontsize=16, fontweight='bold')
    fig.savefig(PLOT_DIR / 'B5_tutoring_dashboard.png'); plt.close(fig)


# =============================================================================
# C. START-TO-FINISH GAINS
# =============================================================================

def plot_C1_start_to_finish(df):
    pre_r = df[df['Zeitpunkt']=='Pre-Reading'][['Participant','Topic','Medium','Score_Pct']].copy()
    post_t = df[df['Zeitpunkt']=='Post-Tutoring'][['Participant','Topic','Medium','Score_Pct']].copy()
    pre_r.columns = ['Participant','Topic','Medium','Start']; post_t.columns = ['Participant','Topic','Medium','End']
    p = pre_r.merge(post_t, on=['Participant','Topic','Medium']); p['Gain'] = p['End'] - p['Start']
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7))
    for _, row in p.iterrows():
        ax1.plot([0,1], [row['Start'], row['End']], color=MEDIUM_COLORS[row['Medium']], alpha=0.25, lw=1)
    for m in MEDIUM_ORDER:
        sub = p[p['Medium']==m]; sm, em = sub['Start'].mean(), sub['End'].mean()
        ax1.plot([0,1], [sm, em], color=MEDIUM_COLORS[m], lw=3.5, marker='o', ms=10, label=f'{m} ({em-sm:+.1f}%)', zorder=10)
    ax1.set_xticks([0,1]); ax1.set_xticklabels(['Pre-Reading\n(Start)','Post-Tutoring\n(End)'], fontsize=12)
    ax1.set_ylabel('Test Score (%)'); ax1.set_ylim(0,105); ax1.legend(title='Medium (total gain)', loc='lower right'); ax1.set_title('Score Trajectory')
    for m in MEDIUM_ORDER:
        sub = p[p['Medium']==m]
        ax2.hist(sub['Gain'], bins=10, alpha=0.5, color=MEDIUM_COLORS[m], label=f'{m} (M={sub["Gain"].mean():.1f}%)', edgecolor='white')
    ax2.axvline(0, color='gray', lw=1, ls='--'); ax2.set_xlabel('Total Gain (%)'); ax2.set_ylabel('Count'); ax2.legend(title='Medium'); ax2.set_title('Gain Distribution')
    fig.suptitle('Start to Finish: Pre-Reading to Post-Tutoring', fontsize=14, fontweight='bold')
    fig.tight_layout(); fig.savefig(PLOT_DIR / 'C1_start_to_finish.png'); plt.close(fig)


def plot_C2_learning_gains(df):
    pivot = df.pivot_table(index=['Participant','Topic','Medium'], columns='Zeitpunkt', values='Score_Pct').reset_index()
    g = pd.DataFrame({'Medium': pivot['Medium'],
        'Reading': pivot.get('Post-Reading',0)-pivot.get('Pre-Reading',0),
        'Tutoring': pivot.get('Post-Tutoring',0)-pivot.get('Pre-Tutoring',0),
        'Total': pivot.get('Post-Tutoring',0)-pivot.get('Pre-Reading',0)})
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    # Scatter
    ax = axes[0]
    for m in MEDIUM_ORDER:
        sub = g[g['Medium']==m]
        ax.scatter(sub['Reading'], sub['Tutoring'], color=MEDIUM_COLORS[m], s=60, alpha=0.7, edgecolors='white', lw=0.5, label=m)
    ax.axhline(0, color='gray', lw=0.8, alpha=0.5); ax.axvline(0, color='gray', lw=0.8, alpha=0.5)
    ax.set_xlabel('Reading Gain (%)'); ax.set_ylabel('Tutoring Gain (%)'); ax.legend(title='Medium'); ax.set_title('Reading vs Tutoring')
    # Bar
    ax = axes[1]; gm = g.groupby('Medium')[['Reading','Tutoring']].agg(['mean','sem']); xp = np.arange(3); w = 0.35
    for i, (gt, c, l) in enumerate([('Reading','#1976D2','Reading'),('Tutoring','#E65100','Tutoring')]):
        ms = [gm.loc[m,(gt,'mean')] for m in MEDIUM_ORDER]; se = [gm.loc[m,(gt,'sem')]*1.96 for m in MEDIUM_ORDER]
        bars = ax.bar(xp+i*w-w/2, ms, w, yerr=se, color=c, alpha=0.8, capsize=4, label=l)
        for b, v in zip(bars, ms): ax.text(b.get_x()+b.get_width()/2, b.get_height()+1, f'{v:.1f}', ha='center', fontsize=9)
    ax.set_xticks(xp); ax.set_xticklabels(MEDIUM_ORDER); ax.set_ylabel('Score Gain (%)'); ax.axhline(0, color='gray', lw=0.8); ax.legend(); ax.set_title('Mean Gains by Medium')
    # Total
    ax = axes[2]; tm = g.groupby('Medium')['Total'].agg(['mean','sem'])
    bars = ax.bar(MEDIUM_ORDER, [tm.loc[m,'mean'] for m in MEDIUM_ORDER], color=[MEDIUM_COLORS[m] for m in MEDIUM_ORDER], alpha=0.8, yerr=[tm.loc[m,'sem']*1.96 for m in MEDIUM_ORDER], capsize=5)
    for b, m in zip(bars, MEDIUM_ORDER): ax.text(b.get_x()+b.get_width()/2, b.get_height()+1, f'{tm.loc[m,"mean"]:.1f}%', ha='center', fontsize=10, fontweight='bold')
    ax.set_ylabel('Total Gain (%)'); ax.axhline(0, color='gray', lw=0.8); ax.set_title('Total Learning Gain')
    fig.suptitle('Learning Gains Overview', fontsize=14, fontweight='bold')
    fig.tight_layout(); fig.savefig(PLOT_DIR / 'C2_learning_gains.png'); plt.close(fig)


# =============================================================================
# D. CONFIDENCE ANALYSIS
# =============================================================================

def plot_D1_confidence_vs_score(df):
    fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)
    ax = axes[0]
    ax.scatter(df['Score_Pct'], df['Avg_Confidence'], alpha=0.4, s=40, c='#546E7A', edgecolors='white', lw=0.5)
    mask = df[['Score_Pct','Avg_Confidence']].dropna().index; xr = df.loc[mask,'Score_Pct']; yr = df.loc[mask,'Avg_Confidence']
    if len(xr) > 2:
        z = np.polyfit(xr, yr, 1); xl = np.linspace(xr.min(), xr.max(), 100)
        ax.plot(xl, np.poly1d(z)(xl), 'r-', lw=2, alpha=0.8)
        r = np.corrcoef(xr, yr)[0,1]
        ax.text(0.05, 0.95, f'r = {r:.3f}', transform=ax.transAxes, fontsize=12, va='top', fontweight='bold', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    ax.set_xlabel('Test Score (%)'); ax.set_ylabel('Avg Confidence (1-7)'); ax.set_title('Overall')
    ax = axes[1]
    for phase, c in zip(PHASE_ORDER, ['#E8EAF6','#C5CAE9','#7986CB','#3F51B5']):
        sub = df[df['Zeitpunkt']==phase]
        ax.scatter(sub['Score_Pct'], sub['Avg_Confidence'], alpha=0.5, s=40, label=PHASE_SHORT[phase], edgecolors='white', lw=0.5)
    ax.legend(fontsize=9, title='Phase'); ax.set_xlabel('Test Score (%)'); ax.set_title('By Phase')
    ax = axes[2]
    for m in MEDIUM_ORDER:
        sub = df[df['Medium']==m]
        ax.scatter(sub['Score_Pct'], sub['Avg_Confidence'], alpha=0.5, s=40, color=MEDIUM_COLORS[m], label=m, edgecolors='white', lw=0.5)
    ax.legend(fontsize=9, title='Medium'); ax.set_xlabel('Test Score (%)'); ax.set_title('By Medium')
    fig.suptitle('Confidence vs Test Score', fontsize=14, fontweight='bold')
    fig.tight_layout(); fig.savefig(PLOT_DIR / 'D1_confidence_vs_score.png'); plt.close(fig)


def plot_D2_delta_conf_vs_score(df):
    ps = df.pivot_table(index=['Participant','Topic','Medium'], columns='Zeitpunkt', values='Score_Pct')
    pc = df.pivot_table(index=['Participant','Topic','Medium'], columns='Zeitpunkt', values='Avg_Confidence')
    d = pd.DataFrame({
        'R_S': ps.get('Post-Reading',0)-ps.get('Pre-Reading',0), 'R_C': pc.get('Post-Reading',0)-pc.get('Pre-Reading',0),
        'T_S': ps.get('Post-Tutoring',0)-ps.get('Pre-Tutoring',0), 'T_C': pc.get('Post-Tutoring',0)-pc.get('Pre-Tutoring',0),
        'A_S': ps.get('Post-Tutoring',0)-ps.get('Pre-Reading',0), 'A_C': pc.get('Post-Tutoring',0)-pc.get('Pre-Reading',0),
    }).reset_index().dropna()
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    for ax, (sx, sy, title) in zip(axes, [('R_S','R_C','Reading Phase'),('T_S','T_C','Tutoring Phase'),('A_S','A_C','Total')]):
        for m in MEDIUM_ORDER:
            sub = d[d['Medium']==m]
            ax.scatter(sub[sx], sub[sy], color=MEDIUM_COLORS[m], s=50, alpha=0.6, edgecolors='white', label=m)
        xv, yv = d[sx].values, d[sy].values
        if len(xv) > 2:
            z = np.polyfit(xv, yv, 1); xl = np.linspace(xv.min(), xv.max(), 100)
            ax.plot(xl, np.poly1d(z)(xl), 'r-', lw=1.5, alpha=0.7)
            r = np.corrcoef(xv, yv)[0,1]
            ax.text(0.05, 0.95, f'r = {r:.3f}', transform=ax.transAxes, fontsize=11, va='top', fontweight='bold', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        ax.axhline(0, color='gray', lw=0.8, alpha=0.5); ax.axvline(0, color='gray', lw=0.8, alpha=0.5)
        ax.set_xlabel('Score Change (%)'); ax.set_ylabel('Confidence Change'); ax.set_title(title); ax.legend(title='Medium', fontsize=8)
    fig.suptitle('Do Changes in Confidence Track Changes in Score?', fontsize=14, fontweight='bold')
    fig.tight_layout(); fig.savefig(PLOT_DIR / 'D2_delta_conf_vs_score.png'); plt.close(fig)


def plot_D3_calibration(df):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    dc = df[['Score_Pct','Avg_Confidence','Zeitpunkt']].dropna().copy()
    dc['Bin'] = pd.cut(dc['Avg_Confidence'], bins=[0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5], labels=list('1234567'))
    cal = dc.groupby('Bin', observed=True)['Score_Pct'].agg(['mean','sem','count'])
    cal = cal[cal['count']>=3]
    ax1.bar(cal.index.astype(str), cal['mean'], yerr=cal['sem']*1.96, capsize=4, color='#5C6BC0', alpha=0.8, edgecolor='white')
    for idx, row in cal.iterrows(): ax1.text(idx, row['mean']+2, f'n={int(row["count"])}', ha='center', fontsize=8, color='gray')
    ax1.set_xlabel('Confidence Rating'); ax1.set_ylabel('Mean Test Score (%)'); ax1.set_title('Overall Calibration')
    pcol = {'Pre-Reading':'#E8EAF6','Post-Reading':'#9FA8DA','Pre-Tutoring':'#5C6BC0','Post-Tutoring':'#283593'}
    for phase in PHASE_ORDER:
        sub = dc[dc['Zeitpunkt']==phase]
        if len(sub) < 5: continue
        bins = pd.cut(sub['Avg_Confidence'], bins=[0.5,2.5,4.5,7.5], labels=['Low (1-2)','Med (3-4)','High (5-7)'])
        ms = sub.groupby(bins, observed=True)['Score_Pct'].mean()
        ax2.plot(ms.index.astype(str), ms.values, marker='o', lw=2, ms=8, color=pcol[phase], label=PHASE_SHORT[phase])
    ax2.set_xlabel('Confidence Level'); ax2.set_ylabel('Mean Test Score (%)'); ax2.legend(title='Phase'); ax2.set_title('Calibration by Phase')
    fig.suptitle('Confidence Calibration', fontsize=14, fontweight='bold')
    fig.tight_layout(); fig.savefig(PLOT_DIR / 'D3_calibration.png'); plt.close(fig)


# =============================================================================
# E. PERSONALITY CORRELATIONS
# =============================================================================

def plot_E1_personality_correlations(merged):
    outcomes = ['Mean_Score_Gain','Mean_Conf_Gain','Mean_Total_Gain','Mean_Pre_Score','Mean_Post_Score','Mean_Pre_Conf','Mean_Post_Conf']
    labels = ['Tutor\nScore Gain','Tutor\nConf Gain','Total\nGain','Pre-Tutor\nScore','Post-Tutor\nScore','Pre-Tutor\nConf','Post-Tutor\nConf']
    corr = np.zeros((5, 7)); pvals = np.zeros_like(corr)
    for i, t in enumerate(TRAIT_ORDER):
        for j, o in enumerate(outcomes):
            xv, yv = merged[t].values, merged[o].values
            mask = ~(np.isnan(xv)|np.isnan(yv))
            if mask.sum() > 3: corr[i,j], pvals[i,j] = stats.pearsonr(xv[mask], yv[mask])
            else: corr[i,j] = np.nan; pvals[i,j] = 1
    fig, ax = plt.subplots(figsize=(12, 6))
    sns.heatmap(pd.DataFrame(corr, index=TRAIT_ORDER, columns=labels), annot=True, fmt='.2f', cmap='RdBu_r', center=0, vmin=-0.7, vmax=0.7, ax=ax, linewidths=1, cbar_kws={'label':'Pearson r'})
    for i in range(5):
        for j in range(7):
            star = '**' if pvals[i,j]<.01 else '*' if pvals[i,j]<.05 else ''
            if star: ax.text(j+0.5, i+0.75, star, ha='center', va='center', fontsize=12, fontweight='bold', color='black')
    ax.set_title('Big Five Traits vs Tutoring Outcomes (* p<.05, ** p<.01)', fontsize=13, fontweight='bold')
    fig.tight_layout(); fig.savefig(PLOT_DIR / 'E1_personality_correlations.png'); plt.close(fig)


def plot_E2_trait_vs_score_gain(merged):
    fig, axes = plt.subplots(1, 5, figsize=(22, 5), sharey=True)
    for ax, trait in zip(axes, TRAIT_ORDER):
        xv, yv = merged[trait].values, merged['Mean_Score_Gain'].values
        mask = ~(np.isnan(xv)|np.isnan(yv))
        ax.scatter(xv[mask], yv[mask], s=60, color=TRAIT_COLORS[trait], alpha=0.7, edgecolors='white', lw=0.5)
        for _, row in merged.iterrows():
            if pd.notna(row[trait]) and pd.notna(row['Mean_Score_Gain']):
                ax.annotate(row['Participant'], (row[trait], row['Mean_Score_Gain']), fontsize=7, alpha=0.5, textcoords="offset points", xytext=(3,3))
        if mask.sum() > 3:
            r, p = stats.pearsonr(xv[mask], yv[mask])
            z = np.polyfit(xv[mask], yv[mask], 1); xl = np.linspace(xv[mask].min(), xv[mask].max(), 100)
            ax.plot(xl, np.poly1d(z)(xl), color=TRAIT_COLORS[trait], lw=2, alpha=0.6)
            sig = '*' if p<.05 else ''
            ax.text(0.05, 0.95, f'r={r:.2f} p={p:.3f}{sig}', transform=ax.transAxes, fontsize=10, va='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        ax.axhline(0, color='gray', lw=0.5, alpha=0.5); ax.set_xlabel(trait, fontsize=11, fontweight='bold', color=TRAIT_COLORS[trait]); ax.set_xlim(1,7)
    axes[0].set_ylabel('Mean Tutoring Score Gain (%)', fontsize=11)
    fig.suptitle('Big Five Traits vs Tutoring Score Gains', fontsize=14, fontweight='bold')
    fig.tight_layout(); fig.savefig(PLOT_DIR / 'E2_trait_vs_score_gain.png'); plt.close(fig)


# =============================================================================
# STATS EXPORT
# =============================================================================

def export_stats(df, paired):
    rows = []

    # --- Overall trajectory ---
    phase_means = df.groupby('Zeitpunkt', observed=True).agg(
        Mean_Score=('Score_Pct', 'mean'),
        SEM_Score=('Score_Pct', 'sem'),
        Mean_Confidence=('Avg_Confidence', 'mean'),
        SEM_Confidence=('Avg_Confidence', 'sem'),
    ).reindex(PHASE_ORDER)
    phase_means.index.name = 'Phase'
    phase_means.to_csv(STATS_DIR / 'overall_trajectory.csv', float_format='%.3f')

    # --- Start-to-finish gain ---
    pre_r = df[df['Zeitpunkt'] == 'Pre-Reading'][['Participant', 'Topic', 'Medium', 'Score_Pct']].copy()
    pre_r.columns = ['Participant', 'Topic', 'Medium', 'Start']
    post_t = df[df['Zeitpunkt'] == 'Post-Tutoring'][['Participant', 'Topic', 'Medium', 'Score_Pct']].copy()
    post_t.columns = ['Participant', 'Topic', 'Medium', 'End']
    sf = pre_r.merge(post_t, on=['Participant', 'Topic', 'Medium'])
    sf['Gain'] = sf['End'] - sf['Start']
    t_sf, p_sf = stats.ttest_1samp(sf['Gain'].dropna(), 0)
    sf_summary = pd.DataFrame([{
        'Metric': 'Pre-Reading to Post-Tutoring',
        'N': len(sf),
        'Gain_Mean': sf['Gain'].mean(),
        'Gain_SD': sf['Gain'].std(),
        't_stat': t_sf,
        'p_value': p_sf,
    }])
    sf_summary.to_csv(STATS_DIR / 'start_to_finish.csv', index=False, float_format='%.3f')

    # --- Tutoring stats by medium ---
    med_rows = []
    for m in MEDIUM_ORDER:
        sub = paired[paired['Medium'] == m]
        t_val, p_val = stats.ttest_rel(sub['Pre_Score'], sub['Post_Score'])
        d_score = cohens_d(sub['Pre_Score'], sub['Post_Score'])
        t_c, p_c = stats.ttest_rel(sub['Pre_Conf'], sub['Post_Conf'])
        d_conf = cohens_d(sub['Pre_Conf'], sub['Post_Conf'])
        all_m = df[df['Medium'] == m]
        med_rows.append({
            'Medium': m,
            'N_pairs': len(sub),
            'Avg_Score_Mean': all_m['Score_Pct'].mean(),
            'Avg_Score_SD': all_m['Score_Pct'].std(),
            'Avg_Conf_Mean': all_m['Avg_Confidence'].mean(),
            'Avg_Conf_SD': all_m['Avg_Confidence'].std(),
            'Pre_Score_Mean': sub['Pre_Score'].mean(),
            'Post_Score_Mean': sub['Post_Score'].mean(),
            'Score_Gain_Mean': sub['Score_Gain'].mean(),
            'Score_Gain_SD': sub['Score_Gain'].std(),
            'Score_Cohens_d': d_score,
            'Score_t': t_val,
            'Score_p': p_val,
            'Pre_Conf_Mean': sub['Pre_Conf'].mean(),
            'Post_Conf_Mean': sub['Post_Conf'].mean(),
            'Conf_Gain_Mean': sub['Conf_Gain'].mean(),
            'Conf_Gain_SD': sub['Conf_Gain'].std(),
            'Conf_Cohens_d': d_conf,
            'Conf_t': t_c,
            'Conf_p': p_c,
        })
    pd.DataFrame(med_rows).to_csv(STATS_DIR / 'tutoring_by_medium.csv', index=False, float_format='%.3f')

    # --- Tutoring stats by topic ---
    topic_rows = []
    for topic in df['Topic'].unique():
        sub_t = paired[paired['Topic'] == topic]
        all_t = df[df['Topic'] == topic]
        t_val, p_val = stats.ttest_rel(sub_t['Pre_Score'], sub_t['Post_Score'])
        d_score = cohens_d(sub_t['Pre_Score'], sub_t['Post_Score'])
        topic_rows.append({
            'Topic': topic,
            'N_pairs': len(sub_t),
            'Avg_Score_Mean': all_t['Score_Pct'].mean(),
            'Avg_Score_SD': all_t['Score_Pct'].std(),
            'Avg_Conf_Mean': all_t['Avg_Confidence'].mean(),
            'Avg_Conf_SD': all_t['Avg_Confidence'].std(),
            'Score_Gain_Mean': sub_t['Score_Gain'].mean(),
            'Score_Gain_SD': sub_t['Score_Gain'].std(),
            'Score_Cohens_d': d_score,
            'Score_t': t_val,
            'Score_p': p_val,
        })
    pd.DataFrame(topic_rows).to_csv(STATS_DIR / 'tutoring_by_topic.csv', index=False, float_format='%.3f')

    # --- Participant summary ---
    part_rows = []
    for pid in sorted(df['Participant'].unique(), key=lambda x: int(x[1:])):
        sub_df = df[df['Participant'] == pid]
        sub_p = paired[paired['Participant'] == pid]
        phases = sub_df.groupby('Zeitpunkt', observed=True)['Score_Pct'].mean().reindex(PHASE_ORDER)
        part_rows.append({
            'Participant': pid,
            'N_Tests': len(sub_df),
            'Avg_Score_Mean': sub_df['Score_Pct'].mean(),
            'Avg_Conf_Mean': sub_df['Avg_Confidence'].mean(),
            'Pre_Reading': phases.get('Pre-Reading', float('nan')),
            'Post_Reading': phases.get('Post-Reading', float('nan')),
            'Pre_Tutoring': phases.get('Pre-Tutoring', float('nan')),
            'Post_Tutoring': phases.get('Post-Tutoring', float('nan')),
            'Reading_Gain': phases.get('Post-Reading', float('nan')) - phases.get('Pre-Reading', float('nan')),
            'Tutoring_Gain': sub_p['Score_Gain'].mean() if len(sub_p) else float('nan'),
        })
    pd.DataFrame(part_rows).to_csv(STATS_DIR / 'participant_summary.csv', index=False, float_format='%.3f')

    # --- Tutoring gain by medium × topic ---
    mt_rows = []
    for m in MEDIUM_ORDER:
        for topic in df['Topic'].unique():
            sub = paired[(paired['Medium'] == m) & (paired['Topic'] == topic)]
            mt_rows.append({
                'Medium': m,
                'Topic': topic,
                'N': len(sub),
                'Score_Gain_Mean': sub['Score_Gain'].mean() if len(sub) else float('nan'),
                'Score_Gain_SD': sub['Score_Gain'].std() if len(sub) else float('nan'),
                'Score_Gain_SEM': sub['Score_Gain'].sem() if len(sub) else float('nan'),
            })
    pd.DataFrame(mt_rows).to_csv(STATS_DIR / 'tutoring_by_medium_topic.csv', index=False, float_format='%.3f')

    print(f"  Stats exported to: {STATS_DIR}")


# =============================================================================
# MAIN
# =============================================================================

def main():
    print("Loading data...")
    df = load_data()
    paired = build_paired_tutoring(df)
    personality = load_personality()

    # Build merged for personality analysis
    p_agg = paired.groupby('Participant').agg(
        Mean_Score_Gain=('Score_Gain','mean'), Mean_Conf_Gain=('Conf_Gain','mean'),
        Mean_Pre_Score=('Pre_Score','mean'), Mean_Post_Score=('Post_Score','mean'),
        Mean_Pre_Conf=('Pre_Conf','mean'), Mean_Post_Conf=('Post_Conf','mean'),
    ).reset_index()
    # Total gain
    pre_r = df[df['Zeitpunkt']=='Pre-Reading'][['Participant','Topic','Medium','Score_Pct']].copy()
    pre_r.columns = ['Participant','Topic','Medium','PreRead']
    post_t = df[df['Zeitpunkt']=='Post-Tutoring'][['Participant','Topic','Medium','Score_Pct']].copy()
    post_t.columns = ['Participant','Topic','Medium','PostTutor']
    total = pre_r.merge(post_t, on=['Participant','Topic','Medium'])
    total['TotalGain'] = total['PostTutor'] - total['PreRead']
    tg = total.groupby('Participant')['TotalGain'].mean().reset_index()
    tg.columns = ['Participant','Mean_Total_Gain']
    p_agg = p_agg.merge(tg, on='Participant', how='left')
    merged = p_agg.merge(personality, on='Participant', how='inner')

    print(f"  {len(df)} test entries, {paired['Participant'].nunique()} participants, "
          f"{len(merged)} with personality data\n")

    # Generate all plots
    sections = [
        ("A. Overall Learning Trajectory", [
            ("A1", "Overall trajectory (score + confidence)", lambda: plot_A1_trajectory(df)),
            ("A2", "Trajectory by medium", lambda: plot_A2_trajectory_by_medium(df)),
            ("A3", "Trajectory by topic", lambda: plot_A3_trajectory_by_topic(df)),
            ("A4", "Participant-level heatmaps", lambda: plot_A4_heatmap(df)),
        ]),
        ("B. Tutoring Phase Deep-Dive", [
            ("B1", "Paired slopes by medium (with stats)", lambda: plot_B1_tutoring_slopes_by_medium(paired)),
            ("B2", "Paired slopes by topic (with stats)", lambda: plot_B2_tutoring_slopes_by_topic(paired)),
            ("B3", "Tutoring gain by medium (effect sizes)", lambda: plot_B3_tutoring_gain_by_medium(paired)),
            ("B4", "Medium x topic interaction", lambda: plot_B4_tutoring_medium_topic(paired)),
            ("B5", "Tutoring effectiveness dashboard", lambda: plot_B5_tutoring_dashboard(paired)),
        ]),
        ("C. Start-to-Finish Gains", [
            ("C1", "Pre-Reading to Post-Tutoring paired", lambda: plot_C1_start_to_finish(df)),
            ("C2", "Learning gains overview", lambda: plot_C2_learning_gains(df)),
        ]),
        ("D. Confidence Analysis", [
            ("D1", "Confidence vs test score scatter", lambda: plot_D1_confidence_vs_score(df)),
            ("D2", "Change in confidence vs change in score", lambda: plot_D2_delta_conf_vs_score(df)),
            ("D3", "Confidence calibration", lambda: plot_D3_calibration(df)),
        ]),
        ("E. Personality Correlations", [
            ("E1", "Big Five vs tutoring outcomes heatmap", lambda: plot_E1_personality_correlations(merged)),
            ("E2", "Trait vs tutoring score gain", lambda: plot_E2_trait_vs_score_gain(merged)),
        ]),
    ]

    for section_name, plots in sections:
        print(f"{section_name}")
        for code, desc, fn in plots:
            fn()
            print(f"  [{code}] {desc}")

    print(f"\n16 plots saved to: {PLOT_DIR}")

    print("\nExporting statistics...")
    export_stats(df, paired)
    print("Done.")


if __name__ == "__main__":
    main()