added simple outlier detection

This commit is contained in:
tom.hempel
2025-09-24 17:35:06 +02:00
parent 4e6a6c999d
commit c332161a5e
2 changed files with 77 additions and 6 deletions

View File

@ -77,6 +77,42 @@ def read_marks(csv_path: Path) -> pd.Series:
return ts
def clean_rr_ms(rr_df: pd.DataFrame, col: str = 'rr_ms', source_name: str | None = None) -> pd.DataFrame:
"""Basic NN editing for RR in ms with interpolation and reporting.
Steps:
- Coerce to numeric and mark non-finite as NaN (count)
- Mark out-of-range [300, 2000] ms as NaN (count)
- Mark robust outliers via 15s rolling median/MAD (z > 3.5) as NaN (count)
- Time-based interpolation to fill flagged values (then ffill/bfill)
- Print counts summary
"""
if rr_df is None or rr_df.empty or col not in rr_df.columns:
return rr_df
df = rr_df.copy()
df[col] = pd.to_numeric(df[col], errors='coerce')
# Track flags (only threshold filtering per request)
nonfinite_mask = ~pd.notna(df[col])
range_mask = (df[col] < 300) | (df[col] > 2000)
# Combine flags: non-finite or out-of-range
flagged = nonfinite_mask | range_mask
# Set flagged to NaN for interpolation
df.loc[flagged, col] = np.nan
# Interpolate in time, then ffill/bfill for edges
if isinstance(df.index, pd.DatetimeIndex):
df[col] = df[col].interpolate(method='time', limit_direction='both')
else:
df[col] = df[col].interpolate(limit_direction='both')
df[col] = df[col].ffill().bfill()
# Reporting
if source_name is None:
source_name = 'RR cleaning'
print(f"{source_name} - RR filter: nonfinite={int(nonfinite_mask.sum())}, out_of_range={int(range_mask.sum())}, total_flagged={int(flagged.sum())}")
return df
def segment_bounds_from_marks(marks: pd.Series, start_ts: pd.Timestamp, end_ts: pd.Timestamp) -> list[tuple[pd.Timestamp, pd.Timestamp]]:
"""Create segments between consecutive marks, plus the final segment from last mark to end.
@ -242,6 +278,7 @@ def process_recording(rec_dir: Path, plots_root: Path) -> None:
hr_df = read_signal_csv(hr_csv, 'hr')
rr_df = read_signal_csv(rr_csv, 'rr_ms')
rr_df = clean_rr_ms(rr_df, 'rr_ms')
marks = read_marks(ts_csv)
if hr_df.empty and rr_df.empty: