Module 3.2: Causal Prediction & Forecasting

20 min Prerequisites: Previous modules

What You'll Learn

  1. Why causal features make better predictions
  2. Using the Models class for prediction
  3. Causal feature selection
  4. Comparing causal vs. non-causal predictions

Why Causal Prediction?

Standard ML uses ALL correlated features. But:

  • Spurious correlations break under distribution shift
  • Using effects as predictors can hurt generalization
  • Causal features are more robust
Key insight: Using only CAUSAL predictors often beats using ALL predictors!

Setup: Create Prediction Scenario

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from tigramite import data_processing as pp
from tigramite.pcmci import PCMCI
from tigramite.independence_tests.parcorr import ParCorr
from tigramite.models import Models
from tigramite.toymodels import structural_causal_processes as toys
# Create a system where we want to predict Y
np.random.seed(42)

def lin_f(x): return x

# X0, X1 are TRUE causes of X2 (our target)
# X3 is correlated with X2 but NOT a cause (effect of X2!)
links = {
    0: [((0, -1), 0.7, lin_f)],
    1: [((1, -1), 0.6, lin_f)],
    2: [((2, -1), 0.3, lin_f), ((0, -1), 0.5, lin_f), ((1, -2), 0.4, lin_f)],  # Target
    3: [((3, -1), 0.5, lin_f), ((2, -1), 0.6, lin_f)],  # X3 is EFFECT of X2!
}

T = 1000
data, _ = toys.structural_causal_process(links, T=T, seed=42)
var_names = ['Feature_A', 'Feature_B', 'Target', 'Spurious']

# Prediction scenario:
# - Feature_A (X0) CAUSES Target (X2) at lag 1
# - Feature_B (X1) CAUSES Target (X2) at lag 2
# - Spurious (X3) is CORRELATED but NOT a cause!
# - (Spurious is actually CAUSED BY Target)

The Spurious Correlation Trap

# Check correlations - Spurious looks predictive!
print("Correlations with Target:")
for i, name in enumerate(var_names):
    if i != 2:  # Skip target itself
        corr = np.corrcoef(data[:-1, i], data[1:, 2])[0, 1]
        print(f"  {name}: {corr:.3f}")

# Spurious has HIGH correlation but is NOT a valid predictor!
Warning: Spurious shows high correlation with Target, but using it as a predictor would be wrong - it's an EFFECT, not a cause!

Step 1: Discover Causal Parents

# First, discover the causal structure
dataframe = pp.DataFrame(data, var_names=var_names)
parcorr = ParCorr(significance='analytic')
pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=0)
results = pcmci.run_pcmciplus(tau_max=5, pc_alpha=0.05)

# Find causal parents of Target (index 2)
target_idx = 2
print("Discovered causal parents of Target:")
for i in range(4):
    for tau in range(6):
        if results['graph'][target_idx, i, tau] == '-->':
            val = results['val_matrix'][target_idx, i, tau]
            print(f"  {var_names[i]}(t-{tau}) → Target: strength={val:.3f}")

Step 2: Compare Prediction Approaches

# Split data into train/test
train_end = 800

# Approach 1: Use ALL features (standard ML)
def create_features_all(data, target_idx=2, max_lag=5):
    """Create features from ALL variables"""
    features = []
    for lag in range(1, max_lag + 1):
        for var in range(data.shape[1]):
            features.append(data[max_lag-lag:-lag if lag < max_lag else None, var])
    X = np.column_stack(features)[:-(max_lag)]
    y = data[max_lag:, target_idx]
    return X[:len(y)], y

# Approach 2: Use only CAUSAL features
def create_features_causal(data, target_idx=2):
    """Create features from only causal parents"""
    # Based on our discovered graph: X0(t-1) and X1(t-2)
    X = np.column_stack([
        data[1:-1, 0],  # Feature_A at lag 1
        data[:-2, 1],   # Feature_B at lag 2
    ])
    y = data[2:, target_idx]
    return X[:len(y)], y
# Train and evaluate
X_all, y_all = create_features_all(data)
X_causal, y_causal = create_features_causal(data)

# All features model
model_all = LinearRegression()
model_all.fit(X_all[:train_end-5], y_all[:train_end-5])
pred_all = model_all.predict(X_all[train_end-5:])
mse_all = mean_squared_error(y_all[train_end-5:], pred_all)

# Causal features model
model_causal = LinearRegression()
model_causal.fit(X_causal[:train_end-2], y_causal[:train_end-2])
pred_causal = model_causal.predict(X_causal[train_end-2:])
mse_causal = mean_squared_error(y_causal[train_end-2:], pred_causal)

print("Prediction Results:")
print(f"  All features (20 total): MSE = {mse_all:.4f}")
print(f"  Causal features only (2): MSE = {mse_causal:.4f}")
print(f"\nCausal features use 10% of variables but similar/better performance!")

Using Tigramite's Models Class

# Extract causal parents from the graph
def get_causal_parents(results, target_idx):
    """Extract causal parents from PCMCI results"""
    parents = {target_idx: []}
    for i in range(results['graph'].shape[0]):
        for tau in range(results['graph'].shape[2]):
            if results['graph'][target_idx, i, tau] == '-->':
                parents[target_idx].append((i, -tau))
    return parents

parents = get_causal_parents(results, target_idx=2)
print(f"Causal parents of Target: {parents}")
# Use Models class for prediction
from tigramite.models import Models

model = Models(
    dataframe=dataframe,
    model=LinearRegression(),
    data_transform=None,
    verbosity=0
)

# Fit with causal parents
model.fit_full_model(
    all_parents=parents,
    selected_targets=[2],  # Predict Target
    tau_max=5
)

# Get predictions
predictions = model.get_fit(target=2, data=dataframe.values[0])
print(f"Model fitted with {len(parents[2])} causal parent(s)")

Key Takeaways

  1. Causal features = causes of the target (not just correlations)
  2. Fewer features, similar/better performance - causal selection is efficient
  3. More robust - causal predictors work under distribution shift
  4. Avoid reverse causation - don't use effects of the target as predictors!
  5. Use PCMCI first to identify which variables are valid predictors