summaryrefslogtreecommitdiff
path: root/power_predictor.py
diff options
context:
space:
mode:
authorDennis Kobert <dennis@kobert.dev>2025-03-10 21:36:30 +0100
committerDennis Kobert <dennis@kobert.dev>2025-03-25 15:52:04 +0100
commit776dee32aca73ded5c0720afbe25ef2aae7b67f3 (patch)
tree1e4c8b9bde5c944869134e0ed751315efe09198a /power_predictor.py
parentc2d63b96cbbd3be653fa0023fadae7df3babaf32 (diff)
Implement benchmarking
Diffstat (limited to 'power_predictor.py')
-rw-r--r--power_predictor.py235
1 files changed, 235 insertions, 0 deletions
diff --git a/power_predictor.py b/power_predictor.py
new file mode 100644
index 0000000..67c73ae
--- /dev/null
+++ b/power_predictor.py
@@ -0,0 +1,235 @@
+import pandas as pd
+import numpy as np
+from sklearn.impute import KNNImputer
+from sklearn.preprocessing import StandardScaler
+from sklearn.feature_selection import mutual_info_regression
+from sklearn.model_selection import train_test_split
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+import matplotlib.pyplot as plt
+
+# Step 1: Load and prepare the data
+def load_data(csv_path):
+ # Read the CSV file
+ df = pd.read_csv(csv_path)
+
+ # Extract target (power usage)
+ y = df['package_power_j'].values
+
+ # Extract features (CPU frequency and performance counters)
+ # Skip timestamp and duration_ms columns
+ X = df.iloc[:, 3:].copy() # Starting from cpu_frequency_mhz
+
+ # Print information about the dataset
+ print(f"Loaded dataset with {X.shape[0]} samples and {X.shape[1]} features")
+ print(f"Feature names: {X.columns.tolist()}")
+
+ return X, y
+
+# Step 2: Handle missing values using KNN imputation
+def impute_missing_values(X):
+ # Replace empty strings with NaN
+ X = X.replace('', np.nan)
+
+ # Convert all values to float
+ X = X.astype(float)
+
+ # Count missing values per column
+ missing_counts = X.isna().sum()
+ print("Missing values per column:")
+ for col, count in missing_counts.items():
+ if count > 0:
+ print(f" {col}: {count} ({count/len(X)*100:.1f}%)")
+
+ # Impute missing values using KNN
+ imputer = KNNImputer(n_neighbors=5)
+ X_imputed = imputer.fit_transform(X)
+
+ return pd.DataFrame(X_imputed, columns=X.columns)
+
+# Step 3: Feature importance analysis
+def analyze_feature_importance(X, y):
+ # Calculate mutual information scores
+ mi_scores = mutual_info_regression(X, y)
+
+ # Create a DataFrame of features and their importance scores
+ feature_importance = pd.DataFrame({
+ 'Feature': X.columns,
+ 'Importance': mi_scores
+ })
+
+ # Sort by importance
+ feature_importance = feature_importance.sort_values('Importance', ascending=False)
+
+ # Plot feature importance
+ plt.figure(figsize=(10, 6))
+ plt.barh(feature_importance['Feature'][:10], feature_importance['Importance'][:10])
+ plt.xlabel('Mutual Information Score')
+ plt.title('Top 10 Most Important Features')
+ plt.tight_layout()
+ plt.savefig('feature_importance.png')
+
+ print("\nTop 5 most important features:")
+ for i, (_, row) in enumerate(feature_importance.head(5).iterrows(), 1):
+ print(f"{i}. {row['Feature']} - importance: {row['Importance']:.4f}")
+
+ return feature_importance
+
+# Step 4: Define the neural network model in PyTorch
+class PowerEstimator(nn.Module):
+ def __init__(self, input_size):
+ super(PowerEstimator, self).__init__()
+ self.model = nn.Sequential(
+ nn.Linear(input_size, 64),
+ nn.ReLU(),
+ nn.Dropout(0.2),
+ nn.Linear(64, 32),
+ nn.ReLU(),
+ nn.Linear(32, 1)
+ )
+
+ def forward(self, x):
+ return self.model(x)
+
+# Step 5: Train the model
+def train_model(X, y, batch_size=32, epochs=100, lr=0.001):
+ # Split data into training and validation sets
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
+
+ # Scale the features
+ scaler = StandardScaler()
+ X_train_scaled = scaler.fit_transform(X_train)
+ X_val_scaled = scaler.transform(X_val)
+
+ # Convert to PyTorch tensors
+ X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
+ y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
+ X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
+ y_val_tensor = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)
+
+ # Create data loaders
+ train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+ val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
+ val_loader = DataLoader(val_dataset, batch_size=batch_size)
+
+ # Initialize model, loss function, and optimizer
+ model = PowerEstimator(X_train.shape[1])
+ criterion = nn.MSELoss()
+ optimizer = optim.Adam(model.parameters(), lr=lr)
+
+ # Training loop
+ train_losses = []
+ val_losses = []
+
+ for epoch in range(epochs):
+ # Training
+ model.train()
+ train_loss = 0.0
+ for inputs, targets in train_loader:
+ optimizer.zero_grad()
+ outputs = model(inputs)
+ loss = criterion(outputs, targets)
+ loss.backward()
+ optimizer.step()
+ train_loss += loss.item() * inputs.size(0)
+
+ train_loss /= len(train_loader.dataset)
+ train_losses.append(train_loss)
+
+ # Validation
+ model.eval()
+ val_loss = 0.0
+ with torch.no_grad():
+ for inputs, targets in val_loader:
+ outputs = model(inputs)
+ loss = criterion(outputs, targets)
+ val_loss += loss.item() * inputs.size(0)
+
+ val_loss /= len(val_loader.dataset)
+ val_losses.append(val_loss)
+
+ # Print progress
+ if (epoch + 1) % 10 == 0:
+ print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}")
+
+ # Plot training history
+ plt.figure(figsize=(10, 5))
+ plt.plot(train_losses, label='Training Loss')
+ plt.plot(val_losses, label='Validation Loss')
+ plt.xlabel('Epoch')
+ plt.ylabel('MSE Loss')
+ plt.title('Training and Validation Loss')
+ plt.legend()
+ plt.savefig('training_history.png')
+
+ return model, scaler
+
+# Step 6: Evaluate the model
+def evaluate_model(model, X, y, scaler):
+ # Scale the features
+ X_scaled = scaler.transform(X)
+
+ # Convert to PyTorch tensors
+ X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
+ y_tensor = torch.tensor(y, dtype=torch.float32)
+
+ # Make predictions
+ model.eval()
+ with torch.no_grad():
+ y_pred = model(X_tensor).squeeze().numpy()
+
+ # Calculate metrics
+ mse = np.mean((y_pred - y) ** 2)
+ rmse = np.sqrt(mse)
+ mae = np.mean(np.abs(y_pred - y))
+ r2 = 1 - (np.sum((y - y_pred) ** 2) / np.sum((y - np.mean(y)) ** 2))
+
+ print("\nModel Evaluation:")
+ print(f"MSE: {mse:.6f}")
+ print(f"RMSE: {rmse:.6f}")
+ print(f"MAE: {mae:.6f}")
+ print(f"R²: {r2:.6f}")
+
+ # Plot actual vs predicted values
+ plt.figure(figsize=(8, 8))
+ plt.scatter(y, y_pred, alpha=0.5)
+ plt.plot([min(y), max(y)], [min(y), max(y)], 'r--')
+ plt.xlabel('Actual Power Usage (J)')
+ plt.ylabel('Predicted Power Usage (J)')
+ plt.title('Actual vs Predicted Power Usage')
+ plt.tight_layout()
+ plt.savefig('prediction_scatter.png')
+
+ return mse, rmse, mae, r2
+
+# Step 7: Main function
+def main():
+ csv_path = 'logs.csv'
+
+ # Load and prepare data
+ print("Loading data...")
+ X, y = load_data(csv_path)
+
+ # Impute missing values
+ print("\nImputing missing values...")
+ X_imputed = impute_missing_values(X)
+
+ # Analyze feature importance
+ print("\nAnalyzing feature importance...")
+ feature_importance = analyze_feature_importance(X_imputed, y)
+
+ # Train the model
+ print("\nTraining model...")
+ model, scaler = train_model(X_imputed, y, batch_size=8, epochs=200)
+
+ # Evaluate the model
+ print("\nEvaluating model...")
+ evaluate_model(model, X_imputed, y, scaler)
+
+ print("\nDone!")
+
+if __name__ == "__main__":
+ main()