1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
|
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE, mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score, mean_squared_error
from itertools import combinations
def load_and_preprocess_data(csv_path):
# Load the CSV file
df = pd.read_csv(csv_path)
# Extract target (power usage)
y = df['package_power_j'].values
# Extract features (CPU frequency and performance counters)
X = df.iloc[:, 3:] # Skip timestamp, duration_ms, and power
# Replace empty strings with NaN
X = X.replace('', np.nan)
# Convert all values to float
X = X.astype(float)
# Impute missing values using KNN
imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)
return X, y
def calculate_mutual_information(X, y):
# Calculate mutual information for each feature
mi_scores = mutual_info_regression(X, y)
# Create a DataFrame of features and their importance scores
feature_importance = pd.DataFrame({
'Feature': X.columns,
'Importance': mi_scores
})
# Sort by importance
feature_importance = feature_importance.sort_values('Importance', ascending=False)
# Print top features
print("\nFeature importance by mutual information:")
for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
print(f"{i}. {row['Feature']} - importance: {row['Importance']:.4f}")
# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Mutual Information Score')
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
return feature_importance
def find_best_features_rfe(X, y, n_features=5):
print("\nRunning Recursive Feature Elimination...")
model = RandomForestRegressor(n_estimators=100, random_state=42)
selector = RFE(estimator=model, n_features_to_select=n_features, step=1)
selector.fit(X, y)
# Get the selected features
selected_features = X.columns[selector.support_].tolist()
print(f"Selected features (RFE): {selected_features}")
return selected_features
def select_uncorrelated_features(X, feature_importances, top_n=5, correlation_threshold=0.75):
print(f"\nSelecting {top_n} uncorrelated features...")
# Convert feature importances to dictionary for easy lookup
importance_dict = {row['Feature']: row['Importance']
for _, row in feature_importances.iterrows()}
# Calculate correlation matrix
corr_matrix = X.corr().abs()
# Start with the most important feature
top_feature = feature_importances.iloc[0]['Feature']
selected = [top_feature]
print(f"Starting with top feature: {top_feature}")
candidates = X.columns.tolist()
candidates.remove(top_feature)
# Greedy selection of uncorrelated features
while len(selected) < top_n and candidates:
best_feature = None
max_importance = -np.inf
for feature in candidates:
# Check correlation with already selected features
correlations = [corr_matrix.loc[feature, sel] for sel in selected]
if max(correlations) < correlation_threshold:
# Use mutual information score as importance
importance = importance_dict[feature]
if importance > max_importance:
max_importance = importance
best_feature = feature
if best_feature:
selected.append(best_feature)
candidates.remove(best_feature)
print(f"Added {best_feature} (correlation with selected: {max([corr_matrix.loc[best_feature, sel] for sel in selected[:-1]])})")
else:
# If no feature satisfies correlation threshold, relax the threshold
old_threshold = correlation_threshold
correlation_threshold += 0.05
print(f"No features satisfy threshold {old_threshold}, relaxing to {correlation_threshold}")
print(f"Selected uncorrelated features: {selected}")
return selected
def evaluate_feature_combination(X, y, feature_combo):
X_subset = X[list(feature_combo)]
model = RandomForestRegressor(n_estimators=100, random_state=42)
scores = cross_val_score(model, X_subset, y, cv=5, scoring='r2')
return scores.mean(), scores.std()
def find_best_feature_combination(X, y, n_features=5, top_k=10):
print(f"\nFinding the best combination of {n_features} features...")
# Always include frequency if available
freq_col = 'cpu_frequency_mhz' if 'cpu_frequency_mhz' in X.columns else None
# For efficiency, limit the search to the top_k most important features
feature_importances = calculate_mutual_information(X, y)
top_features = feature_importances.head(top_k)['Feature'].tolist()
if freq_col and freq_col not in top_features:
top_features = [feat for feat in top_features if feat != freq_col]
top_features = top_features[:n_features-1] # Make room for frequency
else:
top_features = top_features[:n_features]
best_score = -np.inf
best_combo = None
best_std = 0
# Try all combinations
total_combos = sum(1 for _ in combinations(top_features, n_features if not freq_col else n_features-1))
print(f"Testing {total_combos} combinations...")
for i, combo in enumerate(combinations(top_features, n_features if not freq_col else n_features-1)):
if i % 10 == 0:
print(f"Evaluating combination {i+1}/{total_combos}")
features = list(combo)
if freq_col and freq_col not in features:
features.append(freq_col)
score, std = evaluate_feature_combination(X, y, features)
if score > best_score:
best_score = score
best_std = std
best_combo = features
print(f"Best feature combination (R²={best_score:.4f}±{best_std:.4f}): {best_combo}")
return best_combo, best_score, best_std
def compare_feature_subsets(X, y, n_features=5):
print("\nComparing different feature selection methods...")
# Always include frequency if available
freq_col = 'cpu_frequency_mhz' if 'cpu_frequency_mhz' in X.columns else None
# Get feature importances
feature_importances = calculate_mutual_information(X, y)
# Method 1: Top features by mutual information
if freq_col:
mi_top = feature_importances['Feature'].tolist()
if freq_col in mi_top:
mi_top.remove(freq_col)
mi_top = mi_top[:n_features-1] + [freq_col]
else:
mi_top = feature_importances['Feature'].tolist()[:n_features]
# Method 2: Recursive Feature Elimination
rfe_selected = find_best_features_rfe(X, y, n_features if not freq_col else n_features-1)
if freq_col and freq_col not in rfe_selected:
rfe_selected.append(freq_col)
# Method 3: Uncorrelated features
uncorrelated = select_uncorrelated_features(X, feature_importances,
n_features if not freq_col else n_features-1)
if freq_col and freq_col not in uncorrelated:
uncorrelated.append(freq_col)
# Method 4: Best combination (limited search)
best_combo, _, _ = find_best_feature_combination(X, y, n_features, top_k=10)
# Define subsets to test
subsets = {
'mutual_info_top': mi_top,
'rfe_selected': rfe_selected,
'uncorrelated': uncorrelated,
'best_combination': best_combo
}
# Create a visual representation of the correlation matrix
plt.figure(figsize=(12, 10))
corr_matrix = X.corr()
plt.imshow(corr_matrix, cmap='coolwarm', interpolation='none', vmin=-1, vmax=1)
plt.colorbar()
plt.xticks(range(len(X.columns)), X.columns, rotation=90)
plt.yticks(range(len(X.columns)), X.columns)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
# Evaluate each subset with a RandomForestRegressor
results = {}
for name, features in subsets.items():
X_subset = X[features]
scores = cross_val_score(RandomForestRegressor(n_estimators=100, random_state=42),
X_subset, y, cv=5, scoring='r2')
results[name] = {
'r2_mean': scores.mean(),
'r2_std': scores.std(),
'features': features
}
# Print results
print("\nResults for different feature subsets:")
for name, res in results.items():
print(f"{name}: R²={res['r2_mean']:.4f}±{res['r2_std']:.4f}, Features: {res['features']}")
# Create bar chart of results
plt.figure(figsize=(10, 6))
methods = list(results.keys())
scores = [results[m]['r2_mean'] for m in methods]
errors = [results[m]['r2_std'] for m in methods]
plt.bar(methods, scores, yerr=errors, capsize=5)
plt.ylabel('R² Score')
plt.title('Performance of Different Feature Selection Methods')
plt.ylim(0.90, 1.0) # Adjust as needed
plt.tight_layout()
plt.savefig('feature_selection_comparison.png')
# Find the best method
best_method = max(results.items(), key=lambda x: x[1]['r2_mean'])
print(f"\nBest method: {best_method[0]} with R²={best_method[1]['r2_mean']:.4f}")
print(f"Final recommended feature set: {best_method[1]['features']}")
return results, best_method[1]['features']
def final_model_evaluation(X, y, selected_features):
print("\nEvaluating final model with selected features...")
# Prepare data
X_selected = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(
X_selected, y, test_size=0.2, random_state=42)
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Evaluate
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Final model performance: R²={r2:.4f}, RMSE={rmse:.6f}")
# Plot actual vs predicted
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
plt.xlabel('Actual Power Usage (J)')
plt.ylabel('Predicted Power Usage (J)')
plt.title(f'Actual vs Predicted Power Usage with Selected Features\nR²={r2:.4f}, RMSE={rmse:.6f}')
plt.tight_layout()
plt.savefig('final_model_performance.png')
# Feature importances in the final model
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title('Feature Importances in Final Model')
plt.bar(range(X_selected.shape[1]), importances[indices])
plt.xticks(range(X_selected.shape[1]), np.array(selected_features)[indices], rotation=90)
plt.tight_layout()
plt.savefig('final_model_feature_importances.png')
return model, r2, rmse
def main():
# Load and preprocess data
print("Loading and preprocessing data...")
X, y = load_and_preprocess_data('logs.csv')
# Compare different feature selection methods
results, best_features = compare_feature_subsets(X, y, n_features=5)
# Evaluate final model with selected features
model, r2, rmse = final_model_evaluation(X, y, best_features)
print("\nAnalysis complete! The optimal set of 5 performance counters has been identified.")
print(f"Final selected features: {best_features}")
print(f"Model performance: R²={r2:.4f}, RMSE={rmse:.6f}")
if __name__ == "__main__":
main()
|