summaryrefslogtreecommitdiff
path: root/best_counters.py
blob: feb59003530b105a70862960e8f359b4cf19ac6a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE, mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score, mean_squared_error
from itertools import combinations

def load_and_preprocess_data(csv_path):
    # Load the CSV file
    df = pd.read_csv(csv_path)
    
    # Extract target (power usage)
    y = df['package_power_j'].values
    
    # Extract features (CPU frequency and performance counters)
    X = df.iloc[:, 3:]  # Skip timestamp, duration_ms, and power
    
    # Replace empty strings with NaN
    X = X.replace('', np.nan)
    
    # Convert all values to float
    X = X.astype(float)
    
    # Impute missing values using KNN
    imputer = KNNImputer(n_neighbors=5)
    X_imputed = imputer.fit_transform(X)
    X = pd.DataFrame(X_imputed, columns=X.columns)
    
    return X, y

def calculate_mutual_information(X, y):
    # Calculate mutual information for each feature
    mi_scores = mutual_info_regression(X, y)
    
    # Create a DataFrame of features and their importance scores
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': mi_scores
    })
    
    # Sort by importance
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    
    # Print top features
    print("\nFeature importance by mutual information:")
    for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
        print(f"{i}. {row['Feature']} - importance: {row['Importance']:.4f}")
    
    # Visualize feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['Feature'], feature_importance['Importance'])
    plt.xlabel('Mutual Information Score')
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    
    return feature_importance

def find_best_features_rfe(X, y, n_features=5):
    print("\nRunning Recursive Feature Elimination...")
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    selector = RFE(estimator=model, n_features_to_select=n_features, step=1)
    selector.fit(X, y)
    
    # Get the selected features
    selected_features = X.columns[selector.support_].tolist()
    print(f"Selected features (RFE): {selected_features}")
    return selected_features

def select_uncorrelated_features(X, feature_importances, top_n=5, correlation_threshold=0.75):
    print(f"\nSelecting {top_n} uncorrelated features...")
    
    # Convert feature importances to dictionary for easy lookup
    importance_dict = {row['Feature']: row['Importance'] 
                       for _, row in feature_importances.iterrows()}
    
    # Calculate correlation matrix
    corr_matrix = X.corr().abs()
    
    # Start with the most important feature
    top_feature = feature_importances.iloc[0]['Feature']
    selected = [top_feature]
    print(f"Starting with top feature: {top_feature}")
    
    candidates = X.columns.tolist()
    candidates.remove(top_feature)
    
    # Greedy selection of uncorrelated features
    while len(selected) < top_n and candidates:
        best_feature = None
        max_importance = -np.inf
        
        for feature in candidates:
            # Check correlation with already selected features
            correlations = [corr_matrix.loc[feature, sel] for sel in selected]
            if max(correlations) < correlation_threshold:
                # Use mutual information score as importance
                importance = importance_dict[feature]
                if importance > max_importance:
                    max_importance = importance
                    best_feature = feature
        
        if best_feature:
            selected.append(best_feature)
            candidates.remove(best_feature)
            print(f"Added {best_feature} (correlation with selected: {max([corr_matrix.loc[best_feature, sel] for sel in selected[:-1]])})")
        else:
            # If no feature satisfies correlation threshold, relax the threshold
            old_threshold = correlation_threshold
            correlation_threshold += 0.05
            print(f"No features satisfy threshold {old_threshold}, relaxing to {correlation_threshold}")
    
    print(f"Selected uncorrelated features: {selected}")
    return selected

def evaluate_feature_combination(X, y, feature_combo):
    X_subset = X[list(feature_combo)]
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    scores = cross_val_score(model, X_subset, y, cv=5, scoring='r2')
    return scores.mean(), scores.std()

def find_best_feature_combination(X, y, n_features=5, top_k=10):
    print(f"\nFinding the best combination of {n_features} features...")
    
    # Always include frequency if available
    freq_col = 'cpu_frequency_mhz' if 'cpu_frequency_mhz' in X.columns else None
    
    # For efficiency, limit the search to the top_k most important features
    feature_importances = calculate_mutual_information(X, y)
    top_features = feature_importances.head(top_k)['Feature'].tolist()
    
    if freq_col and freq_col not in top_features:
        top_features = [feat for feat in top_features if feat != freq_col]
        top_features = top_features[:n_features-1]  # Make room for frequency
    else:
        top_features = top_features[:n_features]
    
    best_score = -np.inf
    best_combo = None
    best_std = 0
    
    # Try all combinations
    total_combos = sum(1 for _ in combinations(top_features, n_features if not freq_col else n_features-1))
    print(f"Testing {total_combos} combinations...")
    
    for i, combo in enumerate(combinations(top_features, n_features if not freq_col else n_features-1)):
        if i % 10 == 0:
            print(f"Evaluating combination {i+1}/{total_combos}")
            
        features = list(combo)
        if freq_col and freq_col not in features:
            features.append(freq_col)
            
        score, std = evaluate_feature_combination(X, y, features)
        
        if score > best_score:
            best_score = score
            best_std = std
            best_combo = features
    
    print(f"Best feature combination (R²={best_score:.4f}±{best_std:.4f}): {best_combo}")
    return best_combo, best_score, best_std

def compare_feature_subsets(X, y, n_features=5):
    print("\nComparing different feature selection methods...")
    
    # Always include frequency if available
    freq_col = 'cpu_frequency_mhz' if 'cpu_frequency_mhz' in X.columns else None
    
    # Get feature importances
    feature_importances = calculate_mutual_information(X, y)
    
    # Method 1: Top features by mutual information
    if freq_col:
        mi_top = feature_importances['Feature'].tolist()
        if freq_col in mi_top:
            mi_top.remove(freq_col)
        mi_top = mi_top[:n_features-1] + [freq_col]
    else:
        mi_top = feature_importances['Feature'].tolist()[:n_features]
    
    # Method 2: Recursive Feature Elimination
    rfe_selected = find_best_features_rfe(X, y, n_features if not freq_col else n_features-1)
    if freq_col and freq_col not in rfe_selected:
        rfe_selected.append(freq_col)
    
    # Method 3: Uncorrelated features
    uncorrelated = select_uncorrelated_features(X, feature_importances, 
                                              n_features if not freq_col else n_features-1)
    if freq_col and freq_col not in uncorrelated:
        uncorrelated.append(freq_col)
    
    # Method 4: Best combination (limited search)
    best_combo, _, _ = find_best_feature_combination(X, y, n_features, top_k=10)
    
    # Define subsets to test
    subsets = {
        'mutual_info_top': mi_top,
        'rfe_selected': rfe_selected,
        'uncorrelated': uncorrelated,
        'best_combination': best_combo
    }
    
    # Create a visual representation of the correlation matrix
    plt.figure(figsize=(12, 10))
    corr_matrix = X.corr()
    plt.imshow(corr_matrix, cmap='coolwarm', interpolation='none', vmin=-1, vmax=1)
    plt.colorbar()
    plt.xticks(range(len(X.columns)), X.columns, rotation=90)
    plt.yticks(range(len(X.columns)), X.columns)
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.savefig('correlation_matrix.png')
    
    # Evaluate each subset with a RandomForestRegressor
    results = {}
    for name, features in subsets.items():
        X_subset = X[features]
        scores = cross_val_score(RandomForestRegressor(n_estimators=100, random_state=42), 
                                X_subset, y, cv=5, scoring='r2')
        
        results[name] = {
            'r2_mean': scores.mean(),
            'r2_std': scores.std(),
            'features': features
        }
    
    # Print results
    print("\nResults for different feature subsets:")
    for name, res in results.items():
        print(f"{name}: R²={res['r2_mean']:.4f}±{res['r2_std']:.4f}, Features: {res['features']}")
    
    # Create bar chart of results
    plt.figure(figsize=(10, 6))
    methods = list(results.keys())
    scores = [results[m]['r2_mean'] for m in methods]
    errors = [results[m]['r2_std'] for m in methods]
    
    plt.bar(methods, scores, yerr=errors, capsize=5)
    plt.ylabel('R² Score')
    plt.title('Performance of Different Feature Selection Methods')
    plt.ylim(0.90, 1.0)  # Adjust as needed
    plt.tight_layout()
    plt.savefig('feature_selection_comparison.png')
    
    # Find the best method
    best_method = max(results.items(), key=lambda x: x[1]['r2_mean'])
    print(f"\nBest method: {best_method[0]} with R²={best_method[1]['r2_mean']:.4f}")
    print(f"Final recommended feature set: {best_method[1]['features']}")
    
    return results, best_method[1]['features']

def final_model_evaluation(X, y, selected_features):
    print("\nEvaluating final model with selected features...")
    
    # Prepare data
    X_selected = X[selected_features]
    X_train, X_test, y_train, y_test = train_test_split(
        X_selected, y, test_size=0.2, random_state=42)
    
    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    print(f"Final model performance: R²={r2:.4f}, RMSE={rmse:.6f}")
    
    # Plot actual vs predicted
    plt.figure(figsize=(8, 8))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
    plt.xlabel('Actual Power Usage (J)')
    plt.ylabel('Predicted Power Usage (J)')
    plt.title(f'Actual vs Predicted Power Usage with Selected Features\nR²={r2:.4f}, RMSE={rmse:.6f}')
    plt.tight_layout()
    plt.savefig('final_model_performance.png')
    
    # Feature importances in the final model
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(10, 6))
    plt.title('Feature Importances in Final Model')
    plt.bar(range(X_selected.shape[1]), importances[indices])
    plt.xticks(range(X_selected.shape[1]), np.array(selected_features)[indices], rotation=90)
    plt.tight_layout()
    plt.savefig('final_model_feature_importances.png')
    
    return model, r2, rmse

def main():
    # Load and preprocess data
    print("Loading and preprocessing data...")
    X, y = load_and_preprocess_data('logs.csv')
    
    # Compare different feature selection methods
    results, best_features = compare_feature_subsets(X, y, n_features=5)
    
    # Evaluate final model with selected features
    model, r2, rmse = final_model_evaluation(X, y, best_features)
    
    print("\nAnalysis complete! The optimal set of 5 performance counters has been identified.")
    print(f"Final selected features: {best_features}")
    print(f"Model performance: R²={r2:.4f}, RMSE={rmse:.6f}")

if __name__ == "__main__":
    main()