Answered step by step
Verified Expert Solution
Link Copied!

Question

1 Approved Answer

User import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import

User
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from kmodes.kprototypes import KPrototypes
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
# Load the preprocessed dataset
file_path = r"C:\Users\ankit\Downloads\Ankit_proj\Ankit_proj\Clustering\Air_Traffic_Passenger_Statistics.csv"
data = pd.read_csv(file_path)
# Select the features for clustering
features =['Published Airline', 'GEO Region', 'Passenger Count', 'Adjusted Passenger Count', 'Year']
# Define the preprocessing for numerical and categorical features
numerical_features =['Passenger Count', 'Adjusted Passenger Count', 'Year']
categorical_features =['Published Airline', 'GEO Region']
# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(), categorical_features)
],
remainder='passthrough'
)
# Convert categorical columns to one-hot encoding
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(drop='first'), categorical_features) # Use drop='first' to handle multicollinearity
],
remainder='passthrough'
)
# Modify preprocessing to handle categorical variables and sparse data
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(drop='first'), categorical_features) # Adjust encoding method as needed
],
remainder='passthrough'
)
# Preprocess the data
try:
data_preprocessed = preprocessor.fit_transform(data)
except ValueError as e:
print("Error during preprocessing:", e)
# Handle the error, potentially by inspecting the data further or adjusting preprocessing steps
# Determine categorical feature indices for K-Prototypes
cat_features_indices = list(range(len(numerical_features), len(numerical_features)+ len(categorical_features)))
# Perform K-Prototypes clustering if data preprocessing was successful
if 'data_preprocessed' in locals():
# Perform K-Prototypes clustering
kproto = KPrototypes(n_clusters=4, init='Cao', n_init=5, verbose=1)
cluster_labels_kproto = kproto.fit_predict(data_preprocessed, categorical=cat_features_indices)
# Add cluster labels to the original dataframe
data['Cluster_KPrototypes']= cluster_labels_kproto
# Extract the cluster centroids
centroids = kproto.cluster_centroids_
# Apply hierarchical clustering to the centroids
Z = linkage(centroids, method='ward')
# Plot dendrogram for the hierarchical clustering
plt.figure(figsize=(10,7))
dendrogram(Z, labels=[f'Cluster {i}' for i in range(len(centroids))])
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Cluster')
plt.ylabel('Distance')
plt.show()
# Plot clusters in a scatter plot using original features
plt.figure(figsize=(10,6))
sns.scatterplot(x='Passenger Count', y='Adjusted Passenger Count', hue='Cluster_KPrototypes', data=data, palette='viridis', alpha=0.7)
plt.xlabel('Passenger Count')
plt.ylabel('Adjusted Passenger Count')
plt.title('Hybrid Clustering: K-Prototypes + Hierarchical Clustering')
plt.legend(title='Cluster')
plt.show()
# Compute cluster evaluation metrics
# Silhouette Score
silhouette_avg = silhouette_score(data_preprocessed, cluster_labels_kproto)
print(f'Silhouette Score: {silhouette_avg:.4f}')
# Calinski-Harabasz Index
calinski_harabasz = calinski_harabasz_score(data_preprocessed, cluster_labels_kproto)
print(f'Calinski-Harabasz Index: {calinski_harabasz:.4f}')
# Davies-Bouldin Index
davies_bouldin = davies_bouldin_score(data_preprocessed, cluster_labels_kproto)
print(f'Davies-Bouldin Index: {davies_bouldin:.4f}')
Error during preprocessing: For a sparse output, all columns should be a numeric or convertible to a numeric.
my dataset columns value are index int64
Activity Period int64
Operating Airline object
Operating Airline IATA Code object
Published Airline object
Published Airline IATA Code object
GEO Summary object
GEO Region object
Activity Type Code object
Price Category Code object
Terminal object
Boarding Area object
Passenger Count int64
Adjusted Activity Type Code object
Adjusted Passenger Count int64
Year int64
Month object
dtype: object
resolve the error, and provide error free code with the output.Kindly refr

Step by Step Solution

There are 3 Steps involved in it

Step: 1

blur-text-image

Get Instant Access to Expert-Tailored Solutions

See step-by-step solutions with expert insights and AI powered tools for academic success

Step: 2

blur-text-image

Step: 3

blur-text-image

Ace Your Homework with AI

Get the answers you need in no time with our AI-driven, step-by-step assistance

Get Started

Recommended Textbook for

Introduction To Data Mining

Authors: Pang Ning Tan, Michael Steinbach, Vipin Kumar

1st Edition

321321367, 978-0321321367

More Books

Students also viewed these Databases questions