Answered step by step

Verified Expert Solution

Link Copied!

Question

1 Approved Answer

Posted on Jul 26, 2024

User import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import

User

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from kmodes.kprototypes import KPrototypes

from scipy.cluster.hierarchy import dendrogram, linkage

from sklearn.metrics import silhouette

_

score, calinski

_

harabasz

_

score, davies

_

bouldin

_

score

# Load the preprocessed dataset

file

_

path

=

"

\

Users

\

ankit

\

Downloads

\

Ankit

_

proj

\

Ankit

_

proj

\

Clustering

\

Air

_

Traffic

_

Passenger

_

Statistics.csv

"

data

=

.

read

_

csv

(

file

_

path

)

# Select the features for clustering

features

= ['

Published Airline', 'GEO Region', 'Passenger Count', 'Adjusted Passenger Count', 'Year'

]

# Define the preprocessing for numerical and categorical features

numerical

_

features

= ['

Passenger Count', 'Adjusted Passenger Count', 'Year'

]

categorical

_

features

= ['

Published Airline', 'GEO Region'

]

# Create a column transformer for preprocessing

preprocessor

=

ColumnTransformer

(

transformers

= [

('

num

',

StandardScaler

(),

numerical

_

features

),

('

cat

',

OneHotEncoder

(),

categorical

_

features

)

],

remainder

=

'passthrough'

)

# Convert categorical columns to one

-

hot encoding

preprocessor

=

ColumnTransformer

(

transformers

= [

('

num

',

StandardScaler

(),

numerical

_

features

),

('

cat

',

OneHotEncoder

(

drop

=

'first'

),

categorical

_

features

)

# Use drop

=

'first' to handle multicollinearity

],

remainder

=

'passthrough'

)

# Modify preprocessing to handle categorical variables and sparse data

preprocessor

=

ColumnTransformer

(

transformers

= [

('

num

',

StandardScaler

(),

numerical

_

features

),

('

cat

',

OneHotEncoder

(

drop

=

'first'

),

categorical

_

features

)

# Adjust encoding method as needed

],

remainder

=

'passthrough'

)

# Preprocess the data

try:

data

_

preprocessed

=

preprocessor.fit

_

transform

(

data

)

except ValueError as e:

("

Error during preprocessing:", e

)

# Handle the error, potentially by inspecting the data further or adjusting preprocessing steps

# Determine categorical feature indices for K

-

Prototypes

cat

_

features

_

indices

=

list

(

range

(

len

(

numerical

_

features

),

len

(

numerical

_

features

) +

len

(

categorical

_

features

)))

# Perform K

-

Prototypes clustering if data preprocessing was successful

if 'data

_

preprocessed' in locals

()

# Perform K

-

Prototypes clustering

kproto

=

KPrototypes

(

_

clusters

= 4,

init

=

'Cao', n

_

init

= 5,

verbose

= 1)

cluster

_

labels

_

kproto

=

kproto.fit

_

predict

(

data

_

preprocessed, categorical

=

cat

_

features

_

indices

)

# Add cluster labels to the original dataframe

data

['

Cluster

_

KPrototypes'

] =

cluster

_

labels

_

kproto

# Extract the cluster centroids

centroids

=

kproto.cluster

_

centroids

_

# Apply hierarchical clustering to the centroids

=

linkage

(

centroids

,

method

=

'ward'

)

# Plot dendrogram for the hierarchical clustering

plt

.

figure

(

figsize

= (10, 7))

dendrogram

(

,

labels

= [

'

Cluster

{

}'

for i in range

(

len

(

centroids

))])

plt

.

title

('

Hierarchical Clustering Dendrogram'

)

plt

.

xlabel

('

Cluster

')

plt

.

ylabel

('

Distance

')

plt

.

show

()

# Plot clusters in a scatter plot using original features

plt

.

figure

(

figsize

= (10, 6))

sns

.

scatterplot

(

=

'Passenger Count', y

=

'Adjusted Passenger Count', hue

=

'Cluster

_

KPrototypes', data

=

data, palette

=

'viridis', alpha

= 0.7)

plt

.

xlabel

('

Passenger Count'

)

plt

.

ylabel

('

Adjusted Passenger Count'

)

plt

.

title

('

Hybrid Clustering: K

-

Prototypes

+

Hierarchical Clustering'

)

plt

.

legend

(

title

=

'Cluster'

)

plt

.

show

()

# Compute cluster evaluation metrics

# Silhouette Score

silhouette

_

avg

=

silhouette

_

score

(

data

_

preprocessed, cluster

_

labels

_

kproto

)

(

'

Silhouette Score:

{

silhouette

_

avg:

. 4

}')

# Calinski

-

Harabasz Index

calinski

_

harabasz

=

calinski

_

harabasz

_

score

(

data

_

preprocessed, cluster

_

labels

_

kproto

)

(

'

Calinski

-

Harabasz Index:

{

calinski

_

harabasz:

. 4

}')

# Davies

-

Bouldin Index

davies

_

bouldin

=

davies

_

bouldin

_

score

(

data

_

preprocessed, cluster

_

labels

_

kproto

)

(

'

Davies

-

Bouldin Index:

{

davies

_

bouldin:

. 4

}')

Error during preprocessing: For a sparse output, all columns should be a numeric or convertible to a numeric.

my dataset columns value are index int

64

Activity Period int

64

Operating Airline object

Operating Airline IATA Code object

Published Airline object

Published Airline IATA Code object

GEO Summary object

GEO Region object

Activity Type Code object

Price Category Code object

Terminal object

Boarding Area object

Passenger Count int

64

Adjusted Activity Type Code object

Adjusted Passenger Count int

64

Year int

64

Month object

dtype: object

resolve the error, and provide error free code with the output.Kindly refr

Step by Step Solution

There are 3 Steps involved in it

Step: 1

Get Instant Access to Expert-Tailored Solutions

See step-by-step solutions with expert insights and AI powered tools for academic success

Step: 2

Step: 3

Ace Your Homework with AI

Get the answers you need in no time with our AI-driven, step-by-step assistance

Get Started

Recommended Textbook for

Introduction To Data Mining

Authors: Pang Ning Tan, Michael Steinbach, Vipin Kumar

1st Edition

★★★★★

5.26 Southwest Co-op produces bags of fertilizer, and it is concerned about impurity content. It is believed that the weights of impurities per bag are normally distributed with a mean of 12.2 grams...

Answered: 1 week ago

Previous Question Next Question