Data Science

https://tinyurl.com/Allinonebhai

http://tinyurl.com/tybscds

pip install numpy ,pandas ,matplotlib ,scikit-learn

pip install statsmodels

Logistic Regression

import pandas as pd

from sklearn.datasets import load_iris

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix

iris = load_iris()

X = iris.data

y = (iris.target == 0).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)

y_pred_logistic = logistic_model.predict(X_test)

print("Logistic Regression Metrics")

print("Accuracy:", accuracy_score(y_test, y_pred_logistic))

print("Precision:", precision_score(y_test, y_pred_logistic))

print("Recall:", recall_score(y_test, y_pred_logistic))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_logistic)

print("\nClassification Report:\n", classification_report(y_test, y_pred_logistic ))

Linear Regression ( Change the data value as given )

import pandas as pd

from sklearn.linear_model import LinearRegression

data = {

"Experience": [2, 10, 4, 20, 8, 12, 22],

"Salary": [30000, 95000, 45000, 178000, 84000, 120000, 200000]

}

df = pd.DataFrame(data)

X = df[['Experience']]

y = df['Salary']

model = LinearRegression()

model.fit(X, y)

y_pred = model.predict(X)

print("First few predictions:")

for i in range(5):

print(f"Years of Experience: {X.iloc[i, 0]} → Predicted Salary: Rs. {y_pred[i]:.0f}")

Linear Regression ( for predicting petal.width on petal.length.)

from sklearn.datasets import load_iris

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt

data = load_iris()

X = data.data[:, 2].reshape(-1, 1)

y = data.data[:, 3]

model = LinearRegression().fit(X, y)

y_pred = model.predict(X)

print("R² Score:", model.score(X, y))

plt.scatter(X, y, color='blue')

plt.plot(X, y_pred, color='red')

plt.xlabel("Petal Length (cm)")

plt.ylabel("Petal Width (cm)")

plt.title("Linear Regression")

plt.show()

Create CSV file from given data.

import pandas as pd

data = {

'Rollno': [1, 2, 3, 4, 5, 6, 7, 8, 9],

'Name': ['Sudin', 'Shaima', 'Raina', 'Paul', 'Rahul', 'Gopal', 'Yatin', 'Jim', 'Nima'],

'Age': [44, 46, 27, 38, 46, None, 59, 36, 45],

'Marks': [47, 86, 45, None, 45, 67, 45, 34, 32],

'Class': ['FY', 'SY', 'TY', 'SY', 'FY', 'TY', 'FY', 'FY', 'TY']

}

df = pd.DataFrame(data)

df['Age'] = df['Age'].fillna(df['Age'].mean().round())

df['Marks'] = df['Marks'].fillna(df['Marks'].median())

Q1, Q3 = df['Marks'].quantile([0.25, 0.75])

IQR = Q3 - Q1

df = df[df['Marks'].between(Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)]

print("\nCleaned Data:\n", df)

Feature Scaling technique like standardization and normalization (using data set )

import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Dataset

df = pd.DataFrame({

'Make': ['Honda', 'Honda', 'Toyota', 'Nissan', 'Toyota', 'Honda', 'Ford', 'Chevrolet', 'Chevrolet', 'Dodge', 'Dodge'],

'Model': ['Accord', 'Accord', 'Camry', 'Altima', 'Corolla', 'Civic', 'F-150', 'Silverado', 'Impala', 'Charger', 'Charger'],

'Color': ['Red', 'Blue', 'Black', 'Green', 'Black', 'White', 'Black', 'Green', 'Silver', 'Silver', 'Silver'],

'Mileage': [63512, 95135, 75006, 69847, 87278, 138789, 89073, 109231, 34853, 58173],

'Sell Price': [4000, 2500, 45000, 3826, 2224, 2723, 3950, 4959, 3791, 4349, 4252]

})

# Selecting numerical columns

num_df = df[['Mileage', 'Sell Price']]

# Apply Standardization

standard = StandardScaler().fit_transform(num_df)

standard_df = pd.DataFrame(standard, columns=['Mileage_Standard', 'Price_Standard'])

# Apply Normalization

normalized = MinMaxScaler().fit_transform(num_df)

normalized_df = pd.DataFrame(normalized, columns=['Mileage_Normal', 'Price_Normal'])

# Combine with original data (optional)

result = pd.concat([df[['Make', 'Model', 'Color']], standard_df, normalized_df], axis=1)

# Output

print("Transformed Data:\n")

print(result.head())

Feature Scaling Standardization/normalization

#Feature Scaling STrandardization/normalization

import pandas as pd

from sklearn.datasets import load_iris

from sklearn.preprocessing import MinMaxScaler, StandardScaler

iris = load_iris()

df = pd.DataFrame(iris.data, columns = iris.feature_names)

print(df.head())

minmax = MinMaxScaler().fit_transform(df.head())

print("\nData after Min-Max scaling:\n", minmax)

standard = StandardScaler().fit_transform(df.head())

print("\nData after Standard scaling:\n",standard)

Principal component Analysis (PCA)

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import load_iris

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

iris = load_iris()

X = iris.data

y = iris.target

X_scaled = StandardScaler().fit_transform(X)

pca = PCA()

X_pca = pca.fit_transform(X_scaled)

explained = np.cumsum(pca.explained_variance_ratio_)

plt.plot(explained, marker='o')

plt.title("Cumulative Explained Variance")

plt.xlabel("Number of Components")

plt.ylabel("Explained Variance")

plt.grid(True)

plt.show()

n_components = np.argmax(explained >= 0.95) + 1

print("Components needed to explain 95% variance:", n_components)

X_reduced = PCA(n_components=n_components).fit_transform(X_scaled)

plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap='viridis')

plt.title("PCA (Reduced to 2D)")

plt.xlabel("PC1")

plt.ylabel("PC2")

plt.colorbar(label="Target")

plt.grid(True)

plt.show()

K-means Algorithm Elbow Method.

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.datasets import load_iris

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans

data = load_iris()

X = StandardScaler().fit_transform(data.data)

inertia = [KMeans(n_clusters=k, random_state=0).fit(X).inertia_ for k in range(1, 11)]

plt.plot(range(1, 11), inertia, marker='o')

plt.title('Elbow Method'), plt.xlabel('k'), plt.ylabel('Inertia')

plt.show()

km = KMeans(n_clusters=3, random_state=0)

labels = km.fit_predict(X)

plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')

plt.title('K-Means Clusters'), plt.xlabel('Feature 1'), plt.ylabel('Feature 2')

plt.colorbar(), plt.show()

df = pd.DataFrame(data.data, columns=data.feature_names)

df['Cluster'] = labels

print(df.groupby('Cluster').mean())

K-means Algorithm Silhouette analysis.

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.datasets import load_iris

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_score

iris = load_iris()

X = StandardScaler().fit_transform(iris.data)

scores = [silhouette_score(X, KMeans(n_clusters=k, random_state=0).fit_predict(X)) for k in range(2, 11)]

plt.plot(range(2, 11), scores, marker='o')

plt.title('Silhouette Analysis'), plt.xlabel('Clusters'), plt.ylabel('Score')

plt.grid(True), plt.show()

best_k = scores.index(max(scores)) + 2

print("Best number of clusters:", best_k)

clusters = KMeans(n_clusters=best_k, random_state=0).fit_predict(X)

plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis')

plt.title(f'K-Means Clustering (k={best_k})')

plt.xlabel('Feature 1'), plt.ylabel('Feature 2')

plt.colorbar(label='Cluster'), plt.grid(True), plt.show()

df = pd.DataFrame(iris.data, columns=iris.feature_names)

df['Cluster'] = clusters

print("\nCluster-wise Mean Characteristics:\n", df.groupby('Cluster').mean())

Decision Tree Model

#Decision Tree Classification

from sklearn.datasets import load_iris

from sklearn.tree import DecisionTreeClassifier, plot_tree

import matplotlib.pyplot as plt

import pandas as pd

iris = load_iris()

X = iris.data

y = iris.target

feature_names = iris.feature_names

target_names = iris.target_names

clf = DecisionTreeClassifier(criterion='entropy', random_state=0)

clf.fit(X, y)

plt.figure(figsize=(12, 8))

plot_tree(clf, filled=True, feature_names=feature_names, class_names=target_names)

plt.title("Decision Tree for Iris Dataset")

plt.show()

One Sampled T-Test

from scipy import stats

scores = [72, 88, 64, 74, 67, 79, 85, 75, 89, 77]

hypothesized_mean = 70

t_stat, p_value = stats.ttest_1samp(scores, hypothesized_mean)

print("T-statistic:", t_stat)

print("P-value:", p_value)

Search This Blog

Pos

Data Science

Comments

Post a Comment