Data Science

https://tinyurl.com/Allinonebhai

http://tinyurl.com/tybscds


pip install numpy ,pandas ,matplotlib ,scikit-learn

pip install statsmodels



 Logistic Regression


import pandas as pd

from sklearn.datasets import load_iris

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix


iris = load_iris()

X = iris.data

y = (iris.target == 0).astype(int)  


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)


y_pred_logistic = logistic_model.predict(X_test)


print("Logistic Regression Metrics")

print("Accuracy:", accuracy_score(y_test, y_pred_logistic))

print("Precision:", precision_score(y_test, y_pred_logistic))

print("Recall:", recall_score(y_test, y_pred_logistic))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_logistic)


print("\nClassification Report:\n", classification_report(y_test, y_pred_logistic ))



 Linear Regression ( Change the data value as given ) 


import pandas as pd

from sklearn.linear_model import LinearRegression


data = {

    "Experience": [2, 10, 4, 20, 8, 12, 22],

    "Salary": [30000, 95000, 45000, 178000, 84000, 120000, 200000]

}


df = pd.DataFrame(data)


X = df[['Experience']]

y = df['Salary']


model = LinearRegression()

model.fit(X, y)

y_pred = model.predict(X)


print("First few predictions:")

for i in range(5):

    print(f"Years of Experience: {X.iloc[i, 0]} → Predicted Salary: Rs. {y_pred[i]:.0f}")


 Linear Regression  ( for predicting  petal.width on petal.length.)



from sklearn.datasets import load_iris

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt


data = load_iris()

X = data.data[:, 2].reshape(-1, 1)  

y = data.data[:, 3]               


model = LinearRegression().fit(X, y)

y_pred = model.predict(X)


print("R² Score:", model.score(X, y))


plt.scatter(X, y, color='blue')

plt.plot(X, y_pred, color='red')

plt.xlabel("Petal Length (cm)")

plt.ylabel("Petal Width (cm)")

plt.title("Linear Regression")

plt.show()



 Create CSV file from given data.



import pandas as pd

data = {

    'Rollno': [1, 2, 3, 4, 5, 6, 7, 8, 9],

    'Name': ['Sudin', 'Shaima', 'Raina', 'Paul', 'Rahul', 'Gopal', 'Yatin', 'Jim', 'Nima'],

    'Age': [44, 46, 27, 38, 46, None, 59, 36, 45],

    'Marks': [47, 86, 45, None, 45, 67, 45, 34, 32],

    'Class': ['FY', 'SY', 'TY', 'SY', 'FY', 'TY', 'FY', 'FY', 'TY']

}


df = pd.DataFrame(data)


df['Age'] = df['Age'].fillna(df['Age'].mean().round())

df['Marks'] = df['Marks'].fillna(df['Marks'].median())


Q1, Q3 = df['Marks'].quantile([0.25, 0.75])

IQR = Q3 - Q1

df = df[df['Marks'].between(Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)]


print("\nCleaned Data:\n", df)





 Feature Scaling technique like standardization and normalization (using data set )




import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Dataset
df = pd.DataFrame({
    'Make': ['Honda', 'Honda', 'Toyota', 'Nissan', 'Toyota', 'Honda', 'Ford', 'Chevrolet', 'Chevrolet', 'Dodge', 'Dodge'],
    'Model': ['Accord', 'Accord', 'Camry', 'Altima', 'Corolla', 'Civic', 'F-150', 'Silverado', 'Impala', 'Charger', 'Charger'],
    'Color': ['Red', 'Blue', 'Black', 'Green', 'Black', 'White', 'Black', 'Green', 'Silver', 'Silver', 'Silver'],
    'Mileage': [63512, 95135, 75006, 69847, 87278, 138789, 89073, 109231, 34853, 58173],
    'Sell Price': [4000, 2500, 45000, 3826, 2224, 2723, 3950, 4959, 3791, 4349, 4252]
})

# Selecting numerical columns
num_df = df[['Mileage', 'Sell Price']]

# Apply Standardization
standard = StandardScaler().fit_transform(num_df)
standard_df = pd.DataFrame(standard, columns=['Mileage_Standard', 'Price_Standard'])

# Apply Normalization
normalized = MinMaxScaler().fit_transform(num_df)
normalized_df = pd.DataFrame(normalized, columns=['Mileage_Normal', 'Price_Normal'])

# Combine with original data (optional)
result = pd.concat([df[['Make', 'Model', 'Color']], standard_df, normalized_df], axis=1)

# Output
print("Transformed Data:\n")
print(result.head())


Feature Scaling Standardization/normalization


#Feature Scaling STrandardization/normalization

import pandas as pd

from sklearn.datasets import load_iris

from sklearn.preprocessing import MinMaxScaler, StandardScaler


iris = load_iris()

df = pd.DataFrame(iris.data, columns = iris.feature_names)

print(df.head())


minmax = MinMaxScaler().fit_transform(df.head())

print("\nData after Min-Max scaling:\n", minmax)


standard = StandardScaler().fit_transform(df.head())

print("\nData after Standard scaling:\n",standard)



 Principal component Analysis (PCA)


import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import load_iris

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA


iris = load_iris()

X = iris.data

y = iris.target


X_scaled = StandardScaler().fit_transform(X)


pca = PCA()

X_pca = pca.fit_transform(X_scaled)

explained = np.cumsum(pca.explained_variance_ratio_)


plt.plot(explained, marker='o')

plt.title("Cumulative Explained Variance")

plt.xlabel("Number of Components")

plt.ylabel("Explained Variance")

plt.grid(True)

plt.show()


n_components = np.argmax(explained >= 0.95) + 1

print("Components needed to explain 95% variance:", n_components)


X_reduced = PCA(n_components=n_components).fit_transform(X_scaled)


plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap='viridis')

plt.title("PCA (Reduced to 2D)")

plt.xlabel("PC1")

plt.ylabel("PC2")

plt.colorbar(label="Target")

plt.grid(True)

plt.show()




K-means Algorithm  Elbow Method.




import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

data = load_iris()
X = StandardScaler().fit_transform(data.data)

inertia = [KMeans(n_clusters=k, random_state=0).fit(X).inertia_ for k in range(1, 11)]
plt.plot(range(1, 11), inertia, marker='o')
plt.title('Elbow Method'), plt.xlabel('k'), plt.ylabel('Inertia')
plt.show()

km = KMeans(n_clusters=3, random_state=0)
labels = km.fit_predict(X)

plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')
plt.title('K-Means Clusters'), plt.xlabel('Feature 1'), plt.ylabel('Feature 2')
plt.colorbar(), plt.show()

df = pd.DataFrame(data.data, columns=data.feature_names)
df['Cluster'] = labels
print(df.groupby('Cluster').mean())




K-means Algorithm Silhouette analysis.





import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

iris = load_iris()
X = StandardScaler().fit_transform(iris.data)

scores = [silhouette_score(X, KMeans(n_clusters=k, random_state=0).fit_predict(X)) for k in range(2, 11)]
plt.plot(range(2, 11), scores, marker='o')
plt.title('Silhouette Analysis'), plt.xlabel('Clusters'), plt.ylabel('Score')
plt.grid(True), plt.show()

best_k = scores.index(max(scores)) + 2
print("Best number of clusters:", best_k)
clusters = KMeans(n_clusters=best_k, random_state=0).fit_predict(X)

plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis')
plt.title(f'K-Means Clustering (k={best_k})')
plt.xlabel('Feature 1'), plt.ylabel('Feature 2')
plt.colorbar(label='Cluster'), plt.grid(True), plt.show()

df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['Cluster'] = clusters
print("\nCluster-wise Mean Characteristics:\n", df.groupby('Cluster').mean())




 Decision Tree Model




#Decision Tree Classification
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
import pandas as pd

iris = load_iris()
X = iris.data
y = iris.target

feature_names = iris.feature_names
target_names = iris.target_names

clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
clf.fit(X, y)

plt.figure(figsize=(12, 8))
plot_tree(clf, filled=True, feature_names=feature_names, class_names=target_names)
plt.title("Decision Tree for Iris Dataset")

plt.show()


One Sampled T-Test


from scipy import stats

scores = [72, 88, 64, 74, 67, 79, 85, 75, 89, 77]

hypothesized_mean = 70

t_stat, p_value = stats.ttest_1samp(scores, hypothesized_mean)

print("T-statistic:", t_stat)

print("P-value:", p_value)


Comments