Today I will share 5 basic machine learning algorithms, which are commonly asked in machine learning interviews:
- Logistic Regression
- KMeans
- K Nearest Neighbors (KNN)
- Native Bayes
- Gradient Boosting Decision Tree (GBDT)
Python Code Implementation
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
class LogisticRegression:
def __init__(self, learning_rate=0.01, num_iter=1000, fit_intercept=True, verbose=False):
self.learning_rate = learning_rate
self.num_iter = num_iter
self.fit_intercept = fit_intercept
self.verbose = verbose
def add_intercept(self, X):
intercept = np.ones((X.shape[0], 1))
return np.concatenate((intercept, X), axis=1)
def sigmoid(self, z):
return 1 / (1 + np.exp(-z))
def loss(self, h, y):
return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
def fit(self, X, y):
if self.fit_intercept:
X = self.add_intercept(X)
# weights initialization
self.theta = np.zeros(X.shape[1])
for i in range(self.num_iter):
z = np.dot(X, self.theta)
h = self.sigmoid(z)
gradient = np.dot(X.T, (h - y)) / y.size
self.theta -= self.learning_rate * gradient
if self.verbose == True and i % 10000 == 0:
z = np.dot(X, self.theta)
h = self.sigmoid(z)
print("loss: %f" % {self.loss(h, y)})
def predict_prob(self, X):
if self.fit_intercept:
X = self.add_intercept(X)
return self.sigmoid(np.dot(X, self.theta))
def predict(self, X, threshold = 0.5):
return self.predict_prob(X) >= threshold
def score(self, X, y, threshold = 0.5):
return (self.predict(X, threshold) == y).mean()
iris = datasets.load_iris()
X, y = iris.data, (iris.target != 0)*1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)
lr = LogisticRegression()
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))
Python Code Implementation
import numpy as np
class KMeans:
def __init__(self, n_clusters=5, max_iter=300, random_state=42):
self.n_clusters = n_clusters
self.max_iter = max_iter
self.random_state = random_state
def fit(self, X):
np.random.seed(self.random_state)
initial = np.random.permutation(X.shape[0])[:self.n_clusters]
self.cluster_centers_ = X[initial]
for _ in range(self.max_iter):
self.labels_ = [self._nearest(self.cluster_centers_, x) for x in X]
indices = [[i for i, l in enumerate(self.labels_) if l == j]
for j in range(self.n_clusters)]
X_by_cluster = [X[i] for i in indices]
# update the clusters
self.cluster_centers_ = [c.sum(axis=0) / len(c) for c in X_by_cluster]
# sum of square distances from the closest cluster - WCSS
self.inertia_ = sum(((self.cluster_centers_[l] - x)**2).sum()
for x, l in zip(X, self.labels_))
return self
def _nearest(self, clusters, x):
return np.argmin([self._distance(x, c) for c in clusters])
def _distance(self, a, b):
return np.sqrt(((a - b)**2).sum())
def predict(self, X):
return self.labels_
def transform(self, X):
return [[self._distance(x, c) for c in self.cluster_centers_] for x in X]
def fit_predict(self, X):
return self.fit(X).predict(X)
def fit_transform(self, X):
return self.fit(X).transform(X)
def score(self, X):
return -self.inertia_
X = np.random.random((50, 10))
kmeans = KMeans(n_clusters=5, random_state=1)
kmeans.fit(X)
print(kmeans.inertia_)
Python Code Implementation
import numpy as np
from collections import defaultdict
from sklearn import datasets
from sklearn.model_selection import train_test_split
class KNeighborsClassifier:
def __init__(self, n_neighbors=5, weights='uniform', p=2):
self.n_neighbors = n_neighbors
self.weights = weights
self.p = p
def fit(self, X, y):
self.X = X
self.y = y
return self
def _distance(self, data1, data2):
"""1: Manhattan, 2: Euclidean"""
if self.p == 1:
return sum(abs(data1 - data2))
elif self.p == 2:
return np.sqrt(sum((data1 - data2)**2))
raise ValueError("p not recognized: should be 1 or 2")
def _compute_weights(self, distances):
if self.weights == 'uniform':
return [(1, y) for d, y in distances]
elif self.weights == 'distance':
matches = [(1, y) for d, y in distances if d == 0]
return matches if matches else [(1/d, y) for d, y in distances]
raise ValueError("weights not recognized: should be 'uniform' or 'distance'")
def _predict_one(self, test):
distances = sorted((self._distance(x, test), y) for x, y in zip(self.X, self.y))
weights = self._compute_weights(distances[:self.n_neighbors])
weights_by_class = defaultdict(float)
for d, c in weights:
weights_by_class[c] += d
return max(weights_by_class, key = weights_by_class.get)
def predict(self, X):
return [self._predict_one(x) for x in X]
def score(self, X, y):
return sum(1 for p, t in zip(self.predict(X), y) if p == t) / len(y)
iris = datasets.load_iris()
X_train, X_temp, y_train, y_temp = \
train_test_split(iris.data, iris.target, test_size=.4)
X_validation, X_test, y_validation, y_test = \
train_test_split(X_temp, y_temp, test_size=.5)
neighbor = KNeighborsClassifier().fit(X_train, y_train)
print(neighbor.score(X_train, y_train))
print(neighbor.score(X_validation, y_validation))
Python Code Implementation
One is for Native Bayes in text/spam classification. Here we add Laplace smoothing (check CS229 for details).
import collections
import math
class NB:
def __init__(self):
self.num_messages = {}
self.log_class_priors = {}
self.vocab = set()
self.labels = set()
self.word_counts = collections.defaultdict(lambda: collections.Counter())
def clean(self, s):
translator = str.maketrans("", "", string.punctuation)
return s.translate(translator)
def tokenize(self, text):
text = self.clean(text).lower()
return re.split("\W+", text)
def fit(self, X, y):
n = len(y)
self.labels = set(y)
self.num_messages = collections.Counter(y)
for label in self.labels:
self.log_class_priors[label] = 1.0 * self.num_messages[label] / n
for x, label in zip(X, y):
tokens = self.tokenize(x)
counts = collections.Counter(tokens)
for word, count in counts.items():
self.vocab.add(word)
self.word_counts[label][word] += count
return self
def predict(self, X):
res = []
for x in X:
tokens = self.tokenize(x)
counts = collections.Counter(tokens)
score = collections.defaultdict(float)
for word, _ in counts.items():
if word not in self.vocab:
continue
# add Laplace smoothing
for label in self.labels:
score[label] += math.log( (self.word_counts[label][word] + 1) / (self.num_messages[label] + len(self.vocab)))
# add prior
for label in self.labels:
score[label] += self.log_class_priors[label]
res.append(max(score, key = score.get))
return res
def score(self, X, y):
return sum(self.predict(x) == label for x, label in zip(X, y)) / len(y)
Another is for Gaussian Native Bayes, which assumes a normal distribution for the likelihood.
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
class GaussianNB:
def __init__(self):
pass
def fit(self, X, y):
separated = [[x for x, t in zip(X, y) if t == c] for c in np.unique(y)]
# get the mean and std for each feature under each class data
self.model = np.array([np.c_[np.mean(i, axis=0), np.std(i, axis=0)]
for i in separated])
return self
def _prob(self, x, mean, std):
return np.log(1.0/ (np.sqrt(2 * np.pi) * std)) - ((x - mean)**2 / (2 * std**2))
def predict_log_proba(self, X):
return [[sum(self._prob(i, *s) for s, i in zip(para, x)) for para in self.model] for x in X]
def predict(self, X):
return np.argmax(self.predict_log_proba(X), axis=1)
def score(self, X, y):
return sum(self.predict(X) == y) / len(y)
iris = datasets.load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)
nb = GaussianNB().fit(X_train, y_train)
print(nb.score(X_train, y_train))
print(nb.score(X_test, y_test))
Python Code Implementation
from sklearn.datasets import load_boston
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import numpy as np
class GBDT:
def __init__(self, max_iter = 50, learning_rate = 0.1):
self.max_iter = max_iter
self.learning_rate = learning_rate
self.f0 = None
self.regressors = []
def compute_loss(self, y, y_hat):
# mse square loss
return ((y-y_hat)**2)/2
def loss_gradient(self, y, y_hat):
return -(y-y_hat)
def fit(self, X, y):
y_hat = np.array([y.mean()]*len(y))
self.f0 = y_hat
print(self.compute_loss(y, y_hat).mean())
for i in range(self.max_iter):
residuals = -self.loss_gradient(y, y_hat)
regressor = DecisionTreeRegressor(max_depth=1)
regressor.fit(X, residuals)
self.regressors.append(regressor)
predictions = regressor.predict(X)
y_hat = y_hat + self.learning_rate * predictions
print(self.compute_loss(y, y_hat).mean())
return self
def predict(self, X):
y_hat = np.array([self.f0[0]]*len(X))
for regressor in self.regressors:
y_hat = y_hat + self.learning_rate * regressor.predict(X)
return y_hat
gbdt = GBDT(max_iter = 300, learning_rate = 0.05)
boston = load_boston()
X = boston.data
y = boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
gbdt.fit(X_train, y_train)
y_hat = gbdt.predict(X_test)
print(np.sum((y_test-y_hat)**2)/len(y_test))