*Python Codes Used in the Book*
(sorted by section numbers, sections with no
code underneath have no code in the book)
last updated or corrected August 3, 2022 @ 2:49PM MT. Please
send corrections to daniel.denis@umontana.edu .
Chapter 1
1.1
1.2
1.3
1.4
1.5
1.6
1.6.1
1.7
1.8
1.9
1.10
Chapter 2
2.1
2.2
2.3
import numpy
x = numpy.array([0, 1, 2, 3, 4, 5])
numpy.sum(x)
import numpy
as np
x = np.array([0, 1, 2, 3, 4, 5])
np.sum(x)
import numpy as
numpywhichisquitefascinating
numpywhichisquitefascinating.sum(x)
x =
np.array([1, 5, 8, 2, 7, 4])
x = np.array([1, 5, 8, 2, 7, 4])
x[4]
y = np.array([4, 6, 8, 2, 4, 1])
np.concatenate([x, y])
2.4
2.5
data = {‘ac’ : [70, 67,
65, 75, 76, 73, 69, 68, 70, 76, 77, 75, 85, 86, 85, 76, 75,
73, 95, 94, 89, 94, 93, 91],
‘teach’ : [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
3, 4, 4, 4, 4, 4, 4],
‘text’ : [1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2,
2, 1, 1, 1, 2, 2, 2]}
import pandas as pd
df = pd.DataFrame(data)
df
import statistics
ac = df['ac']
z_numerator = (ac - statistics.mean(ac))
z_denominator = statistics.stdev(ac)
z = z_numerator/z_denominator
z
from scipy import stats
ac = df['ac']
stats.zscore(ac, axis=0, ddof=1)
import matplotlib.pyplot as plt
ac = df['ac']
z = stats.zscore(ac, axis=0, ddof=1)
ac_hist = plt.hist(ac)
ac.z_hist = plt.hist(z)
import scipy as sp
sp.stats.skew(ac)
sp.stats.kurtosis(ac)
2.6
data = {‘ac’ : [70, 67, 65,
75, 76, 73, 69, 68, 70, 76, 77, 75, 85, 86, 85, 76, 75,
73, 95, 94, 89, 94, 93, 91],
‘teach’ : [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
3, 3, 4, 4, 4, 4, 4, 4],
‘text’ : [1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2,
2, 2, 1, 1, 1, 2, 2, 2]}
import pandas as pd
df = pd.DataFrame(data)
df
df[‘ac’]
ac = df[‘ac’]
mean = sum(ac)/len(ac)
mean
import statistics
mean = statistics.mean(ac)
mean
import numpy as np
np.mean(ac)
import math
math.sqrt(89.20)
sd = statistics.pstdev(ac)
sd
statistics.median(ac)
statistics.mode(ac)
2.7
import pandas as pd
iq_data = pd.read_csv(‘iq.data.txt’)
iq_data
dataset=pd.read_csv(“iq_data.txt”,delimiter=”\t”)
dataset
2.8
import seaborn as sns
iris = sns.load_dataset('iris')
iris.head()
2.9
import numpy as np
np.random.randn(100)
2.10
import math
math.e
math.pi
2.11
cov_matrix = pd.DataFrame([(10, 20), (5, 15), (20,
12), (8, 17)],
columns=[‘var1’,‘var2’])
cov_matrix
cov_matrix.cov()
cov_matrix.corr()
2.11.1
import numpy as np
np.zeros((4, 4))
import numpy as np
np.empty((4, 4))
import numpy as np
np.ones((4, 2))
import numpy as np
A = np.matrix(‘1 2; 3 4’)
A
import numpy as np
B = np.array([[1, 2],
[3, 4]])
B
import numpy as np
np.transpose(B)
import numpy as np
np.matrix.trace(B)
import numpy as np
np.matrix.diagonal(B)
2.11.2
B = np.array([[1, 2], [3, 4]])
B
results = np.linalg.eig(B)
results
Chapter 3
3.1
3.2
import pandas as pd
data = pd.read_csv("population.csv")
data.head()
state = data['NAME']
state
pop_change_2019 = data['PPOPCHG_2019']
pop_change_2019
N_pop_change_2019 = data['NPOPCHG_2019']
N_pop_change_2019
import matplotlib.pyplot as plt
import numpy as np
plt.plot(data["NPOPCHG_2019"], data["PPOPCHG_2019"],
"o")
plt.xlabel("NPOPCHG_2019")
plt.ylabel("PPOPCHG_2019")
3.3
3.4
x = [10, 15, 16, 23, 27, 38, 43, 56, 57, 60]
y = [5, 8, 9, 13, 16, 20, 40, 45, 67, 75]
import matplotlib.pyplot as plt
import numpy as np
plt.hist2d(x, y, bins=(50, 50), cmap=plt.cm.Reds)
3.5
import matplotlib.pyplot as plt
import seaborn as sns
df = sns.load_dataset('iris')
df.head()
sns.pairplot(df, kind="scatter", hue="species",
markers=["o", "s", "D"], palette="Set2")
plt.show()
sns.pairplot(df, kind="scatter", hue="species",
plot_kws=dict(s=80, edgecolor="white", linewidth=2.5))
plt.show()
3.6
import numpy as np
import matplotlib.pyplot as plt
height = [10, 20, 30, 75, 100]
bars = ('A', 'B', 'C', 'D', 'E')
y_pos = np.arange(len(bars))
plt.bar(y_pos, height, color=(0.2, 0.4, 0.4, 0.6))
3.7
import seaborn as sns
df = sns.load_dataset('iris')
sns.distplot( df["sepal_length"] , color="skyblue",
label="Sepal Length")
sns.distplot( df["sepal_width"] , color="red",
label="Sepal Width")
sns.plt.show()
3.8
x = np.random.rand(40)
y = np.random.rand(40)
z = np.random.rand(40)
plt.scatter(x, y, s=z*1000, alpha=0.5)
plt.show()
plt.scatter(x, y, s=z*10000, alpha=0.5)
plt.show()
3.9
import pandas as pd
df = pd.DataFrame([8,8,1,2], index=['a', 'b', 'c',
'd'], columns=['x'])
df.plot(kind='pie', subplots=True, figsize=(8, 8))
3.10
import seaborn as sns
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.random((5,5)),
columns=["a","b","c","d","e"])
df
map = sns.heatmap(df)
import seaborn as sns
df = sns.load_dataset('iris')
sns.jointplot(x=df["sepal_length"],
y=df["sepal_width"], kind='hex')
sns.jointplot(x=df["sepal_length"],
y=df["sepal_width"], kind='scatter')
sns.jointplot(x=df["sepal_length"],
y=df["sepal_width"], kind='kde')
3.11
import matplotlib.pyplot as plt
import numpy as np
values=np.cumsum(np.random.randn(1000,1))
plt.plot(values)
3.12
Chapter 4
4.1
4.2
4.3
x = [0, 2, 6, 7, 15]
y = [0, 1, 8, 13, 20]
df = pd.DataFrame(x, y)
df
import scipy.stats
scipy.stats.pearsonr(x,y)
scipy.stats.spearmanr(x,y)
plt.scatter(x, y)
4.4
4.5
import numpy as np
np.random.seed(1)
x = np.random.randint(0, 50, 1000)
y = x + np.random.normal(0, 10, 1000)
np.corrcoef(x, y)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.style.use('ggplot')
plt.scatter(x, y)
galton = pd.read_csv('Galton.csv')
galton.head()
from scipy import stats
pearson_coef, p_value =
stats.pearsonr(galton["child"],
galton["parent"])
print("Pearson Correlation: ", pearson_coef, "and a
P-value of:", p_value)
import
seaborn as sns
sns.pairplot(galton)
parent = galton['parent']
child = galton['child']
columns = ['child', 'parent']
ax1 = galton.plot.scatter(x = 'child', y = 'parent')
4.6
iq = [105, 98, 110, 105, 95]
df = pd.DataFrame(iq)
df
from scipy import stats
stats.ttest_1samp(df, 100.0)
from scipy import stats
stats.ttest_ind(parent, child)
plt.figure(figsize=(10, 7))
sns.distplot(parent)
plt.figure(figsize=(9, 5))
sns.distplot(child)
sns.boxplot(parent)
sns.boxplot(child)
4.7
trial_1 = [10, 12.1, 9.2, 11.6, 8.3, 10.5]
trial_2 = [8.2, 11.2, 8.1, 10.5, 7.6, 9.5]
paired_data = pd.DataFrame(trial_1, trial_2)
paired_data
stats.ttest_rel(trial_1, trial_2)
4.8
stats.binom_test(2, n=5, p=0.5, alternative='greater')
4.9
from scipy.stats import chisquare
chisquare([16, 16, 16, 16, 16])
chisquare([16, 15, 16, 15, 16])
chisquare([16, 15, 10, 8, 25])
4.10
import numpy as np
matrix = np.array([[20, 10],
[5, 15]])
matrix
from scipy.stats import chi2_contingency
obs = np.array([[20, 10], [5, 15]])
chi2_contingency(obs)
Chapter 5
5.1
5.2
5.3
5.4
5.5
5.6
5.7
from statsmodels.stats.power import TTestIndPower
effect = 0.8
alpha = 0.05
power = 0.8
analysis = TTestIndPower()
result = analysis.solve_power(effect, power=power, nobs1=None,
ratio=1.0, alpha=alpha)
print('Sample Size: %.3f' % result)
from statsmodels.stats.power import TTestIndPower
effect = 0.8
alpha = 0.05
power = 0.9
analysis = TTestIndPower()
result = analysis.solve_power(effect, power=power, nobs1=None,
ratio=1.0, alpha=alpha)
print('Sample Size: %.3f' % result)
5.8
from statsmodels.stats.power import TTestIndPower
effect = 0.2
alpha = 0.05
power = 0.8
analysis = TTestIndPower()
result = analysis.solve_power(effect, power=power, nobs1=None,
ratio=1.0, alpha=alpha)
print('Sample Size: %.3f' % result)
5.9
from statsmodels.stats.power import TTestIndPower
effect = 0.8
alpha = 0.20
power = 0.8
analysis = TTestIndPower()
result = analysis.solve_power(effect, power=power, nobs1=None,
ratio=1.0, alpha=alpha)
print('Sample Size: %.3f' % result)
5.10
5.11
Chapter 6
6.1
6.2
6.3
data = {'ac' : [70, 67, 65, 75, 76, 73, 69, 68, 70, 76, 77, 75,
85, 86, 85, 76, 75, 73, 95, 94, 89, 94, 93, 91],
'teach' : [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4]}
df = pd.DataFrame(data)
df
df['ac'].hist(by=df['teach'])
df.sum()
df.mean()
df.std()
df.median()
df.describe()
6.4
import scipy as sp
ac = df['ac']
sp.stats.shapiro(ac)
import scipy as sp
teach = df['teach']
sp.stats.levene(ac, teach)
6.5
df.info()
import statsmodels.api as sm
from statsmodels.formula.api import ols
model = ols('ac ~ C(teach)', data=df).fit()
table = sm.stats.anova_lm(model, typ=2)
print(table)
6.6
6.7
6.8
from statsmodels.stats.multicomp import
(pairwise_tukeyhsd,MultiComparison)
post = pairwise_tukeyhsd(df['ac'], df['teach'])
print(post)
6.9
data = {'ac' : [70, 67, 65, 75, 76, 73, 69, 68, 70, 76, 77, 75,
85, 86, 85, 76, 75, 73, 95, 94, 89, 94, 93, 91],
'teach' : [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4],
'text' : [1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2,
1, 1, 1, 2, 2, 2]}
df_2 = pd.DataFrame(data)
df_2
import statsmodels as sm
import statsmodels.api as sm
model_2 = ols('ac ~ C(teach) + C(text)', data = df_2).fit()
table_2 = sm.stats.anova_lm(model_2, typ = 2)
print(table_2)
6.10
6.11
6.12
import statsmodels.api as sm
from statsmodels.formula.api import ols
model_3 = ols('ac ~ C(teach) + C(text) + C(teach)*C(text)', data
= df_2).fit()
table_3 = sm.stats.anova_lm(model_3, typ = 2)
print(table_3)
6.13
import statsmodels.api as sm
res = model_3.resid
fig = sm.qqplot(res, line='s')
6.14
rat = pd.DataFrame({'rat': np.repeat([1, 2, 3, 4, 5, 6], 3),
'trial': np.tile([1, 2, 3], 6),
'time': [10.0, 8.2, 5.3,
12.1, 11.2,
9.1,
9.2, 8.1, 4.6,
11.6, 10.5,
8.1,
8.3, 7.6, 5.5,
10.5, 9.5,
8.1]})
rat
from statsmodels.stats.anova import AnovaRM
print(AnovaRM(data = rat, depvar = 'time', subject = 'rat',
within = ['trial']).fit())
6.15
6.15.1
6.15.2
6.15.3
6.15.4
data = {'grade':[0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
'studytime': [30, 25, 59, 42, 31, 140, 90, 95, 170, 120]}
df = pd.DataFrame(data)
df
grade = df['grade']
studytime = df['studytime']
import scipy
from scipy.stats import mannwhitneyu
sample1 = 30, 25, 59, 42, 31
sample2 = 140, 90, 95, 170, 120
stat, p = mannwhitneyu(sample1, sample2)
stat, p
6.15.5
x = [70, 67, 65, 75, 76, 73]
y = [69, 68, 70, 76, 77, 75]
z = [85, 86, 85, 76, 75, 73]
w = [95, 94, 89, 94, 93, 91]
from scipy import stats
stats.kruskal(x, y, z, w)
stats.kruskal(x, y)
pip install scikit-posthocs
import scikit_posthocs as sp
v = [[70, 67, 65, 75, 76, 73], [69, 68, 70, 76, 77, 75], [85,
86, 85, 76, 75, 73], [95, 94, 89, 94, 93, 91]]
sp.posthoc_nemenyi(v)
Chapter 7
7.1
7.2
7.3
7.4
7.5
7.6
7.7
7.8
7.9
pip install pyreadstat
import pyreadstat
df, meta = pyreadstat.read_sav("iq_data.sav")
df
y = df["verbal"]
x = df["quant"]
import numpy as np
import matplotlib.pyplot as plt
plt.scatter(x, y)
plt.title("Scatterplot of Verbal on Quant")
plt.xlabel("Quant”)
plt.ylabel("Verbal")
import statsmodels.api as sm
x = sm.add_constant(x)
x
model = sm.OLS(y, x).fit()
print_model = model.summary()
print(print_model)
7.10
7.11
7.12
7.13
import pyreadstat
df, meta = pyreadstat.read_sav("iq_data.sav")
df
import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm
X = df[['quant', 'analytic']]
Y = df['verbal']
regr = linear_model.LinearRegression()
regr.fit(X, Y)
print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)
new_quant = 20
new_analytic = 25
print('predicted verbal: \n', regr.predict([[new_quant,
new_analytic]]))
predicted verbal:
***TO GET P-VALUES, USE THE FOLLOWING
CODE (NOT FOUND IN BOOK):
import statsmodels.api as sm
X = df[['quant', 'analytic']]
Y = df['verbal']
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
predictions = model.predict(X)
print_model = model.summary()
print(print_model)
7.14
7.15
7.16
7.17
Chapter 8
8.1
8.2
8.3
odds = 0.5/(1-0.5)
odds
import numpy as np
np.log(odds)
odds = 1.0/(1.0-1.0)
odds = 0.0/(1.0-0.0)
odds
2**3
np.log(2)
np.exp(0.6931471805599453)
8.4
import pandas as pd
data = {'oring' : [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
0, 1, 0, 0, 0, 0, 0],
'temp' : [53, 57, 58, 63, 66, 67, 67, 67, 68, 69, 70, 70, 70, 70,
72, 73, 75, 75, 76, 76, 78, 79, 81]}
df_challenger = pd.DataFrame(data)
print(df_challenger)
df_challenger['oring'].value_counts()
import seaborn as sns
sns.countplot(x='oring', data = df_challenger, palette='hls')
import statsmodels.api as sm
y = df_challenger['oring']
X = df_challenger['temp']
X_const = sm.add_constant(X)
model = sm.Logit(y, X_const)
results = model.fit()
print(results.summary())
predicted_logit = 15.0429 - 0.2322*X
predicted_logit = 15.0429 – 0.2322(X)
= 15.0429 – 0.2322(53)
= 2.7363
predicted_logit
from scipy import stats
stats.spearmanr(predicted_logit, X)
import matplotlib.pyplot as plt
import numpy as np
plt.plot(predicted_logit, X)
plt.xlabel("predicted logit")
plt.ylabel("temp")
plt.title("temp as a function of logit")
8.5
import pandas as pd
import numpy as np
import statsmodels.api as sm
df = pd.read_csv('Smarket.csv', index_col=0, parse_dates=True)
df.head()
import statsmodels.formula.api as smf
Direction
~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume
change = np.where(df['Direction']=='Up', 1, 0)
model_constant = sm.add_constant(df[['Lag1', 'Lag2', 'Lag3',
'Lag4', 'Lag5', 'Volume']])
model = sm.GLM(change, model_constant,
family=sm.families.Binomial()).fit()
print(model.summary())
import numpy as np
np.exp(-0.0731)
prob = np.exp(-0.0731)/(1 + np.exp(-0.0731))
prob
8.5.1
model_constant = sm.add_constant(df[['Lag1']])
model = sm.GLM(change, model_constant,
family=sm.families.Binomial()).fit()
print(model.summary())
8.6
Chapter 9
9.1
9.2
9.3
9.4
9.4.1
9.4.2
9.4.3
9.4.4
9.5
9.6
import pandas as pd
data = {'quant' : [5, 2, 6, 9, 8, 7, 9, 10, 10],
'verbal' : [2, 1, 3, 7, 9, 8, 8, 10, 9],
'train' : [1, 1, 1, 2, 2, 2, 3, 3, 3]}
df_manova = pd.DataFrame(data)
print(df_manova)
print(df_manova.dtypes)
cols = ['train']
for col in cols:
df_manova[col] =
df_manova[col].astype('category')
print(df_manova.dtypes)
from statsmodels.multivariate.manova import MANOVA
maov = MANOVA.from_formula('quant + verbal ~ train', data =
df_manova)
print(maov.mv_test())
9.7
9.8
9.9
9.10
data_discrim = {'y' : [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
'x1' : [4, 3, 3, 2, 2, 8, 7, 5, 3, 3],
'x2' : [2, 1, 2, 2, 5, 3, 4, 5, 4, 2]}
df_discrim = pd.DataFrame(data_discrim)
df_discrim
y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
X = np.array([[4, 2], [3, 1], [3, 2], [2, 2], [2, 5], [8, 3], [7,
4], [5, 5], [3, 4], [3, 2]])
import numpy as np
from sklearn.discriminant_analysis import
LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(X, y)
lda = LinearDiscriminantAnalysis(n_components=1)
model = lda.fit(X, y)
scores = lda.transform(X)
scores
print(lda.scalings_)
y = -3.283 + 0.49739549(x1) + 0.43107609(x2)
y = -3.283 + 0.49739549(4) + 0.43107609(2)
y = -3.283 + 1.98958196 + 0.8621521
= -0.43126586
m = np.dot(lda.means_ - lda.xbar_, lda.scalings_)
m
pred=model.predict(X)
pred
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print(confusion_matrix(pred, y))
9.11
train = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3])
X = np.array([[5, 2], [2, 1], [6, 3], [9, 7], [8, 9], [7, 8], [9,
8], [10, 10], [10, 9]])
X
model = lda.fit(X, train)
model
print(lda.scalings_)
lda.transform(X).shape
lda.transform(X)
pred = model.predict(X)
pred
print(confusion_matrix(pred, train))
9.12
9.13
Chapter 10
10.1
x = [.00, .90, 1.80, 2.60, 3.30, 4.40, 5.20, 6.10, 6.50, 7.40]
y = [5.90, 5.40, 4.40, 4.60, 3.50, 3.70, 2.80, 2.80, 2.40, 1.50]
pca_data = pd.DataFrame(x, y)
pca_data
import statistics
statistics.variance(x)
statistics.variance(y)
total_variance = statistics.variance(x) + statistics.variance(y)
total_variance
10.2
data = np.array([x, y])
data
covMatrix = np.cov(data, bias = False)
covMatrix
import seaborn as sn
sn.heatmap(covMatrix, annot=True, fmt='g')
import numpy.linalg as la
eigen = la.eig(covMatrix)
eigen
eigenvalue, eigenvector = la.eig(covMatrix)
eigenvalue, eigenvector
10.3
import pandas as pd
data = {'x': [0, 0.9, 1.8, 2.6, 3.3, 4.4, 5.2, 6.1, 6.5, 7.4],
'y': [5.9, 5.4, 4.4, 4.6, 3.5, 3.7, 2.8, 2.8, 2.4, 1.5]}
df = pd.DataFrame(data,columns = ['x', 'y'])
corrMatrix = df.corr()
print(corrMatrix)
import numpy.linalg as la
eigen = la.eig(corrMatrix)
eigen
10.4
10.5
A = np.array([[5.9, 0.0], [5.4, 0.9], [4.4, 1.8], [4.6, 2.6],
[3.5, 3.3], [3.7, 4.4], [2.8, 5.2], [2.8, 6.1], [2.4, 6.5], [1.5,
7.4]])
A
import matplotlib.pyplot as plt
plt.plot(x, y)
plt.plot(x, y, 'o', color='black')
from sklearn.decomposition import PCA
pca = PCA(2)
pca.fit(A)
print(pca.components_)
print(pca.explained_variance_)
from sklearn.decomposition import PCA
pca = PCA(1)
pca.fit(A)
print(pca.components_)
print(pca.explained_variance_)
10.6
10.7
10.8
data = pd.read_csv('usarrests.csv')
data.head(10)
df = pd.DataFrame(data, columns=['Murder', 'Assault', 'Urbanpop',
'Rape'])
df
df = pd.DataFrame(data, columns=['Murder', 'Assault', 'UrbanPop',
'Rape'])
df
import numpy as np
np.mean(df)
np.var(df)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)
scaled_data = scaler.transform(df)
scaled_data
scaled_data_df = pd.DataFrame(scaled_data, columns = df.columns)
scaled_data_df
np.mean(scaled_data_df)
np.var(scaled_data_df)
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
pca.fit(scaled_data_df)
pca.components_
print(pca.explained_variance_ratio_)
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
pca.fit(df)
pca.components_
print(pca.explained_variance_)
df.var()
df.std()
10.9
pip install pca
from pca import pca
pip install yellowbrick
import yellowbrick
model = pca(n_components=4)
X = scaled_data
results = model.fit_transform(X)
results
fig, ax = model.scatter()
fig, ax = model.biplot(n_feat=4)
visualizer = PCA(scale=True,
proj_features=True)
visualizer.fit_transform(X)
visualizer.show()
fig, ax = model.plot()
Chapter 11
11.1
11.2
11.3
11.4
11.5
import pandas as pd
import numpy as np
pca_data = np.array([[0, 5.90], [.90, 5.40], [1.80, 4.40], [2.60,
4.60], [3.30, 3.50], [4.40, 3.70], [5.20, 2.80], [6.10, 2.80],
[6.50, 2.40], [7.40, 1.50]])
pca_data
pip install factor_analyzer
import factor_analyzer
from factor_analyzer.factor_analyzer import
calculate_bartlett_sphericity
chi_square_value,p_value=calculate_bartlett_sphericity(pca_data)
chi_square_value, p_value
from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all,kmo_model=calculate_kmo(pca_data)
kmo_model
pip install FactorAnalyzer
from factor_analyzer import FactorAnalyzer
fa = FactorAnalyzer(rotation=None, method = ‘minres’, n_factors=1)
fa.fit(pca_data)
fa.loadings_
fa.get_communalities()
11.6
data = pd.read_csv('usarrests.csv')
data.head(10)
pip install factor_analyzer
import pandas as pd
from factor_analyzer import FactorAnalyzer
df = pd.DataFrame(data, columns=['Murder', 'Assault', 'UrbanPop',
'Rape'])
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)
scaled_data = scaler.transform(df)
scaled_data
fa = FactorAnalyzer()
fa.set_params(n_factors=4, rotation=None)
fa.fit(scaled_data)
fa.loadings_
ev, v = fa.get_eigenvalues()
ev
fa.get_communalities()
(0.84370394)**2 + (-0.37474146)**2 + (-0.07321271)**2
1-0.85762759
fa_varimax = FactorAnalyzer(rotation='varimax')
fa_varimax.fit(scaled_data)
fa_varimax.loadings_
fa_varimax.get_communalities()
Chapter 12
12.1
12.2
12.3
12.4
12.5
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
iris = sns.load_dataset("iris")
print(iris.head())
df = iris[['sepal_length', 'sepal_width', 'petal_length',
'petal_width']]
df.head(3)
kmeans = KMeans(n_clusters=3)
kmeans.fit(df)
kmeans.predict(df)
centroids = kmeans.cluster_centers_
centroids
kpredict = kmeans.predict(df)
plt.scatter(iris['petal_length'], iris['petal_width'], c =
kpredict, cmap = 'cool')
plt.scatter(iris['sepal_length'], iris['sepal_width'], c =
kpredict, cmap = 'cool')
cluster = kmeans.labels_
cluster1 = iris.loc[cluster == 0]
cluster2 = iris.loc[cluster == 1]
cluster3 = iris.loc[cluster == 3]
cluster1.describe()
12.6
12.7
df = iris[['sepal_length', 'sepal_width', 'petal_length',
'petal_width']]
iris.drop(['species'], axis=1, inplace = True)
iris
import seaborn as sns
sns.pairplot(iris)
from sklearn.cluster import AgglomerativeClustering
groups = AgglomerativeClustering(n_clusters=3,
affinity='euclidean', linkage='single')
groups.fit_predict(iris)
iris
plt.scatter(iris['petal_length'], iris['petal_width'], c =
groups.labels_, cmap='cool')
End
of Code. Return to DATAPSYC.COM .
- site last
updated Aug. 7, 2024 -
©
Big Picture
Statistics & Data Science, LLC