# I was greatly inspired by StatsWithJuliaBook
# https://statisticswithjulia.org/index.html
using CSV, Statistics, StatsBase, DataFrames, Statistics
using Random, Distributions, StatsPlots
function my_median(x::Vector)
n = length(x)
if isodd(n)
return sort(x)[(n+1)÷2]
else
#! This is a convention that can be changed
# takes the smallest of the two middle points -> this definition works with qualitative variables
return sort(x)[n÷2]
end
end
my_median (generic function with 1 method)
my_median(["A","F","B","C","D","E"])
"C"
my_median([1, 5, 6, 8, 0.2, π, -1.2,1])
1.0
function my_quantile(x::Vector, α::Real)
n = length(x)
if !isinteger(n*α) # If it is NOT an integer (! denote the negation here)
return sort(x)[ceil(Int, n*α)]
else
#! Here it works only for reals
return 1/2*(sort(x)[Int(n*α)] + sort(x)[Int(n*α) + 1] ) # some convention
end
end
my_quantile (generic function with 1 method)
my_quantile(["A","F","B","C","D","E","F","D","A"], 0.99)
"F"
my_quantile([1, 5, 6, 8, 0.2, π, -1.2,1], 0.5)
2.0707963267948966
N = 100
distribution = Exponential()
X = rand(distribution, N)
α_range = 1e-3:1e-3:1-1e-3
plot(α_range, quantile(distribution, α_range), label = "Theoritical quantile")
plot!(α_range, quantile(X, α_range), label = "Empirical quantile")
xlabel!("α", legend = :topleft)
histogram(X, label="Some data")
summarystats(X)
Summary Stats: Length: 100 Missing Count: 0 Mean: 0.940679 Minimum: 0.004529 1st Quartile: 0.289284 Median: 0.696518 3rd Quartile: 1.301311 Maximum: 3.828397
Laplace distribution $ f_{\mathrm{Laplace}}(x) = \dfrac{\lambda}{2}e^{-\lambda |x|} $
μ = 20
d₁ = Normal(μ, μ)
d₂ = Laplace(μ, μ/1.5)
n = 1000
X = rand(d₁,n)
Y = rand(d₂,n);
println("E(X) = ", mean(X), ", σ₁ = ", std(X))
println("E(Y) = ", mean(Y), ", σ₂ = ", std(Y))
E(X) = 21.03795030904557, σ₁ = 20.607354180304306 E(Y) = 20.939345572216812, σ₂ = 18.944859691781925
histogram([X,Y], alpha = 0.5, label=["Normal distribution Data" "Laplace distribution Data"], yscale =:identity)
qqnorm(X, ms=3, msw=0, label="Normal distribution Data")
qqnorm!(Y, ms=3, msw=0, label="Laplace distribution Data",
xlabel="Normal Theoretical Quantiles",
ylabel="Data Empirical Quantiles", legend=:topleft)
μ = 20
d₁ = Normal(-2,sqrt(5/3))
d₂ = TDist(5) # Student distribution
n = 10_000
X = rand(d₁,n)
Y = rand(d₂,n);
println("E(X) = ", mean(X), ", σ₁ = ", std(X))
println("E(Y) = ", mean(Y), ", σ₂ = ", std(Y))
E(X) = -2.0011132997932584, σ₁ = 1.2829128841947814 E(Y) = -0.010827771686638463, σ₂ = 1.2553641434714244
histogram([X,Y], alpha = 0.5, label=["Normal distribution Data" "Student's t-distribution Data"])
qqnorm(X, ms=3, msw=0, label="Normal Data")
qqnorm!(Y, ms=3, msw=0, label="Student's t-distribution Data",
xlabel="Normal Theoretical Quantiles",
ylabel="Data Empirical Quantiles", legend=:topleft)
qqplot(X, Y, ms=3, msw=0, c = 3, label=:none)
xlabel!("Normal Theoretical Quantiles")
ylabel!("Student's t-distribution Data")
qqplot(X, Y, ms=3, msw=0, c = 1, label=:none)
xlabel!("X")
ylabel!("Y")
#savefig("quiz_2_qqplot.png")
boxplot([Y,X], label=["A" "B"], legend = :bottom)
#savefig("quiz_2_boxplot.png")
plot(quantile(Normal(2),0:0.001:0.9), label = :none)
boxplot([X,Y], label=["Normal Data" "Student's t-distribution Data"], legend = :bottom)
#There are much more outliers in the Student's t-distribution while the rest is similar.
0% means total equality
100% means one person has it all
n = 100
X = 100*rand(n);
f(x) = 1/(1+x)
f (generic function with 1 method)
Y = f.(X) + 0.01*rand(n);
# Is Y correlated with X?
scatter(X,Y, label = :none)
xlabel!("X")
ylabel!("Y")
# Pearson correlation coefficient
cor(X,Y)
-0.6704291228492733
# Spearman correlation coefficient
corspearman(X,Y)
-0.9523912391239124
n = 100
X = 100*rand(n);
Y = 100*rand(n);
# Is Y correlated with X?
scatter(X,Y, label = :none)
xlabel!("X")
ylabel!("Y")
# Pearson correlation coefficient
cor(X,Y)
0.14085127808241357
# Spearman correlation coefficient
corspearman(X,Y)
0.1573957395739574
Y = -(X) +5 *randn(n);
# Is Y correlated with X?
scatter(X,Y, label = :none)
println(cor(X,Y))
println(corspearman(X,Y))
xlabel!("X")
ylabel!("Y")
-0.984976017088089 -0.9825622562256225
N = 1000
X = rand(MixtureModel([Normal(-3,1), Normal(2,1)]), N)
boxplot(X)