using RDatasets # Import some datasets from R
using Distributions #
using StatsPlots, StatsBase
using LaTeXStrings #
x = [0.31, 0.29, 0.34, 0.06]
names = ["A", "B", "C", "D"]
pie(names, x, size = (1000,500))
p1 = bar(names, x, label = :none)
ylims!(0,1)
The Iris data set:
# Display DataFrame
iris = dataset("datasets", "iris")
SepalLength | SepalWidth | PetalLength | PetalWidth | Species | |
---|---|---|---|---|---|
Float64 | Float64 | Float64 | Float64 | Categorical… | |
1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
2 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
5 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
6 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |
7 | 4.6 | 3.4 | 1.4 | 0.3 | setosa |
8 | 5.0 | 3.4 | 1.5 | 0.2 | setosa |
9 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
10 | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
11 | 5.4 | 3.7 | 1.5 | 0.2 | setosa |
12 | 4.8 | 3.4 | 1.6 | 0.2 | setosa |
13 | 4.8 | 3.0 | 1.4 | 0.1 | setosa |
14 | 4.3 | 3.0 | 1.1 | 0.1 | setosa |
15 | 5.8 | 4.0 | 1.2 | 0.2 | setosa |
16 | 5.7 | 4.4 | 1.5 | 0.4 | setosa |
17 | 5.4 | 3.9 | 1.3 | 0.4 | setosa |
18 | 5.1 | 3.5 | 1.4 | 0.3 | setosa |
19 | 5.7 | 3.8 | 1.7 | 0.3 | setosa |
20 | 5.1 | 3.8 | 1.5 | 0.3 | setosa |
21 | 5.4 | 3.4 | 1.7 | 0.2 | setosa |
22 | 5.1 | 3.7 | 1.5 | 0.4 | setosa |
23 | 4.6 | 3.6 | 1.0 | 0.2 | setosa |
24 | 5.1 | 3.3 | 1.7 | 0.5 | setosa |
25 | 4.8 | 3.4 | 1.9 | 0.2 | setosa |
26 | 5.0 | 3.0 | 1.6 | 0.2 | setosa |
27 | 5.0 | 3.4 | 1.6 | 0.4 | setosa |
28 | 5.2 | 3.5 | 1.5 | 0.2 | setosa |
29 | 5.2 | 3.4 | 1.4 | 0.2 | setosa |
30 | 4.7 | 3.2 | 1.6 | 0.2 | setosa |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
# Sort data set
sort(iris, :SepalLength)
SepalLength | SepalWidth | PetalLength | PetalWidth | Species | |
---|---|---|---|---|---|
Float64 | Float64 | Float64 | Float64 | Categorical… | |
1 | 4.3 | 3.0 | 1.1 | 0.1 | setosa |
2 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
3 | 4.4 | 3.0 | 1.3 | 0.2 | setosa |
4 | 4.4 | 3.2 | 1.3 | 0.2 | setosa |
5 | 4.5 | 2.3 | 1.3 | 0.3 | setosa |
6 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
7 | 4.6 | 3.4 | 1.4 | 0.3 | setosa |
8 | 4.6 | 3.6 | 1.0 | 0.2 | setosa |
9 | 4.6 | 3.2 | 1.4 | 0.2 | setosa |
10 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
11 | 4.7 | 3.2 | 1.6 | 0.2 | setosa |
12 | 4.8 | 3.4 | 1.6 | 0.2 | setosa |
13 | 4.8 | 3.0 | 1.4 | 0.1 | setosa |
14 | 4.8 | 3.4 | 1.9 | 0.2 | setosa |
15 | 4.8 | 3.1 | 1.6 | 0.2 | setosa |
16 | 4.8 | 3.0 | 1.4 | 0.3 | setosa |
17 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
18 | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
19 | 4.9 | 3.1 | 1.5 | 0.2 | setosa |
20 | 4.9 | 3.6 | 1.4 | 0.1 | setosa |
21 | 4.9 | 2.4 | 3.3 | 1.0 | versicolor |
22 | 4.9 | 2.5 | 4.5 | 1.7 | virginica |
23 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
24 | 5.0 | 3.4 | 1.5 | 0.2 | setosa |
25 | 5.0 | 3.0 | 1.6 | 0.2 | setosa |
26 | 5.0 | 3.4 | 1.6 | 0.4 | setosa |
27 | 5.0 | 3.2 | 1.2 | 0.2 | setosa |
28 | 5.0 | 3.5 | 1.3 | 0.3 | setosa |
29 | 5.0 | 3.5 | 1.6 | 0.6 | setosa |
30 | 5.0 | 3.3 | 1.4 | 0.2 | setosa |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
@df iris plot(x-> ecdf(:PetalLength)(x),0.5,8, label = :none)
@df iris scatter!(:PetalLength, zeros(length(:PetalLength)), label = "Observations") # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Sepal Length: Empirical CDF")
xlabel!("Length (cm)")
N = 30 # Number of realization
distribution = Exponential(1) # Exponential distribution of parameter λ = 1
X = rand(distribution, N) # Generate N realization of one random variable X=X₁
30-element Vector{Float64}: 0.4195716851118271 1.126095617007512 0.3060691733047449 0.0930094977561999 0.884458929329341 1.965236281242868 2.1258115097592873 0.6653917010885777 0.6967661713701083 0.45853221364001673 0.2898235172480393 1.4839751324172998 0.41392202037492043 ⋮ 0.2012400346112175 0.20667094059578098 0.011333649823350842 1.211464679952854 3.1123661178876314 1.1778674615737303 0.866288671922453 0.027839554079315904 0.5229709383161959 0.6379734335140371 0.3784073756970474 2.4133658303940386
x_range = -0.5:0.01:maximum(X)*1.5
plot(x-> ecdf(X)(x),x_range, label = "Empirical CDF")
scatter!(X, zeros(N) , label = "Realizations", c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
plot!(x -> cdf(distribution, x), x_range, label = "CDF")
ylims!(-0.1,1.1, legend = :bottomright)
histogram(X, label = :none)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Absolute count: absolute count")
Different bin number
histogram(X, label = :none, bins = 1000)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Absolute count: Too much bin!")
Same area histogram: 'box' in the histogram contains the same number of sample points and all boxes have the same area
ea_histogram(X, label = :none)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Absolute count (same area bins)")
Relative Frequency
# Not adapted for continuous distributions
histogram(X, label = :none, norm = :probability)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Relative count -> height sums to one")
# Not adapted for continuous distributions
histogram(X, label = :none, norm = :pdf)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("p.d.f. -> integrates to 1")
The histogram is one type of pdf estimate. Kernel density estimate is another type (more generic since you can recover histograms for uniform Kernels $K(x)$
#Kernel density estimate function
fₙ(x::Real, X::Vector, h::Real, K::Function) = sum(K((x-X[i])/h) for i in eachindex(X))/(length(X)*h)
fₙ (generic function with 1 method)
# Parameter choice -> In general problem dependent
# Test different value
h = 0.15 # Width -> Critical parameter
# Test different Kernel
# Kernel function -> Critical choice
# K(x) = pdf(Normal(0,1), x) # Gaussian kernel
# K(x) = pdf(Uniform(-1/2,1/2), x) # Uniform ("Histogram") kernel
K(x) = 3/4*max(1-x^2, 0) # Epanechnikov (minimizes MSE)
K (generic function with 1 method)
x_range = -0.5:0.01:maximum(X)*1.5
# Uncomment to see the histogram estimate
#histogram(X, label = :none, bins = 5, norm = :pdf, alpha = 0.5)
plot(x -> pdf(distribution, x), x_range, label = "True pdf", lw = 2, c = 1)
scatter!(X, zeros(N), label = "Realizations") # Show the realization of X₁,⋯,Xₙ on the x-axis
# Uncomment to see each individual Kernel of the sum
#[plot!(x -> K((x-X[i])/h), x_range, label = :none, lw = 2, c = 3) for i in eachindex(X)]
plot!(x -> fₙ(x, X, h, K), x_range, label = "kde", lw = 2)