using RDatasets # Import some datasets from R
using Distributions #
using StatsPlots, StatsBase
using LaTeXStrings #
x = [0.31, 0.29, 0.34, 0.06]
names = ["A", "B", "C", "D"]
pie(names, x, size = (1000,500))
p1 = bar(names, x, label = :none)
ylims!(0,1)
The Iris data set:
# Display DataFrame
iris = dataset("datasets", "iris")
| SepalLength | SepalWidth | PetalLength | PetalWidth | Species | |
|---|---|---|---|---|---|
| Float64 | Float64 | Float64 | Float64 | Categorical… | |
| 1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 2 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 5 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| 6 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |
| 7 | 4.6 | 3.4 | 1.4 | 0.3 | setosa |
| 8 | 5.0 | 3.4 | 1.5 | 0.2 | setosa |
| 9 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
| 10 | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
| 11 | 5.4 | 3.7 | 1.5 | 0.2 | setosa |
| 12 | 4.8 | 3.4 | 1.6 | 0.2 | setosa |
| 13 | 4.8 | 3.0 | 1.4 | 0.1 | setosa |
| 14 | 4.3 | 3.0 | 1.1 | 0.1 | setosa |
| 15 | 5.8 | 4.0 | 1.2 | 0.2 | setosa |
| 16 | 5.7 | 4.4 | 1.5 | 0.4 | setosa |
| 17 | 5.4 | 3.9 | 1.3 | 0.4 | setosa |
| 18 | 5.1 | 3.5 | 1.4 | 0.3 | setosa |
| 19 | 5.7 | 3.8 | 1.7 | 0.3 | setosa |
| 20 | 5.1 | 3.8 | 1.5 | 0.3 | setosa |
| 21 | 5.4 | 3.4 | 1.7 | 0.2 | setosa |
| 22 | 5.1 | 3.7 | 1.5 | 0.4 | setosa |
| 23 | 4.6 | 3.6 | 1.0 | 0.2 | setosa |
| 24 | 5.1 | 3.3 | 1.7 | 0.5 | setosa |
| 25 | 4.8 | 3.4 | 1.9 | 0.2 | setosa |
| 26 | 5.0 | 3.0 | 1.6 | 0.2 | setosa |
| 27 | 5.0 | 3.4 | 1.6 | 0.4 | setosa |
| 28 | 5.2 | 3.5 | 1.5 | 0.2 | setosa |
| 29 | 5.2 | 3.4 | 1.4 | 0.2 | setosa |
| 30 | 4.7 | 3.2 | 1.6 | 0.2 | setosa |
| ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
# Sort data set
sort(iris, :SepalLength)
| SepalLength | SepalWidth | PetalLength | PetalWidth | Species | |
|---|---|---|---|---|---|
| Float64 | Float64 | Float64 | Float64 | Categorical… | |
| 1 | 4.3 | 3.0 | 1.1 | 0.1 | setosa |
| 2 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
| 3 | 4.4 | 3.0 | 1.3 | 0.2 | setosa |
| 4 | 4.4 | 3.2 | 1.3 | 0.2 | setosa |
| 5 | 4.5 | 2.3 | 1.3 | 0.3 | setosa |
| 6 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 7 | 4.6 | 3.4 | 1.4 | 0.3 | setosa |
| 8 | 4.6 | 3.6 | 1.0 | 0.2 | setosa |
| 9 | 4.6 | 3.2 | 1.4 | 0.2 | setosa |
| 10 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 11 | 4.7 | 3.2 | 1.6 | 0.2 | setosa |
| 12 | 4.8 | 3.4 | 1.6 | 0.2 | setosa |
| 13 | 4.8 | 3.0 | 1.4 | 0.1 | setosa |
| 14 | 4.8 | 3.4 | 1.9 | 0.2 | setosa |
| 15 | 4.8 | 3.1 | 1.6 | 0.2 | setosa |
| 16 | 4.8 | 3.0 | 1.4 | 0.3 | setosa |
| 17 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 18 | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
| 19 | 4.9 | 3.1 | 1.5 | 0.2 | setosa |
| 20 | 4.9 | 3.6 | 1.4 | 0.1 | setosa |
| 21 | 4.9 | 2.4 | 3.3 | 1.0 | versicolor |
| 22 | 4.9 | 2.5 | 4.5 | 1.7 | virginica |
| 23 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| 24 | 5.0 | 3.4 | 1.5 | 0.2 | setosa |
| 25 | 5.0 | 3.0 | 1.6 | 0.2 | setosa |
| 26 | 5.0 | 3.4 | 1.6 | 0.4 | setosa |
| 27 | 5.0 | 3.2 | 1.2 | 0.2 | setosa |
| 28 | 5.0 | 3.5 | 1.3 | 0.3 | setosa |
| 29 | 5.0 | 3.5 | 1.6 | 0.6 | setosa |
| 30 | 5.0 | 3.3 | 1.4 | 0.2 | setosa |
| ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
@df iris plot(x-> ecdf(:PetalLength)(x),0.5,8, label = :none)
@df iris scatter!(:PetalLength, zeros(length(:PetalLength)), label = "Observations") # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Sepal Length: Empirical CDF")
xlabel!("Length (cm)")
N = 30 # Number of realization
distribution = Exponential(1) # Exponential distribution of parameter λ = 1
X = rand(distribution, N) # Generate N realization of one random variable X=X₁
30-element Vector{Float64}:
0.4195716851118271
1.126095617007512
0.3060691733047449
0.0930094977561999
0.884458929329341
1.965236281242868
2.1258115097592873
0.6653917010885777
0.6967661713701083
0.45853221364001673
0.2898235172480393
1.4839751324172998
0.41392202037492043
⋮
0.2012400346112175
0.20667094059578098
0.011333649823350842
1.211464679952854
3.1123661178876314
1.1778674615737303
0.866288671922453
0.027839554079315904
0.5229709383161959
0.6379734335140371
0.3784073756970474
2.4133658303940386
x_range = -0.5:0.01:maximum(X)*1.5
plot(x-> ecdf(X)(x),x_range, label = "Empirical CDF")
scatter!(X, zeros(N) , label = "Realizations", c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
plot!(x -> cdf(distribution, x), x_range, label = "CDF")
ylims!(-0.1,1.1, legend = :bottomright)
histogram(X, label = :none)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Absolute count: absolute count")
Different bin number
histogram(X, label = :none, bins = 1000)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Absolute count: Too much bin!")
Same area histogram: 'box' in the histogram contains the same number of sample points and all boxes have the same area
ea_histogram(X, label = :none)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Absolute count (same area bins)")
Relative Frequency
# Not adapted for continuous distributions
histogram(X, label = :none, norm = :probability)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Relative count -> height sums to one")
# Not adapted for continuous distributions
histogram(X, label = :none, norm = :pdf)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("p.d.f. -> integrates to 1")
The histogram is one type of pdf estimate. Kernel density estimate is another type (more generic since you can recover histograms for uniform Kernels $K(x)$
#Kernel density estimate function
fₙ(x::Real, X::Vector, h::Real, K::Function) = sum(K((x-X[i])/h) for i in eachindex(X))/(length(X)*h)
fₙ (generic function with 1 method)
# Parameter choice -> In general problem dependent
# Test different value
h = 0.15 # Width -> Critical parameter
# Test different Kernel
# Kernel function -> Critical choice
# K(x) = pdf(Normal(0,1), x) # Gaussian kernel
# K(x) = pdf(Uniform(-1/2,1/2), x) # Uniform ("Histogram") kernel
K(x) = 3/4*max(1-x^2, 0) # Epanechnikov (minimizes MSE)
K (generic function with 1 method)
x_range = -0.5:0.01:maximum(X)*1.5
# Uncomment to see the histogram estimate
#histogram(X, label = :none, bins = 5, norm = :pdf, alpha = 0.5)
plot(x -> pdf(distribution, x), x_range, label = "True pdf", lw = 2, c = 1)
scatter!(X, zeros(N), label = "Realizations") # Show the realization of X₁,⋯,Xₙ on the x-axis
# Uncomment to see each individual Kernel of the sum
#[plot!(x -> K((x-X[i])/h), x_range, label = :none, lw = 2, c = 3) for i in eachindex(X)]
plot!(x -> fₙ(x, X, h, K), x_range, label = "kde", lw = 2)