using RDatasets # Import some datasets from R
using Distributions # 
using StatsPlots, StatsBase
using LaTeXStrings #


x = [0.31, 0.29, 0.34, 0.06]
names = ["A", "B", "C", "D"]
pie(names, x, size = (1000,500))


p1 = bar(names, x, label = :none)
ylims!(0,1)


# Display DataFrame
iris = dataset("datasets", "iris")


# Sort data set
sort(iris, :SepalLength)


@df iris plot(x-> ecdf(:PetalLength)(x),0.5,8, label = :none)
@df iris scatter!(:PetalLength, zeros(length(:PetalLength)), label = "Observations") # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Sepal Length: Empirical CDF")
xlabel!("Length (cm)")


N = 30 # Number of realization
distribution = Exponential(1) # Exponential distribution of parameter λ = 1
X = rand(distribution, N) # Generate N realization of one random variable X=X₁

30-element Vector{Float64}:
 0.4195716851118271
 1.126095617007512
 0.3060691733047449
 0.0930094977561999
 0.884458929329341
 1.965236281242868
 2.1258115097592873
 0.6653917010885777
 0.6967661713701083
 0.45853221364001673
 0.2898235172480393
 1.4839751324172998
 0.41392202037492043
 ⋮
 0.2012400346112175
 0.20667094059578098
 0.011333649823350842
 1.211464679952854
 3.1123661178876314
 1.1778674615737303
 0.866288671922453
 0.027839554079315904
 0.5229709383161959
 0.6379734335140371
 0.3784073756970474
 2.4133658303940386


x_range = -0.5:0.01:maximum(X)*1.5
plot(x-> ecdf(X)(x),x_range, label = "Empirical CDF")
scatter!(X, zeros(N) , label = "Realizations", c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
plot!(x -> cdf(distribution, x), x_range, label = "CDF")
ylims!(-0.1,1.1, legend = :bottomright)


histogram(X, label = :none)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Absolute count: absolute count")


histogram(X, label = :none, bins = 1000)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Absolute count: Too much bin!")


ea_histogram(X, label = :none)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Absolute count (same area bins)")


# Not adapted for continuous distributions
histogram(X, label = :none, norm = :probability)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("Relative count -> height sums to one")


# Not adapted for continuous distributions
histogram(X, label = :none, norm = :pdf)
scatter!(X, zeros(N) , label = :none, c = 2) # Show the realization of X₁,⋯,Xₙ on the x-axis
title!("p.d.f. -> integrates to 1")


#Kernel density estimate function
fₙ(x::Real, X::Vector, h::Real, K::Function) = sum(K((x-X[i])/h) for i in eachindex(X))/(length(X)*h)

fₙ (generic function with 1 method)


# Parameter choice -> In general problem dependent

# Test different value
h = 0.15 # Width -> Critical parameter

# Test different Kernel
# Kernel function -> Critical choice
# K(x) = pdf(Normal(0,1), x) # Gaussian kernel
# K(x) = pdf(Uniform(-1/2,1/2), x) # Uniform ("Histogram") kernel
K(x) = 3/4*max(1-x^2, 0) # Epanechnikov (minimizes MSE)

K (generic function with 1 method)


x_range = -0.5:0.01:maximum(X)*1.5
# Uncomment to see the histogram estimate
#histogram(X, label = :none, bins = 5, norm = :pdf, alpha = 0.5)
plot(x -> pdf(distribution, x), x_range, label = "True pdf", lw = 2, c = 1)

scatter!(X, zeros(N), label = "Realizations") # Show the realization of X₁,⋯,Xₙ on the x-axis

# Uncomment to see each individual Kernel of the sum
#[plot!(x -> K((x-X[i])/h), x_range, label = :none, lw = 2, c = 3) for i in eachindex(X)]

plot!(x -> fₙ(x, X, h, K), x_range, label = "kde", lw = 2)

	SepalLength	SepalWidth	PetalLength	PetalWidth	Species
	Float64	Float64	Float64	Float64	Categorical…
1	5.1	3.5	1.4	0.2	setosa
2	4.9	3.0	1.4	0.2	setosa
3	4.7	3.2	1.3	0.2	setosa
4	4.6	3.1	1.5	0.2	setosa
5	5.0	3.6	1.4	0.2	setosa
6	5.4	3.9	1.7	0.4	setosa
7	4.6	3.4	1.4	0.3	setosa
8	5.0	3.4	1.5	0.2	setosa
9	4.4	2.9	1.4	0.2	setosa
10	4.9	3.1	1.5	0.1	setosa
11	5.4	3.7	1.5	0.2	setosa
12	4.8	3.4	1.6	0.2	setosa
13	4.8	3.0	1.4	0.1	setosa
14	4.3	3.0	1.1	0.1	setosa
15	5.8	4.0	1.2	0.2	setosa
16	5.7	4.4	1.5	0.4	setosa
17	5.4	3.9	1.3	0.4	setosa
18	5.1	3.5	1.4	0.3	setosa
19	5.7	3.8	1.7	0.3	setosa
20	5.1	3.8	1.5	0.3	setosa
21	5.4	3.4	1.7	0.2	setosa
22	5.1	3.7	1.5	0.4	setosa
23	4.6	3.6	1.0	0.2	setosa
24	5.1	3.3	1.7	0.5	setosa
25	4.8	3.4	1.9	0.2	setosa
26	5.0	3.0	1.6	0.2	setosa
27	5.0	3.4	1.6	0.4	setosa
28	5.2	3.5	1.5	0.2	setosa
29	5.2	3.4	1.4	0.2	setosa
30	4.7	3.2	1.6	0.2	setosa
⋮	⋮	⋮	⋮	⋮	⋮

	SepalLength	SepalWidth	PetalLength	PetalWidth	Species
	Float64	Float64	Float64	Float64	Categorical…
1	4.3	3.0	1.1	0.1	setosa
2	4.4	2.9	1.4	0.2	setosa
3	4.4	3.0	1.3	0.2	setosa
4	4.4	3.2	1.3	0.2	setosa
5	4.5	2.3	1.3	0.3	setosa
6	4.6	3.1	1.5	0.2	setosa
7	4.6	3.4	1.4	0.3	setosa
8	4.6	3.6	1.0	0.2	setosa
9	4.6	3.2	1.4	0.2	setosa
10	4.7	3.2	1.3	0.2	setosa
11	4.7	3.2	1.6	0.2	setosa
12	4.8	3.4	1.6	0.2	setosa
13	4.8	3.0	1.4	0.1	setosa
14	4.8	3.4	1.9	0.2	setosa
15	4.8	3.1	1.6	0.2	setosa
16	4.8	3.0	1.4	0.3	setosa
17	4.9	3.0	1.4	0.2	setosa
18	4.9	3.1	1.5	0.1	setosa
19	4.9	3.1	1.5	0.2	setosa
20	4.9	3.6	1.4	0.1	setosa
21	4.9	2.4	3.3	1.0	versicolor
22	4.9	2.5	4.5	1.7	virginica
23	5.0	3.6	1.4	0.2	setosa
24	5.0	3.4	1.5	0.2	setosa
25	5.0	3.0	1.6	0.2	setosa
26	5.0	3.4	1.6	0.4	setosa
27	5.0	3.2	1.2	0.2	setosa
28	5.0	3.5	1.3	0.3	setosa
29	5.0	3.5	1.6	0.6	setosa
30	5.0	3.3	1.4	0.2	setosa
⋮	⋮	⋮	⋮	⋮	⋮

Pie plot VS Bar plot¶

Example of famous dataset¶

First data plot¶

Example on simulated data¶

Empirical CDF¶

Histogram¶

Kernel density estimate¶