Statistics and Data Manipulation¶

(Back to Overview)

Aside: Piping, Mapping, and Filtering¶

In [3]:
import Base: filter, map
filter(f::Function)::Function = x -> filter(f, x)
map(f::Function)::Function    = x -> map(f, x)
Out[3]:
map (generic function with 67 methods)
In [30]:
x = collect(range(0, 10))
Out[30]:
11-element Vector{Int64}:
  0
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
In [31]:
x |> map(x->x+1)
Out[31]:
11-element Vector{Int64}:
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
In [32]:
x |> map(x->x+1) |> filter(x->x%2==0)
Out[32]:
5-element Vector{Int64}:
  2
  4
  6
  8
 10
In [33]:
using Chain
In [35]:
@chain x begin
    map(x->x+1, _)
    filter(x->x%2==0, _)
end
Out[35]:
5-element Vector{Int64}:
  2
  4
  6
  8
 10

Statistics and Feature Extraction¶

In [7]:
using Dates
using TimeZones
using MySQL
using DataFrames
using JSON

import Base: @kwdef

data_directory = joinpath(@__DIR__, "..", "..", "..", "exercises", "fridge")

timestamp(t; target_tz::VariableTimeZone=tz"America/Los_Angeles") = astimezone(
    ZonedDateTime(Dates.unix2datetime(t), tz"GMT"), target_tz
)

function get_emeter_data(conn::MySQL.Connection, name::String, idx::Int64)
    DBInterface.execute(
        DBInterface.prepare(conn,
            "SET @v1 := (SELECT emeter_start FROM $(name) WHERE idx = $(idx))"
        )
    )
    DBInterface.execute(
        DBInterface.prepare(conn,
            "SET @v2 := (SELECT emeter_end FROM $(name) WHERE idx = $(idx))"
        )
    )
    events = DBInterface.execute(
        DBInterface.prepare(conn,
            "SELECT * FROM $(name)_emeter WHERE idx BETWEEN @v1 and @v2"
        )
    ) |> DataFrame
    events = transform(
        events,
        :t => (x->timestamp.(x)) => :timestamp
    )
    return events
end

@kwdef mutable struct DBCredentials
    host::String
    name::String
    username::String
    password::String
end

db_credenitals = DBCredentials(
    ;JSON.parsefile(
        joinpath(data_directory, "db.json"),
        dicttype=Dict{Symbol,String}
    )...
)
# db_credenitals.host = "127.0.0.1";

db_conn = DBInterface.connect(
    MySQL.Connection, db_credenitals.host,
    db_credenitals.username, db_credenitals.password,
    db=db_credenitals.name
)

name = "Arstotzka_Ministry_of_Energy"
idx_max = 61
idx_blacklist = [
    1,2,3,4,5,6,7,8,9,  # Debugging
    51  #  Battery Charger
]

all_emeter_data = DataFrame[]

for idx=1:idx_max
    if idx in idx_blacklist
        continue
    end
    push!(all_emeter_data, get_emeter_data(db_conn, name, idx))
end

all_emeter_data = vcat(all_emeter_data...);
In [8]:
describe(all_emeter_data)
Out[8]:

7 rows × 7 columns (omitted printing of 2 columns)

variablemeanminmedianmax
SymbolUnion…AnyUnion…Any
1idx2.4833e6122.48353e64958709
2t1.6465e91.64349e91.6465e91.64955e9
3ma414.049060.09012
4mv1.21104e5106168121184.0124168
5mw45782.764667246.0918210
6wh14888.4012321.037943
7timestamp2022-01-29T12:19:41.336-08:002022-04-09T17:29:25.218-07:00
In [9]:
all_emeter_data[end,:]
Out[9]:

DataFrameRow (7 columns)

idxtmamvmwwhtimestamp
Int32Float64?Int32?Int32?Int32?Int32?ZonedDat…
495035049587091.64955e9591200377106252002022-04-09T17:29:25.218-07:00
In [10]:
data = @view all_emeter_data[end-Int(2e5):end, :]
Out[10]:

200,001 rows × 7 columns

idxtmamvmwwhtimestamp
Int32Float64?Int32?Int32?Int32?Int32?ZonedDat…
147587091.64929e9601216047243210872022-04-06T17:06:07.072-07:00
247587101.64929e9601216047293210872022-04-06T17:06:08.072-07:00
347587111.64929e9601216047293210872022-04-06T17:06:09.074-07:00
447587121.64929e9601216047293210872022-04-06T17:06:10.075-07:00
547587131.64929e9601215597293210872022-04-06T17:06:11.077-07:00
647587141.64929e9601215597293210872022-04-06T17:06:12.078-07:00
747587151.64929e9601215597315210872022-04-06T17:06:13.079-07:00
847587161.64929e9601215597315210872022-04-06T17:06:14.081-07:00
947587171.64929e9601215597315210872022-04-06T17:06:15.082-07:00
1047587181.64929e9601215607301210872022-04-06T17:06:16.083-07:00
1147587191.64929e9601214847248210872022-04-06T17:06:22.745-07:00
1247587201.64929e9601214847248210872022-04-06T17:06:26.675-07:00
1347587211.64929e9601214847300210872022-04-06T17:06:28.463-07:00
1447587221.64929e9601214847300210872022-04-06T17:06:29.464-07:00
1547587231.64929e9601214847300210872022-04-06T17:06:30.465-07:00
1647587241.64929e9601214847300210872022-04-06T17:06:31.466-07:00
1747587251.64929e9601214847297210872022-04-06T17:06:32.467-07:00
1847587261.64929e9601215147284210872022-04-06T17:06:33.468-07:00
1947587271.64929e9601212817292210872022-04-06T17:06:40.013-07:00
2047587281.64929e9601210677219210872022-04-06T17:07:00.210-07:00
2147587291.64929e9601210677219210872022-04-06T17:07:01.211-07:00
2247587301.64929e9601212417238210872022-04-06T17:07:02.212-07:00
2347587311.64929e9601212417238210872022-04-06T17:07:07.864-07:00
2447587321.64929e9601212417238210872022-04-06T17:07:08.865-07:00
2547587331.64929e9601212417238210872022-04-06T17:07:09.867-07:00
2647587341.64929e9601215257238210872022-04-06T17:07:10.868-07:00
2747587351.64929e9601215257238210872022-04-06T17:07:11.869-07:00
2847587361.64929e9601215257306210872022-04-06T17:07:12.871-07:00
2947587371.64929e9601215257306210872022-04-06T17:07:13.872-07:00
3047587381.64929e9601215257306210872022-04-06T17:07:14.873-07:00
⋮⋮⋮⋮⋮⋮⋮⋮
In [14]:
plot(data[:, :mv])
Out[14]:
In [15]:
using StatsBase, LinearAlgebra

t = @view all_emeter_data[!, :mv]
h = fit(Histogram, t, 1:8:maximum(t))
h = normalize(h, mode=:density)
Out[15]:
Histogram{Float64, 1, Tuple{StepRange{Int64, Int64}}}
edges:
  1:8:124161
weights: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.875, 0.0, 0.625, 0.75, 0.0, 0.0, 0.0, 0.0]
closed: left
isdensity: true
In [16]:
e = collect(h.edges[1])
x = e[1:end-1] + diff(e)/2
y = h.weights
plot(x[y.>0], y[y.>0], legend=:topleft)
Out[16]:

For interactive feature extraction, explore exercises/fridge/Analyze Fridge.ipynb notebook on GitHub

In [19]:
using AverageShiftedHistograms
In [22]:
kd = ash(t);
plot(kd)
Out[22]: