Commit 09df0e9a authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

Feature 2020data

parent 93835b5a
......@@ -2,10 +2,10 @@ stages:
- test
test:
image: "python:3.7"
image: "python:3.8"
stage: test
script:
- apt-get update -qy
- apt-get install -y python-dev python-pip
- pip install -r requirements.txt
- pytest
\ No newline at end of file
- python setup.py test
\ No newline at end of file
.PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3
.PHONY: clean data test lint requirements sync_data_to_s3 sync_data_from_s3
#################################################################################
# GLOBALS #
......@@ -38,6 +38,9 @@ clean:
lint:
flake8 src
test:
PYTHONPATH=. pytest
## Upload Data to S3
sync_data_to_s3:
ifeq (default,$(PROFILE))
......
This diff is collapsed.
%% Cell type:code id: tags:
```
import pandas as pd
import numpy as np
import tensorflow as tf
df = pd.read_csv('..\\data\\processed\\falls.csv')
ats = pd.DataFrame(df.Ats.str.split(pat=",", expand=True))
df = df[['Falls']]
df = pd.concat([df, ats], axis=1)
```
%% Cell type:code id: tags:
```
def get_embedding_size(unique_values: int) -> int:
size = int(min(np.ceil(unique_values / 2), 50))
if size < 2:
return 2
else:
return size
class Category:
def __init__(self, alias: str, unique_values: int):
self.alias = alias
self.unique_values = unique_values
self.embedding_size = get_embedding_size(unique_values)
```
%% Cell type:code id: tags:
```
target_name = 'Falls'
category_list = []
for category in df:
if not category == target_name:
category_list.append(Category(category, df[category].nunique()))
```
%% Cell type:code id: tags:
```
df
```
%% Output
Falls 0 1 2 3 4 5 6 7 8 ... 13 14 15 \
0 0 1809 0 0 0 0 0 0 0 0 ... 0 0 0
1 0 0436 0 0 0 0 0 0 0 0 ... 0 0 0
2 0 0436 0 0 0 0 0 0 0 0 ... 0 0 0
3 0 0933 1815 1206 0433 1812 0 0 0 0 ... 0 0 0
4 0 1218 0933 0912 1222 0433 1830 1803 1812 0 ... 0 0 0
... ... ... ... ... ... ... ... ... ... .. ... .. .. ..
34693 0 2803 0 0 0 0 0 0 0 0 ... 0 0 0
34694 0 0436 0 0 0 0 0 0 0 0 ... 0 0 0
34695 0 0436 0 0 0 0 0 0 0 0 ... 0 0 0
34696 0 0436 0 0 0 0 0 0 0 0 ... 0 0 0
34697 0 1206 0933 1203 1222 0433 0 0 0 0 ... 0 0 0
16 17 18 19 20 21 22
0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0
... .. .. .. .. .. .. ..
34693 0 0 0 0 0 0 0
34694 0 0 0 0 0 0 0
34695 0 0 0 0 0 0 0
34696 0 0 0 0 0 0 0
34697 0 0 0 0 0 0 0
[34698 rows x 24 columns]
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
import src.models.model_features as mf
import src.models.preprocessor as pp
import src.config as cfg
import models.model_features as mf
import models.preprocessor as pp
import config as cfg
pd.reset_option('^display.', silent=True)
pd.set_option('display.max_columns', 100)
# Load data
df = pd.read_csv('../data/interim/timeseries.csv')
# Make features
df = mf.make_citizen_training(df)
df = mf.make_citizen_ats(df)
# Determine if people completed or failed
df_completed = df.loc[(df['NumberWeeksSum'] >= 8)
& (df['NumberTrainingSum'] >= 7)].drop_duplicates(subset='CitizenId')
df_failed = df.drop(df[df.CitizenId.isin(df_completed.CitizenId)].index) \
.drop_duplicates(subset='CitizenId', keep='last')
# Select features to use
feature_list = ['CitizenId'] + cfg.GENERAL_FEATURES + cfg.TRAINING_FEATURES + cfg.ATS_FEATURES
df_completed = df_completed[feature_list]
df_failed = df_failed[feature_list]
# Generate and save profiles
profile_completed = ProfileReport(df_completed)
profile_failed = ProfileReport(df_failed)
profile_completed.to_file(f"..\\reports\\profile_completed.html")
profile_failed.to_file(f"..\\reports\\profile_failed.html")
```
%% Output
Summarize dataset: 100%|██████████| 38/38 [00:30<00:00, 1.23it/s, Completed]\nGenerate report structure: 100%|██████████| 1/1 [00:04<00:00, 4.27s/it]\nRender HTML: 100%|██████████| 1/1 [00:04<00:00, 4.61s/it]\nExport report to file: 100%|██████████| 1/1 [00:00<00:00, 18.52it/s]\nSummarize dataset: 100%|██████████| 38/38 [00:31<00:00, 1.20it/s, Completed]\nGenerate report structure: 100%|██████████| 1/1 [00:04<00:00, 4.19s/it]\nRender HTML: 100%|██████████| 1/1 [00:04<00:00, 4.04s/it]\nExport report to file: 100%|██████████| 1/1 [00:00<00:00, 22.22it/s]\n
%% Cell type:code id: tags:
``` python
```
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment