0. Data Overview¶

import json
import pandas as pd
from pathlib import Path
import os
import random
import numpy as np
from datetime import timedelta

DATA_PATH = Path('/Users/soyoon-yoon/Kaggle mining/Multi_CF')
TRAIN_PATH = DATA_PATH/'train.jsonl'
TEST_PATH = DATA_PATH/'test.jsonl'

sample_size = 10000
chunks = pd.read_json(TRAIN_PATH, lines=True, chunksize = sample_size)

with open(TRAIN_PATH, 'r') as f:
    print(f"We have {len(f.readlines()):,} lines in the training data")

We have 2,963,606 lines in the training data

for c in chunks:
    sample_train_df = c
    print(c)
    break

       session                                             events
10000    10000  [{'aid': 1033792, 'ts': 1659305201724, 'type':...
10001    10001  [{'aid': 476264, 'ts': 1659305201745, 'type': ...
10002    10002  [{'aid': 1754433, 'ts': 1659305201774, 'type':...
10003    10003  [{'aid': 1536959, 'ts': 1659305201875, 'type':...
10004    10004  [{'aid': 287161, 'ts': 1659305201899, 'type': ...
...        ...                                                ...
19995    19995  [{'aid': 1481519, 'ts': 1659305842045, 'type':...
19996    19996  [{'aid': 1109584, 'ts': 1659305842183, 'type':...
19997    19997  [{'aid': 1647277, 'ts': 1659305842315, 'type':...
19998    19998  [{'aid': 753948, 'ts': 1659305842328, 'type': ...
19999    19999  [{'aid': 1690380, 'ts': 1659305842502, 'type':...

[10000 rows x 2 columns]

sample_train_df

sample_train_df.loc[10543, :]['events'][:10]

[{'aid': 602784, 'ts': 1659305228407, 'type': 'clicks'},
 {'aid': 602784, 'ts': 1659305254095, 'type': 'carts'},
 {'aid': 1456023, 'ts': 1659305269630, 'type': 'clicks'},
 {'aid': 1456023, 'ts': 1659305313768, 'type': 'carts'},
 {'aid': 1466811, 'ts': 1659305321905, 'type': 'clicks'},
 {'aid': 1456023, 'ts': 1659305346764, 'type': 'clicks'},
 {'aid': 602784, 'ts': 1659305500999, 'type': 'orders'},
 {'aid': 1456023, 'ts': 1659305500999, 'type': 'orders'},
 {'aid': 884043, 'ts': 1659305559962, 'type': 'clicks'},
 {'aid': 1343414, 'ts': 1659305600039, 'type': 'clicks'}]

sample_train_df.set_index('session', drop=True, inplace=True)
sample_train_df.head()

example_session = sample_train_df.iloc[100].item()

time_elapsed = example_session[-1]["ts"] - example_session[0]["ts"]

# The timestamp is in milliseconds since 00:00:00 UTC on 1 January 1970
print(f'The first session elapsed: {str(timedelta(milliseconds=time_elapsed))} \n')

The first session elapsed: 24 days, 0:33:11.682000

# Count the frequency of actions within the session
action_counts = {}
for action in example_session:
    action_counts[action['type']] = action_counts.get(action['type'], 0) + 1  
print(f'The first session contains the following frequency of actions: {action_counts}')

The first session contains the following frequency of actions: {'clicks': 116, 'carts': 6}

with open(TEST_PATH, 'r') as f:
    print(f"We have {len(f.readlines()):,} lines in the training data")

We have 1,671,803 lines in the training data

sample_size = 150

chunks = pd.read_json(TEST_PATH, lines=True, chunksize = sample_size)

for c in chunks:
    sample_test_df = c
    break

sample_test_df.loc[0, 'events']
#sample_train_df.loc[10000, :]['events']

[{'aid': 59625, 'ts': 1661724000278, 'type': 'clicks'}]

sample_test_df.loc[2, 'events']

[{'aid': 141736, 'ts': 1661724000559, 'type': 'clicks'},
 {'aid': 199008, 'ts': 1661724022851, 'type': 'clicks'},
 {'aid': 57315, 'ts': 1661724170835, 'type': 'clicks'},
 {'aid': 194067, 'ts': 1661724246188, 'type': 'clicks'},
 {'aid': 199008, 'ts': 1661780623778, 'type': 'clicks'},
 {'aid': 199008, 'ts': 1661781274081, 'type': 'clicks'},
 {'aid': 199008, 'ts': 1661781409993, 'type': 'carts'},
 {'aid': 199008, 'ts': 1661804151788, 'type': 'clicks'},
 {'aid': 199008, 'ts': 1662060028567, 'type': 'clicks'},
 {'aid': 199008, 'ts': 1662060064706, 'type': 'clicks'},
 {'aid': 918667, 'ts': 1662060160406, 'type': 'clicks'}]

session_type	predicted lables (aids)
12899779_clicks	129004 126836 118524
12899779_carts	129004 126836 118524
12899779_orders	129004 126836 118524
12899780_clicks	129004 126836 118524
12899780_carts	129004 126836 118524

	session	events
10000	10000	[{'aid': 1033792, 'ts': 1659305201724, 'type':...
10001	10001	[{'aid': 476264, 'ts': 1659305201745, 'type': ...
10002	10002	[{'aid': 1754433, 'ts': 1659305201774, 'type':...
10003	10003	[{'aid': 1536959, 'ts': 1659305201875, 'type':...
10004	10004	[{'aid': 287161, 'ts': 1659305201899, 'type': ...
...	...	...
19995	19995	[{'aid': 1481519, 'ts': 1659305842045, 'type':...
19996	19996	[{'aid': 1109584, 'ts': 1659305842183, 'type':...
19997	19997	[{'aid': 1647277, 'ts': 1659305842315, 'type':...
19998	19998	[{'aid': 753948, 'ts': 1659305842328, 'type': ...
19999	19999	[{'aid': 1690380, 'ts': 1659305842502, 'type':...

	events
session
10000	[{'aid': 1033792, 'ts': 1659305201724, 'type':...
10001	[{'aid': 476264, 'ts': 1659305201745, 'type': ...
10002	[{'aid': 1754433, 'ts': 1659305201774, 'type':...
10003	[{'aid': 1536959, 'ts': 1659305201875, 'type':...
10004	[{'aid': 287161, 'ts': 1659305201899, 'type': ...

[Dataset] OTTO – Multi-Objective Recommender System

0. Data Overview¶