-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathdataset.py
More file actions
84 lines (72 loc) · 2.58 KB
/
dataset.py
File metadata and controls
84 lines (72 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# coding=utf-8
import numpy as np
import torch
from torch.utils.data import Dataset
from copy import copy
class CHDataset(Dataset):
""" Dataset for Convex Hull Problem data
Args:
filename : the dataset file name
max_in_seq_len : maximum input sequence length
max_out_seq_len : maximum output sequence length
"""
def __init__(self, filename, max_in_seq_len, max_out_seq_len, lineCountLimit):
super(CHDataset, self).__init__()
self.max_in_seq_len = max_in_seq_len
self.max_out_seq_len = max_out_seq_len
self.lineCountLimit = lineCountLimit
self.START = [0, 0]
self.END = [0, 0]
self._load_data(filename)
def _load_data(self, filename):
with open(filename, 'r') as f:
lineCount = 0
data = []
for line in f:
if(lineCount == self.lineCountLimit):
break
inp, outp = line.strip().split('output')
inp = list(map(float, inp.strip().split(' ')))
# Add 1 due to special token
outp = list(map(int, outp.strip().split(' ')))
# Add START token
outp_in = copy(self.START)
outp_out = []
for idx in outp:
outp_in += inp[2 * (idx - 1): 2 * idx]
outp_out += [idx]
# Add END token
outp_out += [0]
# Padding input
inp_len = len(inp) // 2
inp = self.START + inp
inp_len += 1
# Special START token
assert self.max_in_seq_len + 1 >= inp_len
for i in range(self.max_in_seq_len + 1 - inp_len):
inp += self.END
inp = np.array(inp).reshape([-1, 2])
inp_len = np.array([inp_len])
# Padding output
outp_len = len(outp) + 1
for i in range(self.max_out_seq_len + 1 - outp_len):
outp_in += self.START
outp_in = np.array(outp_in).reshape([-1, 2])
outp_out = outp_out + [0] * (self.max_out_seq_len + 1 - outp_len)
# outp_out = np.array([i-1 for i in outp_out])
outp_out = np.array(outp_out)
outp_len = np.array([outp_len])
lineCount += 1
data.append((inp.astype("float32"), inp_len, outp_in.astype("float32"), outp_out, outp_len))
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, index):
inp, inp_len, outp_in, outp_out, outp_len = self.data[index]
return inp, inp_len, outp_in, outp_out, outp_len
if __name__=="__main__":
train_ds = CHDataset("./data/convex_hull_50_train.txt", 50,11, 50)
inp, inp_len, outp_in, outp_out, outp_len = train_ds.__getitem__(0)
print(inp)
print(outp_out)
print(outp_in)