Long-short term memory in time-series data

Time-series data such as those in the stock market is usually dependent on the previous n historical data points.

Recurrent Neural Network (RNN) is applied to sequence data to capture the intra-sequence dependence. Long-short term memoy network (LSTM) is a variant of RNN, capturing long-term impact on short-term signal behaviour.

Key difference between a simple RNN and LSTM:

  • Simple RNN: later output nodes of the network are less sensitive to the input at time t=1: gradient vanishes.
  • LSTM: Preseves gradient by implementing “forget” and “input” gates.

LSTM holds the following components in each layer:

  1. Inputs: Previous ouptut (h_{t-1} ) and current input (x_{t} )
  2. Forget gate:
    • System: \sigma decides whehter to throw away information from the current cell state C_t
    • Ouput: f_t =\sigma( W_f \times [h_{t-1}, x_t] + b_f) A number between 0 and 1 for each number in cell state C_{t-1} .
  3. Input gate:
    • System 1: \sigma decides which values will be updated
      • Output: i_t = \sigma(W_i \times [h_{t-1}, x_t]+b_i)
    • System 2: tanh creates a vector of new candidate values
      • Output: \tilde{C_t} = tanh(W_C \times [h_{t-1}, x_t] + b_C)
    • Output: i_t \times \tilde{C_t}
  4. Summation of Forget and Input gates:
    • C_t = f_t \times C_{t-1} + i_t \times \tilde{C_t}
  5. Final Process:
    • System 1: \sigma decides what parts of the cell state C_t will be outputed
      • Output: o_t = \sigma(W_o \times [h_{t-1}, x_t] + b_o)
    • System 2: tanh to generate a value between -1 and 1, multiply by o_t
    • Output: h_t = o_t \times tanh(C_t)

Further reading:

Here are two implementations of codes using Tensorflow and Pytorch:

Tensorflow is a bit more convolved but you can play around with the architecture:

# Tensorflow LSTM
# from https://www.datacamp.com/community/tutorials/lstm-python-stock-market

import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

train_test_split_i = 3230
high_prices = raw_data['high'].as_matrix()
low_prices = raw_data['low'].as_matrix()
mid_prices = (high_prices+low_prices)/2.0

# Split data 2:1 training vs testing
train_data = mid_prices[:train_test_split_i]
test_data = mid_prices[train_test_split_i:]

# Normalize data
scaler = MinMaxScaler()
train_data = train_data.reshape(-1,1)
test_data = test_data.reshape(-1,1)

# Smoothing
smoothing_window_size = 800
for di in range(0, 3200, smoothing_window_size):
    scaler.fit(train_data[di:di+smoothing_window_size,:])
    train_data[di:di+smoothing_window_size,:]=scaler.transform(train_data[di:di+smoothing_window_size,:])
scaler.fit(train_data[di+smoothing_window_size:,:])
train_data[di+smoothing_window_size:,:] = scaler.transform(train_data[di+smoothing_window_size:,:])

# Reshape data
train_data = train_data.reshape(-1)

# Normalize test data
test_data = scaler.transform(test_data).reshape(-1)

# EMA
EMA = 0.0
gamma = 0.1
for ti in range(train_test_split_i):
    EMA = gamma*train_data[ti] + (1-gamma)*EMA
    train_data[ti] = EMA
all_mid_data = np.concatenate([train_data,test_data],axis=0)

# Long Short-Term Memory models
class DataGeneratorSeq(object):
    def __init__(self, prices, batch_size, num_unroll):
        self._prices = prices
        self._prices_length = len(self._prices)-num_unroll
        self._batch_size = batch_size
        self._num_unroll = num_unroll
        self._segments = self._prices_length//self._batch_size
        self._cursor = [offset * self._segments for offset in range(self._batch_size)]
        
    def next_batch(self):
        batch_data = np.zeros((self._batch_size),dtype=np.float32)
        batch_labels = np.zeros((self._batch_size),dtype=np.float32)
        
        for b in range(self._batch_size):
            if self._cursor[b]+1>=self._prices_length:
                self._cursor[b] = np.random.randint(0,(b+1)*self._segments)
            batch_data[b] = self._prices[self._cursor[b]]
            batch_labels[b] = self._prices[self._cursor[b]+np.random.randint(0,5)]
            self._cursor[b] = (self._cursor[b]+1)%self._prices_length
        return batch_data, batch_labels
    
    def unroll_batches(self):
        unroll_data, unroll_labels = [], []
        init_data, init_label = None, None
        for ui in range(self._num_unroll):
            data, labels = self.next_batch()
            unroll_data.append(data)
            unroll_labels.append(labels)
        return unroll_data, unroll_labels
    def reset_indices(self):
        for b in range(self._batch_size):
            self._cursor[b] = np.random.randint(0,min((b+1)*self._segments,self._prices_length-1))

dg = DataGeneratorSeq(train_data,5,5)
u_data, u_labels = dg.unroll_batches()

for ui, (dat,lbl) in enumerate(zip(u_data,u_labels)):
    print('\nUnrolled index %d' %ui)
    print('\tInputs: ', dat)
    print('\tOutputs: ', lbl)

D = 1 # Dimensionality of the data
num_unrollings = 50 # Number of time steps you look into the future
batch_size = 500 # Number of samples in a batch
num_nodes = [200, 200, 150] # Number of hidden nodes in each layer of the deep LSTM stack
n_layers = len(num_nodes) # Number of layers
dropout = 0.2 # Dropout amount

# Reinitialize
tf.reset_default_graph()

# Inputs
train_inputs = [tf.placeholder(tf.float32, shape=[batch_size,D],name='train_inputs_%d'%ui) for ui in range(num_unrollings)]
train_outputs = [tf.placeholder(tf.float32, shape=[batch_size,1], name = 'train_outputs_%d'%ui) for ui in range(num_unrollings)]

# Set up the LSTM and Regression layers
lstm_cells = [
    tf.contrib.rnn.LSTMCell(num_units=num_nodes[li],
                           state_is_tuple=True,
                           initializer=tf.contrib.layers.xavier_initializer())
    for li in range(n_layers)]

drop_lstm_cells = [
    tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob=1.0,
                                 output_keep_prob=1.0-dropout,
                                 state_keep_prob=1.0-dropout)
    for lstm in lstm_cells]
drop_multi_cell = tf.contrib.rnn.MultiRNNCell(drop_lstm_cells)
multi_cell = tf.contrib.rnn.MultiRNNCell(lstm_cells)

# LSTM layers
w = tf.get_variable('w',shape=[num_nodes[-1],1],
                   initializer=tf.contrib.layers.xavier_initializer())
# Linear regression layer
b = tf.get_variable('b',initializer=tf.random_uniform([1],-0.1,0.1))

# Cell state and hidden state
c = [tf.Variable(tf.zeros([batch_size, num_nodes[li]]), trainable=False) for li in range(n_layers)]
h = [tf.Variable(tf.zeros([batch_size, num_nodes[li]]), trainable=False) for li in range(n_layers)]
initial_state = [tf.contrib.rnn.LSTMStateTuple(c[li], h[li]) for li in range(n_layers)]

# Formatting
all_inputs = tf.concat([tf.expand_dims(t,0) for t in train_inputs], axis=0)
all_lstm_outputs, state = tf.nn.dynamic_rnn(
    drop_multi_cell, all_inputs, initial_state=tuple(initial_state),
    time_major=True, dtype=tf.float32)
all_lstm_outputs = tf.reshape(all_lstm_outputs, [batch_size*num_unrollings, num_nodes[-1]])

all_outputs = tf.nn.xw_plus_b(all_lstm_outputs, w, b)
split_outputs = tf.split(all_outputs,num_unrollings, axis=0)

# Training loss
# Mean Squared Error (MSE)
loss = 0.0
with tf.control_dependencies([tf.assign(c[li],state[li][0]) for li in range(n_layers)]+
                            [tf.assign(h[li],state[li][1]) for li in range(n_layers)]):
    loss += sum([tf.reduce_mean(0.5*(split_outputs[ui]-train_outputs[ui])**2) for ui in range(num_unrollings)])

global_step = tf.Variable(0, trainable=False)
inc_gstep = tf.assign(global_step, global_step+1)
tf_learning_rate = tf.placeholder(shape=None, dtype=tf.float32)
tf_min_learning_rate = tf.placeholder(shape=None, dtype=tf.float32)

learning_rate = tf.maximum(
    tf.train.exponential_decay(tf_learning_rate,global_step,
                               decay_steps=1, decay_rate=0.5,
                              staircase=True),
    tf_min_learning_rate)

# Optimizer
optimizer = tf.train.AdamOptimizer(learning_rate)
gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
optimizer = optimizer.apply_gradients(zip(gradients,v))

# Defining prediction related TF calculations
sample_inputs = tf.placeholder(tf.float32, shape=[1,D])
sample_c = [tf.Variable(tf.zeros([1, num_nodes[li]]), trainable=False) for li in range(n_layers)]
sample_h = [tf.Variable(tf.zeros([1, num_nodes[li]]), trainable=False) for li in range(n_layers)]
initial_sample_state = [tf.contrib.rnn.LSTMStateTuple(sample_c[li],sample_h[li]) for li in range(n_layers)]

reset_sample_states = tf.group(*[[tf.assign(sample_c[li],tf.zeros([1, num_nodes[li]])) for li in range(n_layers)]+
                               [tf.assign(sample_h[li],tf.zeros([1, num_nodes[li]])) for li in range(n_layers)]])
sample_outputs, sample_state = tf.nn.dynamic_rnn(multi_cell, tf.expand_dims(sample_inputs,0),
                                                initial_state=tuple(initial_sample_state),
                                                time_major=True,
                                                dtype=tf.float32)
with tf.control_dependencies([tf.assign(sample_c[li],sample_state[li][0]) for li in range(n_layers)]+
                            [tf.assign(sample_h[li], sample_state[li][1]) for li in range(n_layers)]):
    sample_prediction = tf.nn.xw_plus_b(tf.reshape(sample_outputs,[1,-1]),w,b)

# Initialise before Running
epochs = 30
valid_summary = 1
n_predict_once = 50
train_seq_length = train_data.size
train_mse_ot = []
test_mse_ot = []
predictions_over_time = []

session = tf.InteractiveSession()
tf.global_variables_initializer().run()
loss_nondecrease_count = 0
loss_nondecrease_threshold = 2
average_loss = 0

data_gen = DataGeneratorSeq(train_data,batch_size,num_unrollings)
x_axis_seq = []
test_points_seq = np.arange(train_test_split_i,train_test_split_i+100,n_predict_once).tolist()

for ep in range(epochs):
    # =================== Training =======================
    for step in range(train_seq_length//batch_size):
        u_data, u_labels = data_gen.unroll_batches()
        feed_dict = {}
        for ui, (dat,lbl) in enumerate(zip(u_data,u_labels)):
            feed_dict[train_inputs[ui]] = dat.reshape(-1,1)
            feed_dict[train_outputs[ui]] = lbl.reshape(-1,1)
        feed_dict.update({tf_learning_rate: 0.0001, tf_min_learning_rate:0.000001})
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += l
    # =================== Validation =======================
    if (ep+1) % valid_summary == 0:
        average_loss = average_loss/(valid_summary*(train_seq_length//batch_size))
        if (ep+1)%valid_summary==0:
            print('Average loss at step %d: %f' %(ep+1, average_loss))
        train_mse_ot.append(average_loss)
        average_loss = 0
        predictions_seq, mse_test_loss_seq = [], []
    # =================== Update and make prediction =======================
    for w_i in test_points_seq:
        mse_test_loss = 0.0
        our_predictions = []
        if (ep+1)-valid_summary==0:
            x_axis=[]
        for tr_i in range(w_i-num_unrollings+1, w_i-1):
            current_price = all_mid_data[tr_i]
            feed_dict[sample_inputs] = np.array(current_price).reshape(1,1)
            _ = session.run(sample_prediction, feed_dict=feed_dict)
        
        feed_dict = {}
        current_price = all_mid_data[w_i-1]
        feed_dict[sample_inputs] = np.array(current_price).reshape(1,1)
        for pred_i in range(n_predict_once):
            pred = session.run(sample_prediction, feed_dict=feed_dict)
            our_predictions.append(np.asscalar(pred))
            feed_dict[sample_inputs] = np.asarray(pred).reshape(-1,1)
            if (ep+1)-valid_summary==0:
                x_axis.append(w_i+pred_i)
            mse_test_loss += 0.5 *(pred-all_mid_data[w_i+pred_i])**2
        session.run(reset_sample_states)
        predictions_seq.append(np.array(our_predictions))
        mse_test_loss/=n_predict_once
        mse_test_loss_seq.append(mse_test_loss)
        if (ep+1)-valid_summary==0:
            x_axis_seq.append(x_axis)
    current_test_mse = np.mean(mse_test_loss_seq)
    
    # learning rate decay logic
    if len(test_mse_ot)>0 and current_test_mse>min(test_mse_ot):
        loss_nondecrease_count += 1
    else:
        loss_nondecrease_count = 0
    
    if loss_nondecrease_count > loss_nondecrease_threshold:
        session.run(inc_gstep)
        loss_nondecrease_count = 0
        # Decreasing learning rate by 0.5
    test_mse_ot.append(current_test_mse)
    print('\tTest MSE: %.5f'%np.mean(mse_test_loss_seq))
    predictions_over_time.append(predictions_seq)
    print('\tFinished Predictions') 

# Plot

best_prediction_epoch = 28

plt.figure(figsize = (18,18))
plt.subplot(2,1,1)
plt.plot(range(raw_data.shape[0]),all_mid_data,color='b')


# How the predictions change over time
start_alpha = 0.25
alpha = np.arange(start_alpha,1.1,(1.0-start_alpha)/len(predictions_over_time[::3]))
for p_i, p in enumerate(predictions_over_time[::3]):
    for xval,yval in zip(x_axis_seq,p):
        plt.plot(xval,yval,color='r',alpha=alpha[p_i])
plt.title('Evolution of Test Predictions Over Time', fontsize=18)
plt.xlabel('Date', fontsize=18)
plt.ylabel('Mid Price',fontsize=18)
plt.xlim(train_test_split_i,len(raw_data))

# Best prediction
plt.subplot(2,1,2)
plt.plot(range(raw_data.shape[0]),all_mid_data,color='b')
for xval,yval in zip(x_axis_seq,predictions_over_time[best_prediction_epoch]):
    plt.plot(xval,yval,color='r')
plt.title('Best Test Predictions Over Time', fontsize=18)
plt.xlabel('Date',fontsize=18)
plt.ylabel('Mid Price',fontsize=18)
plt.xlim(train_test_split_i,len(raw_data))
plt.show()

Pytorch has a built-in LSTM model:

# LSTM with PyTorch
# https://github.com/jessicayung/blog-code-snippets/blob/master/lstm-pytorch/lstm-baseline.py
import torch
import torch.nn as nn

# Parameters

# Training/testing parameters
num_train = 3230 # (2/3)*len(raw_data)

# Network architecture paramters
input_size = 20
# If `per_element` is True, then LSTM reads in one timestep at a time.
per_element = True
if per_element:
    lstm_input_size = 1
else:
    lstm_input_size = input_size

# Hidden layers
h1 = 32
output_dim = 1
num_layers = 2
learning_rate = 1e-3
num_epochs = 500
dtype = torch.float

# Split training/testing data

high_prices = raw_data['high'].as_matrix()
low_prices = raw_data['low'].as_matrix()
mid_prices = np.concatenate(((high_prices+low_prices)/2.0, np.zeros(input_size)))

# Split data 2:1 training vs testing
y_train = mid_prices[:num_train]
y_test = mid_prices[num_train:]
X = np.reshape([mid_prices[i:i+input_size] for i in range(len(raw_data))],(len(raw_data), input_size))

# Into Torch
X_train = torch.from_numpy(X[:num_train]).type(torch.Tensor).view([input_size,-1,1])
X_test = torch.from_numpy(X[num_train:]).type(torch.Tensor).view([input_size,-1,1])
y_train = torch.from_numpy(y_train).type(torch.Tensor).view(-1)
y_test = torch.from_numpy(y_test).type(torch.Tensor).view(-1)

# Build model
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, batch_size, output_dim=1, num_layers=2):
        super(LSTM,self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.output_dim = output_dim
        # Define the LSTM layer
        self.lstm = nn.LSTM(self.input_dim,self.hidden_dim,self.num_layers)
        # Define the output layer
        self.linear = nn.Linear(self.hidden_dim, self.output_dim)
    def init_hidden(self):
        return(torch.zeros(self.num_layers,self.batch_size,self.hidden_dim),
              torch.zeros(self.num_layers,self.batch_size,self.hidden_dim))
    def forward(self, input):
        """
        Forward pass through LSTM layer
        """
        lstm_out, self.hidden = self.lstm(input.view(len(input), self.batch_size, -1))
        y_pred = self.linear(lstm_out[-1].view(self.batch_size,-1))
        return y_pred.view(-1)
model = LSTM(lstm_input_size, h1, batch_size=num_train,
            output_dim=output_dim, num_layers=num_layers)
loss_fn = torch.nn.MSELoss(size_average=False)
optimiser = torch.optim.Adam(model.parameters(),lr=learning_rate)

# Train model
hist = np.zeros(num_epochs)
for t in range(num_epochs):
    model.hidden = model.init_hidden()
    y_pred = model(X_train)
    loss = loss_fn(y_pred, y_train)
    if t%100 == 0:
        print ("Epoch %s MSE: %f" %(t, loss.item()))
    hist[t] = loss.item()
    
    optimiser.zero_grad()
    loss.backward()
    optimiser.step()
# Plot preds and performance
plt.figure(figsize = (18,18))
plt.subplot(2,1,1)
plt.plot(y_pred.detach().numpy(), label="Preds")
plt.plot(y_train.detach().numpy(), label="Data")

plt.subplot(2,1,2)
plt.plot(hist, label="Training Loss")
plt.legend()
plt.show()

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s