<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>http://quantsoftware.gatech.edu/index.php?action=history&amp;feed=atom&amp;title=Deep_Q-Learning</id>
	<title>Deep Q-Learning - Revision history</title>
	<link rel="self" type="application/atom+xml" href="http://quantsoftware.gatech.edu/index.php?action=history&amp;feed=atom&amp;title=Deep_Q-Learning"/>
	<link rel="alternate" type="text/html" href="http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;action=history"/>
	<updated>2026-05-25T01:30:33Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.34.4</generator>
	<entry>
		<id>http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2586&amp;oldid=prev</id>
		<title>Petosa at 23:56, 12 February 2018</title>
		<link rel="alternate" type="text/html" href="http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2586&amp;oldid=prev"/>
		<updated>2018-02-12T23:56:07Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 23:56, 12 February 2018&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l30&quot; &gt;Line 30:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 30:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;ol&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;ol&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Be in state &amp;lt;code&amp;gt;s&amp;lt;/code&amp;gt;&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Be in state &amp;lt;code&amp;gt;s&amp;lt;/code&amp;gt;&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;With probability &amp;lt;code&amp;gt;p&amp;lt;/code&amp;gt;, choose a random action &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; from the action space (explore). Otherwise, choose action &amp;lt;code&amp;gt;a = &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;argmax_a&lt;/del&gt;(Q(s,a))&amp;lt;/code&amp;gt; (exploit).&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;With probability &amp;lt;code&amp;gt;p&amp;lt;/code&amp;gt;, choose a random action &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; from the action space (explore). Otherwise, choose action &amp;lt;code&amp;gt;a = &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;argmax&amp;lt;sub&amp;gt;a&amp;lt;/sub&amp;gt;&lt;/ins&gt;(Q(s,a))&amp;lt;/code&amp;gt; (exploit).&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Take action &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; and witness the reward &amp;lt;code&amp;gt;r&amp;lt;/code&amp;gt; and the new state you are in &amp;lt;code&amp;gt;s&amp;#039;&amp;lt;/code&amp;gt;.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Take action &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; and witness the reward &amp;lt;code&amp;gt;r&amp;lt;/code&amp;gt; and the new state you are in &amp;lt;code&amp;gt;s&amp;#039;&amp;lt;/code&amp;gt;.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;This unit of training has yielded an experience tuple - &amp;lt;code&amp;gt;&amp;lt;s, a, s&amp;#039;, r&amp;gt;&amp;lt;/code&amp;gt;. Using this experience tuple, perform value iteration with Bellman&amp;#039;s equation.  &lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;This unit of training has yielded an experience tuple - &amp;lt;code&amp;gt;&amp;lt;s, a, s&amp;#039;, r&amp;gt;&amp;lt;/code&amp;gt;. Using this experience tuple, perform value iteration with Bellman&amp;#039;s equation.  &lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l58&quot; &gt;Line 58:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 58:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;ol&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;ol&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Be in state &amp;lt;code&amp;gt;s&amp;lt;/code&amp;gt;&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Be in state &amp;lt;code&amp;gt;s&amp;lt;/code&amp;gt;&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;With probability &amp;lt;code&amp;gt;p&amp;lt;/code&amp;gt;, choose a random action &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; from the action space (explore). Otherwise, choose action &amp;lt;code&amp;gt;a = &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;argmax_a&lt;/del&gt;(Q(s,a))&amp;lt;/code&amp;gt; (exploit).&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;With probability &amp;lt;code&amp;gt;p&amp;lt;/code&amp;gt;, choose a random action &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; from the action space (explore). Otherwise, choose action &amp;lt;code&amp;gt;a = &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;argmax&amp;lt;sub&amp;gt;a&amp;lt;/sub&amp;gt;&lt;/ins&gt;(Q(s,a))&amp;lt;/code&amp;gt; (exploit).&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Take action &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; and witness the reward &amp;lt;code&amp;gt;r&amp;lt;/code&amp;gt; and the new state you are in &amp;lt;code&amp;gt;s&amp;#039;&amp;lt;/code&amp;gt;.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Take action &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; and witness the reward &amp;lt;code&amp;gt;r&amp;lt;/code&amp;gt; and the new state you are in &amp;lt;code&amp;gt;s&amp;#039;&amp;lt;/code&amp;gt;.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li style=&amp;quot;color:navy&amp;quot;&amp;gt;This unit of training has yielded an experience tuple - &amp;lt;code&amp;gt;&amp;lt;s, a, s&amp;#039;, r&amp;gt;&amp;lt;/code&amp;gt;. Add this experience tuple to the bucket. If your bucket overflows its size, remove the oldest entry.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li style=&amp;quot;color:navy&amp;quot;&amp;gt;This unit of training has yielded an experience tuple - &amp;lt;code&amp;gt;&amp;lt;s, a, s&amp;#039;, r&amp;gt;&amp;lt;/code&amp;gt;. Add this experience tuple to the bucket. If your bucket overflows its size, remove the oldest entry.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Petosa</name></author>
		
	</entry>
	<entry>
		<id>http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2585&amp;oldid=prev</id>
		<title>Petosa: /* Deep Q-Learning Algorithm */</title>
		<link rel="alternate" type="text/html" href="http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2585&amp;oldid=prev"/>
		<updated>2018-02-12T23:55:33Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Deep Q-Learning Algorithm&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 23:55, 12 February 2018&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l61&quot; &gt;Line 61:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 61:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Take action &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; and witness the reward &amp;lt;code&amp;gt;r&amp;lt;/code&amp;gt; and the new state you are in &amp;lt;code&amp;gt;s&amp;#039;&amp;lt;/code&amp;gt;.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Take action &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; and witness the reward &amp;lt;code&amp;gt;r&amp;lt;/code&amp;gt; and the new state you are in &amp;lt;code&amp;gt;s&amp;#039;&amp;lt;/code&amp;gt;.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li style=&amp;quot;color:navy&amp;quot;&amp;gt;This unit of training has yielded an experience tuple - &amp;lt;code&amp;gt;&amp;lt;s, a, s&amp;#039;, r&amp;gt;&amp;lt;/code&amp;gt;. Add this experience tuple to the bucket. If your bucket overflows its size, remove the oldest entry.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li style=&amp;quot;color:navy&amp;quot;&amp;gt;This unit of training has yielded an experience tuple - &amp;lt;code&amp;gt;&amp;lt;s, a, s&amp;#039;, r&amp;gt;&amp;lt;/code&amp;gt;. Add this experience tuple to the bucket. If your bucket overflows its size, remove the oldest entry.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li style=&amp;quot;color:navy&amp;quot;&amp;gt;Experience replay step. If you have more than &amp;lt;code&amp;gt;BATCH_SIZE&amp;lt;/code&amp;gt; (usually 32) entries in the bucket, sample &amp;lt;code&amp;gt;BATCH_SIZE&amp;lt;/code&amp;gt; experience tuples from the bucket without replacement. For each sampled tuple &amp;lt;code&amp;gt;&amp;lt;s, a, s&amp;#039;, r&amp;gt;&amp;lt;/code&amp;gt;, generate a new training instance with x-value &amp;lt;code&amp;gt;s&amp;lt;/code&amp;gt;. The y-value of each instance is where things get a little complicated, so pay attention. Let the variable &amp;lt;code&amp;gt;P = Q(s)&amp;lt;/code&amp;gt;, all current Q-value predictions for all actions. If the &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; from our experience tuple is the i&amp;lt;sup&amp;gt;th&amp;lt;/sup&amp;gt; action in the neural net, then we &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;updated &lt;/del&gt;&amp;lt;b&amp;gt;only the Q-value corresponding to &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt;&amp;lt;/b&amp;gt; with &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q(s&amp;#039;))&amp;lt;/code&amp;gt;. The y-value of each sample is set to its respective &amp;lt;code&amp;gt;P&amp;lt;/code&amp;gt;, and the model is updated with a training session on just these samples.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li style=&amp;quot;color:navy&amp;quot;&amp;gt;Experience replay step. If you have more than &amp;lt;code&amp;gt;BATCH_SIZE&amp;lt;/code&amp;gt; (usually 32) entries in the bucket, sample &amp;lt;code&amp;gt;BATCH_SIZE&amp;lt;/code&amp;gt; experience tuples from the bucket without replacement. For each sampled tuple &amp;lt;code&amp;gt;&amp;lt;s, a, s&amp;#039;, r&amp;gt;&amp;lt;/code&amp;gt;, generate a new training instance with x-value &amp;lt;code&amp;gt;s&amp;lt;/code&amp;gt;. The y-value of each instance is where things get a little complicated, so pay attention. Let the variable &amp;lt;code&amp;gt;P = Q(s)&amp;lt;/code&amp;gt;, all current Q-value predictions for all actions. If the &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; from our experience tuple is the i&amp;lt;sup&amp;gt;th&amp;lt;/sup&amp;gt; action in the neural net, then we &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;update &lt;/ins&gt;&amp;lt;b&amp;gt;only the Q-value corresponding to &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt;&amp;lt;/b&amp;gt; with &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q(s&amp;#039;))&amp;lt;/code&amp;gt;. The y-value of each sample is set to its respective &amp;lt;code&amp;gt;P&amp;lt;/code&amp;gt;, and the model is updated with a training session on just these samples.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;/ol&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;/ol&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Loop the above training steps until your epoch completes. Keep doing more epochs (reset to initial state) until the reward achieved by your agent converges and no longer improves. Note that the neural network weights persist between epochs.&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Loop the above training steps until your epoch completes. Keep doing more epochs (reset to initial state) until the reward achieved by your agent converges and no longer improves. Note that the neural network weights persist between epochs.&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Petosa</name></author>
		
	</entry>
	<entry>
		<id>http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2584&amp;oldid=prev</id>
		<title>Petosa: /* Notes */</title>
		<link rel="alternate" type="text/html" href="http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2584&amp;oldid=prev"/>
		<updated>2018-02-12T23:54:52Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Notes&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 23:54, 12 February 2018&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l75&quot; &gt;Line 75:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 75:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==Notes==&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==Notes==&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;There are many different weight initializations to experiment with - &lt;/del&gt;glorot, he ...&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;Hyperparameters:&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;There are many different network structure to experiment with.&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;ul&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;Research into bucket &lt;/del&gt;size&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;li&amp;gt;Network architecture&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;Prioritized bucket&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;li&amp;gt;Weight initilizations (&lt;/ins&gt;glorot, he...&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;)&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;li&amp;gt;Network architecture&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;li&amp;gt;Optimizer&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;li&amp;gt;Learning rate&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;li&amp;gt;Bucket size&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;li&amp;gt;Sample training batch &lt;/ins&gt;size&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;li&amp;gt;Gamma&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;li&amp;gt;Explore starting probability&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;li&amp;gt;Explore ending probability&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;li&amp;gt;Explore probability decay rate&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;li&amp;gt;Target network update interval&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;/ul&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==Resources==&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==Resources==&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Petosa</name></author>
		
	</entry>
	<entry>
		<id>http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2583&amp;oldid=prev</id>
		<title>Petosa: /* Advanced Deep Q-Learning */</title>
		<link rel="alternate" type="text/html" href="http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2583&amp;oldid=prev"/>
		<updated>2018-02-12T23:49:08Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Advanced Deep Q-Learning&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 23:49, 12 February 2018&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l70&quot; &gt;Line 70:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 70:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;&amp;lt;b&amp;gt;Target network.&amp;lt;/b&amp;gt; Instead of calculating targets with &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q(s&amp;#039;))&amp;lt;/code&amp;gt;, we have a separate target network &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; that we will use in &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q~(s&amp;#039;))&amp;lt;/code&amp;gt;. &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; has a structure identical to &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;. Every 1000 or so training iterations, we update &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt;&amp;#039;s weight to be the same as &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;&amp;#039;s weights. In this way, there will be reduced correlation between &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; and &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;, mitigating the aforementioned cat-chasing-tail scenario.&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;&amp;lt;b&amp;gt;Target network.&amp;lt;/b&amp;gt; Instead of calculating targets with &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q(s&amp;#039;))&amp;lt;/code&amp;gt;, we have a separate target network &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; that we will use in &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q~(s&amp;#039;))&amp;lt;/code&amp;gt;. &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; has a structure identical to &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;. Every 1000 or so training iterations, we update &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt;&amp;#039;s weight to be the same as &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;&amp;#039;s weights. In this way, there will be reduced correlation between &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; and &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;, mitigating the aforementioned cat-chasing-tail scenario.&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;One problem in the DQN algorithm is that the agent tends to overestimate the Q-function value, due to the max in the formula used to set targets: &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q(s&amp;#039;))&amp;lt;/code&amp;gt;. The solution is to use two Q-functions, Q&amp;lt;sub&amp;gt;1&amp;lt;/sub&amp;gt; and Q&amp;lt;sub&amp;gt;2&amp;lt;/sub&amp;gt;, which are independently learned. One function is then used to determine the maximizing action and second to estimate its value.  For our purposes, the target network &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; is relatively independent of &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;, so we can use &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt; and &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; as our networks for double Q-learning. So our update now becomes &amp;lt;code&amp;gt;P[i] = r + gamma*Q~(s&amp;#039;,argmax&amp;lt;sub&amp;gt;a&amp;lt;/sub&amp;gt;(Q(s&amp;#039;,a))&amp;lt;/code&amp;gt;.&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;gt;&amp;lt;b&amp;gt;Double Q-learning.&amp;lt;/b&lt;/ins&gt;&amp;gt; One problem in the DQN algorithm is that the agent tends to overestimate the Q-function value, due to the max in the formula used to set targets: &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q(s&amp;#039;))&amp;lt;/code&amp;gt;. The solution is to use two Q-functions, Q&amp;lt;sub&amp;gt;1&amp;lt;/sub&amp;gt; and Q&amp;lt;sub&amp;gt;2&amp;lt;/sub&amp;gt;, which are independently learned. One function is then used to determine the maximizing action and second to estimate its value.  For our purposes, the target network &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; is relatively independent of &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;, so we can use &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt; and &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; as our networks for double Q-learning. So our update now becomes &amp;lt;code&amp;gt;P[i] = r + gamma*Q~(s&amp;#039;,argmax&amp;lt;sub&amp;gt;a&amp;lt;/sub&amp;gt;(Q(s&amp;#039;,a))&amp;lt;/code&amp;gt;.&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;/ol&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;/ol&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Petosa</name></author>
		
	</entry>
	<entry>
		<id>http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2582&amp;oldid=prev</id>
		<title>Petosa: /* Advanced Deep Q-Learning */</title>
		<link rel="alternate" type="text/html" href="http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2582&amp;oldid=prev"/>
		<updated>2018-02-12T23:46:59Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Advanced Deep Q-Learning&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 23:46, 12 February 2018&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l68&quot; &gt;Line 68:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 68:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;DQNs tend not to be stable. Because the loss is calculated from the same network that weights are applied to, you tend to see oscillations in policies. Think of this like a cat chasing its own tail. There are a few tricks you can use to mitigate these problems.&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;DQNs tend not to be stable. Because the loss is calculated from the same network that weights are applied to, you tend to see oscillations in policies. Think of this like a cat chasing its own tail. There are a few tricks you can use to mitigate these problems.&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;ol&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;ol&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;&amp;lt;b&amp;gt;Target network.&amp;lt;/b&amp;gt; Instead of calculating targets with &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q(s&amp;#039;))&amp;lt;/code&amp;gt;, we have a separate target network &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; that we will use in &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q~(s&amp;#039;))&amp;lt;/code&amp;gt;. &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; has a structure identical &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;tot &lt;/del&gt;&amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;. Every 1000 or so training iterations, we update &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt;&amp;#039;s weight to be the same as &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;&amp;#039;s weights. In this way, there will be reduced &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;qcorrelation &lt;/del&gt;between &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; and &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;, mitigating the aforementioned cat-chasing-tail &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;sceneario&lt;/del&gt;.&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;&amp;lt;b&amp;gt;Target network.&amp;lt;/b&amp;gt; Instead of calculating targets with &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q(s&amp;#039;))&amp;lt;/code&amp;gt;, we have a separate target network &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; that we will use in &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q~(s&amp;#039;))&amp;lt;/code&amp;gt;. &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; has a structure identical &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;to &lt;/ins&gt;&amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;. Every 1000 or so training iterations, we update &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt;&amp;#039;s weight to be the same as &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;&amp;#039;s weights. In this way, there will be reduced &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;correlation &lt;/ins&gt;between &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; and &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;, mitigating the aforementioned cat-chasing-tail &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;scenario.&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;li&amp;gt;One problem in the DQN algorithm is that the agent tends to overestimate the Q-function value, due to the max in the formula used to set targets: &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q(s&amp;#039;))&amp;lt;/code&amp;gt;. The solution is to use two Q-functions, Q&amp;lt;sub&amp;gt;1&amp;lt;/sub&amp;gt; and Q&amp;lt;sub&amp;gt;2&amp;lt;/sub&amp;gt;, which are independently learned. One function is then used to determine the maximizing action and second to estimate its value.  For our purposes, the target network &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; is relatively independent of &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;, so we can use &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt; and &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; as our networks for double Q-learning. So our update now becomes &amp;lt;code&amp;gt;P[i] = r + gamma*Q~(s&amp;#039;,argmax&amp;lt;sub&amp;gt;a&amp;lt;/sub&amp;gt;(Q(s&amp;#039;,a))&amp;lt;/code&amp;gt;&lt;/ins&gt;.&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;/ol&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;/ol&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Petosa</name></author>
		
	</entry>
	<entry>
		<id>http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2581&amp;oldid=prev</id>
		<title>Petosa at 23:30, 12 February 2018</title>
		<link rel="alternate" type="text/html" href="http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2581&amp;oldid=prev"/>
		<updated>2018-02-12T23:30:25Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 23:30, 12 February 2018&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l64&quot; &gt;Line 64:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 64:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;/ol&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;/ol&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Loop the above training steps until your epoch completes. Keep doing more epochs (reset to initial state) until the reward achieved by your agent converges and no longer improves. Note that the neural network weights persist between epochs.&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Loop the above training steps until your epoch completes. Keep doing more epochs (reset to initial state) until the reward achieved by your agent converges and no longer improves. Note that the neural network weights persist between epochs.&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;==Advanced Deep Q-Learning==&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;DQNs tend not to be stable. Because the loss is calculated from the same network that weights are applied to, you tend to see oscillations in policies. Think of this like a cat chasing its own tail. There are a few tricks you can use to mitigate these problems.&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&amp;lt;ol&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&amp;lt;li&amp;gt;&amp;lt;b&amp;gt;Target network.&amp;lt;/b&amp;gt; Instead of calculating targets with &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q(s&amp;#039;))&amp;lt;/code&amp;gt;, we have a separate target network &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; that we will use in &amp;lt;code&amp;gt;P[i] = r + gamma*max(Q~(s&amp;#039;))&amp;lt;/code&amp;gt;. &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; has a structure identical tot &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;. Every 1000 or so training iterations, we update &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt;&amp;#039;s weight to be the same as &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;&amp;#039;s weights. In this way, there will be reduced qcorrelation between &amp;lt;code&amp;gt;Q~&amp;lt;/code&amp;gt; and &amp;lt;code&amp;gt;Q&amp;lt;/code&amp;gt;, mitigating the aforementioned cat-chasing-tail sceneario.&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&amp;lt;/ol&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==Notes==&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==Notes==&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Petosa</name></author>
		
	</entry>
	<entry>
		<id>http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2580&amp;oldid=prev</id>
		<title>Petosa: /* Motivation */</title>
		<link rel="alternate" type="text/html" href="http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2580&amp;oldid=prev"/>
		<updated>2018-02-12T23:23:32Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Motivation&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 23:23, 12 February 2018&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l5&quot; &gt;Line 5:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 5:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=== Brief History ===&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=== Brief History ===&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Deep Q-learning was introduced in 2015 by Google&amp;#039;s DeepMind in a Nature article called &amp;lt;i&amp;gt;Human-level control through deep reinforcement learning.&amp;lt;/i&amp;gt; DeepMind used Q-learning backed by a deep neural network to train an agent to play Atari games from raw pixel data - often outperforming humans. Previous attempts to combine reinforcement learning with neural networks had largely failed due to unstable learning. To address these instabilities, DeepMind introduced a mechanism by which the algorithm stores all of the agent&amp;#039;s experiences and then randomly samples and replays these experiences to provide diverse and decorrelated training data.&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Deep Q-learning was introduced in 2015 by Google&amp;#039;s DeepMind in a Nature article called &amp;lt;i&amp;gt;Human-level control through deep reinforcement learning.&amp;lt;/i&amp;gt; DeepMind used Q-learning backed by a deep neural network to train an agent to play Atari games from raw pixel data - often outperforming humans. Previous attempts to combine reinforcement learning with neural networks had largely failed due to unstable learning. To address these instabilities, DeepMind introduced a mechanism by which the algorithm stores all of the agent&amp;#039;s experiences and then randomly samples and replays these experiences to provide diverse and decorrelated training data.&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=== Advantages ===&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=== Advantages ===&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Petosa</name></author>
		
	</entry>
	<entry>
		<id>http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2579&amp;oldid=prev</id>
		<title>Petosa: /* Advanced Deep Q-Learning= */</title>
		<link rel="alternate" type="text/html" href="http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2579&amp;oldid=prev"/>
		<updated>2018-02-12T23:23:23Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Advanced Deep Q-Learning=&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 23:23, 12 February 2018&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l6&quot; &gt;Line 6:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 6:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Deep Q-learning was introduced in 2015 by Google&amp;#039;s DeepMind in a Nature article called &amp;lt;i&amp;gt;Human-level control through deep reinforcement learning.&amp;lt;/i&amp;gt; DeepMind used Q-learning backed by a deep neural network to train an agent to play Atari games from raw pixel data - often outperforming humans. Previous attempts to combine reinforcement learning with neural networks had largely failed due to unstable learning. To address these instabilities, DeepMind introduced a mechanism by which the algorithm stores all of the agent&amp;#039;s experiences and then randomly samples and replays these experiences to provide diverse and decorrelated training data.&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Deep Q-learning was introduced in 2015 by Google&amp;#039;s DeepMind in a Nature article called &amp;lt;i&amp;gt;Human-level control through deep reinforcement learning.&amp;lt;/i&amp;gt; DeepMind used Q-learning backed by a deep neural network to train an agent to play Atari games from raw pixel data - often outperforming humans. Previous attempts to combine reinforcement learning with neural networks had largely failed due to unstable learning. To address these instabilities, DeepMind introduced a mechanism by which the algorithm stores all of the agent&amp;#039;s experiences and then randomly samples and replays these experiences to provide diverse and decorrelated training data.&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;==Advanced Deep Q-Learning===&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;DQNs tend not to be stable. Because the loss is calculated from the same network that weights are applied to, you tend to see oscillations in policies. Think of this like a cat chasing its own tail. There are a few tricks you can use to mitigate these problems.&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&amp;lt;ol&amp;gt;&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&amp;lt;li&amp;gt;&amp;lt;b&amp;gt;Target network.&amp;lt;/b&amp;gt; Instea&amp;lt;/li&amp;gt;&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&amp;lt;/ol&amp;gt;&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=== Advantages ===&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=== Advantages ===&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Petosa</name></author>
		
	</entry>
	<entry>
		<id>http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2578&amp;oldid=prev</id>
		<title>Petosa: /* Motivation */</title>
		<link rel="alternate" type="text/html" href="http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2578&amp;oldid=prev"/>
		<updated>2018-02-12T23:23:02Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Motivation&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 23:23, 12 February 2018&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l5&quot; &gt;Line 5:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 5:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=== Brief History ===&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=== Brief History ===&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Deep Q-learning was introduced in 2015 by Google&amp;#039;s DeepMind in a Nature article called &amp;lt;i&amp;gt;Human-level control through deep reinforcement learning.&amp;lt;/i&amp;gt; DeepMind used Q-learning backed by a deep neural network to train an agent to play Atari games from raw pixel data - often outperforming humans. Previous attempts to combine reinforcement learning with neural networks had largely failed due to unstable learning. To address these instabilities, DeepMind introduced a mechanism by which the algorithm stores all of the agent&amp;#039;s experiences and then randomly samples and replays these experiences to provide diverse and decorrelated training data.&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Deep Q-learning was introduced in 2015 by Google&amp;#039;s DeepMind in a Nature article called &amp;lt;i&amp;gt;Human-level control through deep reinforcement learning.&amp;lt;/i&amp;gt; DeepMind used Q-learning backed by a deep neural network to train an agent to play Atari games from raw pixel data - often outperforming humans. Previous attempts to combine reinforcement learning with neural networks had largely failed due to unstable learning. To address these instabilities, DeepMind introduced a mechanism by which the algorithm stores all of the agent&amp;#039;s experiences and then randomly samples and replays these experiences to provide diverse and decorrelated training data.&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;==Advanced Deep Q-Learning===&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;DQNs tend not to be stable. Because the loss is calculated from the same network that weights are applied to, you tend to see oscillations in policies. Think of this like a cat chasing its own tail. There are a few tricks you can use to mitigate these problems.&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&amp;lt;ol&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&amp;lt;li&amp;gt;&amp;lt;b&amp;gt;Target network.&amp;lt;/b&amp;gt; Instea&amp;lt;/li&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt; &lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&amp;lt;/ol&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=== Advantages ===&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=== Advantages ===&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Petosa</name></author>
		
	</entry>
	<entry>
		<id>http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2576&amp;oldid=prev</id>
		<title>Petosa: /* Table-based Q-learning */</title>
		<link rel="alternate" type="text/html" href="http://quantsoftware.gatech.edu/index.php?title=Deep_Q-Learning&amp;diff=2576&amp;oldid=prev"/>
		<updated>2018-02-12T16:31:16Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Table-based Q-learning&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 16:31, 12 February 2018&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l32&quot; &gt;Line 32:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 32:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;With probability &amp;lt;code&amp;gt;p&amp;lt;/code&amp;gt;, choose a random action &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; from the action space (explore). Otherwise, choose action &amp;lt;code&amp;gt;a = argmax_a(Q(s,a))&amp;lt;/code&amp;gt; (exploit).&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;With probability &amp;lt;code&amp;gt;p&amp;lt;/code&amp;gt;, choose a random action &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; from the action space (explore). Otherwise, choose action &amp;lt;code&amp;gt;a = argmax_a(Q(s,a))&amp;lt;/code&amp;gt; (exploit).&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Take action &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; and witness the reward &amp;lt;code&amp;gt;r&amp;lt;/code&amp;gt; and the new state you are in &amp;lt;code&amp;gt;s&amp;#039;&amp;lt;/code&amp;gt;.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Take action &amp;lt;code&amp;gt;a&amp;lt;/code&amp;gt; and witness the reward &amp;lt;code&amp;gt;r&amp;lt;/code&amp;gt; and the new state you are in &amp;lt;code&amp;gt;s&amp;#039;&amp;lt;/code&amp;gt;.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;This unit of training has &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;yielding &lt;/del&gt;an experience tuple - &amp;lt;code&amp;gt;&amp;lt;s, a, s&amp;#039;, r&amp;gt;&amp;lt;/code&amp;gt;. Using this experience tuple, perform value iteration with Bellman&amp;#039;s equation.  &lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;This unit of training has &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;yielded &lt;/ins&gt;an experience tuple - &amp;lt;code&amp;gt;&amp;lt;s, a, s&amp;#039;, r&amp;gt;&amp;lt;/code&amp;gt;. Using this experience tuple, perform value iteration with Bellman&amp;#039;s equation.  &lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;br&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;br&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;code&amp;gt;Legend: gamma = discount rate, alpha = learning rate.&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt; &lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;code&amp;gt;Legend: gamma = discount rate, alpha = learning rate.&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Petosa</name></author>
		
	</entry>
</feed>