-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathFuNs.html
1495 lines (918 loc) · 56.7 KB
/
FuNs.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html class="theme-next mist use-motion" lang="zh-Hans">
<head><meta name="generator" content="Hexo 3.8.0">
<meta name="google-site-verification" content="zu-9nWphPjrzXV8v514mkHknIz4dNfHlib56-KNAu44">
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="theme-color" content="#222">
<script src="/lib/pace/pace.min.js?v=1.0.2"></script>
<link href="/lib/pace/pace-theme-flash.min.css?v=1.0.2" rel="stylesheet">
<meta http-equiv="Cache-Control" content="no-transform">
<meta http-equiv="Cache-Control" content="no-siteapp">
<script>
(function(i,s,o,g,r,a,m){i["DaoVoiceObject"]=r;i[r]=i[r]||function(){(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;a.charset="utf-8";m.parentNode.insertBefore(a,m)})(window,document,"script",('https:' == document.location.protocol ? 'https:' : 'http:') + "//widget.daovoice.io/widget/356f1943.js","daovoice")
daovoice('init', {
app_id: "356f1943"
});
daovoice('update');
</script>
<link href="/lib/fancybox/source/jquery.fancybox.css?v=2.1.5" rel="stylesheet" type="text/css">
<link href="/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css">
<link href="/css/main.css?v=5.1.4" rel="stylesheet" type="text/css">
<link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon.png?v=5.1.4">
<link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32png?v=5.1.4">
<link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16.png?v=5.1.4">
<link rel="mask-icon" href="/images/logo.svg?v=5.1.4" color="#222">
<meta name="keywords" content="rl,hrl,">
<link rel="alternate" href="/atom.xml" title="Keavnn'Blog" type="application/atom+xml">
<script>
(function(){
if(''){
if (prompt('请输入文章密码','') !== ''){
alert('密码错误!');
history.back();
}
}
})();
</script>
<meta name="description" content="这篇论文提出了FuNs,将智能体决策分为两层——Manager产生子目标、Worker产生动作行为。两层均使用A2C方式进行优化,且梯度互不影响。 推荐程度中等: h-DQN式分层,PG式优化 隐藏状态空间设置子目标,不需要先验知识 上下两层策略均使用A2C的更新方式 应用于离散动作空间">
<meta name="keywords" content="rl,hrl">
<meta property="og:type" content="article">
<meta property="og:title" content="FeUdal Networks for Hierarchical Reinforcement Learning">
<meta property="og:url" content="http://StepNeverStop.github.io/FuNs.html">
<meta property="og:site_name" content="Keavnn'Blog">
<meta property="og:description" content="这篇论文提出了FuNs,将智能体决策分为两层——Manager产生子目标、Worker产生动作行为。两层均使用A2C方式进行优化,且梯度互不影响。 推荐程度中等: h-DQN式分层,PG式优化 隐藏状态空间设置子目标,不需要先验知识 上下两层策略均使用A2C的更新方式 应用于离散动作空间">
<meta property="og:locale" content="zh-Hans">
<meta property="og:image" content="http://stepneverstop.github.io/FuNs/illustration.png">
<meta property="og:image" content="http://stepneverstop.github.io/FuNs/illustration.png">
<meta property="og:updated_time" content="2020-04-27T04:46:44.286Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="FeUdal Networks for Hierarchical Reinforcement Learning">
<meta name="twitter:description" content="这篇论文提出了FuNs,将智能体决策分为两层——Manager产生子目标、Worker产生动作行为。两层均使用A2C方式进行优化,且梯度互不影响。 推荐程度中等: h-DQN式分层,PG式优化 隐藏状态空间设置子目标,不需要先验知识 上下两层策略均使用A2C的更新方式 应用于离散动作空间">
<meta name="twitter:image" content="http://stepneverstop.github.io/FuNs/illustration.png">
<script type="text/javascript" id="hexo.configurations">
var NexT = window.NexT || {};
var CONFIG = {
root: '/',
scheme: 'Mist',
version: '5.1.4',
sidebar: {"position":"left","display":"post","offset":12,"b2t":false,"scrollpercent":true,"onmobile":true},
fancybox: true,
tabs: true,
motion: {"enable":true,"async":true,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
duoshuo: {
userId: '0',
author: '博主'
},
algolia: {
applicationID: '',
apiKey: '',
indexName: '',
hits: {"per_page":10},
labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
}
};
</script>
<link rel="canonical" href="http://StepNeverStop.github.io/FuNs.html">
<title>FeUdal Networks for Hierarchical Reinforcement Learning | Keavnn'Blog</title>
</head>
<body itemscope="" itemtype="http://schema.org/WebPage" lang="zh-Hans">
<div class="container sidebar-position-left page-post-detail">
<div class="headband"></div>
<a href="https://github.com/StepNeverStop" class="github-corner" aria-label="View source on GitHub" rel="external nofollow" target="_blank"><svg width="80" height="80" viewbox="0 0 250 250" style="fill:#151513; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"/><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"/><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"/></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
<header id="header" class="header" itemscope="" itemtype="http://schema.org/WPHeader">
<div class="header-inner"><div class="site-brand-wrapper">
<div class="site-meta ">
<div class="custom-logo-site-title">
<a href="/" class="brand" rel="start">
<span class="logo-line-before"><i></i></span>
<span class="site-title">Keavnn'Blog</span>
<span class="logo-line-after"><i></i></span>
</a>
</div>
<h1 class="site-subtitle" itemprop="description">If it is to be, it is up to me.</h1>
</div>
<div class="site-nav-toggle">
<button>
<span class="btn-bar"></span>
<span class="btn-bar"></span>
<span class="btn-bar"></span>
</button>
</div>
</div>
<nav class="site-nav">
<ul id="menu" class="menu">
<li class="menu-item menu-item-home">
<a href="/" rel="section">
<i class="menu-item-icon fa fa-fw fa-home"></i> <br>
首页
</a>
</li>
<li class="menu-item menu-item-about">
<a href="/about/" rel="section">
<i class="menu-item-icon fa fa-fw fa-user"></i> <br>
关于
</a>
</li>
<li class="menu-item menu-item-tags">
<a href="/tags/" rel="section">
<i class="menu-item-icon fa fa-fw fa-tags"></i> <br>
标签
</a>
</li>
<li class="menu-item menu-item-categories">
<a href="/categories/" rel="section">
<i class="menu-item-icon fa fa-fw fa-th"></i> <br>
分类
</a>
</li>
<li class="menu-item menu-item-archives">
<a href="/archives/" rel="section">
<i class="menu-item-icon fa fa-fw fa-archive"></i> <br>
归档
</a>
</li>
<li class="menu-item menu-item-search">
<a href="javascript:;" class="popup-trigger">
<i class="menu-item-icon fa fa-search fa-fw"></i> <br>
搜索
</a>
</li>
</ul>
<div class="site-search">
<div class="popup search-popup local-search-popup">
<div class="local-search-header clearfix">
<span class="search-icon">
<i class="fa fa-search"></i>
</span>
<span class="popup-btn-close">
<i class="fa fa-times-circle"></i>
</span>
<div class="local-search-input-wrapper">
<input autocomplete="off" placeholder="搜索..." spellcheck="false" type="text" id="local-search-input">
</div>
</div>
<div id="local-search-result"></div>
</div>
</div>
</nav>
</div>
</header>
<main id="main" class="main">
<div class="main-inner">
<div class="content-wrap">
<div id="content" class="content">
<div id="posts" class="posts-expand">
<article class="post post-type-normal" itemscope="" itemtype="http://schema.org/Article">
<div class="post-block">
<link itemprop="mainEntityOfPage" href="http://StepNeverStop.github.io/FuNs.html">
<span hidden itemprop="author" itemscope="" itemtype="http://schema.org/Person">
<meta itemprop="name" content="Keavnn">
<meta itemprop="description" content="">
<meta itemprop="image" content="/images/Kicon.jpg">
</span>
<span hidden itemprop="publisher" itemscope="" itemtype="http://schema.org/Organization">
<meta itemprop="name" content="Keavnn'Blog">
</span>
<header class="post-header">
<h2 class="post-title" itemprop="name headline">FeUdal Networks for Hierarchical Reinforcement Learning</h2>
<div class="post-meta">
<span class="post-time">
<span class="post-meta-item-icon">
<i class="fa fa-calendar-o"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建于" itemprop="dateCreated datePublished" datetime="2020-04-27T10:33:16+08:00">
2020-04-27
</time>
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-calendar-check-o"></i>
</span>
<span class="post-meta-item-text">更新于:</span>
<time title="更新于" itemprop="dateModified" datetime="2020-04-27T12:46:44+08:00">
2020-04-27
</time>
</span>
<span class="post-category">
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-folder-o"></i>
</span>
<span class="post-meta-item-text">分类于</span>
<span itemprop="about" itemscope="" itemtype="http://schema.org/Thing">
<a href="/categories/ReinforcementLearning/" itemprop="url" rel="index">
<span itemprop="name">ReinforcementLearning</span>
</a>
</span>
</span>
<div class="post-wordcount">
<span class="post-meta-item-icon">
<i class="fa fa-file-word-o"></i>
</span>
<span class="post-meta-item-text">字数统计:</span>
<span title="字数统计">
3.4k
</span>
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-clock-o"></i>
</span>
<span class="post-meta-item-text">阅读时长 ≈</span>
<span title="阅读时长">
14
</span>
</div>
</div>
</header>
<div class="post-body" itemprop="articleBody">
<p><img src="./FuNs/illustration.png" alt=""></p>
<p>这篇论文提出了FuNs,将智能体决策分为两层——Manager产生子目标、Worker产生动作行为。两层均使用A2C方式进行优化,且梯度互不影响。</p>
<p>推荐程度中等:</p>
<ul>
<li>h-DQN式分层,PG式优化</li>
<li>隐藏状态空间设置子目标,不需要先验知识</li>
<li>上下两层策略均使用A2C的更新方式</li>
<li>应用于离散动作空间</li>
</ul>
<a id="more"></a>
<h1 id="简介"><a href="#简介" class="headerlink" title="简介"></a>简介</h1><p>论文地址:<a href="http://arxiv.org/abs/1703.01161" rel="external nofollow" target="_blank">http://arxiv.org/abs/1703.01161</a></p>
<p>pytorch复现代码:<a href="https://github.com/dnddnjs/feudal-montezuma" rel="external nofollow" target="_blank">https://github.com/dnddnjs/feudal-montezuma</a></p>
<p>这篇论文引入了FeUdal Networks(FuNs),它是一个新奇的强化学习分层决策结构,它将决策模型分为Manager和Worker:</p>
<ul>
<li>Manager,在lower temporal resolution(低时间尺度)上做higher level决策,产生子目标。这个子目标是输入观察observation的隐状态空间上的方向向量,用于指定在$s_t$时刻之后$c$步应该朝着隐空间的什么方向移动;</li>
<li>Worker,在higher temporal resolution(更密级的时间尺度)上做lower level决策,产生执行的动作。</li>
</ul>
<h2 id="关注点"><a href="#关注点" class="headerlink" title="关注点"></a>关注点</h2><p>如何创建能够学习将其行为分解为有意义的原语,然后重用它们以更有效地获取新行为的智能体是一个长期存在的研究问题。</p>
<blockquote>
<p>How to create agents that can learn to decompose their behaviour into meaningful primitives and then reuse them to more efficiently acquire new behaviours is a long standing research question. The solution to this question may be an important stepping stone towards agents with general intelligence and competence.</p>
</blockquote>
<h2 id="主要贡献"><a href="#主要贡献" class="headerlink" title="主要贡献"></a>主要贡献</h2><p>这篇论文的创新点和特点主要有以下几个:</p>
<ol>
<li>提出一个一致的,端到端的,可微的模型,体现和概括了Feudal RL的原则</li>
<li>虽然上下两层都使用A2C的更新方式,但是上层策略的损失函数构造是根据自己提出的<strong>Transition Policy Gradients</strong>,利用了子目标的语义意义</li>
<li>提出了一个新奇的RNN结构,用在Manager模块中——<strong>Dilated LSTM</strong>,它增强了RNN的记忆能力,允许梯度在大的时间间隔内流动,允许在数百步长上进行有效的反向传播</li>
<li>上层控制器产生的子目标不再是显式的状态,而是隐状态空间上的方向向量</li>
</ol>
<h2 id="优点-效果"><a href="#优点-效果" class="headerlink" title="优点/效果"></a>优点/效果</h2><ul>
<li><p>FuNs大大提高了长期的信用分配和记忆。</p>
<blockquote>
<p>FuN significantly improves long-term credit assignment and memorisation.</p>
</blockquote>
</li>
<li><p>鼓励与Manager设定的不同目标相关联的子策略的出现。</p>
<blockquote>
<p>encourages the emergence of sub-policies associated with different goals set by the Manager.</p>
</blockquote>
</li>
</ul>
<h1 id="文中精要"><a href="#文中精要" class="headerlink" title="文中精要"></a>文中精要</h1><p>下图为FuNs的整体框架示意图。</p>
<p><img src="./FuNs/illustration.png" alt=""></p>
<p>解析:</p>
<ul>
<li><p>上图中的灰色部分均为可学习、可微分的网络变量,在Manager框中即由Manager梯度更新,反之亦然。上图中$f^{\text {percept }}$既不属于Manager也不属于Worker,文中也没有提到它如何优化,我<strong>猜想</strong>它是通过Manager和Worker的Critic网络共享梯度更新优化的。</p>
</li>
<li><p>Manager使用Transition Policy Gradient进行优化,Worker使用Policy Gradient进行优化</p>
</li>
<li><p>$z_{t}=f^{\text {percept }}\left(x_{t}\right)$是特征提取层,CNN之类的,将观测值转换为长度为d的向量</p>
</li>
<li><p>$s_{t}=f^{M s p a c e}\left(z_{t}\right)$是Manager模块中的特征变换层,由几层全连接组成,从图上看维度不变,还是d</p>
</li>
<li><p>$h_{t}^{M}, \hat{g}_{t}=f^{M r n n}\left(s_{t}, h_{t-1}^{M}\right) ; g_{t}=\hat{g}_{t} /\left|\hat{g}_{t}\right|$是Manager模块的子目标输出层,子目标做了归一化操作,其中$f^{M r n n}$是论文中提出的Dilated LSTM</p>
</li>
<li><p>$w_{t}=\phi\left(\sum_{i=t-c}^{t} g_{i}\right)$是将子目标变换为一个可以与Worker动作表示矩阵相乘的向量,看形势上应该属于一个滑动窗口,窗口长度为$c$,在这个长度的一直做移动加和,每一个时间步都根据当前观测值输出子目标$g_i$,然后连续$c$个时间步的子目标加和之后通过$\phi$进行线性变换。</p>
<blockquote>
<p>A linear transform $\phi$ maps a goal $g_t$ into an embedding vector $w_t\in R^k$ , which is then combined via product with matrix $U_t$ (Workers output) to produce policy $\pi$.</p>
</blockquote>
</li>
<li><p>$h^{W}, U_{t}=f^{W r n n}\left(z_{t}, h_{t-1}^{W}\right)$是Worker的LSTM层,这里没有使用Dilated LSTM</p>
</li>
<li><p>$\pi_{t}=\operatorname{SoftMax}\left(U_{t} w_{t}\right)$是Worker的最终动作概率分布输出层,从文章看起来,FuNs只能应用于离散动作空间,因为其下层策略要产生的矩阵为$\mathrm{U}_{\mathrm{t}} \in \mathrm{R}^{|\mathrm{a}| \mathrm{xk}}$,即需要了解动作的数量。</p>
</li>
<li><p>Worker中的$k$为每个动作embedding向量的长度</p>
</li>
</ul>
<h2 id="Manager损失"><a href="#Manager损失" class="headerlink" title="Manager损失"></a>Manager损失</h2><p>Manager的损失函数,或者说是优化目标的梯度是这样的:</p>
<script type="math/tex; mode=display">
\nabla g_{t}=A_{t}^{M} \nabla_{\theta} d_{\cos }\left(s_{t+c}-s_{t}, g_{t}(\theta)\right)</script><script type="math/tex; mode=display">
A_{t}^{M}=R_{t}-V_{t}^{M}\left(x_{t}, \theta\right)</script><p>其中,$d_{\cos }(\alpha, \beta)=\alpha^{T} \beta /(|\alpha||\beta|)$是余弦相似度。与传统的PG损失不同,这里没有使用$log$操作,而且使用余弦相似度。由Critic网络的输入是$x_t$我猜想到上面结构图中的percept部分是是由Critic网络的梯度优化的。</p>
<p>注意,虽然$s_{t+c}$与$s_t$也是由Manager模块产生的,但是在优化中$s_{t+c}-s_{t}$并不传导梯度。</p>
<p>传统的PG目标函数梯度应该是这样的:</p>
<script type="math/tex; mode=display">
\nabla_{\theta} \pi_{t}=\mathbb{E}\left[\left(R_{t}-V\left(s_{t}\right)\right) \nabla_{\theta} \log p\left(a_t | s_{t}\right)\right]</script><p>作者根据分层强化学习中上层策略产生不是动作,而是子目标,将上边式子通过分析、推理改写成下边这种形式:</p>
<p>$o_{t}=\mu\left(s_{t}, \theta\right)$选择子策略,$p\left(s_{t+c} | s_{t}, o_{t}\right)$表示在子策略条件下经过$c$步决策之后的隐状态分布,$\pi^{T P}\left(s_{t+c} | s_{t}\right)=p\left(s_{t+c} | s_{t}, \mu\left(s_{t}, \theta\right)\right)$描述给定起始状态的结束状态的分布,$s_{t+c}=\pi^{T P}\left(s_{t}\right)$是转移函数。</p>
<script type="math/tex; mode=display">
\nabla_{\theta} \pi_{t}^{T P}=\mathbb{E}\left[\left(R_{t}-V\left(s_{t}\right)\right) \nabla_{\theta} \log p\left(s_{t+c} | s_{t}, \mu\left(s_{t}, \theta\right)\right)\right]</script><p>作者为了推导出$\nabla_{\theta} d_{\cos }\left(s_{t+c}-s_{t}, g_{t}(\theta)\right)$这种形式,假设转移模式是一种特殊的形式:$s_{t+c}-s_t$这个隐状态空间上的实际“运动”方向向量服从<strong><a href="https://en.wikipedia.org/wiki/Von_Mises%E2%80%93Fisher_distribution" rel="external nofollow" target="_blank">von Mises-Fisher</a></strong>分布,这个分布的均值即为上层策略产生的$g_t$,根据这个分布的性质,可以推导出如下公式:</p>
<script type="math/tex; mode=display">
p\left(s_{t+c} | s_{t}, o_{t}\right) \propto e^{d_{\cos }\left(s_{t+c}-s_{t}, g_{t}\right)}</script><p>由此取对数可以推导出上边的梯度形式。</p>
<blockquote>
<p>A naive application of policy gradients requires the agent to learn from samples of these trajectories. But if we know where these trajectories are likely to end up, by modelling the transitions, then we can skip directly over the Worker’s behaviour and instead follow the policy gradient of the predicted transition. FuN assumes a particular form for the transition model: that the direction in state-space, s t+c −s t , follows a von Mises-Fisher distribution. Specifically, if the mean direction of the von Mises-Fisher distribution is given by g(o t ) (which for compactness we write as g t ) we would have p(s t+c | s t , o t ) ∝ e d cos (s t+c −s t ,g t ) . If this functional form were indeed correct, then we see that our proposed update heuristic for the Manager, eqn.7, is in fact the proper form for the transition policy gradient arrived at in eqn.10.</p>
</blockquote>
<h2 id="Worker损失"><a href="#Worker损失" class="headerlink" title="Worker损失"></a>Worker损失</h2><p>Worker的损失与传统的A2C一致:</p>
<script type="math/tex; mode=display">
\nabla \pi_{t}=A_{t}^{D} \nabla_{\theta} \log \pi\left(a_{t} | x_{t} ; \theta\right)</script><script type="math/tex; mode=display">
A_{t}^{D}=\left(R_{t}+\alpha R_{t}^{I}-V_{t}^{D}\left(x_{t} ; \theta\right)\right)</script><p>这里优势函数的target目标既包含外部奖励$R_t$,也包含内部奖励$R_t^I$,并用一个超参数$\alpha$来控制内部奖励的影响程度。注意,这里的Critic输入同样为$x_t$,所以我<strong>猜想</strong>特征表示部分由Manager和Worker各自的Critic共享梯度进行优化。</p>
<p>内部奖励是这么定义的:</p>
<script type="math/tex; mode=display">
r_{t}^{I}=1 / c \sum_{i=1}^{c} d_{\cos }\left(s_{t}-s_{t-i}, g_{t-i}\right)</script><p>从这个公式可以看出,每一步的内部奖励需要往前计算$c$步余弦相似度,因此,如果设置的子目标持续步长$c$很大,那么将会引入额外的轨迹长度倍的计算开销。</p>
<p>作者使用方向向量,是因为与假设可以将智能体带到(可能)任意新的绝对位置相比,Worker能够更可靠地引起状态在隐空间上的方向转移。(真拗口)</p>
<blockquote>
<p>We use directions because it is more feasible for the Worker to be able to reliably cause directional shifts in the latent state than it is to assume that the Worker can take us to (potentially) arbitrary new absolute locations.</p>
<p>Note that the Worker’s intrinsic reward (eqn. 8) is based on the log-likelihood of state trajectory. Through that the FuN architecture actively encourages the functional form of the transition model to hold true. Because the Worker is learning to achieve the Manager’s direction, its transitions should, over time, closely follow a distribution around this direction, and hence our approximation for transition policy gradients should hold reasonably well.</p>
</blockquote>
<h2 id="Dilated-LSTM"><a href="#Dilated-LSTM" class="headerlink" title="Dilated LSTM"></a>Dilated LSTM</h2><p>说实话,这里没有完全理解。</p>
<p>公式是这样的:</p>
<script type="math/tex; mode=display">
g_{t}=L S T M\left(s_{t}, \hat{h}_{t-1}^{t \% r} ; \theta^{L S T M}\right)</script><p>其中$\theta^{L S T M}$是共享的LSTM网络参数,cell_state是一个组,$h=\left\{\hat{h}^{i}\right\}_{i=1}^{r}$。$r$是一个dilation radius,也就是说这个LSTM网络包含许多个cell_state,也就是core。式子中的百分号<code>%</code>是做一个模的操作。然后作者说他们在实验中radius的设置与horizon相同,即$c=r$。</p>
<p><strong>我的猜想是这样的</strong>:</p>
<p>作者虽然想让上层策略在一个粗粒度的时间上进行决策,即产生子目标,但是下层策略的内部奖励依赖连续的子目标,也就是每一步的子目标。在这种情况下,上层策略必须每一步都产生子目标,那么既然产生了,不优化不就浪费了嘛,怎么利用呢?于是作者对每一时刻的子目标也做了平滑处理,即$w_{t}=\phi\left(\sum_{i=t-c}^{t} g_{i}\right)$。</p>
<p>问题在于作者想让上层策略的子目标可以持续$c$个时间步,$c$个时间步之后便失效,也就是说,对于上层策略,它的经验序列应该是这样的$s_t, s_{t+c}, s_{t+2c} … s_{t+nc}$,但是由于每一时间步都存在子目标,那么也存在这样的序列$s_{t+1}, s_{t+c+1}, s_{t+2c+1} … s_{t+nc+1}$。针对这种情况,其实有两种处理手段:</p>
<ol>
<li>将收集到的连续序列$s_t, s_{t+1}…$按时间步$c$切分成$c$条轨迹,然后分批次输入到LSTM中进行训练。其实相当于数据预处理过程</li>
<li>直接给LSTM输入收集到的连续序列$s_t, s_{t+1}…$,设置$r=c$组cell_state,也就是为$c$条轨迹设置$c$个不同的起始cell_state,由此来处理不切分的交叉时间尺度的经验序列,让梯度随着cell_state按不同轨迹自动传播。</li>
</ol>
<p>作者就是使用了第2种的处理方式。另外,作者在做对比实验的时候,给LSTM设置步长为40,给Dilated LSTM设置步长400,子目标持续步长$c=10$,也应该是这个道理,因为400/10=40,其实每个子目标序列的持续时间都是一直的,都是40个时间步。</p>
<p>从这个角度分析,我觉得作者说“它促进了更长时间的信用分配以及增强了RNN记忆能力”是一个噱头,因为其实是它只是在共享了LSTM的参数的基础上,对$c$条连续上层子目标序列<strong>分别</strong>做了LSTM运算,本质上没有将长时间的记忆融合进去。</p>
<h1 id="与h-DQN和OC的比较"><a href="#与h-DQN和OC的比较" class="headerlink" title="与h-DQN和OC的比较"></a>与h-DQN和OC的比较</h1><p>FuNs可以视为是h-DQN与OC的部分结合,但又不尽相同。</p>
<h2 id="h-DQN-vs-FuNs"><a href="#h-DQN-vs-FuNs" class="headerlink" title="h-DQN vs. FuNs"></a>h-DQN vs. FuNs</h2><p>相同:</p>
<ul>
<li>都是显式地将决策模型分为两层,上层决策子目标,下层决策具体动作</li>
<li>上层都是最大化外部环境累计期望回报</li>
<li>下层都使用了内在奖励</li>
<li>两者都将子目标作为下层策略的输入</li>
</ul>
<p>不同:</p>
<ul>
<li>FuNs上层产生的子目标是智能体的观测值在隐状态空间下的方向向量,而h-DQN上层产生的子目标是需要人为设计的,往往在原始观测状态空间上设置。</li>
<li>FuNs不需要先验知识</li>
<li>FuNs优化下层模型时,既使用了环境外部奖励,也使用了基于子目标产生的内部奖励,而h-DQN只使用了内部奖励</li>
<li>h-DQN是Q-Learning式的优化方式,即最小化均方误差,而FuNs是PG式的优化方式,即最大化累计期望奖励</li>
</ul>
<h2 id="OC-vs-FuNs"><a href="#OC-vs-FuNs" class="headerlink" title="OC vs. FuNs"></a>OC vs. FuNs</h2><p>相同:</p>
<ul>
<li>两者都是用了PG式的更新方式</li>
<li>两者都是端到端的</li>
</ul>
<p>不同:</p>
<ul>
<li>OC需要判断option的终止条件,FuNs固定上层策略的步长$c$</li>
<li>OC需要指定option的数量(离散),而FuNs产生的子目标为方向向量(连续)</li>
<li>OC需要为每个option构造一个下层策略模型,而FuNs共同一个下层策略模型</li>
<li>OC的下层策略输入不包含option,只是根据option选到下层策略,而FuNs的下层策略需要包含子目标输入</li>
<li>OC没有引入内在奖励,FuNs使用了内在奖励</li>
<li>OC可以应用于连续动作空间,而FuNs用于离散动作空间</li>
</ul>
</div>
<div>
<div>
<div style="text-align:center;color: #ccc;font-size:14px;">-------------本文结束<i class="fa fa-heart"></i>感谢您的阅读-------------</div>
</div>
</div>
<div>
<div class="my_post_copyright">
<script src="//cdn.bootcss.com/clipboard.js/1.5.10/clipboard.min.js"></script>
<!-- JS库 sweetalert 可修改路径 -->
<script src="https://cdn.bootcss.com/jquery/2.0.0/jquery.min.js"></script>
<script src="https://unpkg.com/sweetalert/dist/sweetalert.min.js"></script>
<p><span>本文标题:</span><a href="/FuNs.html">FeUdal Networks for Hierarchical Reinforcement Learning</a></p>
<p><span>文章作者:</span><a href="/" title="访问 Keavnn 的个人博客">Keavnn</a></p>
<p><span>发布时间:</span>2020年04月27日 - 10:04</p>
<p><span>最后更新:</span>2020年04月27日 - 12:04</p>
<p><span>原始链接:</span><a href="/FuNs.html" title="FeUdal Networks for Hierarchical Reinforcement Learning">http://StepNeverStop.github.io/FuNs.html</a>
<span class="copy-path" title="点击复制文章链接"><i class="fa fa-clipboard" data-clipboard-text="http://StepNeverStop.github.io/FuNs.html" aria-label="复制成功!"></i></span>
</p>
<p><span>许可协议:</span><i class="fa fa-creative-commons"></i> <a rel="external nofollow" href="https://creativecommons.org/licenses/by-nc-sa/4.0/" target="_blank" title="Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)">署名-非商业性使用-相同方式共享 4.0 国际</a> 转载请保留原文链接及作者。</p>
</div>
<script>
var clipboard = new Clipboard('.fa-clipboard');
$(".fa-clipboard").click(function(){
clipboard.on('success', function(){
swal({
title: "",
text: '复制成功',
icon: "success",
showConfirmButton: true
});
});
});
</script>
</div>
<div>
<div style="padding: 10px 0; margin: 20px auto; width: 90%; text-align: center;">
<div>如果您获得了帮助,也可以资助一下小的啦~</div>
<button id="rewardButton" disable="enable" onclick="var qr = document.getElementById('QR'); if (qr.style.display === 'none') {qr.style.display='block';} else {qr.style.display='none'}">
<span>打赏啦</span>
</button>
<div id="QR" style="display: none;">
<div id="wechat" style="display: inline-block">
<img id="wechat_qr" src="/images/wechatpay.jpg" alt="Keavnn 微信">
<p>微信</p>
</div>
<div id="alipay" style="display: inline-block">
<img id="alipay_qr" src="/images/alipay.jpg" alt="Keavnn 支付宝">
<p>支付宝</p>
</div>
</div>
</div>
</div>
<footer class="post-footer">
<div class="post-tags">
<a href="/tags/rl/" rel="tag"> <i class="fa fa-tag"></i> rl</a>
<a href="/tags/hrl/" rel="tag"> <i class="fa fa-tag"></i> hrl</a>
</div>
<div class="post-nav">
<div class="post-nav-next post-nav-item">
<a href="/raspverry4b.html" rel="next" title="记录在树莓派4B上的配置命令">
<i class="fa fa-chevron-left"></i> 记录在树莓派4B上的配置命令
</a>
</div>
<span class="post-nav-divider"></span>
<div class="post-nav-prev post-nav-item">
<a href="/rl-code-pit.html" rel="prev" title="在DRL路上踩过的坑">
在DRL路上踩过的坑 <i class="fa fa-chevron-right"></i>
</a>
</div>
</div>
</footer>
</div>
</article>
<div class="post-spread">
<!-- Go to www.addthis.com/dashboard to customize your tools -->
<div class="addthis_inline_share_toolbox">
<script type="text/javascript" src="//s7.addthis.com/js/300/addthis_widget.js#pubid=ra-5cefbfc88c13b0e7" async="async"></script>
</div>
</div>
</div>
</div>
<div class="comments" id="comments">
<div id="lv-container" data-id="city" data-uid="MTAyMC80MTk0NS8xODQ5MQ=="></div>
</div>
</div>
<div class="sidebar-toggle">
<div class="sidebar-toggle-line-wrap">
<span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
<span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
<span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
</div>
</div>
<aside id="sidebar" class="sidebar">
<div id="sidebar-dimmer"></div>
<div class="sidebar-inner">
<ul class="sidebar-nav motion-element">
<li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
文章目录
</li>
<li class="sidebar-nav-overview" data-target="site-overview-wrap">
站点概览
</li>
</ul>
<section class="site-overview-wrap sidebar-panel">
<div class="site-overview">
<div class="site-author motion-element" itemprop="author" itemscope="" itemtype="http://schema.org/Person">
<img class="site-author-image" itemprop="image" src="/images/Kicon.jpg" alt="Keavnn">
<p class="site-author-name" itemprop="name">Keavnn</p>
<p class="site-description motion-element" itemprop="description">If it is to be, it is up to me.</p>
</div>
<nav class="site-state motion-element">
<div class="site-state-item site-state-posts">
<a href="/archives/">
<span class="site-state-item-count">51</span>
<span class="site-state-item-name">日志</span>
</a>
</div>
<div class="site-state-item site-state-categories">
<a href="/categories/index.html">
<span class="site-state-item-count">11</span>
<span class="site-state-item-name">分类</span>
</a>
</div>
<div class="site-state-item site-state-tags">
<a href="/tags/index.html">
<span class="site-state-item-count">26</span>
<span class="site-state-item-name">标签</span>
</a>
</div>
</nav>
<div class="feed-link motion-element">
<a href="/atom.xml" rel="alternate">
<i class="fa fa-rss"></i>
RSS
</a>
</div>
<div class="links-of-author motion-element">
<span class="links-of-author-item">
<a href="https://github.com/StepNeverStop" target="_blank" title="GitHub" rel="external nofollow">
<i class="fa fa-fw fa-github"></i>GitHub</a>
</span>
<span class="links-of-author-item">
<a href="mailto:[email protected]" target="_blank" title="E-Mail" rel="external nofollow">
<i class="fa fa-fw fa-envelope"></i>E-Mail</a>
</span>
</div>
<div class="cc-license motion-element" itemprop="license">
<a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" class="cc-opacity" target="_blank" rel="external nofollow">
<img src="/images/cc-by-nc-sa.svg" alt="Creative Commons">
</a>
</div>
<div class="links-of-blogroll motion-element links-of-blogroll-inline">
<div class="links-of-blogroll-title">
<i class="fa fa-fw fa-link"></i>
推荐阅读
</div>
<ul class="links-of-blogroll-list">
<li class="links-of-blogroll-item">
<a href="https://bluefisher.github.io" title="Fisher Chang" target="_blank" rel="external nofollow">Fisher Chang</a>
</li>
</ul>
</div>
</div>
</section>
<!--noindex-->
<section class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
<div class="post-toc">
<div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-1"><a class="nav-link" href="#简介"><span class="nav-number">1.</span> <span class="nav-text">简介</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#关注点"><span class="nav-number">1.1.</span> <span class="nav-text">关注点</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#主要贡献"><span class="nav-number">1.2.</span> <span class="nav-text">主要贡献</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#优点-效果"><span class="nav-number">1.3.</span> <span class="nav-text">优点/效果</span></a></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#文中精要"><span class="nav-number">2.</span> <span class="nav-text">文中精要</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#Manager损失"><span class="nav-number">2.1.</span> <span class="nav-text">Manager损失</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Worker损失"><span class="nav-number">2.2.</span> <span class="nav-text">Worker损失</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Dilated-LSTM"><span class="nav-number">2.3.</span> <span class="nav-text">Dilated LSTM</span></a></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#与h-DQN和OC的比较"><span class="nav-number">3.</span> <span class="nav-text">与h-DQN和OC的比较</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#h-DQN-vs-FuNs"><span class="nav-number">3.1.</span> <span class="nav-text">h-DQN vs. FuNs</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#OC-vs-FuNs"><span class="nav-number">3.2.</span> <span class="nav-text">OC vs. FuNs</span></a></li></ol></li></ol></div>
</div>
</section>
<!--/noindex-->
</div>
</aside>
</div>
</main>
<footer id="footer" class="footer">
<div class="footer-inner">
<script async src="https://busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>
<div class="copyright">© <span itemprop="copyrightYear">2020</span>
<span class="with-love">
<i class="fa fa-heart"></i>
</span>
<span class="author" itemprop="copyrightHolder">Keavnn</span>
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-area-chart"></i>
</span>
<span class="post-meta-item-text">Site words total count:</span>
<span title="Site words total count">80.3k</span>
</div>
<div class="powered-by">
<i class="fa fa-user-md"></i><span id="busuanzi_container_site_pv">
本站总访问量<span id="busuanzi_value_site_pv"></span>次
</span>
</div>
<!-- <div class="theme-info">
<div class="powered-by"></div>
<span class="post-count">博客全站共80.3k字</span>
</div> -->
</div>
</footer>
<div class="back-to-top">
<i class="fa fa-arrow-up"></i>
<span id="scrollpercent"><span>0</span>%</span>
</div>
</div>
<script type="text/javascript">
if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
window.Promise = null;
}
</script>