PaddlePaddle · TomorrowIsAnOtherDay · Jan 18, 2019 · Jan 16, 2019 · Jan 16, 2019 · Jan 16, 2019
diff --git a/.github/Aircraft.gif b/.github/Aircraft.gif
diff --git a/.github/Breakout.gif b/.github/Breakout.gif
diff --git a/.github/Half-Cheetah.gif b/.github/Half-Cheetah.gif
diff --git a/.github/NeurlIPS2018.gif b/.github/NeurlIPS2018.gif
diff --git a/README.md b/README.md
@@ -67,7 +67,7 @@ agent = AtariAgent(algorithm)
 # Install:
 ### Dependencies
 - Python 2.7 or 3.5+. 
-- PaddlePaddle >=1.2.1 (We try to make our repository always compatible with newest version PaddlePaddle)  
+- PaddlePaddle >=1.2.1 (We try to make our repository always compatible with latest version PaddlePaddle)  
 
 
 ```
@@ -80,3 +80,7 @@ pip install --upgrade git+https://github.com/PaddlePaddle/PARL.git
 - [DDPG](examples/DDPG/)
 - [PPO](examples/PPO/)
 - [Winning Solution for NIPS2018: AI for Prosthetics Challenge](examples/NeurIPS2018-AI-for-Prosthetics-Challenge/)
+
+<img src=".github/NeurlIPS2018.gif" width = "300" height ="200" alt="NeurlIPS2018"/> <img src=".github/Half-Cheetah.gif" width = "300" height ="200" alt="Half-Cheetah"/> <img src=".github/Breakout.gif" width = "200" height ="200" alt="Breakout"/> 
+<br>
+<img src=".github/Aircraft.gif"  width = "808" height ="300"  alt="NeurlIPS2018"/>
diff --git a/docs/ct.png b/docs/ct.png
diff --git a/docs/design_doc.md b/docs/design_doc.md
diff --git a/docs/framework.png b/docs/framework.png
diff --git a/docs/model.png b/docs/model.png
diff --git a/docs/relation.png b/docs/relation.png
diff --git a/docs/step.png b/docs/step.png
diff --git a/examples/DDPG/.benchmark/DDPG_HalfCheetah-v2.png b/examples/DDPG/.benchmark/DDPG_HalfCheetah-v2.png
diff --git a/examples/DDPG/README.md b/examples/DDPG/README.md
@@ -7,11 +7,15 @@ Based on PARL, the DDPG model of deep reinforcement learning is reproduced, and
 ### Mujoco games introduction
 Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco game.
 
+### Benchmark result
+- HalfCheetah-v2
+<img src=".benchmark/DDPG_HalfCheetah-v2.png"/>  
 
 ## How to use
 ### Dependencies:
 + python2.7 or python3.5+
 + [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle)
++ [parl](https://github.com/PaddlePaddle/PARL)
 + gym
 + tqdm
 + mujoco-py>=1.50.1.0

diff --git a/examples/DQN/.benchmark/DQN_Pong.png b/examples/DQN/.benchmark/DQN_Pong.png
diff --git a/examples/DQN/README.md b/examples/DQN/README.md
@@ -7,15 +7,20 @@ Based on PARL, the DQN model of deep reinforcement learning is reproduced, and t
 ### Atari games introduction
 Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari game.
 
+### Benchmark result
+- Pong
+<img src=".benchmark/DQN_Pong.png"/>  
 
 ## How to use
 ### Dependencies:
 + python2.7 or python3.5+
 + [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle)
++ [parl](https://github.com/PaddlePaddle/PARL)
 + gym
 + tqdm
 + opencv-python
-+ ale_python_interface
++ atari_py
++ [ale_python_interface](https://github.com/mgbellemare/Arcade-Learning-Environment)
 
 
 ### Start Training:

diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/README.md b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/README.md
@@ -11,7 +11,7 @@ For more technical details about our solution, we provide:
 3. [[Link]](https://drive.google.com/file/d/1W-FmbJu4_8KmwMIzH0GwaFKZ0z1jg_u0/view?usp=sharing) A poster briefly introducing our solution in NeurIPS2018 competition workshop.
 3. (coming soon)A full academic paper detailing our solution, including entire training pipline, related work and experiments that analyze the importance of each key ingredient.
 
-**Note**: Reproducibility is a long-standing issue in reinforcement learning field. We have tried to guarantee that our code is reproducible, testing each training sub-task three times. However, there are still some factors that prevent us from achieving the same performance. One problem is the choice time of a convergence model during curriculum learning. Choosing a sensible and natural gait visually is crucial for subsequent training, but the definition of what is a good gait varies from different people.
+**Note**: Reproducibility is a long-standing issue in reinforcement learning field. We have tried to guarantee that our code is reproducible, testing each training sub-task three times. However, there are still some factors that prevent us from achieving the same performance. One problem is the choice time of a convergence model during curriculum learning. Choosing a sensible and natural gait visually is crucial for subsequent training, but the definition of what is a good gait varies from person to person.
 
 <p align="center">
 <img src="image/demo.gif" alt="PARL" width="500"/>
@@ -60,7 +60,7 @@ For final submission, we test our model in 500 CPUs, running 10 episodes per CPU
 python simulator_server.py --port [PORT] --ensemble_num 1 
 
 # client (Suggest: 200+ clients)
-python simulator_client.py --port [PORT] --ip [IP] --reward_type RunFastest
+python simulator_client.py --port [PORT] --ip [SERVER_IP] --reward_type RunFastest
 ```
 
 #### 2. Target: run at 3.0 m/s
@@ -71,7 +71,7 @@ python simulator_server.py --port [PORT] --ensemble_num 1 --warm_start_batchs 10
            --restore_model_path [RunFastest model]
 
 # client (Suggest: 200+ clients)
-python simulator_client.py --port [PORT] --ip [IP] --reward_type FixedTargetSpeed --target_v 3.0 \
+python simulator_client.py --port [PORT] --ip [SERVER_IP] --reward_type FixedTargetSpeed --target_v 3.0 \
            --act_penalty_lowerbound 1.5 
 ```
 
@@ -83,7 +83,7 @@ python simulator_server.py --port [PORT] --ensemble_num 1 --warm_start_batchs 10
            --restore_model_path [FixedTargetSpeed 3.0m/s model]
 
 # client (Suggest: 200+ clients)
-python simulator_client.py --port [PORT] --ip [IP] --reward_type FixedTargetSpeed --target_v 2.0 \
+python simulator_client.py --port [PORT] --ip [SERVER_IP] --reward_type FixedTargetSpeed --target_v 2.0 \
            --act_penalty_lowerbound 0.75 
 ```
 
@@ -99,7 +99,7 @@ python simulator_server.py --port [PORT] --ensemble_num 1 --warm_start_batchs 10
            --restore_model_path [FixedTargetSpeed 2.0m/s model]  
 
 # client (Suggest: 200+ clients)
-python simulator_client.py --port [PORT] --ip [IP] --reward_type FixedTargetSpeed --target_v 1.25 \
+python simulator_client.py --port [PORT] --ip [SERVER_IP] --reward_type FixedTargetSpeed --target_v 1.25 \
            --act_penalty_lowerbound 0.6
 ```
 
@@ -109,10 +109,10 @@ As mentioned before, the selection of model that used to fine-tune influence lat
 ```bash
 # server
 python simulator_server.py --port [PORT] --ensemble_num 12 --warm_start_batchs 1000 \
-           --restore_model_path [FixedTargetSpeed 1.25m/s] --restore_from_one_head 
+           --restore_model_path [FixedTargetSpeed 1.25m/s model] --restore_from_one_head 
 
 # client (Suggest: 100+ clients)
-python simulator_client.py --port [PORT] --ip [IP] --reward_type Round2 --act_penalty_lowerbound 0.75 \
+python simulator_client.py --port [PORT] --ip [SERVER_IP] --reward_type Round2 --act_penalty_lowerbound 0.75 \
            --act_penalty_coeff 7.0 --vel_penalty_coeff 20.0 --discrete_data --stage 3
 ```
 

diff --git a/examples/PPO/README.md b/examples/PPO/README.md
@@ -16,8 +16,9 @@ Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco
 
 ## How to use
 ### Dependencies:
-+ python2.7 or python3.5+
++ python3.5+
 + [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle)
++ [parl](https://github.com/PaddlePaddle/PARL)
 + gym
 + tqdm
 + mujoco-py>=1.50.1.0

diff --git a/examples/PPO/mujoco_agent.py b/examples/PPO/mujoco_agent.py
@@ -15,7 +15,6 @@
 import numpy as np
 import parl.layers as layers
 from paddle import fluid
-from sklearn.utils import shuffle
 from parl.framework.agent_base import Agent
 from parl.utils import logger
 
@@ -183,12 +182,16 @@ def value_learn(self, obs, value):
 
         all_loss = []
         for _ in range(self.value_learn_times):
-            obs_train, value_train = shuffle(obs_train, value_train)
+            random_ids = np.arange(obs_train.shape[0])
+            np.random.shuffle(random_ids)
+            shuffle_obs_train = obs_train[random_ids]
+            shuffle_value_train = value_train[random_ids]
             start = 0
             while start < data_size:
                 end = start + self.value_batch_size
-                value_loss = self._batch_value_learn(obs_train[start:end, :],
-                                                     value_train[start:end])
+                value_loss = self._batch_value_learn(
+                    shuffle_obs_train[start:end, :],
+                    shuffle_value_train[start:end])
                 all_loss.append(value_loss)
                 start += self.value_batch_size
         return np.mean(all_loss)
diff --git a/examples/QuickStart/README.md b/examples/QuickStart/README.md
@@ -6,6 +6,7 @@ Based on PARL, train a agent to play CartPole game with policy gradient algorith
 
 + python2.7 or python3.5+
 + [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle)
++ [parl](https://github.com/PaddlePaddle/PARL)
 + gym
 
 ### Start Training:

diff --git a/parl/algorithm_zoo/__init__.py b/parl/algorithm_zoo/__init__.py
diff --git a/parl/algorithm_zoo/simple_algorithms.py b/parl/algorithm_zoo/simple_algorithms.py
diff --git a/parl/common/__init__.py b/parl/common/__init__.py