diff --git a/include/caffe/layers/bias_layer.hpp b/include/caffe/layers/bias_layer.hpp
new file mode 100644
index 00000000000..eedc3aaa351
--- /dev/null
+++ b/include/caffe/layers/bias_layer.hpp
@@ -0,0 +1,54 @@
+#ifndef CAFFE_BIAS_LAYER_HPP_
+#define CAFFE_BIAS_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Computes a sum of two input Blobs, with the shape of the
+ *        latter Blob "broadcast" to match the shape of the former.
+ *        Equivalent to tiling the latter Blob, then computing the elementwise
+ *        sum.
+ *
+ * The second input may be omitted, in which case it's learned as a parameter
+ * of the layer.
+ */
+template <typename Dtype>
+class BiasLayer : public Layer<Dtype> {
+ public:
+  explicit BiasLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Bias"; }
+  virtual inline int MinBottomBlobs() const { return 1; }
+  virtual inline int MaxBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+ private:
+  Blob<Dtype> bias_multiplier_;
+  int outer_dim_, bias_dim_, inner_dim_, dim_;
+};
+
+
+
+}  // namespace caffe
+
+#endif  // CAFFE_BIAS_LAYER_HPP_
diff --git a/include/caffe/layers/channelwise_affine_layer.hpp b/include/caffe/layers/channelwise_affine_layer.hpp
deleted file mode 100644
index 6d8ac98b6ed..00000000000
--- a/include/caffe/layers/channelwise_affine_layer.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-#ifndef CAFFE_CHANNELWISE_AFFINE_LAYER_HPP_
-#define CAFFE_CHANNELWISE_AFFINE_LAYER_HPP_
-
-#include <vector>
-#include "caffe/blob.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/layers/neuron_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-namespace caffe {
-    /**
-     * @brief Affine non-linearity function @f$
-     *         y = ax+b
-     *     @f$, could be used after batch normalization layer
-     *
-     */
-template <typename Dtype>
-class ChannelwiseAffineLayer : public NeuronLayer<Dtype> {
- public:
-      /**
-       * @param param provides ChannelwiseAffineParameter ChannelwiseAffine_param,
-       *     with ChannelwiseAffineLayer options:
-       *   - slope_filler (\b optional, FillerParameter,
-        *     default {'type': constant 'value':1.0001}).
-      *   - bias_filler (\b optional, FillerParameter,
-        *     default {'type': constant 'value':0.0001}).
-      *   - channel_shared (\b optional, default false).
-      *     slopes and biases are shared across channels.
-      */
-     explicit ChannelwiseAffineLayer(const LayerParameter& param)
-         : NeuronLayer<Dtype>(param) {}
-     virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-                             const vector<Blob<Dtype>*>& top);
-     virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-                             const vector<Blob<Dtype>*>& top);
-     virtual inline const char* type() const { return "ChannelwiseAffine"; }
-
- protected:
-     /**
-      * @param bottom input Blob vector (length 1)
-      *   -# @f$ (N \times C \times ...) @f$
-      *      the inputs @f$ x @f$
-      * @param top output Blob vector (length 1)
-      *   -# @f$ (N \times C \times ...) @f$
-      *      the computed outputs for each channel @f$i@f$ @f$
-      *        y_i = a_i x_i + b_i
-      *      @f$.
-      */
-     virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-                              const vector<Blob<Dtype>*>& top);
-     virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-                              const vector<Blob<Dtype>*>& top);
-     /**
-      * @brief Computes the error gradient w.r.t. the ChannelwiseAffine inputs.
-      *
-      * @param top output Blob vector (length 1), providing the error gradient with
-      *      respect to the outputs
-      *   -# @f$ (N \times C \times ...) @f$
-      *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-      *      with respect to computed outputs @f$ y @f$
-      * @param propagate_down see Layer::Backward.
-      * @param bottom input Blob vector (length 1)
-      *   -# @f$ (N \times C \times ...) @f$
-      *      the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their
-      *      diff with gradients @f$
-      *        \frac{\partial E}{\partial x_i} = \left\{
-       *        \begin{array}{lr}
-          *            a_i \frac{\partial E}{\partial y_i}
-      *        \end{array} \right.
-      *      @f$.
-      *      If param_propagate_down_[0] is true, it fills the diff with gradients
-      *      @f$
-      *        \frac{\partial E}{\partial a_i} = \left\{
-         *        \begin{array}{lr}
-         *            \sum_{x_i} x_i \frac{\partial E}{\partial y_i}
-          *        \end{array} \right.
-          *      @f$.
-          *      If param_propagate_down_[1] is true, it fills the diff with gradients
-          *      @f$
-          *        \frac{\partial E}{\partial b_i} = \left\{
-              *        \begin{array}{lr}
-              *             frac{\partial E}{\partial y_i}
-              *        \end{array} \right.
-          *      @f$.
-          */
-     virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-                               const vector<bool>& propagate_down,
-                               const vector<Blob<Dtype>*>& bottom);
-     virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-                               const vector<bool>& propagate_down,
-                               const vector<Blob<Dtype>*>& bottom);
-     bool channel_shared_;
-     Blob<Dtype> multiplier_;
-     // dot multiplier for backward computation of params
-     Blob<Dtype> bias_multiplier_;
-     Blob<Dtype> backward_buff_;
-     // temporary buffer for backward computation
-     Blob<Dtype> bottom_memory_;
-     // memory for in-place computation
-};
-}  // namespace caffe
-
-#endif  // CAFFE_CHANNELWISE_AFFINE_LAYER_HPP_
diff --git a/include/caffe/layers/scale_layer.hpp b/include/caffe/layers/scale_layer.hpp
new file mode 100644
index 00000000000..924df2e51ab
--- /dev/null
+++ b/include/caffe/layers/scale_layer.hpp
@@ -0,0 +1,83 @@
+#ifndef CAFFE_SCALE_LAYER_HPP_
+#define CAFFE_SCALE_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/bias_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes a product of two input Blobs, with the shape of the
+ *        latter Blob "broadcast" to match the shape of the former.
+ *        Equivalent to tiling the latter Blob, then computing the elementwise
+ *        product.
+ *
+ * The second input may be omitted, in which case it's learned as a parameter
+ * of the layer.
+ */
+template <typename Dtype>
+class ScaleLayer: public Layer<Dtype> {
+ public:
+  explicit ScaleLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Scale"; }
+  // Scale
+  virtual inline int MinBottomBlobs() const { return 1; }
+  virtual inline int MaxBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * In the below shape specifications, @f$ i @f$ denotes the value of the
+   * `axis` field given by `this->layer_param_.scale_param().axis()`, after
+   * canonicalization (i.e., conversion from negative to positive index,
+   * if applicable).
+   *
+   * @param bottom input Blob vector (length 2)
+   *   -# @f$ (d_0 \times ... \times
+   *           d_i \times ... \times d_j \times ... \times d_n) @f$
+   *      the first factor @f$ x @f$
+   *   -# @f$ (d_i \times ... \times d_j) @f$
+   *      the second factor @f$ y @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (d_0 \times ... \times
+   *           d_i \times ... \times d_j \times ... \times d_n) @f$
+   *      the product @f$ z = x y @f$ computed after "broadcasting" y.
+   *      Equivalent to tiling @f$ y @f$ to have the same shape as @f$ x @f$,
+   *      then computing the elementwise product.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  shared_ptr<Layer<Dtype> > bias_layer_;
+  vector<Blob<Dtype>*> bias_bottom_vec_;
+  vector<bool> bias_propagate_down_;
+  int bias_param_id_;
+
+  Blob<Dtype> sum_multiplier_;
+  Blob<Dtype> sum_result_;
+  Blob<Dtype> temp_;
+  int axis_;
+  int outer_dim_, scale_dim_, inner_dim_;
+};
+
+
+}  // namespace caffe
+
+#endif  // CAFFE_SCALE_LAYER_HPP_
diff --git a/src/caffe/layers/bias_layer.cpp b/src/caffe/layers/bias_layer.cpp
new file mode 100644
index 00000000000..0a786b5db98
--- /dev/null
+++ b/src/caffe/layers/bias_layer.cpp
@@ -0,0 +1,121 @@
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layers/bias_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void BiasLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  if (bottom.size() == 1 && this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else if (bottom.size() == 1) {
+    // bias is a learned parameter; initialize it
+    const BiasParameter& param = this->layer_param_.bias_param();
+    const int axis = bottom[0]->CanonicalAxisIndex(param.axis());
+    const int num_axes = param.num_axes();
+    CHECK_GE(num_axes, -1) << "num_axes must be non-negative, "
+                           << "or -1 to extend to the end of bottom[0]";
+    if (num_axes >= 0) {
+      CHECK_GE(bottom[0]->num_axes(), axis + num_axes)
+          << "bias blob's shape extends past bottom[0]'s shape when applied "
+          << "starting with bottom[0] axis = " << axis;
+    }
+    this->blobs_.resize(1);
+    const vector<int>::const_iterator& shape_start =
+        bottom[0]->shape().begin() + axis;
+    const vector<int>::const_iterator& shape_end =
+        (num_axes == -1) ? bottom[0]->shape().end() : (shape_start + num_axes);
+    vector<int> bias_shape(shape_start, shape_end);
+    this->blobs_[0].reset(new Blob<Dtype>(bias_shape));
+    shared_ptr<Filler<Dtype> > filler(GetFiller<Dtype>(param.filler()));
+    filler->Fill(this->blobs_[0].get());
+  }
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+}
+
+template <typename Dtype>
+void BiasLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const BiasParameter& param = this->layer_param_.bias_param();
+  Blob<Dtype>* bias = (bottom.size() > 1) ? bottom[1] : this->blobs_[0].get();
+  // Always set axis == 0 in special case where bias is a scalar
+  // (num_axes == 0). Mathematically equivalent for any choice of axis, so the
+  // actual setting can be safely ignored; and computation is most efficient
+  // with axis == 0 and (therefore) outer_dim_ == 1.
+  const int axis = (bias->num_axes() == 0) ?
+      0 : bottom[0]->CanonicalAxisIndex(param.axis());
+  CHECK_GE(bottom[0]->num_axes(), axis + bias->num_axes())
+      << "bias blob's shape extends past bottom[0]'s shape when applied "
+      << "starting with bottom[0] axis = " << axis;
+  for (int i = 0; i < bias->num_axes(); ++i) {
+    CHECK_EQ(bottom[0]->shape(axis + i), bias->shape(i))
+        << "dimension mismatch between bottom[0]->shape(" << axis + i
+        << ") and bias->shape(" << i << ")";
+  }
+  outer_dim_ = bottom[0]->count(0, axis);
+  bias_dim_ = bias->count();
+  inner_dim_ = bottom[0]->count(axis + bias->num_axes());
+  dim_ = bias_dim_ * inner_dim_;
+  if (bottom[0] != top[0]) {
+    top[0]->ReshapeLike(*bottom[0]);
+  }
+  bias_multiplier_.Reshape(vector<int>(1, inner_dim_));
+  if (bias_multiplier_.cpu_data()[inner_dim_ - 1] != Dtype(1)) {
+    caffe_set(inner_dim_, Dtype(1), bias_multiplier_.mutable_cpu_data());
+  }
+}
+
+template <typename Dtype>
+void BiasLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const Dtype* bias_data =
+      ((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  if (bottom[0] != top[0]) {
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    caffe_copy(bottom[0]->count(), bottom_data, top_data);
+  }
+  for (int n = 0; n < outer_dim_; ++n) {
+    caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, bias_dim_,
+        inner_dim_, Dtype(1), Dtype(1), bias_data,
+        bias_multiplier_.cpu_data(), Dtype(1), top_data);
+    top_data += dim_;
+  }
+}
+
+template <typename Dtype>
+void BiasLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0] && bottom[0] != top[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    caffe_copy(bottom[0]->count(), top_diff, bottom_diff);
+  }
+  // in-place, we don't need to do anything with the data diff
+  const bool bias_param = (bottom.size() == 1);
+  if ((!bias_param && propagate_down[1]) ||
+      (bias_param && this->param_propagate_down_[0])) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bias_diff = (bias_param ? this->blobs_[0].get() : bottom[1])
+        ->mutable_cpu_diff();
+    bool accum = bias_param;
+    for (int n = 0; n < outer_dim_; ++n) {
+      caffe_cpu_gemv(CblasNoTrans, bias_dim_, inner_dim_, Dtype(1),
+          top_diff, bias_multiplier_.cpu_data(), Dtype(accum), bias_diff);
+      top_diff += dim_;
+      accum = true;
+    }
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(BiasLayer);
+#endif
+
+INSTANTIATE_CLASS(BiasLayer);
+REGISTER_LAYER_CLASS(Bias);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/bias_layer.cu b/src/caffe/layers/bias_layer.cu
new file mode 100644
index 00000000000..8ac913a5d7b
--- /dev/null
+++ b/src/caffe/layers/bias_layer.cu
@@ -0,0 +1,59 @@
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layers/bias_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void BiasForward(const int n, const Dtype* in,
+    const Dtype* bias, const int bias_dim, const int inner_dim,
+    Dtype* out) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int bias_index = (index / inner_dim) % bias_dim;
+    out[index] = in[index] + bias[bias_index];
+  }
+}
+
+template <typename Dtype>
+void BiasLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const int count = top[0]->count();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* bias_data =
+      ((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  BiasForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, bottom_data, bias_data, bias_dim_, inner_dim_, top_data);
+}
+
+template <typename Dtype>
+void BiasLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0] && bottom[0] != top[0]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    caffe_copy(bottom[0]->count(), top_diff, bottom_diff);
+  }
+  // in-place, we don't need to do anything with the data diff
+  const bool bias_param = (bottom.size() == 1);
+  if ((!bias_param && propagate_down[1]) ||
+      (bias_param && this->param_propagate_down_[0])) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bias_diff = (bias_param ? this->blobs_[0].get() : bottom[1])
+        ->mutable_gpu_diff();
+    bool accum = bias_param;
+    for (int n = 0; n < outer_dim_; ++n) {
+      caffe_gpu_gemv(CblasNoTrans, bias_dim_, inner_dim_, Dtype(1),
+          top_diff, bias_multiplier_.gpu_data(), Dtype(accum), bias_diff);
+      top_diff += dim_;
+      accum = true;
+    }
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(BiasLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/channelwise_affine_layer.cpp b/src/caffe/layers/channelwise_affine_layer.cpp
deleted file mode 100644
index e9f31fb10e3..00000000000
--- a/src/caffe/layers/channelwise_affine_layer.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/layers/channelwise_affine_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ChannelwiseAffineLayer<Dtype>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  CHECK_GE(bottom[0]->num_axes(), 2)
-      << "Number of axes of bottom blob must be >=2.";
-  ChannelwiseAffineParameter channelwise_affine_param =
-        this->layer_param().channelwise_affine_param();
-  int channels = bottom[0]->channels();
-  channel_shared_ = channelwise_affine_param.channel_shared();
-  if (this->blobs_.size() > 0) {
-    LOG(INFO) << "Skipping parameter initialization";
-  } else {
-    this->blobs_.resize(2);
-    if (channel_shared_) {
-      this->blobs_[0].reset(new Blob<Dtype>(vector<int>(0)));
-      this->blobs_[1].reset(new Blob<Dtype>(vector<int>(0)));
-
-    } else {
-      this->blobs_[0].reset(new Blob<Dtype>(vector<int>(1, channels)));
-      this->blobs_[1].reset(new Blob<Dtype>(vector<int>(1, channels)));
-    }
-    shared_ptr<Filler<Dtype> > filler;
-    if (channelwise_affine_param.has_slope_filler()) {
-      filler.reset(GetFiller<Dtype>(channelwise_affine_param.slope_filler()));
-    } else {
-      FillerParameter filler_param;
-      filler_param.set_type("constant");
-      filler_param.set_value(1.0001);
-      filler.reset(GetFiller<Dtype>(filler_param));
-    }
-    filler->Fill(this->blobs_[0].get());
-
-    if (channelwise_affine_param.has_bias_filler()) {
-      filler.reset(GetFiller<Dtype>(channelwise_affine_param.bias_filler()));
-    } else {
-      FillerParameter filler_param;
-      filler_param.set_type("constant");
-      filler_param.set_value(0.0001);
-      filler.reset(GetFiller<Dtype>(filler_param));
-    }
-    filler->Fill(this->blobs_[1].get());
-  }
-  if (channel_shared_) {
-    CHECK_EQ(this->blobs_[0]->count(), 1)
-        << "Slope size is inconsistent with prototxt config";
-  } else {
-    CHECK_EQ(this->blobs_[0]->count(), channels)
-        << "Slope size is inconsistent with prototxt config";
-  }
-
-  // Propagate gradients to the parameters (as directed by backward pass).
-  this->param_propagate_down_.resize(this->blobs_.size(), true);
-  multiplier_.Reshape(vector<int>(1, bottom[0]->count(1)));
-  bias_multiplier_.Reshape(vector<int>(1, bottom[0]->count(1)));
-  backward_buff_.Reshape(vector<int>(1, bottom[0]->count(1)));
-  caffe_set(multiplier_.count(), Dtype(1.0),
-            multiplier_.mutable_cpu_data());
-  caffe_set(bias_multiplier_.count(), Dtype(1.0),
-            bias_multiplier_.mutable_cpu_data());
-}
-
-template <typename Dtype>
-void ChannelwiseAffineLayer<Dtype>::Reshape(
-    const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  CHECK_GE(bottom[0]->num_axes(), 2)
-      << "Number of axes of bottom blob must be >=2.";
-  top[0]->ReshapeLike(*bottom[0]);
-  if (bottom[0] == top[0]) {
-    // For in-place computation
-    bottom_memory_.ReshapeLike(*bottom[0]);
-  }
-  int height  = 1;
-  int width = 1;
-  if (bottom[0]->num_axes() > 2) {
-    height = bottom[0]->shape(2);
-    width = bottom[0]->shape(3);
-  }
-  vector<int> bias_multiplier_shape(1, height * width);
-  bias_multiplier_.Reshape(bias_multiplier_shape);
-  caffe_set(bias_multiplier_.count(), Dtype(1),
-      bias_multiplier_.mutable_cpu_data());
-}
-
-template <typename Dtype>
-void ChannelwiseAffineLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
-  const Dtype* slope_data = this->blobs_[0]->cpu_data();
-  const Dtype* bias_data = this->blobs_[1]->cpu_data();
-  // For in-place computation
-  if (bottom[0] == top[0]) {
-    caffe_copy(count, bottom_data, bottom_memory_.mutable_cpu_data());
-  }
-  // if channel_shared, channel index in the following computation becomes
-  // always zero.
-  const int div_factor = channel_shared_ ? channels : 1;
-  for (int i = 0; i < count; ++i) {
-    int c = (i / dim) % channels / div_factor;
-    top_data[i] = bottom_data[i] * slope_data[c] + bias_data[c];
-     }
-}
-
-template <typename Dtype>
-void ChannelwiseAffineLayer<Dtype>::Backward_cpu(
-    const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  const Dtype* slope_data = this->blobs_[0]->cpu_data();
-
-  const Dtype* top_diff = top[0]->cpu_diff();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->shape(1);
-  const int num = bottom[0]->shape(0);
-  int height = 1;
-  int width = 1;
-  if (bottom[0]->num_axes() > 2) {
-    height = bottom[0]->shape(2);
-    width = bottom[0]->shape(3);
-  }
-
-  // For in-place computation
-  if (top[0] == bottom[0]) {
-    bottom_data = bottom_memory_.cpu_data();
-  }
-
-  // if channel_shared, channel index in the following computation becomes
-  // always zero.
-  const int div_factor = channel_shared_ ? channels : 1;
-
-  // Propagte to param
-  // Since to write bottom diff will affect top diff if top and bottom blobs
-  // are identical (in-place computaion), we first compute param backward to
-  // keep top_diff unchanged.
-
-  if (this->param_propagate_down_[1]) {
-    Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
-    caffe_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
-    for (int n = 0; n < num; ++n) {
-      caffe_cpu_gemv<Dtype>(CblasNoTrans, channels, height * width, 1.,
-        top_diff + top[0]->offset(n),
-        bias_multiplier_.cpu_data(), 1., bias_diff);
-    }
-  }
-  if (this->param_propagate_down_[0]) {
-    Dtype* slope_diff = this->blobs_[0]->mutable_cpu_diff();
-        caffe_set(this->blobs_[0]->count(), Dtype(0), slope_diff);
-    for (int i = 0; i < count; ++i) {
-      int c = (i / dim) % channels / div_factor;
-      slope_diff[c] += top_diff[i] * bottom_data[i];
-    }
-  }
-
-  // Propagate to bottom
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    for (int i = 0; i < count; ++i) {
-      int c = (i / dim) % channels / div_factor;
-      bottom_diff[i] = slope_data[c] * top_diff[i];
-    }
-  }
-}
-
-
-#ifdef CPU_ONLY
-STUB_GPU(ChannelwiseAffineLayer);
-#endif
-
-INSTANTIATE_CLASS(ChannelwiseAffineLayer);
-REGISTER_LAYER_CLASS(ChannelwiseAffine);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/channelwise_affine_layer.cu b/src/caffe/layers/channelwise_affine_layer.cu
deleted file mode 100644
index 2066b26560b..00000000000
--- a/src/caffe/layers/channelwise_affine_layer.cu
+++ /dev/null
@@ -1,144 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/layers/channelwise_affine_layer.hpp"
-
-namespace caffe {
-
-// CUDA kernel for forward
-template <typename Dtype>
-__global__ void ChannelwiseAffineForward(const int n, const int channels,
-    const int dim, const Dtype* in, Dtype* out, const Dtype* slope_data,
-    const Dtype* bias_data, const int div_factor) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int c = (index / dim) % channels / div_factor;
-    out[index] = in[index] * slope_data[c] + bias_data[c];
-  }
-}
-
-// CUDA kernel for bottom backward
-template <typename Dtype>
-__global__ void ChannelwiseAffineBackward(const int n,
-    const int channels, const int dim, const Dtype* in_diff,
-    Dtype* out_diff, const Dtype* slope_data, const int div_factor) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int c = (index / dim) % channels / div_factor;
-    out_diff[index] = slope_data[c] * in_diff[index];
-  }
-}
-
-// CUDA kernel for element-wise parameter backward
-template <typename Dtype>
-__global__ void ChannelwiseAffineParamSlopeBackward(const int n,
-    const int rows, const int rowPitch, const Dtype* in_diff,
-    const Dtype* in_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-      out_diff[index] = in_diff[index] * in_data[index];
-      for ( int k = 1; k < rows; k++ ) {
-          out_diff[index] += in_diff[index + k*rowPitch]
-          * in_data[index + k*rowPitch];
-      }
-  }
-}
-
-template <typename Dtype>
-void ChannelwiseAffineLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
-  const Dtype* slope_data = this->blobs_[0]->gpu_data();
-  const Dtype* bias_data = this->blobs_[1]->gpu_data();
-  const int div_factor = channel_shared_ ? channels : 1;
-
-  // For in-place computation
-  if (top[0] == bottom[0]) {
-    caffe_copy(count, bottom_data, bottom_memory_.mutable_gpu_data());
-  }
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  ChannelwiseAffineForward<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(
-      count, channels, dim, bottom_data, top_data,
-      slope_data, bias_data, div_factor);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-void ChannelwiseAffineLayer<Dtype>::Backward_gpu(
-    const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const int count = bottom[0]->count();
-  const int num = bottom[0]->shape(0);
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->shape(1);
-  int height  = 1;
-  int width = 1;
-  if (bottom[0]->num_axes() > 2) {
-    height = bottom[0]->shape(2);
-    width = bottom[0]->shape(3);
-  }
-
-  // For in-place computation
-  if (top[0] == bottom[0]) {
-    bottom_data = bottom_memory_.gpu_data();
-  }
-  // Propagate to param
-  // Since to write bottom diff will affect top diff if top and bottom blobs
-  // are identical (in-place computaion), we first compute param backward to
-  // keep top_diff unchanged.
-  if (this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      caffe_gpu_set(this->blobs_[1]->count(), Dtype(0.0), bias_diff);
-    // Gradient with respect to bias
-      for (int n = 0; n < num; ++n) {
-          caffe_gpu_gemv<Dtype>(
-            CblasNoTrans, channels, height * width, (Dtype)1.,
-            top_diff + top[0]->offset(n), bias_multiplier_.gpu_data(),
-            (Dtype)1., bias_diff);
-      }
-  }
-  if (this->param_propagate_down_[0]) {
-    Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff();
-    int cdim = channels * dim;
-      // compute element-wise diff
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      ChannelwiseAffineParamSlopeBackward<Dtype><<<CAFFE_GET_BLOCKS(cdim),
-          CAFFE_CUDA_NUM_THREADS>>>(
-          cdim, num, top[0]->offset(1), top_diff ,
-          bottom_data,
-          backward_buff_.mutable_gpu_diff());
-      CUDA_POST_KERNEL_CHECK;
-      if (channel_shared_) {
-        Dtype d = 0;
-        caffe_gpu_dot<Dtype>(cdim, backward_buff_.gpu_diff(),
-            multiplier_.gpu_data(), &d);
-        caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(d), slope_diff);
-      } else {
-        caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, Dtype(1.),
-             backward_buff_.gpu_diff(), multiplier_.gpu_data(), Dtype(1.),
-            slope_diff);
-      }
-  }
-  // Propagate to bottom
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const Dtype* slope_data = this->blobs_[0]->gpu_data();
-    int div_factor = channel_shared_ ? channels : 1;
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    ChannelwiseAffineBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
-        CAFFE_CUDA_NUM_THREADS>>>(
-        count, channels, dim, top_diff, bottom_diff, slope_data, div_factor);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ChannelwiseAffineLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/scale_layer.cpp b/src/caffe/layers/scale_layer.cpp
new file mode 100644
index 00000000000..2af65450b51
--- /dev/null
+++ b/src/caffe/layers/scale_layer.cpp
@@ -0,0 +1,219 @@
+#include <algorithm>
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer_factory.hpp"
+#include "caffe/layers/scale_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ScaleLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const ScaleParameter& param = this->layer_param_.scale_param();
+  if (bottom.size() == 1 && this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else if (bottom.size() == 1) {
+    // scale is a learned parameter; initialize it
+    axis_ = bottom[0]->CanonicalAxisIndex(param.axis());
+    const int num_axes = param.num_axes();
+    CHECK_GE(num_axes, -1) << "num_axes must be non-negative, "
+                           << "or -1 to extend to the end of bottom[0]";
+    if (num_axes >= 0) {
+      CHECK_GE(bottom[0]->num_axes(), axis_ + num_axes)
+          << "scale blob's shape extends past bottom[0]'s shape when applied "
+          << "starting with bottom[0] axis = " << axis_;
+    }
+    this->blobs_.resize(1);
+    const vector<int>::const_iterator& shape_start =
+        bottom[0]->shape().begin() + axis_;
+    const vector<int>::const_iterator& shape_end =
+        (num_axes == -1) ? bottom[0]->shape().end() : (shape_start + num_axes);
+    vector<int> scale_shape(shape_start, shape_end);
+    this->blobs_[0].reset(new Blob<Dtype>(scale_shape));
+    FillerParameter filler_param(param.filler());
+    if (!param.has_filler()) {
+      // Default to unit (1) filler for identity operation.
+      filler_param.set_type("constant");
+      filler_param.set_value(1);
+    }
+    shared_ptr<Filler<Dtype> > filler(GetFiller<Dtype>(filler_param));
+    filler->Fill(this->blobs_[0].get());
+  }
+  if (param.bias_term()) {
+    LayerParameter layer_param(this->layer_param_);
+    layer_param.set_type("Bias");
+    BiasParameter* bias_param = layer_param.mutable_bias_param();
+    bias_param->set_axis(param.axis());
+    if (bottom.size() > 1) {
+      bias_param->set_num_axes(bottom[1]->num_axes());
+    } else {
+      bias_param->set_num_axes(param.num_axes());
+    }
+    bias_param->mutable_filler()->CopyFrom(param.bias_filler());
+    bias_layer_ = LayerRegistry<Dtype>::CreateLayer(layer_param);
+    bias_bottom_vec_.resize(1);
+    bias_bottom_vec_[0] = bottom[0];
+    bias_layer_->SetUp(bias_bottom_vec_, top);
+    bias_param_id_ = this->blobs_.size();
+    this->blobs_.resize(bias_param_id_ + 1);
+    this->blobs_[bias_param_id_] = bias_layer_->blobs()[0];
+    bias_propagate_down_.resize(1, false);
+  }
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+}
+
+template <typename Dtype>
+void ScaleLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const ScaleParameter& param = this->layer_param_.scale_param();
+  Blob<Dtype>* scale = (bottom.size() > 1) ? bottom[1] : this->blobs_[0].get();
+  // Always set axis_ == 0 in special case where scale is an actual scale
+  // (num_axes == 0). Mathematically equivalent for any choice of axis_, so the
+  // actual setting can be safely ignored; and computation is most efficient
+  // with axis_ == 0 and (therefore) outer_dim_ == 1. (Setting axis_ to
+  // bottom[0]->num_axes() - 1, giving inner_dim_ == 1, would be equally
+  // performant.)
+  axis_ = (scale->num_axes() == 0) ?
+      0 : bottom[0]->CanonicalAxisIndex(param.axis());
+  CHECK_GE(bottom[0]->num_axes(), axis_ + scale->num_axes())
+      << "scale blob's shape extends past bottom[0]'s shape when applied "
+      << "starting with bottom[0] axis = " << axis_;
+  for (int i = 0; i < scale->num_axes(); ++i) {
+    CHECK_EQ(bottom[0]->shape(axis_ + i), scale->shape(i))
+        << "dimension mismatch between bottom[0]->shape(" << axis_ + i
+        << ") and scale->shape(" << i << ")";
+  }
+  outer_dim_ = bottom[0]->count(0, axis_);
+  scale_dim_ = scale->count();
+  inner_dim_ = bottom[0]->count(axis_ + scale->num_axes());
+  if (bottom[0] == top[0]) {  // in-place computation
+    temp_.ReshapeLike(*bottom[0]);
+  } else {
+    top[0]->ReshapeLike(*bottom[0]);
+  }
+  sum_result_.Reshape(vector<int>(1, outer_dim_ * scale_dim_));
+  const int sum_mult_size = std::max(outer_dim_, inner_dim_);
+  sum_multiplier_.Reshape(vector<int>(1, sum_mult_size));
+  if (sum_multiplier_.cpu_data()[sum_mult_size - 1] != Dtype(1)) {
+    caffe_set(sum_mult_size, Dtype(1), sum_multiplier_.mutable_cpu_data());
+  }
+  if (bias_layer_) {
+    bias_bottom_vec_[0] = top[0];
+    bias_layer_->Reshape(bias_bottom_vec_, top);
+  }
+}
+
+template <typename Dtype>
+void ScaleLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  if (bottom[0] == top[0]) {
+    // In-place computation; need to store bottom data before overwriting it.
+    // Note that this is only necessary for Backward; we could skip this if not
+    // doing Backward, but Caffe currently provides no way of knowing whether
+    // we'll need to do Backward at the time of the Forward call.
+    caffe_copy(bottom[0]->count(), bottom[0]->cpu_data(),
+               temp_.mutable_cpu_data());
+  }
+  const Dtype* scale_data =
+      ((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  for (int n = 0; n < outer_dim_; ++n) {
+    for (int d = 0; d < scale_dim_; ++d) {
+      const Dtype factor = scale_data[d];
+      caffe_cpu_scale(inner_dim_, factor, bottom_data, top_data);
+      bottom_data += inner_dim_;
+      top_data += inner_dim_;
+    }
+  }
+  if (bias_layer_) {
+    bias_layer_->Forward(bias_bottom_vec_, top);
+  }
+}
+
+template <typename Dtype>
+void ScaleLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (bias_layer_ &&
+      this->param_propagate_down_[this->param_propagate_down_.size() - 1]) {
+    bias_layer_->Backward(top, bias_propagate_down_, bias_bottom_vec_);
+  }
+  const bool scale_param = (bottom.size() == 1);
+  Blob<Dtype>* scale = scale_param ? this->blobs_[0].get() : bottom[1];
+  if ((!scale_param && propagate_down[1]) ||
+      (scale_param && this->param_propagate_down_[0])) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    const bool in_place = (bottom[0] == top[0]);
+    const Dtype* bottom_data = (in_place ? &temp_ : bottom[0])->cpu_data();
+    // Hack: store big eltwise product in bottom[0] diff, except in the special
+    // case where this layer itself does the eltwise product, in which case we
+    // can store it directly in the scale diff, and we're done.
+    // If we're computing in-place (and not doing eltwise computation), this
+    // hack doesn't work and we store the product in temp_.
+    const bool is_eltwise = (bottom[0]->count() == scale->count());
+    Dtype* product = (is_eltwise ? scale->mutable_cpu_diff() :
+        (in_place ? temp_.mutable_cpu_data() : bottom[0]->mutable_cpu_diff()));
+    caffe_mul(top[0]->count(), top_diff, bottom_data, product);
+    if (!is_eltwise) {
+      Dtype* sum_result = NULL;
+      if (inner_dim_ == 1) {
+        sum_result = product;
+      } else if (sum_result_.count() == 1) {
+        const Dtype* sum_mult = sum_multiplier_.cpu_data();
+        Dtype* scale_diff = scale->mutable_cpu_diff();
+        if (scale_param) {
+          Dtype result = caffe_cpu_dot(inner_dim_, product, sum_mult);
+          *scale_diff += result;
+        } else {
+          *scale_diff = caffe_cpu_dot(inner_dim_, product, sum_mult);
+        }
+      } else {
+        const Dtype* sum_mult = sum_multiplier_.cpu_data();
+        sum_result = (outer_dim_ == 1) ?
+            scale->mutable_cpu_diff() : sum_result_.mutable_cpu_data();
+        caffe_cpu_gemv(CblasNoTrans, sum_result_.count(), inner_dim_,
+                       Dtype(1), product, sum_mult, Dtype(0), sum_result);
+      }
+      if (outer_dim_ != 1) {
+        const Dtype* sum_mult = sum_multiplier_.cpu_data();
+        Dtype* scale_diff = scale->mutable_cpu_diff();
+        if (scale_dim_ == 1) {
+          if (scale_param) {
+            Dtype result = caffe_cpu_dot(outer_dim_, sum_mult, sum_result);
+            *scale_diff += result;
+          } else {
+            *scale_diff = caffe_cpu_dot(outer_dim_, sum_mult, sum_result);
+          }
+        } else {
+          caffe_cpu_gemv(CblasTrans, outer_dim_, scale_dim_,
+                         Dtype(1), sum_result, sum_mult, Dtype(scale_param),
+                         scale_diff);
+        }
+      }
+    }
+  }
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    const Dtype* scale_data = scale->cpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    for (int n = 0; n < outer_dim_; ++n) {
+      for (int d = 0; d < scale_dim_; ++d) {
+        const Dtype factor = scale_data[d];
+        caffe_cpu_scale(inner_dim_, factor, top_diff, bottom_diff);
+        bottom_diff += inner_dim_;
+        top_diff += inner_dim_;
+      }
+    }
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(ScaleLayer);
+#endif
+
+INSTANTIATE_CLASS(ScaleLayer);
+REGISTER_LAYER_CLASS(Scale);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/scale_layer.cu b/src/caffe/layers/scale_layer.cu
new file mode 100644
index 00000000000..fc9a8064db5
--- /dev/null
+++ b/src/caffe/layers/scale_layer.cu
@@ -0,0 +1,135 @@
+#include <cfloat>
+#include <vector>
+
+#include "caffe/layers/scale_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void ScaleForward(const int n, const Dtype* in,
+    const Dtype* scale, const int scale_dim, const int inner_dim,
+    Dtype* out) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int scale_index = (index / inner_dim) % scale_dim;
+    out[index] = in[index] * scale[scale_index];
+  }
+}
+
+template <typename Dtype>
+__global__ void ScaleBiasForward(const int n, const Dtype* in,
+    const Dtype* scale, const Dtype* bias,
+    const int scale_dim, const int inner_dim, Dtype* out) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int scale_index = (index / inner_dim) % scale_dim;
+    out[index] = in[index] * scale[scale_index] + bias[scale_index];
+  }
+}
+
+template <typename Dtype>
+void ScaleLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const int count = top[0]->count();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  if (bottom[0] == top[0]) {
+    // in-place computation; need to store bottom data before overwriting it.
+    // Note that this is only necessary for Backward; we could skip this if not
+    // doing Backward, but Caffe currently provides no way of knowing whether
+    // we'll need to do Backward at the time of the Forward call.
+    caffe_copy(bottom[0]->count(), bottom[0]->gpu_data(),
+               temp_.mutable_gpu_data());
+  }
+  const Dtype* scale_data =
+      ((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  if (bias_layer_) {
+    const Dtype* bias_data = this->blobs_[bias_param_id_]->gpu_data();
+    ScaleBiasForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, bottom_data, scale_data, bias_data, scale_dim_, inner_dim_,
+        top_data);
+  } else {
+    ScaleForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, bottom_data, scale_data, scale_dim_, inner_dim_, top_data);
+  }
+}
+
+template <typename Dtype>
+void ScaleLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (bias_layer_ &&
+      this->param_propagate_down_[this->param_propagate_down_.size() - 1]) {
+    bias_layer_->Backward(top, bias_propagate_down_, bias_bottom_vec_);
+  }
+  const bool scale_param = (bottom.size() == 1);
+  Blob<Dtype>* scale = scale_param ? this->blobs_[0].get() : bottom[1];
+  if ((!scale_param && propagate_down[1]) ||
+      (scale_param && this->param_propagate_down_[0])) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const bool in_place = (bottom[0] == top[0]);
+    const Dtype* bottom_data = (in_place ? &temp_ : bottom[0])->gpu_data();
+    // Hack: store big eltwise product in bottom[0] diff, except in the special
+    // case where this layer itself does the eltwise product, in which case we
+    // can store it directly in the scale diff, and we're done.
+    // If we're computing in-place (and not doing eltwise computation), this
+    // hack doesn't work and we store the product in temp_.
+    const bool is_eltwise = (bottom[0]->count() == scale->count());
+    Dtype* product = (is_eltwise ? scale->mutable_gpu_diff() :
+        (in_place ? temp_.mutable_gpu_data() : bottom[0]->mutable_gpu_diff()));
+    caffe_gpu_mul(top[0]->count(), top_diff, bottom_data, product);
+    if (!is_eltwise) {
+      Dtype* sum_result = NULL;
+      if (inner_dim_ == 1) {
+        sum_result = product;
+      } else if (sum_result_.count() == 1) {
+        const Dtype* sum_mult = sum_multiplier_.gpu_data();
+        Dtype* scale_diff = scale->mutable_cpu_diff();
+        if (scale_param) {
+          Dtype result;
+          caffe_gpu_dot(inner_dim_, product, sum_mult, &result);
+          *scale_diff += result;
+        } else {
+          caffe_gpu_dot(inner_dim_, product, sum_mult, scale_diff);
+        }
+      } else {
+        const Dtype* sum_mult = sum_multiplier_.gpu_data();
+        sum_result = (outer_dim_ == 1) ?
+            scale->mutable_gpu_diff() : sum_result_.mutable_gpu_data();
+        caffe_gpu_gemv(CblasNoTrans, sum_result_.count(), inner_dim_,
+                       Dtype(1), product, sum_mult, Dtype(0), sum_result);
+      }
+      if (outer_dim_ != 1) {
+        const Dtype* sum_mult = sum_multiplier_.gpu_data();
+        if (scale_dim_ == 1) {
+          Dtype* scale_diff = scale->mutable_cpu_diff();
+          if (scale_param) {
+            Dtype result;
+            caffe_gpu_dot(outer_dim_, sum_mult, sum_result, &result);
+            *scale_diff += result;
+          } else {
+            caffe_gpu_dot(outer_dim_, sum_mult, sum_result, scale_diff);
+          }
+        } else {
+          Dtype* scale_diff = scale->mutable_gpu_diff();
+          caffe_gpu_gemv(CblasTrans, outer_dim_, scale_dim_,
+                         Dtype(1), sum_result, sum_mult, Dtype(scale_param),
+                         scale_diff);
+        }
+      }
+    }
+  }
+  if (propagate_down[0]) {
+    const int count = top[0]->count();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const Dtype* scale_data = scale->gpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    ScaleForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, top_diff, scale_data, scale_dim_, inner_dim_, bottom_diff);
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(ScaleLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index fe6209cf673..6493a72d778 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -306,7 +306,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 142 (last added: channelwise_affine_param)
+// LayerParameter next available layer-specific ID: 143 (last added: scale_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -356,7 +356,7 @@ message LayerParameter {
   optional AccuracyParameter accuracy_param = 102;
   optional ArgMaxParameter argmax_param = 103;
   optional BatchNormParameter batch_norm_param = 139;
-  optional ChannelwiseAffineParameter channelwise_affine_param = 141;
+  optional BiasParameter bias_param = 141;
   optional ConcatParameter concat_param = 104;
   optional ContrastiveLossParameter contrastive_loss_param = 105;
   optional ConvolutionParameter convolution_param = 106;
@@ -385,6 +385,7 @@ message LayerParameter {
   optional ReductionParameter reduction_param = 136;
   optional ReLUParameter relu_param = 123;
   optional ReshapeParameter reshape_param = 133;
+  optional ScaleParameter scale_param = 142;
   optional SigmoidParameter sigmoid_param = 124;
   optional SoftmaxParameter softmax_param = 125;
   optional SPPParameter spp_param = 132;
@@ -499,15 +500,36 @@ message BatchNormParameter {
   optional float eps = 3 [default = 1e-5];
 }
 
-message ChannelwiseAffineParameter {
-
-  // Initial value of a_i. Default is a_i=1.0 for all i.
-  optional FillerParameter slope_filler = 1;
-
-  optional FillerParameter bias_filler = 2;
+message BiasParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar bias.
+  optional int32 axis = 1 [default = 1];
 
-  // Whether or not slope paramters are shared across channels.
-  optional bool channel_shared = 3 [default = false];
+  // (num_axes is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the bias
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to add a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [default = 1];
+
+  // (filler is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.)
+  // The initialization for the learned bias parameter.
+  // Default is the zero (0) initialization, resulting in the BiasLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
 }
 
 message ContrastiveLossParameter {
@@ -972,6 +994,43 @@ message ReshapeParameter {
   optional int32 num_axes = 3 [default = -1];
 }
 
+message ScaleParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar multiplier.
+  optional int32 axis = 1 [default = 1];
+
+  // (num_axes is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the scale
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to multiply with a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [default = 1];
+
+  // (filler is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.)
+  // The initialization for the learned scale parameter.
+  // Default is the unit (1) initialization, resulting in the ScaleLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+
+  // Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but
+  // may be more efficient).  Initialized with bias_filler (defaults to 0).
+  optional bool bias_term = 4 [default = false];
+  optional FillerParameter bias_filler = 5;
+}
+
 message SigmoidParameter {
   enum Engine {
     DEFAULT = 0;
diff --git a/src/caffe/test/test_bias_layer.cpp b/src/caffe/test/test_bias_layer.cpp
new file mode 100644
index 00000000000..0d23d3f453c
--- /dev/null
+++ b/src/caffe/test/test_bias_layer.cpp
@@ -0,0 +1,461 @@
+#include <algorithm>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/bias_layer.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class BiasLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  BiasLayerTest()
+      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
+        blob_bottom_eltwise_(new Blob<Dtype>(2, 3, 4, 5)),
+        blob_bottom_broadcast_0_(new Blob<Dtype>()),
+        blob_bottom_broadcast_1_(new Blob<Dtype>()),
+        blob_bottom_broadcast_2_(new Blob<Dtype>()),
+        blob_bottom_bias_(new Blob<Dtype>(vector<int>())),
+        blob_top_(new Blob<Dtype>()) {
+    Caffe::set_random_seed(1701);
+    vector<int> broadcast_shape(2);
+    broadcast_shape[0] = 2; broadcast_shape[1] = 3;
+    this->blob_bottom_broadcast_0_->Reshape(broadcast_shape);
+    broadcast_shape[0] = 3; broadcast_shape[1] = 4;
+    this->blob_bottom_broadcast_1_->Reshape(broadcast_shape);
+    broadcast_shape[0] = 4; broadcast_shape[1] = 5;
+    this->blob_bottom_broadcast_2_->Reshape(broadcast_shape);
+    FillerParameter filler_param;
+    filler_param.set_min(1);
+    filler_param.set_max(10);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    filler.Fill(this->blob_bottom_eltwise_);
+    filler.Fill(this->blob_bottom_broadcast_0_);
+    filler.Fill(this->blob_bottom_broadcast_1_);
+    filler.Fill(this->blob_bottom_broadcast_2_);
+    filler.Fill(this->blob_bottom_bias_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+  virtual ~BiasLayerTest() {
+    delete blob_bottom_;
+    delete blob_bottom_eltwise_;
+    delete blob_bottom_broadcast_0_;
+    delete blob_bottom_broadcast_1_;
+    delete blob_bottom_broadcast_2_;
+    delete blob_bottom_bias_;
+    delete blob_top_;
+  }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_bottom_eltwise_;
+  Blob<Dtype>* const blob_bottom_broadcast_0_;
+  Blob<Dtype>* const blob_bottom_broadcast_1_;
+  Blob<Dtype>* const blob_bottom_broadcast_2_;
+  Blob<Dtype>* const blob_bottom_bias_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(BiasLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(BiasLayerTest, TestForwardEltwise) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data_a = this->blob_bottom_->cpu_data();
+  const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data_a[i] + in_data_b[i], 1e-5);
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardEltwiseInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_bottom_->cpu_data();
+  const int count = this->blob_bottom_->count();
+  const Dtype* in_data_a = orig_bottom.cpu_data();
+  const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data_a[i] + in_data_b[i], 1e-5);
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestBackwardEltwiseInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  Blob<Dtype> top_diff(this->blob_bottom_->shape());
+  FillerParameter filler_param;
+  filler_param.set_type("gaussian");
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> filler(filler_param);
+  filler.Fill(&top_diff);
+  vector<bool> propagate_down(2, true);
+  // Run forward + backward without in-place computation;
+  // save resulting bottom diffs.
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_top_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  const bool kReshape = true;
+  const bool kCopyDiff = true;
+  Blob<Dtype> orig_bottom_diff;
+  orig_bottom_diff.CopyFrom(*this->blob_bottom_, kCopyDiff, kReshape);
+  Blob<Dtype> orig_bias_diff;
+  orig_bias_diff.CopyFrom(*this->blob_bottom_eltwise_,
+                            kCopyDiff, kReshape);
+  // Rerun forward + backward with in-place computation;
+  // check that resulting bottom diffs are the same.
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_bottom_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_NEAR(orig_bottom_diff.cpu_diff()[i],
+                this->blob_bottom_->cpu_diff()[i], 1e-5);
+  }
+  for (int i = 0; i < this->blob_bottom_eltwise_->count(); ++i) {
+    EXPECT_NEAR(orig_bias_diff.cpu_diff()[i],
+                this->blob_bottom_eltwise_->cpu_diff()[i], 1e-5);
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardEltwiseWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BiasParameter* bias_param = layer_param.mutable_bias_param();
+  bias_param->set_axis(0);
+  bias_param->set_num_axes(-1);
+  bias_param->mutable_filler()->set_type("gaussian");
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data_a = this->blob_bottom_->cpu_data();
+  const Dtype* in_data_b = layer->blobs()[0]->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data_a[i] + in_data_b[i], 1e-5);
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardBroadcastBegin) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_);
+  LayerParameter layer_param;
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) +
+                      this->blob_bottom_broadcast_0_->data_at(n, c, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardBroadcastMiddle) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(1);
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) +
+                      this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardBroadcastMiddleInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(1);
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_bottom_->data_at(n, c, h, w),
+                      orig_bottom.data_at(n, c, h, w) +
+                      this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestBackwardBroadcastMiddleInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(1);
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  Blob<Dtype> top_diff(this->blob_bottom_->shape());
+  FillerParameter filler_param;
+  filler_param.set_type("gaussian");
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> filler(filler_param);
+  filler.Fill(&top_diff);
+  vector<bool> propagate_down(2, true);
+  // Run forward + backward without in-place computation;
+  // save resulting bottom diffs.
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_top_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  const bool kReshape = true;
+  const bool kCopyDiff = true;
+  Blob<Dtype> orig_bottom_diff;
+  orig_bottom_diff.CopyFrom(*this->blob_bottom_, kCopyDiff, kReshape);
+  Blob<Dtype> orig_bias_diff;
+  orig_bias_diff.CopyFrom(*this->blob_bottom_broadcast_1_,
+                            kCopyDiff, kReshape);
+  // Rerun forward + backward with in-place computation;
+  // check that resulting bottom diffs are the same.
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_bottom_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_NEAR(orig_bottom_diff.cpu_diff()[i],
+                this->blob_bottom_->cpu_diff()[i], 1e-5);
+  }
+  for (int i = 0; i < this->blob_bottom_broadcast_1_->count(); ++i) {
+    EXPECT_NEAR(orig_bias_diff.cpu_diff()[i],
+                this->blob_bottom_broadcast_1_->cpu_diff()[i], 1e-5);
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardBroadcastMiddleWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BiasParameter* bias_param = layer_param.mutable_bias_param();
+  bias_param->set_axis(1);
+  bias_param->set_num_axes(2);
+  bias_param->mutable_filler()->set_type("gaussian");
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) +
+                      layer->blobs()[0]->data_at(c, h, 0, 0), 1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardBroadcastEnd) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(2);
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) +
+                      this->blob_bottom_broadcast_2_->data_at(h, w, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_bias_);
+  LayerParameter layer_param;
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data = this->blob_bottom_->cpu_data();
+  const Dtype bias = *this->blob_bottom_bias_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data[i] + bias, 1e-5);
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardBiasAxis2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_bias_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(2);
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data = this->blob_bottom_->cpu_data();
+  const Dtype bias = *this->blob_bottom_bias_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data[i] + bias, 1e-5);
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientEltwise) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientEltwiseWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BiasParameter* bias_param = layer_param.mutable_bias_param();
+  bias_param->set_axis(0);
+  bias_param->set_num_axes(-1);
+  bias_param->mutable_filler()->set_type("gaussian");
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientBroadcastBegin) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_);
+  LayerParameter layer_param;
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientBroadcastMiddle) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(1);
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientBroadcastMiddleWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  BiasParameter* bias_param = layer_param.mutable_bias_param();
+  bias_param->set_axis(1);
+  bias_param->set_num_axes(2);
+  bias_param->mutable_filler()->set_type("gaussian");
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientBroadcastEnd) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(2);
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_bias_);
+  LayerParameter layer_param;
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientBiasAxis2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_bias_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(2);
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_channelwise_affine_layer.cpp b/src/caffe/test/test_channelwise_affine_layer.cpp
deleted file mode 100644
index a3e2544f77a..00000000000
--- a/src/caffe/test/test_channelwise_affine_layer.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/channelwise_affine_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class ChannelwiseAffineLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  ChannelwiseAffineLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_top_(new Blob<Dtype>()) {
-    Caffe::set_random_seed(1701);
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~ChannelwiseAffineLayerTest() {
-      delete blob_bottom_; delete blob_top_; }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-
-  void TestChannelwiseAffine(ChannelwiseAffineLayer<Dtype> *layer) {
-    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    // Now, check values
-    const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-    const Dtype* top_data = this->blob_top_->cpu_data();
-    const Dtype* slope_data = layer->blobs()[0]->cpu_data();
-    const Dtype* bias_data = layer->blobs()[1]->cpu_data();
-    const Dtype kDelta = 2e-5;
-    int hw = this->blob_bottom_->height() * this->blob_bottom_->width();
-    int channels = this->blob_bottom_->channels();
-    bool channel_shared =
-        layer->layer_param().channelwise_affine_param().channel_shared();
-    for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-          int c = channel_shared ? 0 : (i / hw) % channels;
-          EXPECT_NEAR(top_data[i],
-                       bottom_data[i]* slope_data[c] + bias_data[c], kDelta);
-        }
-  }
-};
-TYPED_TEST_CASE(ChannelwiseAffineLayerTest, TestDtypesAndDevices);
-
-
-TYPED_TEST(ChannelwiseAffineLayerTest, TestChannelwiseAffineForward) {
-    typedef typename TypeParam::Dtype Dtype;
-    LayerParameter layer_param;
-    ChannelwiseAffineLayer<Dtype> layer(layer_param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(layer.blobs()[0].get());
-    filler.Fill(layer.blobs()[1].get());
-    this->TestChannelwiseAffine(&layer);
-}
-
-TYPED_TEST(ChannelwiseAffineLayerTest,
-           TestChannelwiseAffineForwardChannelShared) {
-    typedef typename TypeParam::Dtype Dtype;
-    LayerParameter layer_param;
-    layer_param.mutable_channelwise_affine_param()->set_channel_shared(true);
-    ChannelwiseAffineLayer<Dtype> layer(layer_param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    this->TestChannelwiseAffine(&layer);
-}
-
-TYPED_TEST(ChannelwiseAffineLayerTest, TestChannelwiseAffineGradient) {
-    typedef typename TypeParam::Dtype Dtype;
-    LayerParameter layer_param;
-    layer_param.mutable_channelwise_affine_param()->set_channel_shared(false);
-    ChannelwiseAffineLayer<Dtype> layer(layer_param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
-    checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-                                    this->blob_top_vec_);
-}
-
-TYPED_TEST(ChannelwiseAffineLayerTest,
-           TestChannelwiseAffineGradientChannelShared) {
-    typedef typename TypeParam::Dtype Dtype;
-    LayerParameter layer_param;
-    layer_param.mutable_channelwise_affine_param()->set_channel_shared(true);
-    ChannelwiseAffineLayer<Dtype> layer(layer_param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
-    checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-                                    this->blob_top_vec_);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_scale_layer.cpp b/src/caffe/test/test_scale_layer.cpp
new file mode 100644
index 00000000000..3669c2e1a4f
--- /dev/null
+++ b/src/caffe/test/test_scale_layer.cpp
@@ -0,0 +1,501 @@
+#include <algorithm>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/scale_layer.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class ScaleLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  ScaleLayerTest()
+      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
+        blob_bottom_eltwise_(new Blob<Dtype>(2, 3, 4, 5)),
+        blob_bottom_broadcast_0_(new Blob<Dtype>()),
+        blob_bottom_broadcast_1_(new Blob<Dtype>()),
+        blob_bottom_broadcast_2_(new Blob<Dtype>()),
+        blob_bottom_scale_(new Blob<Dtype>(vector<int>())),
+        blob_top_(new Blob<Dtype>()) {
+    Caffe::set_random_seed(1701);
+    vector<int> broadcast_shape(2);
+    broadcast_shape[0] = 2; broadcast_shape[1] = 3;
+    this->blob_bottom_broadcast_0_->Reshape(broadcast_shape);
+    broadcast_shape[0] = 3; broadcast_shape[1] = 4;
+    this->blob_bottom_broadcast_1_->Reshape(broadcast_shape);
+    broadcast_shape[0] = 4; broadcast_shape[1] = 5;
+    this->blob_bottom_broadcast_2_->Reshape(broadcast_shape);
+    FillerParameter filler_param;
+    filler_param.set_min(1);
+    filler_param.set_max(10);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    filler.Fill(this->blob_bottom_eltwise_);
+    filler.Fill(this->blob_bottom_broadcast_0_);
+    filler.Fill(this->blob_bottom_broadcast_1_);
+    filler.Fill(this->blob_bottom_broadcast_2_);
+    filler.Fill(this->blob_bottom_scale_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+  virtual ~ScaleLayerTest() {
+    delete blob_bottom_;
+    delete blob_bottom_eltwise_;
+    delete blob_bottom_broadcast_0_;
+    delete blob_bottom_broadcast_1_;
+    delete blob_bottom_broadcast_2_;
+    delete blob_bottom_scale_;
+    delete blob_top_;
+  }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_bottom_eltwise_;
+  Blob<Dtype>* const blob_bottom_broadcast_0_;
+  Blob<Dtype>* const blob_bottom_broadcast_1_;
+  Blob<Dtype>* const blob_bottom_broadcast_2_;
+  Blob<Dtype>* const blob_bottom_scale_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(ScaleLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(ScaleLayerTest, TestForwardEltwise) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data_a = this->blob_bottom_->cpu_data();
+  const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data_a[i] * in_data_b[i], 1e-5);
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardEltwiseInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_bottom_->cpu_data();
+  const int count = this->blob_bottom_->count();
+  const Dtype* in_data_a = orig_bottom.cpu_data();
+  const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data_a[i] * in_data_b[i], 1e-5);
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestBackwardEltwiseInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  Blob<Dtype> top_diff(this->blob_bottom_->shape());
+  FillerParameter filler_param;
+  filler_param.set_type("gaussian");
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> filler(filler_param);
+  filler.Fill(&top_diff);
+  vector<bool> propagate_down(2, true);
+  // Run forward + backward without in-place computation;
+  // save resulting bottom diffs.
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_top_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  const bool kReshape = true;
+  const bool kCopyDiff = true;
+  Blob<Dtype> orig_bottom_diff;
+  orig_bottom_diff.CopyFrom(*this->blob_bottom_, kCopyDiff, kReshape);
+  Blob<Dtype> orig_scale_diff;
+  orig_scale_diff.CopyFrom(*this->blob_bottom_eltwise_,
+                            kCopyDiff, kReshape);
+  // Rerun forward + backward with in-place computation;
+  // check that resulting bottom diffs are the same.
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_bottom_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_NEAR(orig_bottom_diff.cpu_diff()[i],
+                this->blob_bottom_->cpu_diff()[i], 1e-5);
+  }
+  for (int i = 0; i < this->blob_bottom_eltwise_->count(); ++i) {
+    EXPECT_NEAR(orig_scale_diff.cpu_diff()[i],
+                this->blob_bottom_eltwise_->cpu_diff()[i], 1e-5);
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardEltwiseWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ScaleParameter* scale_param = layer_param.mutable_scale_param();
+  scale_param->set_axis(0);
+  scale_param->set_num_axes(-1);
+  scale_param->mutable_filler()->set_type("gaussian");
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data_a = this->blob_bottom_->cpu_data();
+  const Dtype* in_data_b = layer->blobs()[0]->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data_a[i] * in_data_b[i], 1e-5);
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardBroadcastBegin) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_);
+  LayerParameter layer_param;
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) *
+                      this->blob_bottom_broadcast_0_->data_at(n, c, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardBroadcastMiddle) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(1);
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) *
+                      this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardBroadcastMiddleInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(1);
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_bottom_->data_at(n, c, h, w),
+                      orig_bottom.data_at(n, c, h, w) *
+                      this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestBackwardBroadcastMiddleInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(1);
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  Blob<Dtype> top_diff(this->blob_bottom_->shape());
+  FillerParameter filler_param;
+  filler_param.set_type("gaussian");
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> filler(filler_param);
+  filler.Fill(&top_diff);
+  vector<bool> propagate_down(2, true);
+  // Run forward + backward without in-place computation;
+  // save resulting bottom diffs.
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_top_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  const bool kReshape = true;
+  const bool kCopyDiff = true;
+  Blob<Dtype> orig_bottom_diff;
+  orig_bottom_diff.CopyFrom(*this->blob_bottom_, kCopyDiff, kReshape);
+  Blob<Dtype> orig_scale_diff;
+  orig_scale_diff.CopyFrom(*this->blob_bottom_broadcast_1_,
+                            kCopyDiff, kReshape);
+  // Rerun forward + backward with in-place computation;
+  // check that resulting bottom diffs are the same.
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_bottom_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_NEAR(orig_bottom_diff.cpu_diff()[i],
+                this->blob_bottom_->cpu_diff()[i], 1e-5);
+  }
+  for (int i = 0; i < this->blob_bottom_broadcast_1_->count(); ++i) {
+    EXPECT_NEAR(orig_scale_diff.cpu_diff()[i],
+                this->blob_bottom_broadcast_1_->cpu_diff()[i], 1e-5);
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardBroadcastMiddleWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ScaleParameter* scale_param = layer_param.mutable_scale_param();
+  scale_param->set_axis(1);
+  scale_param->set_num_axes(2);
+  scale_param->mutable_filler()->set_type("gaussian");
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) *
+                      layer->blobs()[0]->data_at(c, h, 0, 0), 1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardBroadcastMiddleWithParamAndBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ScaleParameter* scale_param = layer_param.mutable_scale_param();
+  scale_param->set_axis(1);
+  scale_param->set_num_axes(2);
+  scale_param->mutable_filler()->set_type("gaussian");
+  scale_param->set_bias_term(true);
+  scale_param->mutable_bias_filler()->set_type("gaussian");
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) *
+                      layer->blobs()[0]->data_at(c, h, 0, 0) +
+                      layer->blobs()[1]->data_at(c, h, 0, 0), 1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardBroadcastEnd) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(2);
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) *
+                      this->blob_bottom_broadcast_2_->data_at(h, w, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardScale) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
+  LayerParameter layer_param;
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data = this->blob_bottom_->cpu_data();
+  const Dtype scale = *this->blob_bottom_scale_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data[i] * scale, 1e-5);
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardScaleAxis2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(2);
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data = this->blob_bottom_->cpu_data();
+  const Dtype scale = *this->blob_bottom_scale_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data[i] * scale, 1e-5);
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientEltwise) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientEltwiseWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ScaleParameter* scale_param = layer_param.mutable_scale_param();
+  scale_param->set_axis(0);
+  scale_param->set_num_axes(-1);
+  scale_param->mutable_filler()->set_type("gaussian");
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientBroadcastBegin) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_);
+  LayerParameter layer_param;
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientBroadcastMiddle) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(1);
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientBroadcastMiddleWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  ScaleParameter* scale_param = layer_param.mutable_scale_param();
+  scale_param->set_axis(1);
+  scale_param->set_num_axes(2);
+  scale_param->mutable_filler()->set_type("gaussian");
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientBroadcastEnd) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(2);
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientScale) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
+  LayerParameter layer_param;
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientScaleAndBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
+  LayerParameter layer_param;
+  ScaleParameter* scale_param = layer_param.mutable_scale_param();
+  scale_param->set_bias_term(true);
+  scale_param->mutable_bias_filler()->set_type("gaussian");
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientScaleAxis2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(2);
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+}  // namespace caffe