Separation and generalization of ChannelwiseAffineLayer into BiasLayer

and ScaleLayer. The behavior of ChannelwiseAffineLayer can be reproduced by a ScaleLayer with `scale_param { bias_term: true }`. BiasLayer and ScaleLayer each take 1 or 2 bottoms, with the output having the same shape as the first. The second input -- either another bottom or a learned parameter -- will have its axes (virtually) broadcast and tiled to have the same shape as the first, after which elementwise addition (Bias) or multiplication (Scale) is performed.
BVLC · Jan 23, 2016 · 8a67137 · 8a67137
1 parent ec04197
commit 8a67137
Show file tree

Hide file tree

Showing 13 changed files with 1,702 additions and 551 deletions.
diff --git a/include/caffe/layers/bias_layer.hpp b/include/caffe/layers/bias_layer.hpp
@@ -0,0 +1,54 @@
+#ifndef CAFFE_BIAS_LAYER_HPP_
+#define CAFFE_BIAS_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Computes a sum of two input Blobs, with the shape of the
+ *        latter Blob "broadcast" to match the shape of the former.
+ *        Equivalent to tiling the latter Blob, then computing the elementwise
+ *        sum.
+ *
+ * The second input may be omitted, in which case it's learned as a parameter
+ * of the layer.
+ */
+template <typename Dtype>
+class BiasLayer : public Layer<Dtype> {
+ public:
+  explicit BiasLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Bias"; }
+  virtual inline int MinBottomBlobs() const { return 1; }
+  virtual inline int MaxBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+ private:
+  Blob<Dtype> bias_multiplier_;
+  int outer_dim_, bias_dim_, inner_dim_, dim_;
+};
+
+
+
+}  // namespace caffe
+
+#endif  // CAFFE_BIAS_LAYER_HPP_
diff --git a/include/caffe/layers/channelwise_affine_layer.hpp b/include/caffe/layers/channelwise_affine_layer.hpp
diff --git a/include/caffe/layers/scale_layer.hpp b/include/caffe/layers/scale_layer.hpp
@@ -0,0 +1,83 @@
+#ifndef CAFFE_SCALE_LAYER_HPP_
+#define CAFFE_SCALE_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/bias_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes a product of two input Blobs, with the shape of the
+ *        latter Blob "broadcast" to match the shape of the former.
+ *        Equivalent to tiling the latter Blob, then computing the elementwise
+ *        product.
+ *
+ * The second input may be omitted, in which case it's learned as a parameter
+ * of the layer.
+ */
+template <typename Dtype>
+class ScaleLayer: public Layer<Dtype> {
+ public:
+  explicit ScaleLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Scale"; }
+  // Scale
+  virtual inline int MinBottomBlobs() const { return 1; }
+  virtual inline int MaxBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * In the below shape specifications, @f$ i @f$ denotes the value of the
+   * `axis` field given by `this->layer_param_.scale_param().axis()`, after
+   * canonicalization (i.e., conversion from negative to positive index,
+   * if applicable).
+   *
+   * @param bottom input Blob vector (length 2)
+   *   -# @f$ (d_0 \times ... \times
+   *           d_i \times ... \times d_j \times ... \times d_n) @f$
+   *      the first factor @f$ x @f$
+   *   -# @f$ (d_i \times ... \times d_j) @f$
+   *      the second factor @f$ y @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (d_0 \times ... \times
+   *           d_i \times ... \times d_j \times ... \times d_n) @f$
+   *      the product @f$ z = x y @f$ computed after "broadcasting" y.
+   *      Equivalent to tiling @f$ y @f$ to have the same shape as @f$ x @f$,
+   *      then computing the elementwise product.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  shared_ptr<Layer<Dtype> > bias_layer_;
+  vector<Blob<Dtype>*> bias_bottom_vec_;
+  vector<bool> bias_propagate_down_;
+  int bias_param_id_;
+
+  Blob<Dtype> sum_multiplier_;
+  Blob<Dtype> sum_result_;
+  Blob<Dtype> temp_;
+  int axis_;
+  int outer_dim_, scale_dim_, inner_dim_;
+};
+
+
+}  // namespace caffe
+
+#endif  // CAFFE_SCALE_LAYER_HPP_
diff --git a/src/caffe/layers/bias_layer.cpp b/src/caffe/layers/bias_layer.cpp
@@ -0,0 +1,121 @@
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layers/bias_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void BiasLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  if (bottom.size() == 1 && this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else if (bottom.size() == 1) {
+    // bias is a learned parameter; initialize it
+    const BiasParameter& param = this->layer_param_.bias_param();
+    const int axis = bottom[0]->CanonicalAxisIndex(param.axis());
+    const int num_axes = param.num_axes();
+    CHECK_GE(num_axes, -1) << "num_axes must be non-negative, "
+                           << "or -1 to extend to the end of bottom[0]";
+    if (num_axes >= 0) {
+      CHECK_GE(bottom[0]->num_axes(), axis + num_axes)
+          << "bias blob's shape extends past bottom[0]'s shape when applied "
+          << "starting with bottom[0] axis = " << axis;
+    }
+    this->blobs_.resize(1);
+    const vector<int>::const_iterator& shape_start =
+        bottom[0]->shape().begin() + axis;
+    const vector<int>::const_iterator& shape_end =
+        (num_axes == -1) ? bottom[0]->shape().end() : (shape_start + num_axes);
+    vector<int> bias_shape(shape_start, shape_end);
+    this->blobs_[0].reset(new Blob<Dtype>(bias_shape));
+    shared_ptr<Filler<Dtype> > filler(GetFiller<Dtype>(param.filler()));
+    filler->Fill(this->blobs_[0].get());
+  }
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+}
+
+template <typename Dtype>
+void BiasLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const BiasParameter& param = this->layer_param_.bias_param();
+  Blob<Dtype>* bias = (bottom.size() > 1) ? bottom[1] : this->blobs_[0].get();
+  // Always set axis == 0 in special case where bias is a scalar
+  // (num_axes == 0). Mathematically equivalent for any choice of axis, so the
+  // actual setting can be safely ignored; and computation is most efficient
+  // with axis == 0 and (therefore) outer_dim_ == 1.
+  const int axis = (bias->num_axes() == 0) ?
+      0 : bottom[0]->CanonicalAxisIndex(param.axis());
+  CHECK_GE(bottom[0]->num_axes(), axis + bias->num_axes())
+      << "bias blob's shape extends past bottom[0]'s shape when applied "
+      << "starting with bottom[0] axis = " << axis;
+  for (int i = 0; i < bias->num_axes(); ++i) {
+    CHECK_EQ(bottom[0]->shape(axis + i), bias->shape(i))
+        << "dimension mismatch between bottom[0]->shape(" << axis + i
+        << ") and bias->shape(" << i << ")";
+  }
+  outer_dim_ = bottom[0]->count(0, axis);
+  bias_dim_ = bias->count();
+  inner_dim_ = bottom[0]->count(axis + bias->num_axes());
+  dim_ = bias_dim_ * inner_dim_;
+  if (bottom[0] != top[0]) {
+    top[0]->ReshapeLike(*bottom[0]);
+  }
+  bias_multiplier_.Reshape(vector<int>(1, inner_dim_));
+  if (bias_multiplier_.cpu_data()[inner_dim_ - 1] != Dtype(1)) {
+    caffe_set(inner_dim_, Dtype(1), bias_multiplier_.mutable_cpu_data());
+  }
+}
+
+template <typename Dtype>
+void BiasLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const Dtype* bias_data =
+      ((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  if (bottom[0] != top[0]) {
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    caffe_copy(bottom[0]->count(), bottom_data, top_data);
+  }
+  for (int n = 0; n < outer_dim_; ++n) {
+    caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, bias_dim_,
+        inner_dim_, Dtype(1), Dtype(1), bias_data,
+        bias_multiplier_.cpu_data(), Dtype(1), top_data);
+    top_data += dim_;
+  }
+}
+
+template <typename Dtype>
+void BiasLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0] && bottom[0] != top[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    caffe_copy(bottom[0]->count(), top_diff, bottom_diff);
+  }
+  // in-place, we don't need to do anything with the data diff
+  const bool bias_param = (bottom.size() == 1);
+  if ((!bias_param && propagate_down[1]) ||
+      (bias_param && this->param_propagate_down_[0])) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bias_diff = (bias_param ? this->blobs_[0].get() : bottom[1])
+        ->mutable_cpu_diff();
+    bool accum = bias_param;
+    for (int n = 0; n < outer_dim_; ++n) {
+      caffe_cpu_gemv(CblasNoTrans, bias_dim_, inner_dim_, Dtype(1),
+          top_diff, bias_multiplier_.cpu_data(), Dtype(accum), bias_diff);
+      top_diff += dim_;
+      accum = true;
+    }
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(BiasLayer);
+#endif
+
+INSTANTIATE_CLASS(BiasLayer);
+REGISTER_LAYER_CLASS(Bias);
+
+}  // namespace caffe