diff --git a/include/caffe/layers/scalar_layer.hpp b/include/caffe/layers/scalar_layer.hpp
index 59882e4d5f6..f679622dde4 100644
--- a/include/caffe/layers/scalar_layer.hpp
+++ b/include/caffe/layers/scalar_layer.hpp
@@ -65,6 +65,7 @@ class ScalarLayer: public Layer<Dtype> {
 
   Blob<Dtype> sum_multiplier_;
   Blob<Dtype> sum_result_;
+  Blob<Dtype> temp_;
   int axis_;
   int outer_dim_, scalar_dim_, inner_dim_;
 };
diff --git a/src/caffe/layers/scalar_layer.cpp b/src/caffe/layers/scalar_layer.cpp
index 67988fce6df..0fa489ae976 100644
--- a/src/caffe/layers/scalar_layer.cpp
+++ b/src/caffe/layers/scalar_layer.cpp
@@ -44,12 +44,6 @@ void ScalarLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void ScalarLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  // TODO: make ScalarLayer usable in-place.
-  // Currently, in-place computation is broken during Backward with
-  // propagate_down[0] && propagate_down[1], as bottom[0]'s diff is used for
-  // temporary storage of an intermediate result, overwriting top[0]'s diff
-  // if using in-place computation.
-  CHECK_NE(bottom[0], top[0]) << "ScalarLayer cannot be used in-place";
   const ScalarParameter& param = this->layer_param_.scalar_param();
   Blob<Dtype>* scalar = (bottom.size() > 1) ? bottom[1] : this->blobs_[0].get();
   // Always set axis_ == 0 in special case where scalar is an actual scalar
@@ -71,7 +65,11 @@ void ScalarLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   outer_dim_ = bottom[0]->count(0, axis_);
   scalar_dim_ = scalar->count();
   inner_dim_ = bottom[0]->count(axis_ + scalar->num_axes());
-  top[0]->ReshapeLike(*bottom[0]);
+  if (bottom[0] == top[0]) {  // in-place computation
+    temp_.ReshapeLike(*bottom[0]);
+  } else {
+    top[0]->ReshapeLike(*bottom[0]);
+  }
   sum_result_.Reshape(vector<int>(1, outer_dim_ * scalar_dim_));
   const int sum_mult_size = std::max(outer_dim_, inner_dim_);
   sum_multiplier_.Reshape(vector<int>(1, sum_mult_size));
@@ -84,6 +82,14 @@ template <typename Dtype>
 void ScalarLayer<Dtype>::Forward_cpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
+  if (bottom[0] == top[0]) {
+    // In-place computation; need to store bottom data before overwriting it.
+    // Note that this is only necessary for Backward; we could skip this if not
+    // doing Backward, but Caffe currently provides no way of knowing whether
+    // we'll need to do Backward at the time of the Forward call.
+    caffe_copy(bottom[0]->count(), bottom[0]->cpu_data(),
+               temp_.mutable_cpu_data());
+  }
   const Dtype* scalar_data =
       ((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
@@ -105,12 +111,16 @@ void ScalarLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   if ((!scalar_param && propagate_down[1]) ||
       (scalar_param && this->param_propagate_down_[0])) {
     const Dtype* top_diff = top[0]->cpu_diff();
-    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const bool in_place = (bottom[0] == top[0]);
+    const Dtype* bottom_data = (in_place ? &temp_ : bottom[0])->cpu_data();
     // Hack: store big eltwise product in bottom[0] diff, except in the special
     // case where this layer itself does the eltwise product, in which case we
     // can store it directly in the scalar diff, and we're done.
+    // If we're computing in-place (and not doing eltwise computation), this
+    // hack doesn't work and we store the product in temp_.
     const bool is_eltwise = (bottom[0]->count() == scalar->count());
-    Dtype* product = (is_eltwise ? scalar : bottom[0])->mutable_cpu_diff();
+    Dtype* product = (is_eltwise ? scalar->mutable_cpu_diff() :
+        (in_place ? temp_.mutable_cpu_data() : bottom[0]->mutable_cpu_diff()));
     caffe_mul(top[0]->count(), top_diff, bottom_data, product);
     if (!is_eltwise) {
       Dtype* sum_result = NULL;
diff --git a/src/caffe/layers/scalar_layer.cu b/src/caffe/layers/scalar_layer.cu
index b1af488d769..9c6932723af 100644
--- a/src/caffe/layers/scalar_layer.cu
+++ b/src/caffe/layers/scalar_layer.cu
@@ -21,6 +21,14 @@ void ScalarLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   const int count = top[0]->count();
   const Dtype* bottom_data = bottom[0]->gpu_data();
+  if (bottom[0] == top[0]) {
+    // in-place computation; need to store bottom data before overwriting it.
+    // Note that this is only necessary for Backward; we could skip this if not
+    // doing Backward, but Caffe currently provides no way of knowing whether
+    // we'll need to do Backward at the time of the Forward call.
+    caffe_copy(bottom[0]->count(), bottom[0]->gpu_data(),
+               temp_.mutable_gpu_data());
+  }
   const Dtype* scalar_data =
       ((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
@@ -37,12 +45,16 @@ void ScalarLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   if ((!scalar_param && propagate_down[1]) ||
       (scalar_param && this->param_propagate_down_[0])) {
     const Dtype* top_diff = top[0]->gpu_diff();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
+    const bool in_place = (bottom[0] == top[0]);
+    const Dtype* bottom_data = (in_place ? &temp_ : bottom[0])->gpu_data();
     // Hack: store big eltwise product in bottom[0] diff, except in the special
     // case where this layer itself does the eltwise product, in which case we
     // can store it directly in the scalar diff, and we're done.
+    // If we're computing in-place (and not doing eltwise computation), this
+    // hack doesn't work and we store the product in temp_.
     const bool is_eltwise = (bottom[0]->count() == scalar->count());
-    Dtype* product = (is_eltwise ? scalar : bottom[0])->mutable_gpu_diff();
+    Dtype* product = (is_eltwise ? scalar->mutable_gpu_diff() :
+        (in_place ? temp_.mutable_gpu_data() : bottom[0]->mutable_gpu_diff()));
     caffe_gpu_mul(top[0]->count(), top_diff, bottom_data, product);
     if (!is_eltwise) {
       Dtype* sum_result = NULL;
diff --git a/src/caffe/test/test_scalar_layer.cpp b/src/caffe/test/test_scalar_layer.cpp
index caba89a0d81..399d54a395e 100644
--- a/src/caffe/test/test_scalar_layer.cpp
+++ b/src/caffe/test/test_scalar_layer.cpp
@@ -86,6 +86,70 @@ TYPED_TEST(ScalarLayerTest, TestForwardEltwise) {
   }
 }
 
+TYPED_TEST(ScalarLayerTest, TestForwardEltwiseInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  shared_ptr<ScalarLayer<Dtype> > layer(new ScalarLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_bottom_->cpu_data();
+  const int count = this->blob_bottom_->count();
+  const Dtype* in_data_a = orig_bottom.cpu_data();
+  const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data_a[i] * in_data_b[i], 1e-5);
+  }
+}
+
+TYPED_TEST(ScalarLayerTest, TestBackwardEltwiseInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  shared_ptr<ScalarLayer<Dtype> > layer(new ScalarLayer<Dtype>(layer_param));
+  Blob<Dtype> top_diff(this->blob_bottom_->shape());
+  FillerParameter filler_param;
+  filler_param.set_type("gaussian");
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> filler(filler_param);
+  filler.Fill(&top_diff);
+  vector<bool> propagate_down(2, true);
+  // Run forward + backward without in-place computation;
+  // save resulting bottom diffs.
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_top_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  const bool kReshape = true;
+  const bool kCopyDiff = true;
+  Blob<Dtype> orig_bottom_diff;
+  orig_bottom_diff.CopyFrom(*this->blob_bottom_, kCopyDiff, kReshape);
+  Blob<Dtype> orig_scalar_diff;
+  orig_scalar_diff.CopyFrom(*this->blob_bottom_eltwise_,
+                            kCopyDiff, kReshape);
+  // Rerun forward + backward with in-place computation;
+  // check that resulting bottom diffs are the same.
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_bottom_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_NEAR(orig_bottom_diff.cpu_diff()[i],
+                this->blob_bottom_->cpu_diff()[i], 1e-5);
+  }
+  for (int i = 0; i < this->blob_bottom_eltwise_->count(); ++i) {
+    EXPECT_NEAR(orig_scalar_diff.cpu_diff()[i],
+                this->blob_bottom_eltwise_->cpu_diff()[i], 1e-5);
+  }
+}
+
 TYPED_TEST(ScalarLayerTest, TestForwardEltwiseWithParam) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -151,6 +215,77 @@ TYPED_TEST(ScalarLayerTest, TestForwardBroadcastMiddle) {
   }
 }
 
+TYPED_TEST(ScalarLayerTest, TestForwardBroadcastMiddleInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_scalar_param()->set_axis(1);
+  shared_ptr<ScalarLayer<Dtype> > layer(new ScalarLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_bottom_->data_at(n, c, h, w),
+                      orig_bottom.data_at(n, c, h, w) *
+                      this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScalarLayerTest, TestBackwardBroadcastMiddleInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_scalar_param()->set_axis(1);
+  shared_ptr<ScalarLayer<Dtype> > layer(new ScalarLayer<Dtype>(layer_param));
+  Blob<Dtype> top_diff(this->blob_bottom_->shape());
+  FillerParameter filler_param;
+  filler_param.set_type("gaussian");
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> filler(filler_param);
+  filler.Fill(&top_diff);
+  vector<bool> propagate_down(2, true);
+  // Run forward + backward without in-place computation;
+  // save resulting bottom diffs.
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_top_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  const bool kReshape = true;
+  const bool kCopyDiff = true;
+  Blob<Dtype> orig_bottom_diff;
+  orig_bottom_diff.CopyFrom(*this->blob_bottom_, kCopyDiff, kReshape);
+  Blob<Dtype> orig_scalar_diff;
+  orig_scalar_diff.CopyFrom(*this->blob_bottom_broadcast_1_,
+                            kCopyDiff, kReshape);
+  // Rerun forward + backward with in-place computation;
+  // check that resulting bottom diffs are the same.
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_bottom_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_NEAR(orig_bottom_diff.cpu_diff()[i],
+                this->blob_bottom_->cpu_diff()[i], 1e-5);
+  }
+  for (int i = 0; i < this->blob_bottom_broadcast_1_->count(); ++i) {
+    EXPECT_NEAR(orig_scalar_diff.cpu_diff()[i],
+                this->blob_bottom_broadcast_1_->cpu_diff()[i], 1e-5);
+  }
+}
+
 TYPED_TEST(ScalarLayerTest, TestForwardBroadcastMiddleWithParam) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;