Merge 'digits-4.0' into 'master'

Conflicts: digits/__init__.py
NVIDIA · Jul 26, 2016 · 19f8ecf · 19f8ecf
2 parents abfbf47 + 20362c2
commit 19f8ecf
Show file tree

Hide file tree

Showing 8 changed files with 399 additions and 37 deletions.
diff --git a/digits/device_query.py b/digits/device_query.py
@@ -73,9 +73,11 @@ class c_cudaDeviceProp(ctypes.Structure):
             ('managedMemSupported', ctypes.c_int),
             ('isMultiGpuBoard', ctypes.c_int),
             ('multiGpuBoardGroupID', ctypes.c_int),
+            # Extra space for new fields in future toolkits
+            ('__future_buffer', ctypes.c_int * 128),
             # added later with cudart.cudaDeviceGetPCIBusId
             # (needed by NVML)
-            ('pciBusID_str', ctypes.c_char * 13),
+            ('pciBusID_str', ctypes.c_char * 16),
             ]
 
 class struct_c_nvmlDevice_t(ctypes.Structure):
@@ -93,6 +95,8 @@ class c_nvmlMemory_t(ctypes.Structure):
             ('total', ctypes.c_ulonglong),
             ('free', ctypes.c_ulonglong),
             ('used', ctypes.c_ulonglong),
+            # Extra space for new fields in future toolkits
+            ('__future_buffer', ctypes.c_ulonglong * 8),
             ]
 
 class c_nvmlUtilization_t(ctypes.Structure):
@@ -102,6 +106,8 @@ class c_nvmlUtilization_t(ctypes.Structure):
     _fields_ = [
             ('gpu', ctypes.c_uint),
             ('memory', ctypes.c_uint),
+            # Extra space for new fields in future toolkits
+            ('__future_buffer', ctypes.c_uint * 8),
             ]
 
 def get_library(name):
@@ -128,14 +134,12 @@ def get_cudart():
             if cudart is not None:
                 return cudart
     else:
-        for name in (
-                'libcudart.so.7.0',
-                'libcudart.so.7.5',
-                'libcudart.so.8.0',
-                'libcudart.so'):
-            cudart = get_library(name)
-            if cudart is not None:
-                return cudart
+        for major in xrange(9,5,-1):
+            for minor in (5,0):
+                cudart = get_library('libcudart.so.%d.%d' % (major, minor))
+                if cudart is not None:
+                    return cudart
+        return get_library('libcudart.so')
     return None
 
 def get_nvml():
@@ -196,9 +200,9 @@ def get_devices(force_reload=False):
         properties = c_cudaDeviceProp()
         rc = cudart.cudaGetDeviceProperties(ctypes.byref(properties), x)
         if rc == 0:
-            pciBusID_str = ' ' * 13
+            pciBusID_str = ' ' * 16
             # also save the string representation of the PCI bus ID
-            rc = cudart.cudaDeviceGetPCIBusId(ctypes.c_char_p(pciBusID_str), 13, x)
+            rc = cudart.cudaDeviceGetPCIBusId(ctypes.c_char_p(pciBusID_str), 16, x)
             if rc == 0:
                 properties.pciBusID_str = pciBusID_str
             devices.append(properties)
@@ -281,6 +285,8 @@ def get_nvml_info(device_id):
         print 'Device #%d:' % i
         print '>>> CUDA attributes:'
         for name, t in device._fields_:
+            if name in ['__future_buffer']:
+                continue
             if not args.verbose and name not in [
                     'name', 'totalGlobalMem', 'clockRate', 'major', 'minor',]:
                 continue

diff --git a/digits/extensions/data/objectDetection/README.md b/digits/extensions/data/objectDetection/README.md
@@ -1,18 +1,130 @@
 # Object Detection Data Extension
 
-This data extension creates DIGITS datasets for Object Detection networks such as [DetectNet](https://github.com/NVIDIA/caffe/tree/caffe-0.15/examples/kitti).
+This data extension creates DIGITS datasets for object detection networks such as [DetectNet](https://github.com/NVIDIA/caffe/tree/caffe-0.15/examples/kitti).
 
-Labels are expected in KITTI format.
-Refer to the Object Detection track on KITTI web site for more information.
+DIGITS uses the KITTI format for object detection data.
+When preparing your own data for ingestion into a dataset, you must follow the same format.
 
-Custom class mappings may be used by specifiying a comma-separated list of lower-case class names in the Object Detection dataset creation form.
-Class ID #0 is intended to be reserved for `DontCare` objects.
-Labels whose class is not listed in the class mappings are implicitly mapped to the `DontCare` class.
-Class IDs are used by DetectNet (see `DetectNetTransformation.detectnet_groundtruth_param.object_class` fields) to recognize which objects should be included and mapped to the specified index in the coverage map.
+#### Table of contents
+
+* [Folder structure](#folder-structure)
+* [Label format](#label-format)
+* [Custom class mappings](#custom-class-mappings)
+
+## Folder structure
+
+You should have one folder containing images, and another folder containing labels.
+
+* Image filenames are formatted like `IDENTIFIER.EXTENSION` (e.g. `000001.png` or `2.jpg`).
+* Label filenames are formatted like `IDENTIFIER.txt` (e.g. `000001.txt` or `2.txt`).
+
+These identifiers need to match.
+So, if you have a `1.png` in your image directory, there must to be a corresponding `1.txt` in your labels directory.
+
+If you want to include validation data, then you need separate folders for validation images and validation labels.
+A typical folder layout would look something like this:
+```
+train/
+├── images/
+│   └── 000001.png
+└── labels/
+    └── 000001.txt
+val/
+├── images/
+│   └── 000002.png
+└── labels/
+    └── 000002.txt
+```
+
+## Label format
+
+The format for KITTI labels is explained in the `readme.txt` from the "Object development kit".
+Here is the relevant portion:
+```
+Data Format Description
+=======================
+
+The data for training and testing can be found in the corresponding folders.
+The sub-folders are structured as follows:
+
+  - image_02/ contains the left color camera images (png)
+  - label_02/ contains the left color camera label files (plain text files)
+  - calib/ contains the calibration for all four cameras (plain text file)
+
+The label files contain the following information, which can be read and
+written using the matlab tools (readLabels.m, writeLabels.m) provided within
+this devkit. All values (numerical or strings) are separated via spaces,
+each row corresponds to one object. The 15 columns represent:
+
+#Values    Name      Description
+----------------------------------------------------------------------------
+   1    type         Describes the type of object: 'Car', 'Van', 'Truck',
+                     'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram',
+                     'Misc' or 'DontCare'
+   1    truncated    Float from 0 (non-truncated) to 1 (truncated), where
+                     truncated refers to the object leaving image boundaries
+   1    occluded     Integer (0,1,2,3) indicating occlusion state:
+                     0 = fully visible, 1 = partly occluded
+                     2 = largely occluded, 3 = unknown
+   1    alpha        Observation angle of object, ranging [-pi..pi]
+   4    bbox         2D bounding box of object in the image (0-based index):
+                     contains left, top, right, bottom pixel coordinates
+   3    dimensions   3D object dimensions: height, width, length (in meters)
+   3    location     3D object location x,y,z in camera coordinates (in meters)
+   1    rotation_y   Rotation ry around Y-axis in camera coordinates [-pi..pi]
+   1    score        Only for results: Float, indicating confidence in
+                     detection, needed for p/r curves, higher is better.
+
+Here, 'DontCare' labels denote regions in which objects have not been labeled,
+for example because they have been too far away from the laser scanner. To
+prevent such objects from being counted as false positives our evaluation
+script will ignore objects detected in don't care regions of the test set.
+You can use the don't care labels in the training set to avoid that your object
+detector is harvesting hard negatives from those areas, in case you consider
+non-object regions from the training images as negative examples.
 
-The following table shows the default class-ID mappings in DIGITS:
+The coordinates in the camera coordinate system can be projected in the image
+by using the 3x4 projection matrix in the calib folder, where for the left
+color camera for which the images are provided, P2 must be used. The
+difference between rotation_y and alpha is, that rotation_y is directly
+given in camera coordinates, while alpha also considers the vector from the
+camera center to the object center, to compute the relative orientation of
+the object with respect to the camera. For example, a car which is facing
+along the X-axis of the camera coordinate system corresponds to rotation_y=0,
+no matter where it is located in the X/Z plane (bird's eye view), while
+alpha is zero only, when this object is located along the Z-axis of the
+camera. When moving the car away from the Z-axis, the observation angle
+will change.
 
-Class name | ID
+To project a point from Velodyne coordinates into the left color image,
+you can use this formula: x = P2 * R0_rect * Tr_velo_to_cam * y
+For the right color image: x = P3 * R0_rect * Tr_velo_to_cam * y
+
+Note: All matrices are stored row-major, i.e., the first values correspond
+to the first row. R0_rect contains a 3x3 matrix which you need to extend to
+a 4x4 matrix by adding a 1 as the bottom-right element and 0's elsewhere.
+Tr_xxx is a 3x4 matrix (R|t), which you need to extend to a 4x4 matrix
+in the same way!
+
+Note, that while all this information is available for the training data,
+only the data which is actually needed for the particular benchmark must
+be provided to the evaluation server. However, all 15 values must be provided
+at all times, with the unused ones set to their default values (=invalid) as
+specified in writeLabels.m. Additionally a 16'th value must be provided
+with a floating value of the score for a particular detection, where higher
+indicates higher confidence in the detection. The range of your scores will
+be automatically determined by our evaluation server, you don't have to
+normalize it, but it should be roughly linear. If you use writeLabels.m for
+writing your results, this function will take care of storing all required
+data correctly.
+```
+
+## Custom class mappings
+
+When creating the dataset, DIGITS has to translate from the object type string to a numerical identifier.
+By default, DIGITS uses the following class mappings, as follows from the above label format description:
+
+Class name (string in label file) | Class ID (number in database)
 ---------- | ---
 dontcare | 0
 car | 1
@@ -29,3 +141,22 @@ people | 11
 cyclist | 12
 tram | 13
 person_sitting | 14
+
+**NOTE:** Class 0 is treated as a special case.
+See "Label format" above for a detailed description.
+All classes which don't exist in the provided mapping are implicitly mapped to 0.
+
+**NOTE:** Class 1 is also treated as a special case.
+DetectNet is a single-class object detection network, and only cares about the "Car" class, which is expected to be ID 1.
+You can change the mapping in the DetectNet prototxt, but it's simplest to just make the class you care about map to 1.
+
+Custom class mappings may be used by specifiying a comma-separated list of lower-case class names in the Object Detection dataset creation form.
+
+For example, if you only want to detect pedestrians, enter `dontcare,pedestrian` in the "Custom classes" field to generate this mapping:
+
+Class name | Class ID
+---------- | ---
+dontcare | 0
+pedestrian | 1
+
+All labeled objects other than "pedestrian" in your dataset will be mapped to 0.
diff --git a/docs/BuildCaffe.md b/docs/BuildCaffe.md
@@ -14,7 +14,7 @@ For best performance, you'll want:
 
 Install some dependencies with Deb packages:
 ```sh
-sudo apt-get install --no-install-recommends build-essential cmake git gfortran libatlas-base-dev libboost-all-dev libgflags-dev libgoogle-glog-dev libhdf5-serial-dev libleveldb-dev liblmdb-dev libopencv-dev libprotobuf-dev libsnappy-dev protobuf-compiler python-all-dev python-dev python-h5py python-matplotlib python-numpy python-pil python-pip python-protobuf python-scipy python-skimage python-sklearn
+sudo apt-get install --no-install-recommends build-essential cmake git gfortran libatlas-base-dev libboost-all-dev libgflags-dev libgoogle-glog-dev libhdf5-serial-dev libleveldb-dev liblmdb-dev libopencv-dev libprotobuf-dev libsnappy-dev protobuf-compiler python-all-dev python-dev python-h5py python-matplotlib python-numpy python-opencv python-pil python-pip python-protobuf python-scipy python-skimage python-sklearn
 ```
 
 ## Download source

diff --git a/docs/BuildDigitsWindows.md b/docs/BuildDigitsWindows.md
@@ -3,7 +3,7 @@
 ## Prerequisites
 - Python2
 - CUDA 7.5
-- CuDNN 4
+- CuDNN 5.1
 - Caffe
 - Graphviz
 
@@ -43,8 +43,9 @@ At this moment, do not install gevent yet.  We need to install it after installi
 CUDA 7.5 can be obtained at NVIDIA CUDA (https://developer.nvidia.com/cuda-downloads).
 Please select Windows 7 to download.
 
-### CuDNN 4
-Download CuDNN 4 at NVIDIA website (https://developer.nvidia.com/cudnn).
+### CuDNN 5.1
+Download CuDNN 5.1 at NVIDIA website (https://developer.nvidia.com/cudnn).
+Please select CuDNN 5.1 for CUDA 7.5.
 
 ### Caffe
 Caffe can be obtained at (https://github.com/bvlc/caffe/tree/windows).

diff --git a/examples/object-detection/.gitignore b/examples/object-detection/.gitignore
@@ -0,0 +1,2 @@
+/kitti-data/
+/*.zip
diff --git a/examples/object-detection/README.md b/examples/object-detection/README.md
@@ -4,7 +4,7 @@ Table of Contents
 =================
 * [Introduction](#introduction)
 * [Dataset creation](#dataset-creation)
-    * [Preparing the data](#preparing-the-data)
+    * [Downloading and preparing the KITTI data](#downloading-and-preparing-the-kitti-data)
     * [Loading the data into DIGITS](#loading-the-data-into-digits)
 * [Model creation](#model-creation)
     * [DetectNet](#detectnet)
@@ -19,21 +19,45 @@ During inference, object detection will be materialized by drawing bounding rect
 
 ## Dataset creation
 
-### Preparing the data
+In this example, we will be using data from the Object Detection track of the KITTI Vision Benchmark Suite.
+You can of course use any other data you like, but DIGITS expects object detection data to be labelled in the style of KITTI data.
 
-This walk-through was tested using images and labels from the **Car** Object Detection track of the KITTI Vision benchmark.
-Other similar datasets may be used though you may prefer to download the KITTI dataset from their web site if you wish to replicate the results from this example.
+If you do want to use your own dataset instead of KITTI, read [digits/extensions/data/objectDetection/README.md](../../digits/extensions/data/objectDetection/README.md) to format your data properly and then skip the next section.
 
-Optionally you may split the dataset into a training set and a (usually much smaller) validation set.
-Doing so is strongly recommended to assess the quality of the neural network.
+### Downloading and preparing the KITTI data
 
-The data need to be structured in the following way:
-- An image folder contains supported images (.png, .jpg, .jpeg, .bmp, .ppm).
-- A label folder contains .txt files in KITTI format that define the ground truth.
-Note that for each image in the image folder there must be a corresponding text file in the label folder.
-For example if the image folder includes an image named `foo.png` then the label folder needs to include a file named `foo.txt`.
+We are unable to provide download links to the KITTI data like we can for MNIST and CIFAR, so you'll have to download a few large files yourself.
+Go to http://www.cvlibs.net/datasets/kitti/eval_object.php and download these files:
+
+ | Filename | Size
+------------ | ------------- | ------------- | -------------
+Left color images of object data set | `data_object_image_2.zip` | **12GB**
+Training labels of object data set | `data_object_label_2.zip` | 5MB
+Object development kit | `devkit_object.zip` | 1MB
+
+Copy those files into `$DIGITS_HOME/examples/object-detection/`.
 
-There needs to be one of the above set of directories for each of the training and validation sets.
+Then, use the `prepare_kitti_data.py` script to create a train/val split of the labelled images.
+This will take a few minutes, spent mostly on unpacking the large zipfiles.
+```
+$ ./prepare_kitti_data.py
+Extracting zipfiles ...
+Unzipping data_object_label_2.zip ...
+Unzipping data_object_image_2.zip ...
+Unzipping devkit_object.zip ...
+Calculating image to video mapping ...
+Splitting images by video ...
+Creating train/val split ...
+Done.
+```
+
+At the end you will have your data at `$DIGITS_HOME/examples/object-detection/kitti-data/{train,val}/`.
+
+The data is structured in the following way:
+- An image folder containing supported images (`.png`, `.jpg`, etc.).
+- A label folder containing `.txt` files in KITTI format that define the ground truth.
+For each image in the image folder there must be a corresponding text file in the label folder.
+For example if the image folder includes an image named `foo.png` then the label folder needs to include a file named `foo.txt`.
 
 ### Loading the data into DIGITS
 
@@ -60,7 +84,7 @@ In this example we will use **DetectNet**.
 DetectNet is a GoogLeNet-derived network that is specifically tuned for Object Detection.
 
 In order to train DetectNet, [NVcaffe](https://github.com/NVIDIA/caffe) version [0.15.1](https://github.com/NVIDIA/caffe/tree/v0.15.1) or later is required.
-The [model description for DetectNet](https://github.com/NVIDIA/caffe/tree/caffe-0.15/examples/kitti) may be found in the NV-Caffe repository.
+The model description for DetectNet can be found at `$CAFFE_HOME/examples/kitti/detectnet_network.prototxt` ([raw link](https://raw.githubusercontent.com/NVIDIA/caffe/caffe-0.15/examples/kitti/detectnet_network.prototxt)).
 
 Since DetectNet is derived from GoogLeNet it is strongly recommended to use pre-trained weights from an ImageNet-trained GoogLeNet as this will help speed training up significantly.
 A suitable pre-trained GoogLeNet `.caffemodel` may be found on this [page](https://github.com/BVLC/caffe/tree/rc3/models/bvlc_googlenet).

diff --git a/examples/object-detection/form-object-detection-dataset.jpg b/examples/object-detection/form-object-detection-dataset.jpg