From 86a3d694eb2876aaaf6048088a9d76f8472f769f Mon Sep 17 00:00:00 2001
From: Sebastian Gomez-Gonzalez <sgomez@tue.mpg.de>
Date: Thu, 14 Sep 2017 16:54:10 +0200
Subject: [PATCH] Adding support for the compilation with cuda (Not tested yet)

---
 CMakeLists.txt                     | 11 +++++-
 include/ball_tracking/img_proc.hpp | 17 ++++++++-
 src/cuda/img_proc.cu               | 61 ++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 2 deletions(-)
 create mode 100644 src/cuda/img_proc.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 22b96b6..8620c05 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,11 +2,19 @@ cmake_minimum_required(VERSION 3.5)
 project(ball_tracking)
 
 find_package(OpenCV REQUIRED)
+find_package(CUDA 8.0 REQUIRED)
 
 option (PYLIB "Create a Python Module with interface to some of the C++ implementations" ON)
 
 include_directories(include
-  ${OpenCV_INCLUDES})
+  ${OpenCV_INCLUDES}
+  ${CUDA_INCLUDE_DIRS})
+
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11 -arch=sm_30" )
+cuda_add_library(cu_ball_track
+  src/cuda/img_proc.cu
+  )
+
 add_library(ball_tracking SHARED
   src/img_proc.cpp
   src/utils.cpp
@@ -14,6 +22,7 @@ add_library(ball_tracking SHARED
   )
 target_link_libraries(ball_tracking
   ${OpenCV_LIBS}
+  cu_ball_track
   )
 
 #Compile with C++11 support only
diff --git a/include/ball_tracking/img_proc.hpp b/include/ball_tracking/img_proc.hpp
index 6b90e12..6da0a7f 100644
--- a/include/ball_tracking/img_proc.hpp
+++ b/include/ball_tracking/img_proc.hpp
@@ -4,7 +4,6 @@
 #include <opencv2/opencv.hpp>
 #include <memory>
 #include <functional>
-#include <json.hpp>
 
 namespace ball_tracking {
 
@@ -35,6 +34,22 @@ namespace ball_tracking {
    */
   cv::Mat quadf_log_reg(cv::InputArray src, cv::InputArray bkg, cv::InputArray weights);
 
+  namespace cuda {
+
+    /**
+     * @brief Applies a pixel-wise logistic regression with quadratic features to the source
+     * image and returns a single-channel image with the log-probabilities in the GPU
+     *
+     * @param[in] src The source image on the GPU
+     * @param[in] bkg A background image (Without the ball) on the GPU
+     * @param[in] weights Vector of weights of logistic regression
+     * @param[out] dst The destination image in the GPU
+     * @param[in] stream The stream of execution in the GPU
+     */
+    void quadf_log_reg(const cv::cuda::GpuMat& src, const cv::cuda::GpuMat& bkg, 
+        const cv::cuda::GpuMat weights, cv::cuda::GpuMat& dst, cv::cuda::Stream& stream);
+  };
+
 };
 
 #endif
diff --git a/src/cuda/img_proc.cu b/src/cuda/img_proc.cu
new file mode 100644
index 0000000..eb1a886
--- /dev/null
+++ b/src/cuda/img_proc.cu
@@ -0,0 +1,61 @@
+
+#include "opencv2/cudev.hpp"
+#include "opencv2/opencv_modules.hpp"
+#include <ball_tracking/img_proc.hpp>
+
+using namespace cv;
+using namespace cv::cuda;
+
+namespace ball_tracking {
+  
+  namespace {
+    __global__ void quadf_log_reg_kernel(const PtrStepSz<uchar3> src,
+        const PtrStepSz<uchar3> bkg, const PtrStepSz<double> weights,
+        PtrStepSz<double> dst) {
+      const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+      const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+      const unsigned int n = x + blockIdx.x*blockDim.x*y;
+      
+      __shared__ double w[28];
+      if (n<28) w[n] = weights(0,n);
+      __syncthreads();
+
+      if (x < src.rows && y < src.cols) {
+        double sum = 0.0;
+        uchar3 it = src(x,y), it_bkg = bkg(x,y);
+        const double lfeat[7] = {it.x/255.0, it.y/255.0, it.z/255.0, 
+          it_bkg.x/255.0, it_bkg.y/255.0, it_bkg.z/255.0, 1.0};
+        const double* wptr = w;
+        for (unsigned int i=0; i<7; i++) {
+          for (unsigned int j=i; j<7; j++, wptr++) {
+            sum += (*wptr) * lfeat[i] * lfeat[j];
+          }
+        }
+        dst(y,x) = sum;
+      }
+    }
+  };
+  
+  namespace cuda {
+    /**
+     * Require that src and bkg are of equal dimensions and that weights
+     * is a vector of 28 dimensions
+     */
+    void quadf_log_reg(const GpuMat& src, const GpuMat& bkg,
+        const GpuMat weights, GpuMat& dst, Stream& stream = Stream::Null()) {
+      CV_Assert(src.rows==bkg.rows && src.cols==bkg.cols);
+      dim3 block(32,8);
+      dim3 grid((src.cols + block.x - 1)/block.x,
+          (src.rows + block.y - 1) / block.y);
+
+      dst.create(src.size(), CV_64FC1);
+      cudaStream_t s = StreamAccessor::getStream(stream);
+
+      quadf_log_reg_kernel<<<grid,block,0,s>>>(src, bkg, weights, dst);
+
+      if (s == 0)
+        cudaDeviceSynchronize();
+    }
+  };
+
+};
-- 
GitLab