Skip to content

CUDA Stream must be awaited before computing optical flow #24540

@ProtectedVariable

Description

@ProtectedVariable

System Information

OpenCV version: 4.8.1
Operating System / Platform: Windows 10 Pro
Compiler & compiler version: MSVC 19.35.32216.1

Detailed description

When using the Farneback optical flow on GPU with a CUDA stream, the stream must be awaited before computing the optical as otherwise the results are corrupted.
In the example I provide I first downscale the image to half resolution (cuda::resize on the stream) and then compute the optical flow.
As both operations are on the same stream I expect that they would operate correctly but as you can see from the result, not synchronizing the stream before the optical flow creates an incorrect result.

This can create very strange results because depending on your GPU performance it might not show up but with a slower GPU it will make a bad result.

Steps to reproduce

Here's an example of using the optical flow to align images that have been rotated. Commenting the line with the comment // <------------------ Why is this line necessary ---------------------- creates invalid result.

#include <opencv2/calib3d.hpp>
#include <opencv2/core.hpp>
#include <opencv2/cudaarithm.hpp>
#include <opencv2/cudafilters.hpp>
#include <opencv2/cudaoptflow.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/highgui.hpp>

using namespace cv;

cv::Mat makeCheckerboard() {
	int dim = 2048;
	int cell_size = dim / 8;
	constexpr int maxval = (1 << 16) - 1;
	Mat currentImage = Mat::zeros(dim, dim, CV_16UC1);
	for (int i = 0; i < dim * dim / (cell_size * cell_size); i++) {
		int y = (i * cell_size / dim);
		cv::rectangle(currentImage,
			{ ((y % 2) * cell_size + (i * cell_size * 2)) % dim,
			 y * cell_size, cell_size, cell_size },
			maxval, cv::FILLED);
	}
	return currentImage;
}

std::vector<cv::Mat> makeRotationSerie(int size, const cv::Mat& original) {
	std::vector<cv::Mat> vec;
	vec.push_back(original);

	for (int i = 1; i < size; i++) {
		auto mat = original.clone();
		auto center = original.size() / 2;
		auto rot_mat = cv::getRotationMatrix2D(
			{ (float)center.width, (float)center.height }, i, 1.0);
		cv::warpAffine(original, mat, rot_mat, original.size());
		vec.push_back(mat);
	}
	return vec;
}

int main()
{
	auto opt_flow = cv::cuda::FarnebackOpticalFlow::create();

	auto images = makeRotationSerie(10, makeCheckerboard());
	auto stream = cv::cuda::Stream();

	cuda::GpuMat init_image(images[0]);
	cuda::GpuMat init_image_resized;
	cuda::resize(init_image, init_image_resized, Size(), 0.5,
		0.5, cv::INTER_LINEAR, stream);

	Mat result_image(2048, 2048, CV_16UC1, Scalar(0));

	for (const auto& img : images) {

		cuda::GpuMat next_image(img);

		cuda::GpuMat next_image_resized;
		cuda::resize(next_image, next_image_resized, Size(), 0.5,
			0.5, cv::INTER_LINEAR, stream);

		stream.waitForCompletion(); // <------------------ Why is this line necessary ----------------------
		// Compute the optical flow
		cuda::GpuMat gpu_flow;
		opt_flow->calc(init_image_resized, next_image_resized, gpu_flow, stream);

		stream.waitForCompletion();
		Mat flow(gpu_flow);

		//estimate uniform transformation by querying at regular intervals and
		//Rescale to original image size
		std::vector<Point2f> moved_points;
		std::vector<Point2f> original_points;
		for (int x = 15; x < 1024; x += 15) {
			for (int y = 15; y < 1024; y += 15) {
				original_points.emplace_back(x / 0.5, y / 0.5);
				Point2f f = flow.at<Point2f>(y, x);
				moved_points.push_back(
					Point2f((x + f.x) / 0.5, (y + f.y) / 0.5));
			}
		}

		Mat warp_matrix = estimateAffinePartial2D(original_points, moved_points);

		cuda::GpuMat next_image_aligned;
		cuda::warpAffine(next_image, next_image_aligned, warp_matrix,
			next_image.size(), INTER_LINEAR | WARP_INVERSE_MAP, 0,
			cv::Scalar(), stream);

		stream.waitForCompletion();
		result_image += Mat(next_image_aligned) / 10;
	}

	imshow("Result", result_image);
	waitKey(0);
	imwrite("result.png", result_image);
}

With the stream.waitForCompletion() enabled (between resize and optical flow's calc)
result

Without the stream.waitForCompletion() enabled (between resize and optical flow's calc)
result

Issue submission checklist

  • I report the issue, it's not a question
  • I checked the problem with documentation, FAQ, open issues, forum.opencv.org, Stack Overflow, etc and have not found any solution
  • I updated to the latest OpenCV version and the issue is still there
  • There is reproducer code and related data files (videos, images, onnx, etc)

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions