Qualcomm开发者专区

图像处理算法

操作系统	云服务/平台	技术难度	关注领域
Android		Intermediate	Low Power Gaming Embedded

任务目标

当运行某些算法的时候，我们需要测试硬件处理器的性能。也许我们能够运行自己的算法在适合的处理器。或者将我们的算法拆分，然后放到不同的处理器上运行，也许能够获得更好的性能我希望提供一个解决方案，用来优化应用和设备性能。

所需材料/所需清单/工具

• Snapdragon Heterogeneous Compute SDK v1.0.0 - Linux

• android-ndk-r14b-linux-x86_64

源码/示例/可执行的应用程序

• Source Code

附加资料

• hetcompute_sample_ImageProcessingDemo

构建/装配说明

以下展示了在这个项目中使用到的部分。

1. 使用SDM845平台的Android设备，并且建立HetCompute SDK，Hexagon SDK和应用。

2.Ubuntu 18.04LTS

3.Type-C date line.

4.所有开发工作都基于这个HetComputeSDK

部署项目

1. Download Snapdragon Heterogeneous Compute SDK from https://developer.qualcomm.com/download/snapdragon-heterogeneous-compute-sdk-1.0.0.deb?referrer=node/35864, and install it to PC.

2. Download android-ndk-r14b-linux-x86_64 from https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip, and install it to PC.

3. Download Hexagon DSP SDK from https://developer.qualcomm.com/download/hexagon/hexagon-sdk-v3-3-3-linux.zip?referrer=node/6116, and install it to PC.

4. 编译然后复制应用和库到设备。编译方法可以参考 README.md文件

5. Push application to /data/local/tmp and running:

./hetcompute_sample_ImageProcessingDemo

6.如果没有问题，可以上传你的code到github

工作流程

一、开始应该做些什么？

首先我们需要创建处理器内核通道用于控制硬件

/1.0.0/samples/ImageProcessingDemo.cc

Create CPU kernel pipe
void
denoise_image_process_for_cpu(Pixel* input, Pixel* output)
{
unsigned long begin_process_time = 0;
unsigned long end_process_time = 0;

// Create a reusable: w[SEARCH_WINDOW_SIZE][SEARCH_WINDOW_SIZE].
float w[SEARCH_WINDOW_SIZE * SEARCH_WINDOW_SIZE];

    auto g = hetcompute::create_group("denoise_task_per_pixel");
// Iterate through all points in the input image
for (int y = 0; y < img_height; y++)
{
for (int x = 0; x < img_width; x++)
{
g->launch([x, y, &w, input, &output] {
// Compute weights for points in the search window.
compute_weights(input, Point{ x, y }, w);
float weight_sum = 0;
float temp       = 0;
// Denoise: compute the weighted average for this point.
for (int i = 0; i < SEARCH_WINDOW_SIZE; i++)
{
for (int j = 0; j < SEARCH_WINDOW_SIZE; j++)
{
Point neighbor;
neighbor.x = x - SEARCH_WINDOW_SIZE / 2 + i;
neighbor.y = y - SEARCH_WINDOW_SIZE / 2 + j;
neighbor   = clamp_to_reflection(neighbor);
temp += w[i * SEARCH_WINDOW_SIZE + j] * input[neighbor.y * img_width + neighbor.x];
weight_sum += w[i * SEARCH_WINDOW_SIZE + j];
}
}

temp /= weight_sum;
output[y * img_width + x] = static_cast<Pixel>(temp);
});
}
}

begin_process_time = getCurrentTimeMsec();
g->wait_for(); // wait for all the tasks to complete
end_process_time = getCurrentTimeMsec();

process_calc_time_cpu += (end_process_time - begin_process_time);
}

Create GPU kernel pipe
void
denoise_image_process_for_gpu(Pixel* input, Pixel* output)
{
unsigned long begin_process_time = 0;
unsigned long end_process_time = 0;

// create HetComputeSDK buffer
auto input_buffer = hetcompute::create_buffer<unsigned char>(input_buffer_size, hetcompute::device_set({ hetcompute::gpu }));
auto output_buffer = hetcompute::create_buffer<float>(output_buffer_size, hetcompute::device_set({ hetcompute::gpu }));
auto similarity_weights_buffer = hetcompute::create_buffer<float>(MAX_DIST, hetcompute::device_set({ hetcompute::gpu }));

// Init HetComputeSDK buffer
input_buffer.acquire_wi();
similarity_weights_buffer.acquire_wi();
for (size_t x = 0; x < input_buffer_size; x++) {
input_buffer[x] = input[x];
}

for (size_t y = 0; y < MAX_DIST; y++) {
similarity_weights_buffer[y] = similarity_weights[y];
}
input_buffer.release();
similarity_weights_buffer.release();

// create GPU kernel
auto gk = hetcompute::create_gpu_kernel<hetcompute::buffer_ptr<const unsigned char>,
hetcompute::buffer_ptr<float>,
hetcompute::buffer_ptr<const float>,
const unsigned int,
const unsigned int,
const unsigned int,
const unsigned int>
(image_kernel_string, "process_denoise_image");

//! [Create a 2D Range Task]
hetcompute::range<2> range_2d(img_width, img_height);

// Create a task
auto gpu_task = hetcompute::create_task(gk, range_2d, input_buffer, output_buffer,
similarity_weights_buffer,
SEARCH_WINDOW_SIZE, SIMILARITY_WINDOW_SIZE,
img_width, img_height);

// Launch the task on the gpu
gpu_task->launch();

// Wait for task completion.
begin_process_time = getCurrentTimeMsec();
gpu_task->wait_for();
end_process_time = getCurrentTimeMsec();

process_calc_time_gpu += (end_process_time - begin_process_time);

output_buffer.acquire_ro();
for (size_t count = 0; count < output_buffer.size(); count++) {
output[count] = static_cast<Pixel>(output_buffer[count]);
}
output_buffer.release();
}

Create DSP kernel pipe
void
denoise_image_process_for_dsp(Pixel* input, Pixel* output)
{
unsigned long begin_process_time = 0;
unsigned long end_process_time = 0;

// create HetComputeSDK buffer
auto input_buffer = hetcompute::create_buffer<char>(input_buffer_size, hetcompute::device_set({ hetcompute::dsp }));
auto output_buffer = hetcompute::create_buffer<float>(output_buffer_size, hetcompute::device_set({ hetcompute::dsp }));
auto similarity_weights_buffer = hetcompute::create_buffer<float>(MAX_DIST, hetcompute::device_set({ hetcompute::dsp }));

// Init HetComputeSDK buffer
input_buffer.acquire_wi();
similarity_weights_buffer.acquire_wi();
for (size_t x = 0; x < input_buffer_size; x++) {
input_buffer[x] = static_cast<Pixel>(input[x]);
}

for (size_t y = 0; y < MAX_DIST; y++) {
similarity_weights_buffer[y] = similarity_weights[y];
}
input_buffer.release();
similarity_weights_buffer.release();

// Create task group
auto dg = hetcompute::create_group();

// create DSP kernel
auto dk = hetcompute::create_dsp_kernel<>(hetcompute_dsp_denoise_image_process);

// Launch the task on the dsp
for (int height = 0; height < img_height; height++) {
for (int width = 0; width < img_width; width++) {
dg->launch(dk, input_buffer, output_buffer, similarity_weights_buffer,
SEARCH_WINDOW_SIZE, SIMILARITY_WINDOW_SIZE,
img_width, img_height, width, height);
}
}

begin_process_time = getCurrentTimeMsec();
dg->wait_for();
end_process_time = getCurrentTimeMsec();

process_calc_time_dsp += (end_process_time - begin_process_time);

output_buffer.acquire_ro();
for (size_t count = 0; count < output_buffer.size(); count++) {
output[count] = static_cast<Pixel>(output_buffer[count]);
}
output_buffer.release();

}

贡献者信息

姓名	公司
Shen Tao shentao1012@thundersoft.com	Thundersoft
Yang Rong yangrong0925@thundersoft.com	Thundersoft
Wu kouzw0723@thundersoft.com	Thundersoft

>>浏览更多Qualcomm硬件案例：http://qualcomm.csdn.net/m/zone/qualcomm2016/project

Qualcomm 开发者专区是 Qualcomm 联合CSDN 共同打造的面向中国开发者的技术专区。致力于通过提供全球最新资讯和最多元的技术资源及支持，为开发者们打造全面一流的开发环境。本专区将以嵌入式、物联网、游戏开发、Qualcomm® 骁龙™处理器的软件优化等技术为核心，打造全面的开发者技术服务社区，为下一代高性能体验和设计带来更多的想法和灵感。

加入 Qualcomm 开发者专区

高通软件中心

通过集中式门户站无缝管理您的高通®软件和工具

下载软件中心