图像处理算法

操作系统 云服务/平台 技术难度 关注领域
Android   Intermediate Low Power        Gaming       Embedded

任务目标

当运行某些算法的时候,我们需要测试硬件处理器的性能。也许我们能够运行自己的算法在适合的处理器。或者将我们的算法拆分,然后放到不同的处理器上运行,也许能够获得更好的性能 我希望提供一个解决方案,用来优化应用和设备性能。

所需材料/所需清单/工具

  • Snapdragon Heterogeneous Compute SDK v1.0.0 - Linux

  • android-ndk-r14b-linux-x86_64

源码/示例/可执行的应用程序

  • Source Code

附加资料

  • hetcompute_sample_ImageProcessingDemo

构建/装配说明

以下展示了在这个项目中使用到的部分。

1. 使用SDM845平台的Android设备,并且建立HetCompute SDK,Hexagon SDK和应用。

2.Ubuntu 18.04LTS

3.Type-C date line.

4.所有开发工作都基于这个HetComputeSDK

 

部署项目

1. Download Snapdragon Heterogeneous Compute SDK from https://developer.qualcomm.com/download/snapdragon-heterogeneous-compute-sdk-1.0.0.deb?referrer=node/35864, and install it to PC.

2. Download android-ndk-r14b-linux-x86_64 from https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip, and install it to PC.

3. Download Hexagon DSP SDK from https://developer.qualcomm.com/download/hexagon/hexagon-sdk-v3-3-3-linux.zip?referrer=node/6116, and install it to PC.

4. 编译然后复制应用和库到设备。编译方法可以参考 README.md文件

5. Push application to /data/local/tmp and running:

    ./hetcompute_sample_ImageProcessingDemo

6.如果没有问题,可以上传你的code到github

 

工作流程

一、开始应该做些什么?

首先我们需要创建处理器内核通道用于控制硬件

/1.0.0/samples/ImageProcessingDemo.cc

 

Create CPU kernel pipe
void
denoise_image_process_for_cpu(Pixel* input, Pixel* output)
{
    unsigned long begin_process_time = 0;
    unsigned long end_process_time = 0;

    // Create a reusable: w[SEARCH_WINDOW_SIZE][SEARCH_WINDOW_SIZE].
float w[SEARCH_WINDOW_SIZE * SEARCH_WINDOW_SIZE];

    auto g = hetcompute::create_group("denoise_task_per_pixel");
// Iterate through all points in the input image
for (int y = 0; y < img_height; y++)
{
for (int x = 0; x < img_width; x++)
{
g->launch([x, y, &w, input, &output] {
// Compute weights for points in the search window.
compute_weights(input, Point{ x, y }, w);
float weight_sum = 0;
float temp       = 0;
// Denoise: compute the weighted average for this point.
for (int i = 0; i < SEARCH_WINDOW_SIZE; i++)
{
for (int j = 0; j < SEARCH_WINDOW_SIZE; j++)
{
Point neighbor;
neighbor.x = x - SEARCH_WINDOW_SIZE / 2 + i;
neighbor.y = y - SEARCH_WINDOW_SIZE / 2 + j;
neighbor   = clamp_to_reflection(neighbor);
temp += w[i * SEARCH_WINDOW_SIZE + j] * input[neighbor.y * img_width + neighbor.x];
weight_sum += w[i * SEARCH_WINDOW_SIZE + j];
}
}

                temp /= weight_sum;
output[y * img_width + x] = static_cast<Pixel>(temp);
});
}
}

    begin_process_time = getCurrentTimeMsec();
g->wait_for(); // wait for all the tasks to complete
end_process_time = getCurrentTimeMsec();

    process_calc_time_cpu += (end_process_time - begin_process_time);
}

Create GPU kernel pipe
void
denoise_image_process_for_gpu(Pixel* input, Pixel* output)
{
unsigned long begin_process_time = 0;
unsigned long end_process_time = 0;

    // create HetComputeSDK buffer
auto input_buffer = hetcompute::create_buffer<unsigned char>(input_buffer_size, hetcompute::device_set({ hetcompute::gpu }));
auto output_buffer = hetcompute::create_buffer<float>(output_buffer_size, hetcompute::device_set({ hetcompute::gpu }));
auto similarity_weights_buffer = hetcompute::create_buffer<float>(MAX_DIST, hetcompute::device_set({ hetcompute::gpu }));

    // Init HetComputeSDK buffer
input_buffer.acquire_wi();
similarity_weights_buffer.acquire_wi();
for (size_t x = 0; x < input_buffer_size; x++) {
input_buffer[x] = input[x];
}

    for (size_t y = 0; y < MAX_DIST; y++) {
similarity_weights_buffer[y] = similarity_weights[y];
}
input_buffer.release();
similarity_weights_buffer.release();

    // create GPU kernel
auto gk = hetcompute::create_gpu_kernel<hetcompute::buffer_ptr<const unsigned char>,
hetcompute::buffer_ptr<float>,
hetcompute::buffer_ptr<const float>,
const unsigned int,
const unsigned int,
const unsigned int,
const unsigned int>
(image_kernel_string, "process_denoise_image");

    //! [Create a 2D Range Task]
hetcompute::range<2> range_2d(img_width, img_height);

    // Create a task
auto gpu_task = hetcompute::create_task(gk, range_2d, input_buffer, output_buffer,
similarity_weights_buffer,
SEARCH_WINDOW_SIZE, SIMILARITY_WINDOW_SIZE,
img_width, img_height);

    // Launch the task on the gpu
gpu_task->launch();

    // Wait for task completion.
begin_process_time = getCurrentTimeMsec();
gpu_task->wait_for();
end_process_time = getCurrentTimeMsec();

    process_calc_time_gpu += (end_process_time - begin_process_time);

    output_buffer.acquire_ro();
for (size_t count = 0; count < output_buffer.size(); count++) {
output[count] = static_cast<Pixel>(output_buffer[count]);
}
output_buffer.release();
}

Create DSP kernel pipe
void
denoise_image_process_for_dsp(Pixel* input, Pixel* output)
{
unsigned long begin_process_time = 0;
unsigned long end_process_time = 0;

    // create HetComputeSDK buffer
auto input_buffer = hetcompute::create_buffer<char>(input_buffer_size, hetcompute::device_set({ hetcompute::dsp }));
auto output_buffer = hetcompute::create_buffer<float>(output_buffer_size, hetcompute::device_set({ hetcompute::dsp }));
auto similarity_weights_buffer = hetcompute::create_buffer<float>(MAX_DIST, hetcompute::device_set({ hetcompute::dsp }));

    // Init HetComputeSDK buffer
input_buffer.acquire_wi();
similarity_weights_buffer.acquire_wi();
for (size_t x = 0; x < input_buffer_size; x++) {
input_buffer[x] = static_cast<Pixel>(input[x]);
}

    for (size_t y = 0; y < MAX_DIST; y++) {
similarity_weights_buffer[y] = similarity_weights[y];
}
input_buffer.release();
similarity_weights_buffer.release();

    // Create task group
auto dg = hetcompute::create_group();

    // create DSP kernel
auto dk = hetcompute::create_dsp_kernel<>(hetcompute_dsp_denoise_image_process);

    // Launch the task on the dsp
for (int height = 0; height < img_height; height++) {
for (int width = 0; width < img_width; width++) {
dg->launch(dk, input_buffer, output_buffer, similarity_weights_buffer,
SEARCH_WINDOW_SIZE, SIMILARITY_WINDOW_SIZE,
img_width, img_height, width, height);
}
}

    begin_process_time = getCurrentTimeMsec();
dg->wait_for();
end_process_time = getCurrentTimeMsec();

    process_calc_time_dsp += (end_process_time - begin_process_time);

    output_buffer.acquire_ro();
for (size_t count = 0; count < output_buffer.size(); count++) {
output[count] = static_cast<Pixel>(output_buffer[count]);
}
output_buffer.release();

}

贡献者信息

姓名 公司

Shen Tao

shentao1012@thundersoft.com
Thundersoft

Yang Rong

yangrong0925@thundersoft.com
Thundersoft

Wu

kouzw0723@thundersoft.com
Thundersoft

>>浏览更多Qualcomm硬件案例:http://qualcomm.csdn.net/m/zone/qualcomm2016/project

Qualcomm 解决方案

 

高通 AI Hub

全新高通 AI Hub 包含预优化AI模型库,支持在搭载骁龙和高通平台的终端上进行无缝部署。
该模型库为开发者提供超过75个主流的AI和生成式AI模型,比如Whisper、ControlNet、Stable Diffusion和Baichuan-7B,可在不同执行环境(runtime)中打包,能够在不同形态终端中实现卓越的终端侧AI性能、降低内存占用并提升能效。所有模型均经过优化,以充分利用高通AI引擎内所有核心(NPU、CPU和GPU)的硬件加速能力,从而使推理速度提升4倍。

了解更多

SDK 下载

本版块下载 SDK,只需简单注册,就可轻松下载。