图像处理算法
操作系统 | 云服务/平台 | 技术难度 | 关注领域 |
---|---|---|---|
Android | Intermediate | Low Power Gaming Embedded |
任务目标
当运行某些算法的时候,我们需要测试硬件处理器的性能。也许我们能够运行自己的算法在适合的处理器。或者将我们的算法拆分,然后放到不同的处理器上运行,也许能够获得更好的性能 我希望提供一个解决方案,用来优化应用和设备性能。
所需材料/所需清单/工具
• Snapdragon Heterogeneous Compute SDK v1.0.0 - Linux
• android-ndk-r14b-linux-x86_64
源码/示例/可执行的应用程序
附加资料
• hetcompute_sample_ImageProcessingDemo
构建/装配说明
以下展示了在这个项目中使用到的部分。
1. 使用SDM845平台的Android设备,并且建立HetCompute SDK,Hexagon SDK和应用。
2.Ubuntu 18.04LTS
3.Type-C date line.
4.所有开发工作都基于这个HetComputeSDK
部署项目
1. Download Snapdragon Heterogeneous Compute SDK from https://developer.qualcomm.com/download/snapdragon-heterogeneous-compute-sdk-1.0.0.deb?referrer=node/35864, and install it to PC.
2. Download android-ndk-r14b-linux-x86_64 from https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip, and install it to PC.
3. Download Hexagon DSP SDK from https://developer.qualcomm.com/download/hexagon/hexagon-sdk-v3-3-3-linux.zip?referrer=node/6116, and install it to PC.
4. 编译然后复制应用和库到设备。编译方法可以参考 README.md文件
5. Push application to /data/local/tmp and running:
./hetcompute_sample_ImageProcessingDemo
6.如果没有问题,可以上传你的code到github
工作流程
一、开始应该做些什么?
首先我们需要创建处理器内核通道用于控制硬件
/1.0.0/samples/ImageProcessingDemo.cc
Create CPU kernel pipe
void
denoise_image_process_for_cpu(Pixel* input, Pixel* output)
{
unsigned long begin_process_time = 0;
unsigned long end_process_time = 0;
// Create a reusable: w[SEARCH_WINDOW_SIZE][SEARCH_WINDOW_SIZE].
float w[SEARCH_WINDOW_SIZE * SEARCH_WINDOW_SIZE];
auto g = hetcompute::create_group("denoise_task_per_pixel");
// Iterate through all points in the input image
for (int y = 0; y < img_height; y++)
{
for (int x = 0; x < img_width; x++)
{
g->launch([x, y, &w, input, &output] {
// Compute weights for points in the search window.
compute_weights(input, Point{ x, y }, w);
float weight_sum = 0;
float temp = 0;
// Denoise: compute the weighted average for this point.
for (int i = 0; i < SEARCH_WINDOW_SIZE; i++)
{
for (int j = 0; j < SEARCH_WINDOW_SIZE; j++)
{
Point neighbor;
neighbor.x = x - SEARCH_WINDOW_SIZE / 2 + i;
neighbor.y = y - SEARCH_WINDOW_SIZE / 2 + j;
neighbor = clamp_to_reflection(neighbor);
temp += w[i * SEARCH_WINDOW_SIZE + j] * input[neighbor.y * img_width + neighbor.x];
weight_sum += w[i * SEARCH_WINDOW_SIZE + j];
}
}
temp /= weight_sum;
output[y * img_width + x] = static_cast<Pixel>(temp);
});
}
}
begin_process_time = getCurrentTimeMsec();
g->wait_for(); // wait for all the tasks to complete
end_process_time = getCurrentTimeMsec();
process_calc_time_cpu += (end_process_time - begin_process_time);
}
Create GPU kernel pipe
void
denoise_image_process_for_gpu(Pixel* input, Pixel* output)
{
unsigned long begin_process_time = 0;
unsigned long end_process_time = 0;
// create HetComputeSDK buffer
auto input_buffer = hetcompute::create_buffer<unsigned char>(input_buffer_size, hetcompute::device_set({ hetcompute::gpu }));
auto output_buffer = hetcompute::create_buffer<float>(output_buffer_size, hetcompute::device_set({ hetcompute::gpu }));
auto similarity_weights_buffer = hetcompute::create_buffer<float>(MAX_DIST, hetcompute::device_set({ hetcompute::gpu }));
// Init HetComputeSDK buffer
input_buffer.acquire_wi();
similarity_weights_buffer.acquire_wi();
for (size_t x = 0; x < input_buffer_size; x++) {
input_buffer[x] = input[x];
}
for (size_t y = 0; y < MAX_DIST; y++) {
similarity_weights_buffer[y] = similarity_weights[y];
}
input_buffer.release();
similarity_weights_buffer.release();
// create GPU kernel
auto gk = hetcompute::create_gpu_kernel<hetcompute::buffer_ptr<const unsigned char>,
hetcompute::buffer_ptr<float>,
hetcompute::buffer_ptr<const float>,
const unsigned int,
const unsigned int,
const unsigned int,
const unsigned int>
(image_kernel_string, "process_denoise_image");
//! [Create a 2D Range Task]
hetcompute::range<2> range_2d(img_width, img_height);
// Create a task
auto gpu_task = hetcompute::create_task(gk, range_2d, input_buffer, output_buffer,
similarity_weights_buffer,
SEARCH_WINDOW_SIZE, SIMILARITY_WINDOW_SIZE,
img_width, img_height);
// Launch the task on the gpu
gpu_task->launch();
// Wait for task completion.
begin_process_time = getCurrentTimeMsec();
gpu_task->wait_for();
end_process_time = getCurrentTimeMsec();
process_calc_time_gpu += (end_process_time - begin_process_time);
output_buffer.acquire_ro();
for (size_t count = 0; count < output_buffer.size(); count++) {
output[count] = static_cast<Pixel>(output_buffer[count]);
}
output_buffer.release();
}
Create DSP kernel pipe
void
denoise_image_process_for_dsp(Pixel* input, Pixel* output)
{
unsigned long begin_process_time = 0;
unsigned long end_process_time = 0;
// create HetComputeSDK buffer
auto input_buffer = hetcompute::create_buffer<char>(input_buffer_size, hetcompute::device_set({ hetcompute::dsp }));
auto output_buffer = hetcompute::create_buffer<float>(output_buffer_size, hetcompute::device_set({ hetcompute::dsp }));
auto similarity_weights_buffer = hetcompute::create_buffer<float>(MAX_DIST, hetcompute::device_set({ hetcompute::dsp }));
// Init HetComputeSDK buffer
input_buffer.acquire_wi();
similarity_weights_buffer.acquire_wi();
for (size_t x = 0; x < input_buffer_size; x++) {
input_buffer[x] = static_cast<Pixel>(input[x]);
}
for (size_t y = 0; y < MAX_DIST; y++) {
similarity_weights_buffer[y] = similarity_weights[y];
}
input_buffer.release();
similarity_weights_buffer.release();
// Create task group
auto dg = hetcompute::create_group();
// create DSP kernel
auto dk = hetcompute::create_dsp_kernel<>(hetcompute_dsp_denoise_image_process);
// Launch the task on the dsp
for (int height = 0; height < img_height; height++) {
for (int width = 0; width < img_width; width++) {
dg->launch(dk, input_buffer, output_buffer, similarity_weights_buffer,
SEARCH_WINDOW_SIZE, SIMILARITY_WINDOW_SIZE,
img_width, img_height, width, height);
}
}
begin_process_time = getCurrentTimeMsec();
dg->wait_for();
end_process_time = getCurrentTimeMsec();
process_calc_time_dsp += (end_process_time - begin_process_time);
output_buffer.acquire_ro();
for (size_t count = 0; count < output_buffer.size(); count++) {
output[count] = static_cast<Pixel>(output_buffer[count]);
}
output_buffer.release();
}
贡献者信息
姓名 | 公司 |
---|---|
shentao1012@thundersoft.com | Thundersoft |
Yang Rong yangrong0925@thundersoft.com |
Thundersoft |
Wu kouzw0723@thundersoft.com |
Thundersoft |
>>浏览更多Qualcomm硬件案例:http://qualcomm.csdn.net/m/zone/qualcomm2016/project
Qualcomm 开发者专区是 Qualcomm 联合CSDN 共同打造的面向中国开发者的技术专区。致力于通过提供全球最新资讯和最多元的技术资源及支持,为开发者们打造全面一流的开发环境。本专区将以嵌入式、物联网、游戏开发、Qualcomm® 骁龙™处理器的软件优化等技术为核心,打造全面的开发者技术服务社区,为下一代高性能体验和设计带来更多的想法和灵感。
加入 Qualcomm 开发者专区