пятница, 8 марта 2013 г.

scaling performance of libswscale versus OpenGL

There are many ways to scale an image. One can use provided libraries, or implement scaling via OpenGL pipeline. Latter choice looks preferred, as it uses hardware part which is desined to do suck tasks efficiently. But how fast OpenGL is? I've got following results on Celeron P4500 (it has integrated Ironlake video chip):
  w    h     soft     gl1      gl2   gl1nocopy gl2nocopy
 256; 256;0.002582;0.005684;0.007081;0.000657;0.002122
 256; 512;0.002958;0.009531;0.011303;0.000887;0.002118
 256; 768;0.003201;0.013818;0.015991;0.000859;0.002552
 256;1024;0.003485;0.018340;0.019503;0.001634;0.002126
 512; 256;0.003370;0.009567;0.011041;0.001214;0.002147
 512; 512;0.003848;0.019100;0.019351;0.001630;0.002103
 512; 768;0.004408;0.027939;0.028123;0.001745;0.002439
 512;1024;0.004896;0.036727;0.035284;0.002099;0.002110
 768; 256;0.004105;0.014059;0.015718;0.000943;0.002477
 768; 512;0.004929;0.027467;0.027748;0.001842;0.002419
 768; 768;0.005623;0.041481;0.040696;0.002376;0.002828
 768;1024;0.006465;0.056259;0.051331;0.002579;0.002218
1024; 256;0.004955;0.019852;0.019108;0.001627;0.002216
1024; 512;0.006256;0.037294;0.035924;0.002014;0.002284
1024; 768;0.006949;0.055599;0.052430;0.002492;0.002436
1024;1024;0.007896;0.073875;0.067410;0.002786;0.002413
1280; 256;0.005839;0.023752;0.024084;0.001527;0.002734
1280; 512;0.006979;0.045716;0.043938;0.002335;0.002552
1280; 768;0.008185;0.069451;0.063058;0.002651;0.002615
1280;1024;0.009488;0.093946;0.084176;0.003533;0.002774
1536; 256;0.006622;0.029007;0.028052;0.001837;0.002686
1536; 512;0.008031;0.055607;0.051383;0.002668;0.002287
1536; 768;0.009423;0.084270;0.076563;0.003294;0.002679
1536;1024;0.010719;0.114334;0.100322;0.003849;0.003139
1792; 256;0.007528;0.033786;0.032293;0.001866;0.002958
1792; 512;0.009060;0.064492;0.059214;0.002730;0.002416
1792; 768;0.010708;0.097901;0.087288;0.003630;0.002867
1792;1024;0.012345;0.131178;0.117498;0.003919;0.004337
2048; 256;0.008322;0.038345;0.035222;0.001984;0.002280
2048; 512;0.010135;0.074094;0.067603;0.002736;0.002468
2048; 768;0.011943;0.111330;0.101121;0.003874;0.003175
2048;1024;0.013746;0.149898;0.134717;0.004516;0.004226


Here in gl1 case I create and destroy textures on every scale while in gl2 case I create two large textures once, and then use them as upload targets. Last two columns represent case where no copying data back performed. As OpenGL is tailored for output, retrieving data back can be slow. And you see here, how slow it can be. Around 10 times slower than doing scale on CPU! But when you don't need data back (i.e. you are going to display scaled image on screen, or perform any additional processing), it can be dramatically faster.
Source image was 600x372. w and h columns represent width and height of destination. Hardware: Celeron P4500 with integrated video chip.

Here is the source of scaling.c:

#define _POSIX_C_SOURCE    199309L
#define GL_GLEXT_PROTOTYPES
#include <libswscale/swscale.h>
#include <cairo/cairo.h>
#include <time.h>
#include <GL/gl.h>
#include <GL/glext.h>
#include <GL/glx.h>
#include <X11/Xlib.h>
 
int max_w = 2048;
int max_h = 1024;
int reps_count = 100;
int step = 256;
 
void scale_image_libswscale(void *src_buf, int src_w, int src_h, int src_pitch,
                            void *dst_buf, int dst_w, int dst_h, int dst_pitch)
{
    struct SwsContext *sws_ctx = sws_getContext(src_w, src_h, PIX_FMT_BGRA,
                                                dst_w, dst_h, PIX_FMT_BGRA, SWS_FAST_BILINEAR,
                                                NULL, NULL, NULL);
    uint8_t const * const src[] = {src_buf, 0, 0, 0};
    int src_stride[] = {src_pitch, 0, 0, 0};
    uint8_t * const dst[] = {dst_buf, 0, 0, 0};
    int dst_stride[] = {dst_pitch, 0, 0, 0};
 
    sws_scale(sws_ctx, src, src_stride, 0, src_h, dst, dst_stride);
    sws_freeContext(sws_ctx);
}
 
void scale_image_gl(void *src_buf, int src_w, int src_h, int src_pitch,
                    void *dst_buf, int dst_w, int dst_h, int dst_pitch, int nocopy)
{
    GLuint tex[2];
    glGenTextures(2, tex);
    glBindTexture(GL_TEXTURE_2D, tex[0]);
 
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, dst_w, dst_h, 0, GL_BGRA, GL_UNSIGNED_BYTE, NULL);
    glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, tex[0], 0);
 
    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    glOrtho(0, dst_w-1, 0, dst_h-1, -1.0, 1.0);
    glViewport(0, 0, dst_w, dst_h);
    glEnable(GL_TEXTURE_2D);
 
    glPixelStorei(GL_UNPACK_ROW_LENGTH, src_pitch/4);
    glBindTexture(GL_TEXTURE_2D, tex[1]);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, src_w, src_h, 0, GL_BGRA, GL_UNSIGNED_BYTE, src_buf);
    glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
 
    glBegin(GL_QUADS);
        glTexCoord2f(0.0f, 0.0f); glVertex2i(0, 0);
        glTexCoord2f(1.0f, 0.0f); glVertex2i(dst_w - 1, 0);
        glTexCoord2f(1.0f, 1.0f); glVertex2i(dst_w - 1, dst_h - 1);
        glTexCoord2f(0.0f, 1.0f); glVertex2i(0, dst_h - 1);
    glEnd();
 
    glBindTexture(GL_TEXTURE_2D, tex[0]);
    if (!nocopy)
        glGetTexImage(GL_TEXTURE_2D, 0, GL_BGRA, GL_UNSIGNED_BYTE, dst_buf);
 
    glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
 
    glDeleteTextures(2, tex);
}
 
void scale_image_gl2(void *src_buf, int src_w, int src_h, int src_pitch,
                     void *dst_buf, int dst_w, int dst_h, int dst_pitch, int nocopy, GLuint tex_src)
{
    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    glOrtho(0, dst_w-1, 0, dst_h-1, -1.0, 1.0);
    glViewport(0, 0, dst_w, dst_h);
    glEnable(GL_TEXTURE_2D);
 
    glBindTexture(GL_TEXTURE_2D, tex_src);
    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, src_w, src_h, GL_BGRA, GL_UNSIGNED_BYTE, src_buf);
 
    glBegin(GL_QUADS);
        glTexCoord2f(0.0f, 0.0f); glVertex2i(0, 0);
        glTexCoord2f(1.0f, 0.0f); glVertex2i(dst_w - 1, 0);
        glTexCoord2f(1.0f, 1.0f); glVertex2i(dst_w - 1, dst_h - 1);
        glTexCoord2f(0.0f, 1.0f); glVertex2i(0, dst_h - 1);
    glEnd();
 
    if (!nocopy) {
        glPixelStorei(GL_UNPACK_ROW_LENGTH, dst_pitch/4);
        glReadPixels(0, 0, dst_w, dst_h, GL_BGRA, GL_UNSIGNED_BYTE, dst_buf);
        glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
    }
}
 
 
int main(void)
{
    struct timespec t_begin, t_end;
    double interval_sws, interval_gl1[2], interval_gl2[2];
 
    cairo_surface_t *img_surf = cairo_image_surface_create_from_png("kuzina.png");
    if (CAIRO_STATUS_SUCCESS != cairo_surface_status(img_surf)) {
        printf("image load failuren");
        exit(1);
    }
 
    void *src_buf = cairo_image_surface_get_data(img_surf);
    int src_w = cairo_image_surface_get_width(img_surf);
    int src_h = cairo_image_surface_get_height(img_surf);
    int src_stride = cairo_image_surface_get_stride(img_surf);
 
    void *dst_buf = malloc(max_w*max_h*4);
    if (NULL == dst_buf) { printf("malloc failedn"); exit(1); }
 
    Display *display = XOpenDisplay(NULL);
    GLint att[] = {GLX_RGBA, GLX_DEPTH_SIZE, 24, GLX_DOUBLEBUFFER, None};
    XVisualInfo *vi = glXChooseVisual(display, 0, att);
    GLXContext glc = glXCreateContext(display, vi, NULL, GL_TRUE);
    glXMakeCurrent(display, DefaultRootWindow(display), glc);
    GLuint fbo_id[2];
    glGenFramebuffers(2, fbo_id);
    glBindFramebuffer(GL_FRAMEBUFFER, fbo_id[0]);
    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
 
    glBindFramebuffer(GL_FRAMEBUFFER, fbo_id[1]);
    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
 
    GLuint tex[2];
    glGenTextures(2, tex);
    glBindTexture(GL_TEXTURE_2D, tex[0]);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, max_w, max_h, 0, GL_BGRA, GL_UNSIGNED_BYTE, NULL);
    glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, tex[0], 0);
 
    glBindTexture(GL_TEXTURE_2D, tex[1]);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, max_w, max_h, 0, GL_BGRA, GL_UNSIGNED_BYTE, NULL);
 
    for (int dst_w = step; dst_w <= max_w; dst_w += step) {
        for (int dst_h = step; dst_h <= max_h; dst_h += step) {
            // swscale
            clock_gettime(CLOCK_MONOTONIC_RAW, &t_begin);
            for (int k = 0; k < reps_count; k ++) {
                scale_image_libswscale(src_buf, src_w, src_h, src_stride,
                                       dst_buf, dst_w, dst_h, 4*dst_w);
            }
            clock_gettime(CLOCK_MONOTONIC_RAW, &t_end);
            interval_sws = (t_end.tv_nsec - t_begin.tv_nsec)/1e9 + (t_end.tv_sec - t_begin.tv_sec);
 
            for (int nocopy = 0; nocopy < 2; nocopy ++) {
                // gl scale
                clock_gettime(CLOCK_MONOTONIC_RAW, &t_begin);
                glBindFramebuffer(GL_FRAMEBUFFER, fbo_id[0]);
                for (int k = 0; k < reps_count; k ++) {
                    scale_image_gl(src_buf, src_w, src_h, src_stride,
                                   dst_buf, dst_w, dst_h, 4*dst_w, nocopy);
                }
                clock_gettime(CLOCK_MONOTONIC_RAW, &t_end);
                interval_gl1[nocopy] = (t_end.tv_nsec - t_begin.tv_nsec)/1e9 +
                                                (t_end.tv_sec - t_begin.tv_sec);
 
                // gl scale 2
                clock_gettime(CLOCK_MONOTONIC_RAW, &t_begin);
                glBindFramebuffer(GL_FRAMEBUFFER, fbo_id[1]);
                for (int k = 0; k < reps_count; k ++) {
                    scale_image_gl2(src_buf, src_w, src_h, src_stride,
                                    dst_buf, dst_w, dst_h, 4*dst_w, nocopy, tex[1]);
                }
                clock_gettime(CLOCK_MONOTONIC_RAW, &t_end);
                interval_gl2[nocopy] = (t_end.tv_nsec - t_begin.tv_nsec)/1e9 +
                                                (t_end.tv_sec - t_begin.tv_sec);
            }
 
            printf("%4d;%4d;%8f;%8f;%8f;%8f;%8fn", dst_w, dst_h, interval_sws/reps_count,
                interval_gl1[0]/reps_count, interval_gl2[0]/reps_count,
                interval_gl1[1]/reps_count, interval_gl2[1]/reps_count);
        }
    }
 
    free(dst_buf);
    cairo_surface_destroy(img_surf);
 
}
 


CMakeLists.txt:

cmake_minimum_required(VERSION 2.8)
add_definitions(-std=gnu99 -Wall -Wextra)

find_package(X11)
find_package(PkgConfig REQUIRED)
pkg_check_modules(QW cairo gl glu libswscale REQUIRED)

link_directories(${QW_LIBRARY_DIRS} ${X11_LIBRARY_DIRS})
include_directories(${QW_LIBRARY_DIRS} ${X11_INCLUDE_DIRS})

add_executable(sc scaling.c)

target_link_libraries(sc ${QW_LIBRARIES} rt ${X11_LIBRARIES})

Комментариев нет:

Отправить комментарий