TUT HEVC Encoder
Loading...
Searching...
No Matches
reg_sad_pow2_widths-sse41.h
Go to the documentation of this file.
1/*****************************************************************************
2 * This file is part of Kvazaar HEVC encoder.
3 *
4 * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without modification,
8 * are permitted provided that the following conditions are met:
9 *
10 * * Redistributions of source code must retain the above copyright notice, this
11 * list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright notice, this
14 * list of conditions and the following disclaimer in the documentation and/or
15 * other materials provided with the distribution.
16 *
17 * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
18 * contributors may be used to endorse or promote products derived from
19 * this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
28 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
31 ****************************************************************************/
32
33#ifndef REG_SAD_POW2_WIDTHS_SSE41_H_
34#define REG_SAD_POW2_WIDTHS_SSE41_H_
35
36#include "kvazaar.h"
37
38#if KVZ_BIT_DEPTH == 8
39
41#include <immintrin.h>
42
43static INLINE uint32_t reg_sad_w0(const uint8_t * const data1, const uint8_t * const data2,
44 const int32_t height, const uint32_t stride1,
45 const uint32_t stride2)
46{
47 return 0;
48}
49
50static INLINE uint32_t reg_sad_w4(const uint8_t * const data1, const uint8_t * const data2,
51 const int32_t height, const uint32_t stride1,
52 const uint32_t stride2)
53{
55 int32_t y;
56
57 const int32_t height_fourline_groups = height & ~3;
58 const int32_t height_residual_lines = height & 3;
59
60 for (y = 0; y < height_fourline_groups; y += 4) {
63
64 a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 1) * stride1), 1);
65 b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 1) * stride2), 1);
66 a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 2) * stride1), 2);
67 b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 2) * stride2), 2);
68 a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 3) * stride1), 3);
69 b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 3) * stride2), 3);
70
73 }
75 for (; y < height; y++) {
76 __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(data1 + y * stride1));
77 __m128i b = _mm_cvtsi32_si128(*(const uint32_t *)(data2 + y * stride2));
78
81 }
82 }
85
86 return _mm_cvtsi128_si32(sad);
87}
88
89static INLINE uint32_t reg_sad_w8(const uint8_t * const data1, const uint8_t * const data2,
90 const int32_t height, const uint32_t stride1,
91 const uint32_t stride2)
92{
94 int32_t y;
95
96 const int32_t height_fourline_groups = height & ~3;
97 const int32_t height_residual_lines = height & 3;
98
99 for (y = 0; y < height_fourline_groups; y += 4) {
104
105 a_d = _mm_loadl_pd(a_d, (const double *)(data1 + (y + 0) * stride1));
106 b_d = _mm_loadl_pd(b_d, (const double *)(data2 + (y + 0) * stride2));
107 a_d = _mm_loadh_pd(a_d, (const double *)(data1 + (y + 1) * stride1));
108 b_d = _mm_loadh_pd(b_d, (const double *)(data2 + (y + 1) * stride2));
109
110 c_d = _mm_loadl_pd(c_d, (const double *)(data1 + (y + 2) * stride1));
111 d_d = _mm_loadl_pd(d_d, (const double *)(data2 + (y + 2) * stride2));
112 c_d = _mm_loadh_pd(c_d, (const double *)(data1 + (y + 3) * stride1));
113 d_d = _mm_loadh_pd(d_d, (const double *)(data2 + (y + 3) * stride2));
114
119
124 }
126 for (; y < height; y++) {
129
132 }
133 }
136
137 return _mm_cvtsi128_si32(sad);
138}
139
140static INLINE uint32_t reg_sad_w12(const uint8_t * const data1, const uint8_t * const data2,
141 const int32_t height, const uint32_t stride1,
142 const uint32_t stride2)
143{
145 int32_t y;
146 for (y = 0; y < height; y++) {
147 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1));
148 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2));
149
150 __m128i b_masked = _mm_blend_epi16(a, b, 0x3f);
153 }
156 return _mm_cvtsi128_si32(sad);
157}
158
159static INLINE uint32_t reg_sad_w16(const uint8_t * const data1, const uint8_t * const data2,
160 const int32_t height, const uint32_t stride1,
161 const uint32_t stride2)
162{
164 int32_t y;
165
166 const int32_t height_fourline_groups = height & ~3;
167 const int32_t height_residual_lines = height & 3;
168
169 for (y = 0; y < height_fourline_groups; y += 4) {
170 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1));
171 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2));
172 __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1));
173 __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2));
174 __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1));
175 __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2));
176 __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1));
177 __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2));
178
183
188 }
190 for (; y < height; y++) {
191 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1));
192 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2));
193
196 }
197 }
198
201 return _mm_cvtsi128_si32(sad);
202}
203
204static INLINE uint32_t reg_sad_w24(const uint8_t * const data1, const uint8_t * const data2,
205 const int32_t height, const uint32_t stride1,
206 const uint32_t stride2)
207{
209 int32_t y;
210
211 const int32_t height_doublelines = height & ~1;
212 const int32_t height_parity = height & 1;
213
214 for (y = 0; y < height_doublelines; y += 2) {
215 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1));
216 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2));
217 __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1));
218 __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2));
219
222
223 e_d = _mm_loadl_pd(e_d, (const double *)(data1 + (y + 0) * stride1 + 16));
224 f_d = _mm_loadl_pd(f_d, (const double *)(data2 + (y + 0) * stride2 + 16));
225 e_d = _mm_loadh_pd(e_d, (const double *)(data1 + (y + 1) * stride1 + 16));
226 f_d = _mm_loadh_pd(f_d, (const double *)(data2 + (y + 1) * stride2 + 16));
227
230
234
238 }
239 if (height_parity) {
240 __m128i a = _mm_loadu_si128 ((const __m128i *)(data1 + y * stride1));
241 __m128i b = _mm_loadu_si128 ((const __m128i *)(data2 + y * stride2));
242 __m128i c = _mm_loadl_epi64 ((const __m128i *)(data1 + y * stride1 + 16));
243 __m128i d = _mm_loadl_epi64 ((const __m128i *)(data2 + y * stride2 + 16));
244
247
250 }
253 return _mm_cvtsi128_si32(sad);
254}
255
256static INLINE uint32_t reg_sad_arbitrary(const uint8_t * const data1, const uint8_t * const data2,
257 const int32_t width, const int32_t height, const uint32_t stride1,
258 const uint32_t stride2)
259{
260 int32_t y, x;
262
263 // Bytes in block in 128-bit blocks per each scanline, and remainder
264 const int32_t width_xmms = width & ~15;
265 const int32_t width_residual_pixels = width & 15;
266
267 const int32_t height_fourline_groups = height & ~3;
268 const int32_t height_residual_lines = height & 3;
269
271 const __m128i ns = _mm_setr_epi8 (0, 1, 2, 3, 4, 5, 6, 7,
272 8, 9, 10, 11, 12, 13, 14, 15);
274
275 for (x = 0; x < width_xmms; x += 16) {
276 for (y = 0; y < height_fourline_groups; y += 4) {
277 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1 + x));
278 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2 + x));
279 __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1 + x));
280 __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2 + x));
281 __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1 + x));
282 __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2 + x));
283 __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1 + x));
284 __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2 + x));
285
290
295 }
297 for (; y < height; y++) {
298 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1 + x));
299 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2 + x));
300
302
304 }
305 }
306 }
307
309 for (y = 0; y < height_fourline_groups; y += 4) {
310 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1 + x));
311 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2 + x));
312 __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1 + x));
313 __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2 + x));
314 __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1 + x));
315 __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2 + x));
316 __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1 + x));
317 __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2 + x));
318
323
328
333 }
335 for (; y < height; y++) {
336 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1 + x));
337 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2 + x));
338
341
343 }
344 }
345 }
348
349 return _mm_cvtsi128_si32(sad);
350}
351
352static uint32_t ver_sad_w4(const uint8_t *pic_data, const uint8_t *ref_data,
353 int32_t height, uint32_t stride)
354{
357 int32_t y;
358
359 const int32_t height_fourline_groups = height & ~3;
360 const int32_t height_residual_lines = height & 3;
361
362 for (y = 0; y < height_fourline_groups; y += 4) {
363 __m128i a = _mm_cvtsi32_si128(*(uint32_t *)(pic_data + y * stride));
364
365 a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 1) * stride), 1);
366 a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 2) * stride), 2);
367 a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 3) * stride), 3);
368
371 }
373 // Only pick the last dword, because we're comparing single dwords (lines)
375
376 for (; y < height; y++) {
377 __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * stride));
378
381 }
382 }
385
386 return _mm_cvtsi128_si32(sad);
387}
388
389static uint32_t ver_sad_w8(const uint8_t *pic_data, const uint8_t *ref_data,
390 int32_t height, uint32_t stride)
391{
394 int32_t y;
395
396 const int32_t height_fourline_groups = height & ~3;
397 const int32_t height_residual_lines = height & 3;
398
399 for (y = 0; y < height_fourline_groups; y += 4) {
402
403 a_d = _mm_loadl_pd(a_d, (const double *)(pic_data + (y + 0) * stride));
404 a_d = _mm_loadh_pd(a_d, (const double *)(pic_data + (y + 1) * stride));
405
406 c_d = _mm_loadl_pd(c_d, (const double *)(pic_data + (y + 2) * stride));
407 c_d = _mm_loadh_pd(c_d, (const double *)(pic_data + (y + 3) * stride));
408
411
416 }
419
420 for (; y < height; y++) {
421 __m128i a = _mm_loadl_epi64((__m128i *)(pic_data + y * stride));
422
425 }
426 }
429
430 return _mm_cvtsi128_si32(sad);
431}
432
434 int32_t height, uint32_t stride)
435{
438 int32_t y;
439
440 for (y = 0; y < height; y++) {
441 __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride));
442
446 }
449 return _mm_cvtsi128_si32(sad);
450}
451
453 int32_t height, uint32_t stride)
454{
457 int32_t y;
458
459 const int32_t height_fourline_groups = height & ~3;
460 const int32_t height_residual_lines = height & 3;
461
462 for (y = 0; y < height_fourline_groups; y += 4) {
463 __m128i pic_row_1 = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
464 __m128i pic_row_2 = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * stride));
465 __m128i pic_row_3 = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * stride));
466 __m128i pic_row_4 = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * stride));
467
472
477 }
479 for (; y < height; y++) {
480 __m128i pic_row = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
482
484 }
485 }
488
489 return _mm_cvtsi128_si32(sad);
490}
491
493 int32_t width, int32_t height, uint32_t stride)
494{
495 int32_t y, x;
497
498 // Bytes in block in 128-bit blocks per each scanline, and remainder
499 const int32_t width_xmms = width & ~15;
500 const int32_t width_residual_pixels = width & 15;
501
502 const int32_t height_fourline_groups = height & ~3;
503 const int32_t height_residual_lines = height & 3;
504
506 const __m128i ns = _mm_setr_epi8 (0, 1, 2, 3, 4, 5, 6, 7,
507 8, 9, 10, 11, 12, 13, 14, 15);
509
510 for (x = 0; x < width_xmms; x += 16) {
512 for (y = 0; y < height_fourline_groups; y += 4) {
513 __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x));
514 __m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x));
515 __m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x));
516 __m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x));
517
522
527 }
529 for (; y < height; y++) {
530 __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x));
531
533
535 }
536 }
537 }
538
541 for (y = 0; y < height_fourline_groups; y += 4) {
542 __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x));
543 __m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x));
544 __m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x));
545 __m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x));
546
551
556
561 }
563 for (; y < height; y++) {
564 __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x));
565
568
570 }
571 }
572 }
575
576 return _mm_cvtsi128_si32(sad);
577}
578
581 uint32_t left, uint32_t right)
582{
583 const int32_t right_border_idx = 3 - right;
584 const int32_t border_idx = left ? left : right_border_idx;
585
586 const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
587 8, 9, 10, 11, 12, 13, 14, 15);
588
590 const int32_t leftoff = border_idx_negative | left;
591
592 // Dualword (ie. line) base indexes, ie. the edges the lines read will be
593 // clamped towards
594 const __m128i dwbaseids = _mm_setr_epi8(0, 0, 0, 0, 4, 4, 4, 4,
595 8, 8, 8, 8, 12, 12, 12, 12);
596
599
601
604
606
607 const int32_t height_fourline_groups = height & ~3;
608 const int32_t height_residual_lines = height & 3;
609
611 int32_t y;
612 for (y = 0; y < height_fourline_groups; y += 4) {
615
616 a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 1) * pic_stride), 1);
617 b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 1) * ref_stride + leftoff), 1);
618 a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 2) * pic_stride), 2);
619 b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 2) * ref_stride + leftoff), 2);
620 a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 3) * pic_stride), 3);
621 b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 3) * ref_stride + leftoff), 3);
622
626 }
628 for (; y < height; y++) {
631
635 }
636 }
639
640 return _mm_cvtsi128_si32(sad);
641}
642
645 uint32_t left, uint32_t right)
646{
647 // right is the number of overhanging pixels in the vector, so it has to be
648 // handled this way to produce the index of last valid (border) pixel
649 const int32_t right_border_idx = 7 - right;
650 const int32_t border_idx = left ? left : right_border_idx;
651
652 const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
653 8, 9, 10, 11, 12, 13, 14, 15);
654
655 // Quadword (ie. line) base indexes, ie. the edges the lines read will be
656 // clamped towards; higher qword (lower line) bytes tend towards 8 and lower
657 // qword (higher line) bytes towards 0
658 const __m128i qwbaseids = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
659 8, 8, 8, 8, 8, 8, 8, 8);
660
661 // Dirty hack alert! If right == block_width (ie. the entire vector is
662 // outside the frame), move the block offset one pixel to the left (so
663 // that the leftmost pixel in vector is actually the valid border pixel
664 // from which we want to extrapolate), and use an epol mask that will
665 // simply stretch the pixel all over the vector.
666 //
667 // To avoid a branch here:
668 // The mask will be -1 (0xffffffff) for border_idx -1 and 0 for >= 0
670 const int32_t leftoff = border_idx_negative | left;
671
674
676
677 // If we're straddling the left border, right_border_idx is 7 and the first
678 // operation does nothing. If right border, left is 0 and the second
679 // operation does nothing.
682
683 // If right == 8 (we're completely outside the frame), right_border_idx is
684 // -1 and so is mask1. Clamp negative values to qwbaseid and as discussed
685 // earlier, adjust the load offset instead to load the "-1'st" pixels and
686 // using qwbaseids as the shuffle mask, broadcast it all over the rows.
688
689 const int32_t height_fourline_groups = height & ~3;
690 const int32_t height_residual_lines = height & 3;
691
693 int32_t y;
694 for (y = 0; y < height_fourline_groups; y += 4) {
699
700 a_d = _mm_loadl_pd(a_d, (const double *)(pic_data + (y + 0) * pic_stride));
701 b_d = _mm_loadl_pd(b_d, (const double *)(ref_data + (y + 0) * ref_stride + leftoff));
702 a_d = _mm_loadh_pd(a_d, (const double *)(pic_data + (y + 1) * pic_stride));
703 b_d = _mm_loadh_pd(b_d, (const double *)(ref_data + (y + 1) * ref_stride + leftoff));
704
705 c_d = _mm_loadl_pd(c_d, (const double *)(pic_data + (y + 2) * pic_stride));
706 d_d = _mm_loadl_pd(d_d, (const double *)(ref_data + (y + 2) * ref_stride + leftoff));
707 c_d = _mm_loadh_pd(c_d, (const double *)(pic_data + (y + 3) * pic_stride));
708 d_d = _mm_loadh_pd(d_d, (const double *)(ref_data + (y + 3) * ref_stride + leftoff));
709
714
717
722 }
724 for (; y < height; y++) {
727
729
732 }
733 }
736 return _mm_cvtsi128_si32(sad);
737}
738
739/*
740 * left and right measure how many pixels of one horizontal scanline will be
741 * outside either the left or the right screen border. For blocks straddling
742 * the left border, read the scanlines starting from the left border instead,
743 * and use the extrapolation mask to essentially move the pixels right while
744 * copying the left border pixel to the vector positions that logically point
745 * outside of the buffer.
746 *
747 * For blocks straddling the right border, just read over the right border,
748 * and extrapolate all pixels beyond the border idx to copy the value of the
749 * border pixel. An exception is right == width (leftmost reference pixel is
750 * one place right from the right border, it's ugly because the pixel to
751 * extrapolate from is located at relative X offset -1), abuse the left border
752 * aligning functionality instead to actually read starting from the valid
753 * border pixel, and use a suitable mask to fill all the other pixels with
754 * that value.
755 */
758 const uint32_t left, const uint32_t right)
759{
760 // right is the number of overhanging pixels in the vector, so it has to be
761 // handled this way to produce the index of last valid (border) pixel
762 const int32_t right_border_idx = 15 - right;
763 const int32_t border_idx = left ? left : right_border_idx;
764
765 const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
766 8, 9, 10, 11, 12, 13, 14, 15);
768
769 // Dirty hack alert! If right == block_width (ie. the entire vector is
770 // outside the frame), move the block offset one pixel to the left (so
771 // that the leftmost pixel in vector is actually the valid border pixel
772 // from which we want to extrapolate), and use an epol mask that will
773 // simply stretch the pixel all over the vector.
774 //
775 // To avoid a branch here:
776 // The mask will be -1 (0xffffffff) for border_idx -1 and 0 for >= 0
778 const int32_t leftoff = border_idx_negative | left;
779
782
783 // If we're straddling the left border, right_border_idx is 15 and the first
784 // operation does nothing. If right border, left is 0 and the second
785 // operation does nothing.
788
789 // If right == 16 (we're completely outside the frame), right_border_idx is
790 // -1 and so is mask1. Clamp negative values to zero and as discussed
791 // earlier, adjust the load offset instead to load the "-1'st" pixel and
792 // using an all-zero shuffle mask, broadcast it all over the vector.
794
795 const int32_t height_fourline_groups = height & ~3;
796 const int32_t height_residual_lines = height & 3;
797
799 int32_t y;
800 for (y = 0; y < height_fourline_groups; y += 4) {
801 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride));
809
814
819
824 }
826 for (; y < height; y++) {
827 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride));
832 }
833 }
836 return _mm_cvtsi128_si32(sad);
837}
838
840 int32_t width, int32_t height, uint32_t pic_stride,
842{
844
845 const size_t vec_width = 16;
846 const size_t vecwid_bitmask = 15;
847 const size_t vec_width_log2 = 4;
848
849 const int32_t height_fourline_groups = height & ~3;
850 const int32_t height_residual_lines = height & 3;
851
852 const __m128i rights = _mm_set1_epi8((uint8_t)right);
853 const __m128i blk_widths = _mm_set1_epi8((uint8_t)width);
855 const __m128i nslo = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
856
860 if (left) {
865 left_offset = left;
866 border_off = left;
867 invec_lstart = 0;
869 invec_linc = 1;
870 is_left_bm = -1;
871 } else {
872 inside_vecs = ((width - right) + vecwid_bitmask) >> vec_width_log2;
876 left_offset = right - width;
877 border_off = width - 1 - right;
879 invec_lend = -1;
880 invec_linc = -1;
881 is_left_bm = 0;
882 }
884
888
889 // -x == (x ^ 0xff) + 1 = (x ^ 0xff) - 0xff. Also x == (x ^ 0x00) - 0x00.
890 // in other words, calculate inverse of left_offsets if is_left is true.
893
896
901
903 int32_t x, y;
904 for (y = 0; y < height_fourline_groups; y += 4) {
909
910 for (x = 0; x < outside_vecs; x++) {
915
918
919 // Unread imask is (is_left NOR unrd_imask_for_right), do the maths etc
923
928
933
938 }
941
946
947 for (x = invec_lstart; x != invec_lend; x += invec_linc) {
948 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + a_off));
956
961
966
972
977
982
987
992 }
993 }
995 for (; y < height; y++) {
997 for (x = 0; x < outside_vecs; x++) {
999
1002
1003 // Unread imask is (is_left NOR unrd_imask_for_right), do the maths etc
1008
1011 }
1014
1016 for (x = invec_lstart; x != invec_lend; x += invec_linc) {
1017 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + a_off));
1019
1022
1029
1030 old_b = b_shifted;
1031
1034 }
1035 }
1036 }
1039 return _mm_cvtsi128_si32(sad);
1040}
1041
1042#endif // KVZ_BIT_DEPTH == 8
1043
1044#endif
#define INLINE
Definition global.h:240
#define MAX_TILES_PER_DIM
Definition global.h:232
This file defines the public API of Kvazaar when used as a library.
#define _mm_bsrli_si128(a, imm8)
Definition missing-intel-intrinsics.h:8