Imported Upstream version 0.148.2708+git86b7198
Rico Tzschichholz
7 years ago
1252 | 1252 | ret |
1253 | 1253 | endfunc |
1254 | 1254 | |
1255 | function x264_plane_copy_neon, export=1 | |
1255 | function x264_plane_copy_core_neon, export=1 | |
1256 | 1256 | add x8, x4, #15 |
1257 | 1257 | and x4, x8, #~15 |
1258 | 1258 | sub x1, x1, x4 |
1351 | 1351 | ret |
1352 | 1352 | endfunc |
1353 | 1353 | |
1354 | function x264_plane_copy_interleave_neon, export=1 | |
1354 | function x264_plane_copy_interleave_core_neon, export=1 | |
1355 | 1355 | add w9, w6, #15 |
1356 | 1356 | and w9, w9, #0xfffffff0 |
1357 | 1357 | sub x1, x1, x9, lsl #1 |
1666 | 1666 | b.gt 1b |
1667 | 1667 | ret |
1668 | 1668 | endfunc |
1669 | ||
1670 | // void mbtree_fix8_pack( int16_t *dst, float *src, int count ) | |
1671 | function x264_mbtree_fix8_pack_neon, export=1 | |
1672 | subs w3, w2, #8 | |
1673 | b.lt 2f | |
1674 | 1: | |
1675 | subs w3, w3, #8 | |
1676 | ld1 {v0.4s,v1.4s}, [x1], #32 | |
1677 | fcvtzs v0.4s, v0.4s, #8 | |
1678 | fcvtzs v1.4s, v1.4s, #8 | |
1679 | sqxtn v2.4h, v0.4s | |
1680 | sqxtn2 v2.8h, v1.4s | |
1681 | rev16 v3.16b, v2.16b | |
1682 | st1 {v3.8h}, [x0], #16 | |
1683 | b.ge 1b | |
1684 | 2: | |
1685 | adds w3, w3, #8 | |
1686 | b.eq 4f | |
1687 | 3: | |
1688 | subs w3, w3, #1 | |
1689 | ldr s0, [x1], #4 | |
1690 | fcvtzs w4, s0, #8 | |
1691 | rev16 w5, w4 | |
1692 | strh w5, [x0], #2 | |
1693 | b.gt 3b | |
1694 | 4: | |
1695 | ret | |
1696 | endfunc | |
1697 | ||
1698 | // void mbtree_fix8_unpack( float *dst, int16_t *src, int count ) | |
1699 | function x264_mbtree_fix8_unpack_neon, export=1 | |
1700 | subs w3, w2, #8 | |
1701 | b.lt 2f | |
1702 | 1: | |
1703 | subs w3, w3, #8 | |
1704 | ld1 {v0.8h}, [x1], #16 | |
1705 | rev16 v1.16b, v0.16b | |
1706 | sxtl v2.4s, v1.4h | |
1707 | sxtl2 v3.4s, v1.8h | |
1708 | scvtf v4.4s, v2.4s, #8 | |
1709 | scvtf v5.4s, v3.4s, #8 | |
1710 | st1 {v4.4s,v5.4s}, [x0], #32 | |
1711 | b.ge 1b | |
1712 | 2: | |
1713 | adds w3, w3, #8 | |
1714 | b.eq 4f | |
1715 | 3: | |
1716 | subs w3, w3, #1 | |
1717 | ldrh w4, [x1], #2 | |
1718 | rev16 w5, w4 | |
1719 | sxth w6, w5 | |
1720 | scvtf s0, w6, #8 | |
1721 | str s0, [x0], #4 | |
1722 | b.gt 3b | |
1723 | 4: | |
1724 | ret | |
1725 | endfunc |
48 | 48 | void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); |
49 | 49 | void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); |
50 | 50 | |
51 | void x264_plane_copy_neon( pixel *dst, intptr_t i_dst, | |
52 | pixel *src, intptr_t i_src, int w, int h ); | |
51 | void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst, | |
52 | pixel *src, intptr_t i_src, int w, int h ); | |
53 | 53 | void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, |
54 | 54 | pixel *dstv, intptr_t i_dstv, |
55 | 55 | pixel *src, intptr_t i_src, int w, int h ); |
57 | 57 | pixel *dstb, intptr_t i_dstb, |
58 | 58 | pixel *dstc, intptr_t i_dstc, |
59 | 59 | pixel *src, intptr_t i_src, int pw, int w, int h ); |
60 | void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst, | |
61 | pixel *srcu, intptr_t i_srcu, | |
62 | pixel *srcv, intptr_t i_srcv, int w, int h ); | |
60 | void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst, | |
61 | pixel *srcu, intptr_t i_srcu, | |
62 | pixel *srcv, intptr_t i_srcv, int w, int h ); | |
63 | 63 | |
64 | 64 | void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); |
65 | 65 | void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); |
99 | 99 | |
100 | 100 | void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int ); |
101 | 101 | |
102 | void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count ); | |
103 | void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count ); | |
104 | ||
102 | 105 | #if !HIGH_BIT_DEPTH |
103 | 106 | static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w ) |
104 | 107 | { |
202 | 205 | void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, |
203 | 206 | uint8_t *src, intptr_t stride, int width, |
204 | 207 | int height, int16_t *buf ); |
208 | ||
209 | PLANE_COPY(16, neon) | |
210 | PLANE_INTERLEAVE(neon) | |
205 | 211 | #endif // !HIGH_BIT_DEPTH |
206 | 212 | |
207 | 213 | PROPAGATE_LIST(neon) |
261 | 267 | |
262 | 268 | pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon; |
263 | 269 | pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon; |
270 | pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon; | |
271 | pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon; | |
264 | 272 | |
265 | 273 | pf->memcpy_aligned = x264_memcpy_aligned_neon; |
266 | 274 | pf->memzero_aligned = x264_memzero_aligned_neon; |
1467 | 1467 | bx lr |
1468 | 1468 | endfunc |
1469 | 1469 | |
1470 | function x264_plane_copy_neon | |
1470 | function x264_plane_copy_core_neon | |
1471 | 1471 | push {r4,lr} |
1472 | 1472 | ldr r4, [sp, #8] |
1473 | 1473 | ldr lr, [sp, #12] |
1576 | 1576 | pop {r4-r8, r10, r11, pc} |
1577 | 1577 | endfunc |
1578 | 1578 | |
1579 | function x264_plane_copy_interleave_neon | |
1579 | function x264_plane_copy_interleave_core_neon | |
1580 | 1580 | push {r4-r7, lr} |
1581 | 1581 | ldrd r6, r7, [sp, #28] |
1582 | 1582 | ldrd r4, r5, [sp, #20] |
1603 | 1603 | pop {r4-r7, pc} |
1604 | 1604 | endfunc |
1605 | 1605 | |
1606 | function x264_plane_copy_swap_neon | |
1606 | function x264_plane_copy_swap_core_neon | |
1607 | 1607 | push {r4-r5, lr} |
1608 | 1608 | ldrd r4, r5, [sp, #12] |
1609 | 1609 | add lr, r4, #15 |
1879 | 1879 | bge 8b |
1880 | 1880 | bx lr |
1881 | 1881 | endfunc |
1882 | ||
1883 | @ void mbtree_fix8_pack( int16_t *dst, float *src, int count ) | |
1884 | function x264_mbtree_fix8_pack_neon, export=1 | |
1885 | subs r3, r2, #8 | |
1886 | blt 2f | |
1887 | 1: | |
1888 | subs r3, r3, #8 | |
1889 | vld1.32 {q0,q1}, [r1,:128]! | |
1890 | vcvt.s32.f32 q0, q0, #8 | |
1891 | vcvt.s32.f32 q1, q1, #8 | |
1892 | vqmovn.s32 d4, q0 | |
1893 | vqmovn.s32 d5, q1 | |
1894 | vrev16.8 q3, q2 | |
1895 | vst1.16 {q3}, [r0,:128]! | |
1896 | bge 1b | |
1897 | 2: | |
1898 | adds r3, r3, #8 | |
1899 | bxeq lr | |
1900 | 3: | |
1901 | subs r3, r3, #1 | |
1902 | vld1.32 {d0[0]}, [r1]! | |
1903 | vcvt.s32.f32 s0, s0, #8 | |
1904 | vrev16.8 d0, d0 | |
1905 | vst1.16 {d0[0]}, [r0]! | |
1906 | bgt 3b | |
1907 | ||
1908 | bx lr | |
1909 | endfunc | |
1910 | ||
1911 | @ void mbtree_fix8_unpack( float *dst, int16_t *src, int count ) | |
1912 | function x264_mbtree_fix8_unpack_neon, export=1 | |
1913 | subs r3, r2, #8 | |
1914 | blt 2f | |
1915 | 1: | |
1916 | subs r3, r3, #8 | |
1917 | vld1.16 {q0}, [r1,:128]! | |
1918 | vrev16.8 q1, q0 | |
1919 | vmovl.s16 q0, d2 | |
1920 | vmovl.s16 q1, d3 | |
1921 | vcvt.f32.s32 q0, q0, #8 | |
1922 | vcvt.f32.s32 q1, q1, #8 | |
1923 | vst1.32 {q0,q1}, [r0,:128]! | |
1924 | bge 1b | |
1925 | 2: | |
1926 | adds r3, r3, #8 | |
1927 | bxeq lr | |
1928 | 3: | |
1929 | subs r3, r3, #1 | |
1930 | vld1.16 {d0[0]}, [r1]! | |
1931 | vrev16.8 d0, d0 | |
1932 | vmovl.s16 q0, d0 | |
1933 | vcvt.f32.s32 d0, d0, #8 | |
1934 | vst1.32 {d0[0]}, [r0]! | |
1935 | bgt 3b | |
1936 | ||
1937 | bx lr | |
1938 | endfunc |
47 | 47 | void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); |
48 | 48 | void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); |
49 | 49 | |
50 | void x264_plane_copy_neon( pixel *dst, intptr_t i_dst, | |
51 | pixel *src, intptr_t i_src, int w, int h ); | |
50 | void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst, | |
51 | pixel *src, intptr_t i_src, int w, int h ); | |
52 | 52 | void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, |
53 | 53 | pixel *dstv, intptr_t i_dstv, |
54 | 54 | pixel *src, intptr_t i_src, int w, int h ); |
56 | 56 | pixel *dstb, intptr_t i_dstb, |
57 | 57 | pixel *dstc, intptr_t i_dstc, |
58 | 58 | pixel *src, intptr_t i_src, int pw, int w, int h ); |
59 | void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst, | |
60 | pixel *srcu, intptr_t i_srcu, | |
61 | pixel *srcv, intptr_t i_srcv, int w, int h ); | |
62 | void x264_plane_copy_swap_neon( pixel *dst, intptr_t i_dst, | |
63 | pixel *src, intptr_t i_src, int w, int h ); | |
59 | void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst, | |
60 | pixel *srcu, intptr_t i_srcu, | |
61 | pixel *srcv, intptr_t i_srcv, int w, int h ); | |
62 | void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst, | |
63 | pixel *src, intptr_t i_src, int w, int h ); | |
64 | 64 | |
65 | 65 | void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); |
66 | 66 | void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); |
108 | 108 | |
109 | 109 | void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int ); |
110 | 110 | |
111 | void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count ); | |
112 | void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count ); | |
113 | ||
111 | 114 | #if !HIGH_BIT_DEPTH |
112 | 115 | static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w ) |
113 | 116 | { |
228 | 231 | src += stride; |
229 | 232 | } |
230 | 233 | } |
234 | ||
235 | PLANE_COPY(16, neon) | |
236 | PLANE_COPY_SWAP(16, neon) | |
237 | PLANE_INTERLEAVE(neon) | |
231 | 238 | #endif // !HIGH_BIT_DEPTH |
232 | 239 | |
233 | 240 | PROPAGATE_LIST(neon) |
290 | 297 | |
291 | 298 | pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon; |
292 | 299 | pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon; |
300 | pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon; | |
301 | pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon; | |
293 | 302 | #endif // !HIGH_BIT_DEPTH |
294 | 303 | |
295 | 304 | // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs |
99 | 99 | }\ |
100 | 100 | } |
101 | 101 | |
102 | void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); | |
103 | ||
104 | #define PLANE_COPY(align, cpu)\ | |
105 | static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\ | |
106 | {\ | |
107 | int c_w = (align) / sizeof(pixel) - 1;\ | |
108 | if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\ | |
109 | x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\ | |
110 | else if( !(w&c_w) )\ | |
111 | x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\ | |
112 | else\ | |
113 | {\ | |
114 | if( --h > 0 )\ | |
115 | {\ | |
116 | if( i_src > 0 )\ | |
117 | {\ | |
118 | x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ | |
119 | dst += i_dst * h;\ | |
120 | src += i_src * h;\ | |
121 | }\ | |
122 | else\ | |
123 | x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ | |
124 | }\ | |
125 | /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\ | |
126 | memcpy( dst, src, w*sizeof(pixel) );\ | |
127 | }\ | |
128 | } | |
129 | ||
130 | void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); | |
131 | ||
132 | #define PLANE_COPY_SWAP(align, cpu)\ | |
133 | static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\ | |
134 | {\ | |
135 | int c_w = (align>>1) / sizeof(pixel) - 1;\ | |
136 | if( !(w&c_w) )\ | |
137 | x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\ | |
138 | else if( w > c_w )\ | |
139 | {\ | |
140 | if( --h > 0 )\ | |
141 | {\ | |
142 | if( i_src > 0 )\ | |
143 | {\ | |
144 | x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ | |
145 | dst += i_dst * h;\ | |
146 | src += i_src * h;\ | |
147 | }\ | |
148 | else\ | |
149 | x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ | |
150 | }\ | |
151 | x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\ | |
152 | for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\ | |
153 | {\ | |
154 | dst[x] = src[x+1];\ | |
155 | dst[x+1] = src[x];\ | |
156 | }\ | |
157 | }\ | |
158 | else\ | |
159 | x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\ | |
160 | } | |
161 | ||
162 | void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst, | |
163 | pixel *srcu, intptr_t i_srcu, | |
164 | pixel *srcv, intptr_t i_srcv, int w, int h ); | |
165 | ||
166 | #define PLANE_INTERLEAVE(cpu) \ | |
167 | static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\ | |
168 | pixel *srcu, intptr_t i_srcu,\ | |
169 | pixel *srcv, intptr_t i_srcv, int w, int h )\ | |
170 | {\ | |
171 | int c_w = 16 / sizeof(pixel) - 1;\ | |
172 | if( !(w&c_w) )\ | |
173 | x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ | |
174 | else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\ | |
175 | {\ | |
176 | if( --h > 0 )\ | |
177 | {\ | |
178 | if( i_srcu > 0 )\ | |
179 | {\ | |
180 | x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\ | |
181 | dst += i_dst * h;\ | |
182 | srcu += i_srcu * h;\ | |
183 | srcv += i_srcv * h;\ | |
184 | }\ | |
185 | else\ | |
186 | x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\ | |
187 | }\ | |
188 | x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\ | |
189 | }\ | |
190 | else\ | |
191 | x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ | |
192 | } | |
193 | ||
102 | 194 | struct x264_weight_t; |
103 | 195 | typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int ); |
104 | 196 | typedef struct x264_weight_t |
3760 | 3760 | return p_src1; |
3761 | 3761 | } |
3762 | 3762 | } |
3763 | #endif // !HIGH_BIT_DEPTH | |
3763 | 3764 | |
3764 | 3765 | void x264_mc_init_mips( int32_t cpu, x264_mc_functions_t *pf ) |
3765 | 3766 | { |
3767 | #if !HIGH_BIT_DEPTH | |
3766 | 3768 | if( cpu & X264_CPU_MSA ) |
3767 | 3769 | { |
3768 | 3770 | pf->mc_luma = x264_mc_luma_msa; |
3802 | 3804 | pf->memzero_aligned = x264_memzero_aligned_msa; |
3803 | 3805 | pf->frame_init_lowres_core = x264_frame_init_lowres_core_msa; |
3804 | 3806 | } |
3805 | } | |
3806 | #endif | |
3807 | #endif // !HIGH_BIT_DEPTH | |
3808 | } |
87 | 87 | void x264_prefetch_ref_mmx2( pixel *, intptr_t, int ); |
88 | 88 | void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); |
89 | 89 | void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); |
90 | void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); | |
91 | 90 | void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); |
92 | 91 | void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); |
93 | void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); | |
94 | 92 | void x264_plane_copy_interleave_core_mmx2( pixel *dst, intptr_t i_dst, |
95 | 93 | pixel *srcu, intptr_t i_srcu, |
96 | 94 | pixel *srcv, intptr_t i_srcv, int w, int h ); |
100 | 98 | void x264_plane_copy_interleave_core_avx( pixel *dst, intptr_t i_dst, |
101 | 99 | pixel *srcu, intptr_t i_srcu, |
102 | 100 | pixel *srcv, intptr_t i_srcv, int w, int h ); |
103 | void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst, | |
104 | pixel *srcu, intptr_t i_srcu, | |
105 | pixel *srcv, intptr_t i_srcv, int w, int h ); | |
106 | 101 | void x264_plane_copy_deinterleave_mmx( pixel *dstu, intptr_t i_dstu, |
107 | 102 | pixel *dstv, intptr_t i_dstv, |
108 | 103 | pixel *src, intptr_t i_src, int w, int h ); |
492 | 487 | #endif |
493 | 488 | #endif // HIGH_BIT_DEPTH |
494 | 489 | |
495 | #define PLANE_COPY(align, cpu)\ | |
496 | static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\ | |
497 | {\ | |
498 | int c_w = (align) / sizeof(pixel) - 1;\ | |
499 | if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\ | |
500 | x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\ | |
501 | else if( !(w&c_w) )\ | |
502 | x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\ | |
503 | else\ | |
504 | {\ | |
505 | if( --h > 0 )\ | |
506 | {\ | |
507 | if( i_src > 0 )\ | |
508 | {\ | |
509 | x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ | |
510 | dst += i_dst * h;\ | |
511 | src += i_src * h;\ | |
512 | }\ | |
513 | else\ | |
514 | x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ | |
515 | }\ | |
516 | /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\ | |
517 | memcpy( dst, src, w*sizeof(pixel) );\ | |
518 | }\ | |
519 | } | |
520 | ||
521 | 490 | PLANE_COPY(16, sse) |
522 | 491 | PLANE_COPY(32, avx) |
523 | 492 | |
524 | #define PLANE_COPY_SWAP(align, cpu)\ | |
525 | static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\ | |
526 | {\ | |
527 | int c_w = (align>>1) / sizeof(pixel) - 1;\ | |
528 | if( !(w&c_w) )\ | |
529 | x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\ | |
530 | else if( w > c_w )\ | |
531 | {\ | |
532 | if( --h > 0 )\ | |
533 | {\ | |
534 | if( i_src > 0 )\ | |
535 | {\ | |
536 | x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ | |
537 | dst += i_dst * h;\ | |
538 | src += i_src * h;\ | |
539 | }\ | |
540 | else\ | |
541 | x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ | |
542 | }\ | |
543 | x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\ | |
544 | for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\ | |
545 | {\ | |
546 | dst[x] = src[x+1];\ | |
547 | dst[x+1] = src[x];\ | |
548 | }\ | |
549 | }\ | |
550 | else\ | |
551 | x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\ | |
552 | } | |
553 | ||
554 | 493 | PLANE_COPY_SWAP(16, ssse3) |
555 | 494 | PLANE_COPY_SWAP(32, avx2) |
556 | ||
557 | #define PLANE_INTERLEAVE(cpu) \ | |
558 | static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\ | |
559 | pixel *srcu, intptr_t i_srcu,\ | |
560 | pixel *srcv, intptr_t i_srcv, int w, int h )\ | |
561 | {\ | |
562 | int c_w = 16 / sizeof(pixel) - 1;\ | |
563 | if( !(w&c_w) )\ | |
564 | x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ | |
565 | else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\ | |
566 | {\ | |
567 | if( --h > 0 )\ | |
568 | {\ | |
569 | if( i_srcu > 0 )\ | |
570 | {\ | |
571 | x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\ | |
572 | dst += i_dst * h;\ | |
573 | srcu += i_srcu * h;\ | |
574 | srcv += i_srcv * h;\ | |
575 | }\ | |
576 | else\ | |
577 | x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\ | |
578 | }\ | |
579 | x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\ | |
580 | }\ | |
581 | else\ | |
582 | x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ | |
583 | } | |
584 | 495 | |
585 | 496 | PLANE_INTERLEAVE(mmx2) |
586 | 497 | PLANE_INTERLEAVE(sse2) |
34 | 34 | |
35 | 35 | Advanced options: |
36 | 36 | --disable-asm disable platform-specific assembly optimizations |
37 | --enable-lto enable link-time optimization | |
37 | 38 | --enable-debug add -g |
38 | 39 | --enable-gprof add -pg |
39 | 40 | --enable-strip add -s |
345 | 346 | swscale="auto" |
346 | 347 | asm="auto" |
347 | 348 | interlaced="yes" |
349 | lto="no" | |
348 | 350 | debug="no" |
349 | 351 | gprof="no" |
350 | 352 | strip="no" |
451 | 453 | --disable-swscale) |
452 | 454 | swscale="no" |
453 | 455 | ;; |
456 | --enable-lto) | |
457 | lto="auto" | |
458 | ;; | |
454 | 459 | --enable-debug) |
455 | 460 | debug="yes" |
456 | 461 | ;; |
502 | 507 | [ "$cli" = "no" -a "$shared" = "no" -a "$static" = "no" ] && die "Nothing to build. Enable cli, shared or static." |
503 | 508 | |
504 | 509 | CC="${CC-${cross_prefix}gcc}" |
505 | AR="${AR-${cross_prefix}ar}" | |
506 | RANLIB="${RANLIB-${cross_prefix}ranlib}" | |
507 | 510 | STRIP="${STRIP-${cross_prefix}strip}" |
508 | 511 | INSTALL="${INSTALL-install}" |
512 | PKGCONFIG="${PKGCONFIG-${cross_prefix}pkg-config}" | |
513 | ||
514 | # ar and ranlib doesn't load the LTO plugin by default, prefer the gcc-prefixed wrappers which does. | |
515 | if ${cross_prefix}gcc-ar --version >/dev/null 2>&1; then | |
516 | AR="${AR-${cross_prefix}gcc-ar}" | |
517 | else | |
518 | AR="${AR-${cross_prefix}ar}" | |
519 | fi | |
520 | if ${cross_prefix}gcc-ranlib --version >/dev/null 2>&1; then | |
521 | RANLIB="${RANLIB-${cross_prefix}gcc-ranlib}" | |
522 | else | |
523 | RANLIB="${RANLIB-${cross_prefix}ranlib}" | |
524 | fi | |
509 | 525 | |
510 | 526 | if [ "x$host" = x ]; then |
511 | 527 | host=`${SRCPATH}/config.guess` |
557 | 573 | fi |
558 | 574 | fi |
559 | 575 | |
560 | if [[ "$cc_base" = clang || "$cc_base" = clang[\ .]* ]]; then | |
576 | if [[ "$cc_base" = clang* ]]; then | |
561 | 577 | if cc_check '' -Werror=unknown-warning-option ; then |
562 | 578 | CHECK_CFLAGS="$CHECK_CFLAGS -Werror=unknown-warning-option" |
563 | 579 | fi |
924 | 940 | |
925 | 941 | if [ "$cli_libx264" = "system" -a "$shared" != "yes" ] ; then |
926 | 942 | [ "$static" = "yes" ] && die "Option --system-libx264 can not be used together with --enable-static" |
927 | if ${cross_prefix}pkg-config --exists x264 2>/dev/null; then | |
928 | X264_LIBS="$(${cross_prefix}pkg-config --libs x264)" | |
929 | X264_INCLUDE_DIR="${X264_INCLUDE_DIR-$(${cross_prefix}pkg-config --variable=includedir x264)}" | |
943 | if $PKGCONFIG --exists x264 2>/dev/null; then | |
944 | X264_LIBS="$($PKGCONFIG --libs x264)" | |
945 | X264_INCLUDE_DIR="${X264_INCLUDE_DIR-$($PKGCONFIG --variable=includedir x264)}" | |
930 | 946 | configure_system_override "$X264_INCLUDE_DIR" || die "Detection of system libx264 configuration failed" |
931 | 947 | else |
932 | 948 | die "Can not find system libx264" |
1010 | 1026 | |
1011 | 1027 | if [ "$swscale" = "auto" ] ; then |
1012 | 1028 | swscale="no" |
1013 | if ${cross_prefix}pkg-config --exists libswscale 2>/dev/null; then | |
1014 | SWSCALE_LIBS="$SWSCALE_LIBS $(${cross_prefix}pkg-config --libs libswscale libavutil)" | |
1015 | SWSCALE_CFLAGS="$SWSCALE_CFLAGS $(${cross_prefix}pkg-config --cflags libswscale libavutil)" | |
1029 | if $PKGCONFIG --exists libswscale 2>/dev/null; then | |
1030 | SWSCALE_LIBS="$SWSCALE_LIBS $($PKGCONFIG --libs libswscale libavutil)" | |
1031 | SWSCALE_CFLAGS="$SWSCALE_CFLAGS $($PKGCONFIG --cflags libswscale libavutil)" | |
1016 | 1032 | fi |
1017 | 1033 | [ -z "$SWSCALE_LIBS" ] && SWSCALE_LIBS="-lswscale -lavutil" |
1018 | 1034 | |
1027 | 1043 | |
1028 | 1044 | if [ "$lavf" = "auto" ] ; then |
1029 | 1045 | lavf="no" |
1030 | if ${cross_prefix}pkg-config --exists libavformat libavcodec libswscale 2>/dev/null; then | |
1031 | LAVF_LIBS="$LAVF_LIBS $(${cross_prefix}pkg-config --libs libavformat libavcodec libavutil libswscale)" | |
1032 | LAVF_CFLAGS="$LAVF_CFLAGS $(${cross_prefix}pkg-config --cflags libavformat libavcodec libavutil libswscale)" | |
1046 | if $PKGCONFIG --exists libavformat libavcodec libswscale 2>/dev/null; then | |
1047 | LAVF_LIBS="$LAVF_LIBS $($PKGCONFIG --libs libavformat libavcodec libavutil libswscale)" | |
1048 | LAVF_CFLAGS="$LAVF_CFLAGS $($PKGCONFIG --cflags libavformat libavcodec libavutil libswscale)" | |
1033 | 1049 | fi |
1034 | 1050 | if [ -z "$LAVF_LIBS" -a -z "$LAVF_CFLAGS" ]; then |
1035 | 1051 | LAVF_LIBS="-lavformat" |
1051 | 1067 | ffms_major="2"; ffms_minor="21"; ffms_micro="0"; ffms_bump="0" |
1052 | 1068 | ffms="no" |
1053 | 1069 | |
1054 | if ${cross_prefix}pkg-config --exists ffms2 2>/dev/null; then | |
1055 | FFMS2_LIBS="$FFMS2_LIBS $(${cross_prefix}pkg-config --libs ffms2)" | |
1056 | FFMS2_CFLAGS="$FFMS2_CFLAGS $(${cross_prefix}pkg-config --cflags ffms2)" | |
1070 | if $PKGCONFIG --exists ffms2 2>/dev/null; then | |
1071 | FFMS2_LIBS="$FFMS2_LIBS $($PKGCONFIG --libs ffms2)" | |
1072 | FFMS2_CFLAGS="$FFMS2_CFLAGS $($PKGCONFIG --cflags ffms2)" | |
1057 | 1073 | fi |
1058 | 1074 | [ -z "$FFMS2_LIBS" ] && FFMS2_LIBS="-lffms2" |
1059 | 1075 | |
1093 | 1109 | |
1094 | 1110 | if [ "$lsmash" = "auto" ] ; then |
1095 | 1111 | lsmash="no" |
1096 | if ${cross_prefix}pkg-config --exists liblsmash 2>/dev/null; then | |
1097 | LSMASH_LIBS="$LSMASH_LIBS $(${cross_prefix}pkg-config --libs liblsmash)" | |
1098 | LSMASH_CFLAGS="$LSMASH_CFLAGS $(${cross_prefix}pkg-config --cflags liblsmash)" | |
1112 | if $PKGCONFIG --exists liblsmash 2>/dev/null; then | |
1113 | LSMASH_LIBS="$LSMASH_LIBS $($PKGCONFIG --libs liblsmash)" | |
1114 | LSMASH_CFLAGS="$LSMASH_CFLAGS $($PKGCONFIG --cflags liblsmash)" | |
1099 | 1115 | fi |
1100 | 1116 | [ -z "$LSMASH_LIBS" ] && LSMASH_LIBS="-llsmash" |
1101 | 1117 | |
1176 | 1192 | RCFLAGS="$RCFLAGS -DDEBUG" |
1177 | 1193 | else |
1178 | 1194 | CFLAGS="-O3 -ffast-math $CFLAGS" |
1179 | fi | |
1195 | if [ "$lto" = "auto" ] && [ $compiler = GNU ] && cc_check "" "-flto" ; then | |
1196 | lto="yes" | |
1197 | CFLAGS="$CFLAGS -flto" | |
1198 | LDFLAGS="$LDFLAGS -O3 -flto" | |
1199 | fi | |
1200 | fi | |
1201 | [ "$lto" = "auto" ] && lto="no" | |
1180 | 1202 | |
1181 | 1203 | if cc_check '' -fno-tree-vectorize ; then |
1182 | 1204 | CFLAGS="$CFLAGS -fno-tree-vectorize" |
1461 | 1483 | thread: $thread |
1462 | 1484 | opencl: $opencl |
1463 | 1485 | filters: $filters |
1486 | lto: $lto | |
1464 | 1487 | debug: $debug |
1465 | 1488 | gprof: $gprof |
1466 | 1489 | strip: $strip |
195 | 195 | ( csp >= X264_CSP_BGR ? 1 : 0 ) ); |
196 | 196 | sps->vui.b_color_description_present = 0; |
197 | 197 | |
198 | sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <= 9 ? param->vui.i_colorprim : 2 ); | |
199 | sps->vui.i_transfer = ( param->vui.i_transfer >= 0 && param->vui.i_transfer <= 15 ? param->vui.i_transfer : 2 ); | |
200 | sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 10 ? param->vui.i_colmatrix : | |
198 | sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <= 12 ? param->vui.i_colorprim : 2 ); | |
199 | sps->vui.i_transfer = ( param->vui.i_transfer >= 0 && param->vui.i_transfer <= 17 ? param->vui.i_transfer : 2 ); | |
200 | sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 11 ? param->vui.i_colmatrix : | |
201 | 201 | ( csp >= X264_CSP_BGR ? 0 : 2 ) ); |
202 | 202 | if( sps->vui.i_colorprim != 2 || |
203 | 203 | sps->vui.i_transfer != 2 || |
27 | 27 | #include <ctype.h> |
28 | 28 | #include "common/common.h" |
29 | 29 | #include "common/cpu.h" |
30 | ||
31 | #ifdef _WIN32 | |
32 | #include <windows.h> | |
33 | #endif | |
30 | 34 | |
31 | 35 | // GCC doesn't align stack variables on ARM, so use .bss |
32 | 36 | #if ARCH_ARM |
0 | 0 | #!/bin/sh |
1 | 1 | # Script modified from upstream source for Debian packaging since packaging |
2 | 2 | # won't include .git repository. |
3 | echo '#define X264_VERSION " r2699 a5e06b9"' | |
4 | echo '#define X264_POINTVER "0.148.2699 a5e06b9"' | |
3 | echo '#define X264_VERSION " r2708 86b7198"' | |
4 | echo '#define X264_POINTVER "0.148.2708 86b7198"' |
844 | 844 | " - %s\n", range_names[0], stringify_names( buf, range_names ) ); |
845 | 845 | H2( " --colorprim <string> Specify color primaries [\"%s\"]\n" |
846 | 846 | " - undef, bt709, bt470m, bt470bg, smpte170m,\n" |
847 | " smpte240m, film, bt2020\n", | |
847 | " smpte240m, film, bt2020, smpte428,\n" | |
848 | " smpte431, smpte432\n", | |
848 | 849 | strtable_lookup( x264_colorprim_names, defaults->vui.i_colorprim ) ); |
849 | 850 | H2( " --transfer <string> Specify transfer characteristics [\"%s\"]\n" |
850 | 851 | " - undef, bt709, bt470m, bt470bg, smpte170m,\n" |
851 | 852 | " smpte240m, linear, log100, log316,\n" |
852 | 853 | " iec61966-2-4, bt1361e, iec61966-2-1,\n" |
853 | " bt2020-10, bt2020-12\n", | |
854 | " bt2020-10, bt2020-12, smpte2084, smpte428\n", | |
854 | 855 | strtable_lookup( x264_transfer_names, defaults->vui.i_transfer ) ); |
855 | 856 | H2( " --colormatrix <string> Specify color matrix setting [\"%s\"]\n" |
856 | 857 | " - undef, bt709, fcc, bt470bg, smpte170m,\n" |
857 | " smpte240m, GBR, YCgCo, bt2020nc, bt2020c\n", | |
858 | " smpte240m, GBR, YCgCo, bt2020nc, bt2020c,\n" | |
859 | " smpte2085\n", | |
858 | 860 | strtable_lookup( x264_colmatrix_names, defaults->vui.i_colmatrix ) ); |
859 | 861 | H2( " --chromaloc <integer> Specify chroma sample location (0 to 5) [%d]\n", |
860 | 862 | defaults->vui.i_chroma_loc ); |
208 | 208 | static const char * const x264_overscan_names[] = { "undef", "show", "crop", 0 }; |
209 | 209 | static const char * const x264_vidformat_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 }; |
210 | 210 | static const char * const x264_fullrange_names[] = { "off", "on", 0 }; |
211 | static const char * const x264_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", "bt2020", 0 }; | |
211 | static const char * const x264_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", "bt2020", "smpte428", | |
212 | "smpte431", "smpte432", 0 }; | |
212 | 213 | static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", "log316", |
213 | "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", 0 }; | |
214 | static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", "bt2020nc", "bt2020c", 0 }; | |
214 | "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", "smpte2084", "smpte428", 0 }; | |
215 | static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", "bt2020nc", "bt2020c", | |
216 | "smpte2085", 0 }; | |
215 | 217 | static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 }; |
216 | 218 | |
217 | 219 | /* Colorspace type */ |