Codebase list libde265 / 5d0c630
Imported Upstream version 0.8 Joachim Bauch 9 years ago
141 changed file(s) with 35800 addition(s) and 30421 deletion(s). Raw diff Collapse all Expand all
2121 before_install:
2222 - sh -c "if [ ! -z '$DECODESTREAMS' ]; then sudo add-apt-repository -y ppa:strukturag/libde265; fi"
2323 - sudo apt-get update -qq
24 - sh -c "if [ -z '$HOST' ]; then sudo apt-get install -qq valgrind libsdl-dev; fi"
24 - sh -c "if [ -z '$HOST' ]; then sudo apt-get install -qq valgrind libsdl-dev libqt4-dev libswscale-dev; fi"
2525 - sh -c "if [ ! -z '$HOST' ]; then sudo apt-get install -qq wine; fi"
2626 - sh -c "if [ '$WINE' = 'wine' ]; then sudo apt-get install -qq gcc-mingw-w64-i686 g++-mingw-w64-i686 binutils-mingw-w64-i686 mingw-w64-dev; fi"
2727 - sh -c "if [ '$WINE' = 'wine64' ]; then sudo apt-get install -qq gcc-mingw-w64-x86-64 g++-mingw-w64-x86-64 binutils-mingw-w64-x86-64 mingw-w64-dev; fi"
4242 - sh -c "if [ -z "$HOST" ] && [ -z "$DECODESTREAMS" ]; then LD_LIBRARY_PATH=./libde265/.libs/ valgrind --tool=memcheck --quiet --error-exitcode=1 ./dec265/.libs/dec265 -t 4 -q -c -f 100 ./libde265-data/IDR-only/paris-352x288-intra.bin; fi"
4343 - sh -c "if [ -z "$HOST" ] && [ -z "$DECODESTREAMS" ]; then LD_LIBRARY_PATH=./libde265/.libs/ valgrind --tool=memcheck --quiet --error-exitcode=1 ./dec265/.libs/dec265 -q -c -f 100 ./libde265-data/RandomAccess/paris-ra-wpp.bin; fi"
4444 - sh -c "if [ -z "$HOST" ] && [ -z "$DECODESTREAMS" ]; then LD_LIBRARY_PATH=./libde265/.libs/ valgrind --tool=memcheck --quiet --error-exitcode=1 ./dec265/.libs/dec265 -t 4 -q -c -f 100 ./libde265-data/RandomAccess/paris-ra-wpp.bin; fi"
45 - sh -c "if [ ! -z "$HOST" ]; then WINEPREFIX=`pwd`/$WINE $WINE ./dec265/dec265.exe -q -c ./libde265-data/IDR-only/paris-352x288-intra.bin; fi"
46 - sh -c "if [ ! -z "$HOST" ]; then WINEPREFIX=`pwd`/$WINE $WINE ./dec265/dec265.exe -t 4 -q -c ./libde265-data/IDR-only/paris-352x288-intra.bin; fi"
47 - sh -c "if [ ! -z "$HOST" ]; then WINEPREFIX=`pwd`/$WINE $WINE ./dec265/dec265.exe -q -c ./libde265-data/RandomAccess/paris-ra-wpp.bin; fi"
48 - sh -c "if [ ! -z "$HOST" ]; then WINEPREFIX=`pwd`/$WINE $WINE ./dec265/dec265.exe -t 4 -q -c ./libde265-data/RandomAccess/paris-ra-wpp.bin; fi"
45 - sh -c "if [ ! -z "$HOST" ]; then WINEPREFIX=`pwd`/$WINE WINEPATH=/usr/lib/gcc/$HOST/4.6/ $WINE ./dec265/dec265.exe -q -c ./libde265-data/IDR-only/paris-352x288-intra.bin; fi"
46 - sh -c "if [ ! -z "$HOST" ]; then WINEPREFIX=`pwd`/$WINE WINEPATH=/usr/lib/gcc/$HOST/4.6/ $WINE ./dec265/dec265.exe -t 4 -q -c ./libde265-data/IDR-only/paris-352x288-intra.bin; fi"
47 - sh -c "if [ ! -z "$HOST" ]; then WINEPREFIX=`pwd`/$WINE WINEPATH=/usr/lib/gcc/$HOST/4.6/ $WINE ./dec265/dec265.exe -q -c ./libde265-data/RandomAccess/paris-ra-wpp.bin; fi"
48 - sh -c "if [ ! -z "$HOST" ]; then WINEPREFIX=`pwd`/$WINE WINEPATH=/usr/lib/gcc/$HOST/4.6/ $WINE ./dec265/dec265.exe -t 4 -q -c ./libde265-data/RandomAccess/paris-ra-wpp.bin; fi"
4949 - sh -c "if [ ! -z '$DECODESTREAMS' ]; then python scripts/decodestreams.py /var/lib/libde265-teststreams; fi"
0 Authors of libde165
0 Authors of libde265
11 See also the files THANKS and ChangeLog
22
33 Dirk Farin <farin@struktur.de>
44 - designed and implemented libde265
5
6 Joachim Bauch <bauch@struktur.de>
7 - bugfixes, optimizations and support for Windows
0 The library `libde265` is distributed under the terms of the GNU Lesser
1 General Public License. The sample applications are distributed under
2 the terms of the GNU General Public License.
3
4 License texts below and in the `COPYING` files of the corresponding
5 subfolders.
6
7 ----------------------------------------------------------------------
8
09 GNU LESSER GENERAL PUBLIC LICENSE
110 Version 3, 29 June 2007
211
162171 apply, that proxy's public statement of acceptance of any version is
163172 permanent authorization for you to choose that version for the
164173 Library.
174
175 ----------------------------------------------------------------------
176
177 GNU GENERAL PUBLIC LICENSE
178 Version 3, 29 June 2007
179
180 Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
181 Everyone is permitted to copy and distribute verbatim copies
182 of this license document, but changing it is not allowed.
183
184 Preamble
185
186 The GNU General Public License is a free, copyleft license for
187 software and other kinds of works.
188
189 The licenses for most software and other practical works are designed
190 to take away your freedom to share and change the works. By contrast,
191 the GNU General Public License is intended to guarantee your freedom to
192 share and change all versions of a program--to make sure it remains free
193 software for all its users. We, the Free Software Foundation, use the
194 GNU General Public License for most of our software; it applies also to
195 any other work released this way by its authors. You can apply it to
196 your programs, too.
197
198 When we speak of free software, we are referring to freedom, not
199 price. Our General Public Licenses are designed to make sure that you
200 have the freedom to distribute copies of free software (and charge for
201 them if you wish), that you receive source code or can get it if you
202 want it, that you can change the software or use pieces of it in new
203 free programs, and that you know you can do these things.
204
205 To protect your rights, we need to prevent others from denying you
206 these rights or asking you to surrender the rights. Therefore, you have
207 certain responsibilities if you distribute copies of the software, or if
208 you modify it: responsibilities to respect the freedom of others.
209
210 For example, if you distribute copies of such a program, whether
211 gratis or for a fee, you must pass on to the recipients the same
212 freedoms that you received. You must make sure that they, too, receive
213 or can get the source code. And you must show them these terms so they
214 know their rights.
215
216 Developers that use the GNU GPL protect your rights with two steps:
217 (1) assert copyright on the software, and (2) offer you this License
218 giving you legal permission to copy, distribute and/or modify it.
219
220 For the developers' and authors' protection, the GPL clearly explains
221 that there is no warranty for this free software. For both users' and
222 authors' sake, the GPL requires that modified versions be marked as
223 changed, so that their problems will not be attributed erroneously to
224 authors of previous versions.
225
226 Some devices are designed to deny users access to install or run
227 modified versions of the software inside them, although the manufacturer
228 can do so. This is fundamentally incompatible with the aim of
229 protecting users' freedom to change the software. The systematic
230 pattern of such abuse occurs in the area of products for individuals to
231 use, which is precisely where it is most unacceptable. Therefore, we
232 have designed this version of the GPL to prohibit the practice for those
233 products. If such problems arise substantially in other domains, we
234 stand ready to extend this provision to those domains in future versions
235 of the GPL, as needed to protect the freedom of users.
236
237 Finally, every program is threatened constantly by software patents.
238 States should not allow patents to restrict development and use of
239 software on general-purpose computers, but in those that do, we wish to
240 avoid the special danger that patents applied to a free program could
241 make it effectively proprietary. To prevent this, the GPL assures that
242 patents cannot be used to render the program non-free.
243
244 The precise terms and conditions for copying, distribution and
245 modification follow.
246
247 TERMS AND CONDITIONS
248
249 0. Definitions.
250
251 "This License" refers to version 3 of the GNU General Public License.
252
253 "Copyright" also means copyright-like laws that apply to other kinds of
254 works, such as semiconductor masks.
255
256 "The Program" refers to any copyrightable work licensed under this
257 License. Each licensee is addressed as "you". "Licensees" and
258 "recipients" may be individuals or organizations.
259
260 To "modify" a work means to copy from or adapt all or part of the work
261 in a fashion requiring copyright permission, other than the making of an
262 exact copy. The resulting work is called a "modified version" of the
263 earlier work or a work "based on" the earlier work.
264
265 A "covered work" means either the unmodified Program or a work based
266 on the Program.
267
268 To "propagate" a work means to do anything with it that, without
269 permission, would make you directly or secondarily liable for
270 infringement under applicable copyright law, except executing it on a
271 computer or modifying a private copy. Propagation includes copying,
272 distribution (with or without modification), making available to the
273 public, and in some countries other activities as well.
274
275 To "convey" a work means any kind of propagation that enables other
276 parties to make or receive copies. Mere interaction with a user through
277 a computer network, with no transfer of a copy, is not conveying.
278
279 An interactive user interface displays "Appropriate Legal Notices"
280 to the extent that it includes a convenient and prominently visible
281 feature that (1) displays an appropriate copyright notice, and (2)
282 tells the user that there is no warranty for the work (except to the
283 extent that warranties are provided), that licensees may convey the
284 work under this License, and how to view a copy of this License. If
285 the interface presents a list of user commands or options, such as a
286 menu, a prominent item in the list meets this criterion.
287
288 1. Source Code.
289
290 The "source code" for a work means the preferred form of the work
291 for making modifications to it. "Object code" means any non-source
292 form of a work.
293
294 A "Standard Interface" means an interface that either is an official
295 standard defined by a recognized standards body, or, in the case of
296 interfaces specified for a particular programming language, one that
297 is widely used among developers working in that language.
298
299 The "System Libraries" of an executable work include anything, other
300 than the work as a whole, that (a) is included in the normal form of
301 packaging a Major Component, but which is not part of that Major
302 Component, and (b) serves only to enable use of the work with that
303 Major Component, or to implement a Standard Interface for which an
304 implementation is available to the public in source code form. A
305 "Major Component", in this context, means a major essential component
306 (kernel, window system, and so on) of the specific operating system
307 (if any) on which the executable work runs, or a compiler used to
308 produce the work, or an object code interpreter used to run it.
309
310 The "Corresponding Source" for a work in object code form means all
311 the source code needed to generate, install, and (for an executable
312 work) run the object code and to modify the work, including scripts to
313 control those activities. However, it does not include the work's
314 System Libraries, or general-purpose tools or generally available free
315 programs which are used unmodified in performing those activities but
316 which are not part of the work. For example, Corresponding Source
317 includes interface definition files associated with source files for
318 the work, and the source code for shared libraries and dynamically
319 linked subprograms that the work is specifically designed to require,
320 such as by intimate data communication or control flow between those
321 subprograms and other parts of the work.
322
323 The Corresponding Source need not include anything that users
324 can regenerate automatically from other parts of the Corresponding
325 Source.
326
327 The Corresponding Source for a work in source code form is that
328 same work.
329
330 2. Basic Permissions.
331
332 All rights granted under this License are granted for the term of
333 copyright on the Program, and are irrevocable provided the stated
334 conditions are met. This License explicitly affirms your unlimited
335 permission to run the unmodified Program. The output from running a
336 covered work is covered by this License only if the output, given its
337 content, constitutes a covered work. This License acknowledges your
338 rights of fair use or other equivalent, as provided by copyright law.
339
340 You may make, run and propagate covered works that you do not
341 convey, without conditions so long as your license otherwise remains
342 in force. You may convey covered works to others for the sole purpose
343 of having them make modifications exclusively for you, or provide you
344 with facilities for running those works, provided that you comply with
345 the terms of this License in conveying all material for which you do
346 not control copyright. Those thus making or running the covered works
347 for you must do so exclusively on your behalf, under your direction
348 and control, on terms that prohibit them from making any copies of
349 your copyrighted material outside their relationship with you.
350
351 Conveying under any other circumstances is permitted solely under
352 the conditions stated below. Sublicensing is not allowed; section 10
353 makes it unnecessary.
354
355 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
356
357 No covered work shall be deemed part of an effective technological
358 measure under any applicable law fulfilling obligations under article
359 11 of the WIPO copyright treaty adopted on 20 December 1996, or
360 similar laws prohibiting or restricting circumvention of such
361 measures.
362
363 When you convey a covered work, you waive any legal power to forbid
364 circumvention of technological measures to the extent such circumvention
365 is effected by exercising rights under this License with respect to
366 the covered work, and you disclaim any intention to limit operation or
367 modification of the work as a means of enforcing, against the work's
368 users, your or third parties' legal rights to forbid circumvention of
369 technological measures.
370
371 4. Conveying Verbatim Copies.
372
373 You may convey verbatim copies of the Program's source code as you
374 receive it, in any medium, provided that you conspicuously and
375 appropriately publish on each copy an appropriate copyright notice;
376 keep intact all notices stating that this License and any
377 non-permissive terms added in accord with section 7 apply to the code;
378 keep intact all notices of the absence of any warranty; and give all
379 recipients a copy of this License along with the Program.
380
381 You may charge any price or no price for each copy that you convey,
382 and you may offer support or warranty protection for a fee.
383
384 5. Conveying Modified Source Versions.
385
386 You may convey a work based on the Program, or the modifications to
387 produce it from the Program, in the form of source code under the
388 terms of section 4, provided that you also meet all of these conditions:
389
390 a) The work must carry prominent notices stating that you modified
391 it, and giving a relevant date.
392
393 b) The work must carry prominent notices stating that it is
394 released under this License and any conditions added under section
395 7. This requirement modifies the requirement in section 4 to
396 "keep intact all notices".
397
398 c) You must license the entire work, as a whole, under this
399 License to anyone who comes into possession of a copy. This
400 License will therefore apply, along with any applicable section 7
401 additional terms, to the whole of the work, and all its parts,
402 regardless of how they are packaged. This License gives no
403 permission to license the work in any other way, but it does not
404 invalidate such permission if you have separately received it.
405
406 d) If the work has interactive user interfaces, each must display
407 Appropriate Legal Notices; however, if the Program has interactive
408 interfaces that do not display Appropriate Legal Notices, your
409 work need not make them do so.
410
411 A compilation of a covered work with other separate and independent
412 works, which are not by their nature extensions of the covered work,
413 and which are not combined with it such as to form a larger program,
414 in or on a volume of a storage or distribution medium, is called an
415 "aggregate" if the compilation and its resulting copyright are not
416 used to limit the access or legal rights of the compilation's users
417 beyond what the individual works permit. Inclusion of a covered work
418 in an aggregate does not cause this License to apply to the other
419 parts of the aggregate.
420
421 6. Conveying Non-Source Forms.
422
423 You may convey a covered work in object code form under the terms
424 of sections 4 and 5, provided that you also convey the
425 machine-readable Corresponding Source under the terms of this License,
426 in one of these ways:
427
428 a) Convey the object code in, or embodied in, a physical product
429 (including a physical distribution medium), accompanied by the
430 Corresponding Source fixed on a durable physical medium
431 customarily used for software interchange.
432
433 b) Convey the object code in, or embodied in, a physical product
434 (including a physical distribution medium), accompanied by a
435 written offer, valid for at least three years and valid for as
436 long as you offer spare parts or customer support for that product
437 model, to give anyone who possesses the object code either (1) a
438 copy of the Corresponding Source for all the software in the
439 product that is covered by this License, on a durable physical
440 medium customarily used for software interchange, for a price no
441 more than your reasonable cost of physically performing this
442 conveying of source, or (2) access to copy the
443 Corresponding Source from a network server at no charge.
444
445 c) Convey individual copies of the object code with a copy of the
446 written offer to provide the Corresponding Source. This
447 alternative is allowed only occasionally and noncommercially, and
448 only if you received the object code with such an offer, in accord
449 with subsection 6b.
450
451 d) Convey the object code by offering access from a designated
452 place (gratis or for a charge), and offer equivalent access to the
453 Corresponding Source in the same way through the same place at no
454 further charge. You need not require recipients to copy the
455 Corresponding Source along with the object code. If the place to
456 copy the object code is a network server, the Corresponding Source
457 may be on a different server (operated by you or a third party)
458 that supports equivalent copying facilities, provided you maintain
459 clear directions next to the object code saying where to find the
460 Corresponding Source. Regardless of what server hosts the
461 Corresponding Source, you remain obligated to ensure that it is
462 available for as long as needed to satisfy these requirements.
463
464 e) Convey the object code using peer-to-peer transmission, provided
465 you inform other peers where the object code and Corresponding
466 Source of the work are being offered to the general public at no
467 charge under subsection 6d.
468
469 A separable portion of the object code, whose source code is excluded
470 from the Corresponding Source as a System Library, need not be
471 included in conveying the object code work.
472
473 A "User Product" is either (1) a "consumer product", which means any
474 tangible personal property which is normally used for personal, family,
475 or household purposes, or (2) anything designed or sold for incorporation
476 into a dwelling. In determining whether a product is a consumer product,
477 doubtful cases shall be resolved in favor of coverage. For a particular
478 product received by a particular user, "normally used" refers to a
479 typical or common use of that class of product, regardless of the status
480 of the particular user or of the way in which the particular user
481 actually uses, or expects or is expected to use, the product. A product
482 is a consumer product regardless of whether the product has substantial
483 commercial, industrial or non-consumer uses, unless such uses represent
484 the only significant mode of use of the product.
485
486 "Installation Information" for a User Product means any methods,
487 procedures, authorization keys, or other information required to install
488 and execute modified versions of a covered work in that User Product from
489 a modified version of its Corresponding Source. The information must
490 suffice to ensure that the continued functioning of the modified object
491 code is in no case prevented or interfered with solely because
492 modification has been made.
493
494 If you convey an object code work under this section in, or with, or
495 specifically for use in, a User Product, and the conveying occurs as
496 part of a transaction in which the right of possession and use of the
497 User Product is transferred to the recipient in perpetuity or for a
498 fixed term (regardless of how the transaction is characterized), the
499 Corresponding Source conveyed under this section must be accompanied
500 by the Installation Information. But this requirement does not apply
501 if neither you nor any third party retains the ability to install
502 modified object code on the User Product (for example, the work has
503 been installed in ROM).
504
505 The requirement to provide Installation Information does not include a
506 requirement to continue to provide support service, warranty, or updates
507 for a work that has been modified or installed by the recipient, or for
508 the User Product in which it has been modified or installed. Access to a
509 network may be denied when the modification itself materially and
510 adversely affects the operation of the network or violates the rules and
511 protocols for communication across the network.
512
513 Corresponding Source conveyed, and Installation Information provided,
514 in accord with this section must be in a format that is publicly
515 documented (and with an implementation available to the public in
516 source code form), and must require no special password or key for
517 unpacking, reading or copying.
518
519 7. Additional Terms.
520
521 "Additional permissions" are terms that supplement the terms of this
522 License by making exceptions from one or more of its conditions.
523 Additional permissions that are applicable to the entire Program shall
524 be treated as though they were included in this License, to the extent
525 that they are valid under applicable law. If additional permissions
526 apply only to part of the Program, that part may be used separately
527 under those permissions, but the entire Program remains governed by
528 this License without regard to the additional permissions.
529
530 When you convey a copy of a covered work, you may at your option
531 remove any additional permissions from that copy, or from any part of
532 it. (Additional permissions may be written to require their own
533 removal in certain cases when you modify the work.) You may place
534 additional permissions on material, added by you to a covered work,
535 for which you have or can give appropriate copyright permission.
536
537 Notwithstanding any other provision of this License, for material you
538 add to a covered work, you may (if authorized by the copyright holders of
539 that material) supplement the terms of this License with terms:
540
541 a) Disclaiming warranty or limiting liability differently from the
542 terms of sections 15 and 16 of this License; or
543
544 b) Requiring preservation of specified reasonable legal notices or
545 author attributions in that material or in the Appropriate Legal
546 Notices displayed by works containing it; or
547
548 c) Prohibiting misrepresentation of the origin of that material, or
549 requiring that modified versions of such material be marked in
550 reasonable ways as different from the original version; or
551
552 d) Limiting the use for publicity purposes of names of licensors or
553 authors of the material; or
554
555 e) Declining to grant rights under trademark law for use of some
556 trade names, trademarks, or service marks; or
557
558 f) Requiring indemnification of licensors and authors of that
559 material by anyone who conveys the material (or modified versions of
560 it) with contractual assumptions of liability to the recipient, for
561 any liability that these contractual assumptions directly impose on
562 those licensors and authors.
563
564 All other non-permissive additional terms are considered "further
565 restrictions" within the meaning of section 10. If the Program as you
566 received it, or any part of it, contains a notice stating that it is
567 governed by this License along with a term that is a further
568 restriction, you may remove that term. If a license document contains
569 a further restriction but permits relicensing or conveying under this
570 License, you may add to a covered work material governed by the terms
571 of that license document, provided that the further restriction does
572 not survive such relicensing or conveying.
573
574 If you add terms to a covered work in accord with this section, you
575 must place, in the relevant source files, a statement of the
576 additional terms that apply to those files, or a notice indicating
577 where to find the applicable terms.
578
579 Additional terms, permissive or non-permissive, may be stated in the
580 form of a separately written license, or stated as exceptions;
581 the above requirements apply either way.
582
583 8. Termination.
584
585 You may not propagate or modify a covered work except as expressly
586 provided under this License. Any attempt otherwise to propagate or
587 modify it is void, and will automatically terminate your rights under
588 this License (including any patent licenses granted under the third
589 paragraph of section 11).
590
591 However, if you cease all violation of this License, then your
592 license from a particular copyright holder is reinstated (a)
593 provisionally, unless and until the copyright holder explicitly and
594 finally terminates your license, and (b) permanently, if the copyright
595 holder fails to notify you of the violation by some reasonable means
596 prior to 60 days after the cessation.
597
598 Moreover, your license from a particular copyright holder is
599 reinstated permanently if the copyright holder notifies you of the
600 violation by some reasonable means, this is the first time you have
601 received notice of violation of this License (for any work) from that
602 copyright holder, and you cure the violation prior to 30 days after
603 your receipt of the notice.
604
605 Termination of your rights under this section does not terminate the
606 licenses of parties who have received copies or rights from you under
607 this License. If your rights have been terminated and not permanently
608 reinstated, you do not qualify to receive new licenses for the same
609 material under section 10.
610
611 9. Acceptance Not Required for Having Copies.
612
613 You are not required to accept this License in order to receive or
614 run a copy of the Program. Ancillary propagation of a covered work
615 occurring solely as a consequence of using peer-to-peer transmission
616 to receive a copy likewise does not require acceptance. However,
617 nothing other than this License grants you permission to propagate or
618 modify any covered work. These actions infringe copyright if you do
619 not accept this License. Therefore, by modifying or propagating a
620 covered work, you indicate your acceptance of this License to do so.
621
622 10. Automatic Licensing of Downstream Recipients.
623
624 Each time you convey a covered work, the recipient automatically
625 receives a license from the original licensors, to run, modify and
626 propagate that work, subject to this License. You are not responsible
627 for enforcing compliance by third parties with this License.
628
629 An "entity transaction" is a transaction transferring control of an
630 organization, or substantially all assets of one, or subdividing an
631 organization, or merging organizations. If propagation of a covered
632 work results from an entity transaction, each party to that
633 transaction who receives a copy of the work also receives whatever
634 licenses to the work the party's predecessor in interest had or could
635 give under the previous paragraph, plus a right to possession of the
636 Corresponding Source of the work from the predecessor in interest, if
637 the predecessor has it or can get it with reasonable efforts.
638
639 You may not impose any further restrictions on the exercise of the
640 rights granted or affirmed under this License. For example, you may
641 not impose a license fee, royalty, or other charge for exercise of
642 rights granted under this License, and you may not initiate litigation
643 (including a cross-claim or counterclaim in a lawsuit) alleging that
644 any patent claim is infringed by making, using, selling, offering for
645 sale, or importing the Program or any portion of it.
646
647 11. Patents.
648
649 A "contributor" is a copyright holder who authorizes use under this
650 License of the Program or a work on which the Program is based. The
651 work thus licensed is called the contributor's "contributor version".
652
653 A contributor's "essential patent claims" are all patent claims
654 owned or controlled by the contributor, whether already acquired or
655 hereafter acquired, that would be infringed by some manner, permitted
656 by this License, of making, using, or selling its contributor version,
657 but do not include claims that would be infringed only as a
658 consequence of further modification of the contributor version. For
659 purposes of this definition, "control" includes the right to grant
660 patent sublicenses in a manner consistent with the requirements of
661 this License.
662
663 Each contributor grants you a non-exclusive, worldwide, royalty-free
664 patent license under the contributor's essential patent claims, to
665 make, use, sell, offer for sale, import and otherwise run, modify and
666 propagate the contents of its contributor version.
667
668 In the following three paragraphs, a "patent license" is any express
669 agreement or commitment, however denominated, not to enforce a patent
670 (such as an express permission to practice a patent or covenant not to
671 sue for patent infringement). To "grant" such a patent license to a
672 party means to make such an agreement or commitment not to enforce a
673 patent against the party.
674
675 If you convey a covered work, knowingly relying on a patent license,
676 and the Corresponding Source of the work is not available for anyone
677 to copy, free of charge and under the terms of this License, through a
678 publicly available network server or other readily accessible means,
679 then you must either (1) cause the Corresponding Source to be so
680 available, or (2) arrange to deprive yourself of the benefit of the
681 patent license for this particular work, or (3) arrange, in a manner
682 consistent with the requirements of this License, to extend the patent
683 license to downstream recipients. "Knowingly relying" means you have
684 actual knowledge that, but for the patent license, your conveying the
685 covered work in a country, or your recipient's use of the covered work
686 in a country, would infringe one or more identifiable patents in that
687 country that you have reason to believe are valid.
688
689 If, pursuant to or in connection with a single transaction or
690 arrangement, you convey, or propagate by procuring conveyance of, a
691 covered work, and grant a patent license to some of the parties
692 receiving the covered work authorizing them to use, propagate, modify
693 or convey a specific copy of the covered work, then the patent license
694 you grant is automatically extended to all recipients of the covered
695 work and works based on it.
696
697 A patent license is "discriminatory" if it does not include within
698 the scope of its coverage, prohibits the exercise of, or is
699 conditioned on the non-exercise of one or more of the rights that are
700 specifically granted under this License. You may not convey a covered
701 work if you are a party to an arrangement with a third party that is
702 in the business of distributing software, under which you make payment
703 to the third party based on the extent of your activity of conveying
704 the work, and under which the third party grants, to any of the
705 parties who would receive the covered work from you, a discriminatory
706 patent license (a) in connection with copies of the covered work
707 conveyed by you (or copies made from those copies), or (b) primarily
708 for and in connection with specific products or compilations that
709 contain the covered work, unless you entered into that arrangement,
710 or that patent license was granted, prior to 28 March 2007.
711
712 Nothing in this License shall be construed as excluding or limiting
713 any implied license or other defenses to infringement that may
714 otherwise be available to you under applicable patent law.
715
716 12. No Surrender of Others' Freedom.
717
718 If conditions are imposed on you (whether by court order, agreement or
719 otherwise) that contradict the conditions of this License, they do not
720 excuse you from the conditions of this License. If you cannot convey a
721 covered work so as to satisfy simultaneously your obligations under this
722 License and any other pertinent obligations, then as a consequence you may
723 not convey it at all. For example, if you agree to terms that obligate you
724 to collect a royalty for further conveying from those to whom you convey
725 the Program, the only way you could satisfy both those terms and this
726 License would be to refrain entirely from conveying the Program.
727
728 13. Use with the GNU Affero General Public License.
729
730 Notwithstanding any other provision of this License, you have
731 permission to link or combine any covered work with a work licensed
732 under version 3 of the GNU Affero General Public License into a single
733 combined work, and to convey the resulting work. The terms of this
734 License will continue to apply to the part which is the covered work,
735 but the special requirements of the GNU Affero General Public License,
736 section 13, concerning interaction through a network will apply to the
737 combination as such.
738
739 14. Revised Versions of this License.
740
741 The Free Software Foundation may publish revised and/or new versions of
742 the GNU General Public License from time to time. Such new versions will
743 be similar in spirit to the present version, but may differ in detail to
744 address new problems or concerns.
745
746 Each version is given a distinguishing version number. If the
747 Program specifies that a certain numbered version of the GNU General
748 Public License "or any later version" applies to it, you have the
749 option of following the terms and conditions either of that numbered
750 version or of any later version published by the Free Software
751 Foundation. If the Program does not specify a version number of the
752 GNU General Public License, you may choose any version ever published
753 by the Free Software Foundation.
754
755 If the Program specifies that a proxy can decide which future
756 versions of the GNU General Public License can be used, that proxy's
757 public statement of acceptance of a version permanently authorizes you
758 to choose that version for the Program.
759
760 Later license versions may give you additional or different
761 permissions. However, no additional obligations are imposed on any
762 author or copyright holder as a result of your choosing to follow a
763 later version.
764
765 15. Disclaimer of Warranty.
766
767 THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
768 APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
769 HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
770 OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
771 THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
772 PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
773 IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
774 ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
775
776 16. Limitation of Liability.
777
778 IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
779 WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
780 THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
781 GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
782 USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
783 DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
784 PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
785 EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
786 SUCH DAMAGES.
787
788 17. Interpretation of Sections 15 and 16.
789
790 If the disclaimer of warranty and limitation of liability provided
791 above cannot be given local legal effect according to their terms,
792 reviewing courts shall apply local law that most closely approximates
793 an absolute waiver of all civil liability in connection with the
794 Program, unless a warranty or assumption of liability accompanies a
795 copy of the Program in return for a fee.
796
797 END OF TERMS AND CONDITIONS
798
799 How to Apply These Terms to Your New Programs
800
801 If you develop a new program, and you want it to be of the greatest
802 possible use to the public, the best way to achieve this is to make it
803 free software which everyone can redistribute and change under these terms.
804
805 To do so, attach the following notices to the program. It is safest
806 to attach them to the start of each source file to most effectively
807 state the exclusion of warranty; and each file should have at least
808 the "copyright" line and a pointer to where the full notice is found.
809
810 <one line to give the program's name and a brief idea of what it does.>
811 Copyright (C) <year> <name of author>
812
813 This program is free software: you can redistribute it and/or modify
814 it under the terms of the GNU General Public License as published by
815 the Free Software Foundation, either version 3 of the License, or
816 (at your option) any later version.
817
818 This program is distributed in the hope that it will be useful,
819 but WITHOUT ANY WARRANTY; without even the implied warranty of
820 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
821 GNU General Public License for more details.
822
823 You should have received a copy of the GNU General Public License
824 along with this program. If not, see <http://www.gnu.org/licenses/>.
825
826 Also add information on how to contact you by electronic and paper mail.
827
828 If the program does terminal interaction, make it output a short
829 notice like this when it starts in an interactive mode:
830
831 <program> Copyright (C) <year> <name of author>
832 This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
833 This is free software, and you are welcome to redistribute it
834 under certain conditions; type `show c' for details.
835
836 The hypothetical commands `show w' and `show c' should show the appropriate
837 parts of the General Public License. Of course, your program's commands
838 might be different; for a GUI interface, you would use an "about box".
839
840 You should also get your employer (if you work as a programmer) or school,
841 if any, to sign a "copyright disclaimer" for the program, if necessary.
842 For more information on this, and how to apply and follow the GNU GPL, see
843 <http://www.gnu.org/licenses/>.
844
845 The GNU General Public License does not permit incorporating your program
846 into proprietary programs. If your program is a subroutine library, you
847 may consider it more useful to permit linking proprietary applications with
848 the library. If this is what you want to do, use the GNU Lesser General
849 Public License instead of this License. But first, please read
850 <http://www.gnu.org/philosophy/why-not-lgpl.html>.
851
852 ----------------------------------------------------------------------
0 See https://github.com/strukturag/libde265 for further information.
1111 Basic Installation
1212 ==================
1313
14 Briefly, the shell commands `./configure; make; make install' should
15 configure, build, and install this package. The following
14 Briefly, the shell command `./configure && make && make install'
15 should configure, build, and install this package. The following
1616 more-detailed instructions are generic; see the `README' file for
1717 instructions specific to this package. Some packages provide this
1818 `INSTALL' file but do not implement all of the features documented
1818 build \
1919 Makefile.vc7 \
2020 README.md \
21 libde265.png
21 libde265.png \
22 */COPYING
2223
2324 pkgconfigdir = $(libdir)/pkgconfig
2425 pkgconfig_DATA = libde265.pc
0 # Makefile.in generated by automake 1.13.3 from Makefile.am.
0 # Makefile.in generated by automake 1.14.1 from Makefile.am.
11 # @configure_input@
22
33 # Copyright (C) 1994-2013 Free Software Foundation, Inc.
8484 DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \
8585 $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
8686 $(top_srcdir)/configure $(am__configure_deps) \
87 $(srcdir)/config.h.in $(srcdir)/libde265.pc.in COPYING TODO \
87 $(srcdir)/config.h.in $(srcdir)/libde265.pc.in COPYING compile \
8888 config.guess config.sub depcomp install-sh missing ltmain.sh
8989 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
9090 am__aclocal_m4_deps = $(top_srcdir)/m4/libtool.m4 \
304304 SET_MAKE = @SET_MAKE@
305305 SHELL = @SHELL@
306306 STRIP = @STRIP@
307 SWSCALE_CFLAGS = @SWSCALE_CFLAGS@
308 SWSCALE_LIBS = @SWSCALE_LIBS@
307309 VERSION = @VERSION@
308310 VIDEOGFX_CFLAGS = @VIDEOGFX_CFLAGS@
309311 VIDEOGFX_LIBS = @VIDEOGFX_LIBS@
373375 build \
374376 Makefile.vc7 \
375377 README.md \
376 libde265.png
378 libde265.png \
379 */COPYING
377380
378381 pkgconfigdir = $(libdir)/pkgconfig
379382 pkgconfig_DATA = libde265.pc
417420 $(am__aclocal_m4_deps):
418421
419422 config.h: stamp-h1
420 @if test ! -f $@; then rm -f stamp-h1; else :; fi
421 @if test ! -f $@; then $(MAKE) $(AM_MAKEFLAGS) stamp-h1; else :; fi
423 @test -f $@ || rm -f stamp-h1
424 @test -f $@ || $(MAKE) $(AM_MAKEFLAGS) stamp-h1
422425
423426 stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
424427 @rm -f stamp-h1
650653 $(am__post_remove_distdir)
651654
652655 dist-tarZ: distdir
656 @echo WARNING: "Support for shar distribution archives is" \
657 "deprecated." >&2
658 @echo WARNING: "It will be removed altogether in Automake 2.0" >&2
653659 tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
654660 $(am__post_remove_distdir)
655661
656662 dist-shar: distdir
663 @echo WARNING: "Support for distribution archives compressed with" \
664 "legacy program 'compress' is deprecated." >&2
665 @echo WARNING: "It will be removed altogether in Automake 2.0" >&2
657666 shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
658667 $(am__post_remove_distdir)
659668
695704 && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
696705 && am__cwd=`pwd` \
697706 && $(am__cd) $(distdir)/_build \
698 && ../configure --srcdir=.. --prefix="$$dc_install_base" \
707 && ../configure \
699708 $(AM_DISTCHECK_CONFIGURE_FLAGS) \
700709 $(DISTCHECK_CONFIGURE_FLAGS) \
710 --srcdir=.. --prefix="$$dc_install_base" \
701711 && $(MAKE) $(AM_MAKEFLAGS) \
702712 && $(MAKE) $(AM_MAKEFLAGS) dvi \
703713 && $(MAKE) $(AM_MAKEFLAGS) check \
0 See https://github.com/strukturag/libde265 for further information.
44 ![libde265](libde265.png)
55
66 libde265 is an open source implementation of the h.265 video codec.
7 It is written from scratch in plain C for simplicity and efficiency.
8 Its simple API makes it easy to integrate it into other software.
7 It is written from scratch and has a plain C API to enable
8 a simple integration into other software.
99
1010 libde265 supports WPP and tile-based multithreading and includes SSE optimizations.
11 All features of the Main profile except long-term MC (which is currently unused
12 by the available encoders) are supported.
11 The decoder includes all features of the Main profile and correctly decodes almost all
12 conformance streams (see [[wiki page](https://github.com/strukturag/libde265/wiki/Decoder-conformance)]).
13
14 A list of supported features are available in the [wiki](https://github.com/strukturag/libde265/wiki/Supported-decoding-features).
1315
1416 For latest news check our website at http://www.libde265.org
1517
6163
6264 - Qt (required for sherlock265),
6365
64 - libvideogfx (required for sherlock265, optional for dec265).
66 - libswscale (required for sherlock265 if libvideogfx is not available).
67
68 - libvideogfx (required for sherlock265 if libswscale is not available,
69 optional for dec265).
6570
6671 Libvideogfx can be obtained from
6772 http://www.dirk-farin.net/software/libvideogfx/index.html
96101
97102 - gstreamer plugin, [source](https://github.com/strukturag/gstreamer-libde265), [binary packages](https://launchpad.net/~strukturag/+archive/libde265).
98103
104 - VLC plugin [source](https://github.com/strukturag/vlc-libde265), [binary packages](https://launchpad.net/~strukturag/+archive/libde265).
105
99106 - Windows DirectShow filters, https://github.com/strukturag/LAVFilters/releases
100107
101108 - ffmpeg fork, https://github.com/farindk/ffmpeg
109
110 - ffmpeg decoder [source](https://github.com/strukturag/libde265-ffmpeg)
111
112 - libde265.js JavaScript decoder [source](https://github.com/strukturag/libde265.js), [demo](https://strukturag.github.io/libde265.js/).
102113
103114
104115 License
105116 =======
106117
107 libde265 is distributed under the terms of the GNU Lesser General Public License.
108 See COPYRIGHT for more details.
118 The library `libde265` is distributed under the terms of the GNU Lesser
119 General Public License. The sample applications are distributed under
120 the terms of the GNU General Public License.
121
122 See `COPYING` for more details.
109123
110124 Copyright (c) 2013-2014 Struktur AG
111125 Contact: Dirk Farin <farin@struktur.de>
+0
-8
TODO less more
0 /mnt/temp/dirk/hevc-bbc-bitstreams/ftp.kw.bbc.co.uk/hevc/hm-10.1-anchors/bitstreams/ra_main/SlideShow_1280x720_20_qp22.bin
1 /mnt/temp/dirk/hevc-bbc-bitstreams/ftp.kw.bbc.co.uk/hevc/hm-10.1-anchors/bitstreams/ra_main/SlideShow_1280x720_20_qp27.bin
2 SEI decoded picture MD5 mismatch (POC=57)
3 decoding error: image checksum mismatch
4 /mnt/temp/dirk/hevc-bbc-bitstreams/ftp.kw.bbc.co.uk/hevc/hm-10.1-anchors/bitstreams/ra_main/SlideShow_1280x720_20_qp32.bin
5 /mnt/temp/dirk/hevc-bbc-bitstreams/ftp.kw.bbc.co.uk/hevc/hm-10.1-anchors/bitstreams/ra_main/SlideShow_1280x720_20_qp37.bin
6 SEI decoded picture MD5 mismatch (POC=6)
7 decoding error: image checksum mismatch
0 # generated automatically by aclocal 1.13.3 -*- Autoconf -*-
0 # generated automatically by aclocal 1.14.1 -*- Autoconf -*-
11
22 # Copyright (C) 1996-2013 Free Software Foundation, Inc.
33
191191 # generated from the m4 files accompanying Automake X.Y.
192192 # (This private macro should not be called outside this file.)
193193 AC_DEFUN([AM_AUTOMAKE_VERSION],
194 [am__api_version='1.13'
194 [am__api_version='1.14'
195195 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
196196 dnl require some minimum version. Point them to the right macro.
197 m4_if([$1], [1.13.3], [],
197 m4_if([$1], [1.14.1], [],
198198 [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
199199 ])
200200
210210 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
211211 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
212212 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
213 [AM_AUTOMAKE_VERSION([1.13.3])dnl
213 [AM_AUTOMAKE_VERSION([1.14.1])dnl
214214 m4_ifndef([AC_AUTOCONF_VERSION],
215215 [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
216216 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
577577 # This macro actually does too much. Some checks are only needed if
578578 # your package does certain things. But this isn't really a big deal.
579579
580 dnl Redefine AC_PROG_CC to automatically invoke _AM_PROG_CC_C_O.
581 m4_define([AC_PROG_CC],
582 m4_defn([AC_PROG_CC])
583 [_AM_PROG_CC_C_O
584 ])
585
580586 # AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE])
581587 # AM_INIT_AUTOMAKE([OPTIONS])
582588 # -----------------------------------------------
685691 AC_CONFIG_COMMANDS_PRE(dnl
686692 [m4_provide_if([_AM_COMPILER_EXEEXT],
687693 [AM_CONDITIONAL([am__EXEEXT], [test -n "$EXEEXT"])])])dnl
688 ])
694
695 # POSIX will say in a future version that running "rm -f" with no argument
696 # is OK; and we want to be able to make that assumption in our Makefile
697 # recipes. So use an aggressive probe to check that the usage we want is
698 # actually supported "in the wild" to an acceptable degree.
699 # See automake bug#10828.
700 # To make any issue more visible, cause the running configure to be aborted
701 # by default if the 'rm' program in use doesn't match our expectations; the
702 # user can still override this though.
703 if rm -f && rm -fr && rm -rf; then : OK; else
704 cat >&2 <<'END'
705 Oops!
706
707 Your 'rm' program seems unable to run without file operands specified
708 on the command line, even when the '-f' option is present. This is contrary
709 to the behaviour of most rm programs out there, and not conforming with
710 the upcoming POSIX standard: <http://austingroupbugs.net/view.php?id=542>
711
712 Please tell bug-automake@gnu.org about your system, including the value
713 of your $PATH and any error possibly output before this message. This
714 can help us improve future automake versions.
715
716 END
717 if test x"$ACCEPT_INFERIOR_RM_PROGRAM" = x"yes"; then
718 echo 'Configuration will proceed anyway, since you have set the' >&2
719 echo 'ACCEPT_INFERIOR_RM_PROGRAM variable to "yes"' >&2
720 echo >&2
721 else
722 cat >&2 <<'END'
723 Aborting the configuration process, to ensure you take notice of the issue.
724
725 You can download and install GNU coreutils to get an 'rm' implementation
726 that behaves properly: <http://www.gnu.org/software/coreutils/>.
727
728 If you want to complete the configuration process using your problematic
729 'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM
730 to "yes", and re-run configure.
731
732 END
733 AC_MSG_ERROR([Your 'rm' program is bad, sorry.])
734 fi
735 fi])
689736
690737 dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion. Do not
691738 dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
692739 dnl mangled by Autoconf and run in a shell conditional statement.
693740 m4_define([_AC_COMPILER_EXEEXT],
694741 m4_defn([_AC_COMPILER_EXEEXT])[m4_provide([_AM_COMPILER_EXEEXT])])
695
696742
697743 # When config.status generates a header, we must update the stamp-h file.
698744 # This file resides in the same directory as the config header
874920 # Execute IF-SET if OPTION is set, IF-NOT-SET otherwise.
875921 AC_DEFUN([_AM_IF_OPTION],
876922 [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
923
924 # Copyright (C) 1999-2013 Free Software Foundation, Inc.
925 #
926 # This file is free software; the Free Software Foundation
927 # gives unlimited permission to copy and/or distribute it,
928 # with or without modifications, as long as this notice is preserved.
929
930 # _AM_PROG_CC_C_O
931 # ---------------
932 # Like AC_PROG_CC_C_O, but changed for automake. We rewrite AC_PROG_CC
933 # to automatically call this.
934 AC_DEFUN([_AM_PROG_CC_C_O],
935 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
936 AC_REQUIRE_AUX_FILE([compile])dnl
937 AC_LANG_PUSH([C])dnl
938 AC_CACHE_CHECK(
939 [whether $CC understands -c and -o together],
940 [am_cv_prog_cc_c_o],
941 [AC_LANG_CONFTEST([AC_LANG_PROGRAM([])])
942 # Make sure it works both with $CC and with simple cc.
943 # Following AC_PROG_CC_C_O, we do the test twice because some
944 # compilers refuse to overwrite an existing .o file with -o,
945 # though they will create one.
946 am_cv_prog_cc_c_o=yes
947 for am_i in 1 2; do
948 if AM_RUN_LOG([$CC -c conftest.$ac_ext -o conftest2.$ac_objext]) \
949 && test -f conftest2.$ac_objext; then
950 : OK
951 else
952 am_cv_prog_cc_c_o=no
953 break
954 fi
955 done
956 rm -f core conftest*
957 unset am_i])
958 if test "$am_cv_prog_cc_c_o" != yes; then
959 # Losing compiler, so override with the script.
960 # FIXME: It is wrong to rewrite CC.
961 # But if we don't then we get into trouble of one sort or another.
962 # A longer-term fix would be to have automake use am__CC in this case,
963 # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
964 CC="$am_aux_dir/compile $CC"
965 fi
966 AC_LANG_POP([C])])
967
968 # For backward compatibility.
969 AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
970
971 # Copyright (C) 2001-2013 Free Software Foundation, Inc.
972 #
973 # This file is free software; the Free Software Foundation
974 # gives unlimited permission to copy and/or distribute it,
975 # with or without modifications, as long as this notice is preserved.
976
977 # AM_RUN_LOG(COMMAND)
978 # -------------------
979 # Run COMMAND, save the exit status in ac_status, and log it.
980 # (This has been adapted from Autoconf's _AC_RUN_LOG macro.)
981 AC_DEFUN([AM_RUN_LOG],
982 [{ echo "$as_me:$LINENO: $1" >&AS_MESSAGE_LOG_FD
983 ($1) >&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD
984 ac_status=$?
985 echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD
986 (exit $ac_status); }])
877987
878988 # Check to make sure that the build environment is sane. -*- Autoconf -*-
879989
0 #! /bin/sh
1 # Wrapper for compilers which do not understand '-c -o'.
2
3 scriptversion=2012-10-14.11; # UTC
4
5 # Copyright (C) 1999-2013 Free Software Foundation, Inc.
6 # Written by Tom Tromey <tromey@cygnus.com>.
7 #
8 # This program is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2, or (at your option)
11 # any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
21 # As a special exception to the GNU General Public License, if you
22 # distribute this file as part of a program that contains a
23 # configuration script generated by Autoconf, you may include it under
24 # the same distribution terms that you use for the rest of that program.
25
26 # This file is maintained in Automake, please report
27 # bugs to <bug-automake@gnu.org> or send patches to
28 # <automake-patches@gnu.org>.
29
30 nl='
31 '
32
33 # We need space, tab and new line, in precisely that order. Quoting is
34 # there to prevent tools from complaining about whitespace usage.
35 IFS=" "" $nl"
36
37 file_conv=
38
39 # func_file_conv build_file lazy
40 # Convert a $build file to $host form and store it in $file
41 # Currently only supports Windows hosts. If the determined conversion
42 # type is listed in (the comma separated) LAZY, no conversion will
43 # take place.
44 func_file_conv ()
45 {
46 file=$1
47 case $file in
48 / | /[!/]*) # absolute file, and not a UNC file
49 if test -z "$file_conv"; then
50 # lazily determine how to convert abs files
51 case `uname -s` in
52 MINGW*)
53 file_conv=mingw
54 ;;
55 CYGWIN*)
56 file_conv=cygwin
57 ;;
58 *)
59 file_conv=wine
60 ;;
61 esac
62 fi
63 case $file_conv/,$2, in
64 *,$file_conv,*)
65 ;;
66 mingw/*)
67 file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
68 ;;
69 cygwin/*)
70 file=`cygpath -m "$file" || echo "$file"`
71 ;;
72 wine/*)
73 file=`winepath -w "$file" || echo "$file"`
74 ;;
75 esac
76 ;;
77 esac
78 }
79
80 # func_cl_dashL linkdir
81 # Make cl look for libraries in LINKDIR
82 func_cl_dashL ()
83 {
84 func_file_conv "$1"
85 if test -z "$lib_path"; then
86 lib_path=$file
87 else
88 lib_path="$lib_path;$file"
89 fi
90 linker_opts="$linker_opts -LIBPATH:$file"
91 }
92
93 # func_cl_dashl library
94 # Do a library search-path lookup for cl
95 func_cl_dashl ()
96 {
97 lib=$1
98 found=no
99 save_IFS=$IFS
100 IFS=';'
101 for dir in $lib_path $LIB
102 do
103 IFS=$save_IFS
104 if $shared && test -f "$dir/$lib.dll.lib"; then
105 found=yes
106 lib=$dir/$lib.dll.lib
107 break
108 fi
109 if test -f "$dir/$lib.lib"; then
110 found=yes
111 lib=$dir/$lib.lib
112 break
113 fi
114 if test -f "$dir/lib$lib.a"; then
115 found=yes
116 lib=$dir/lib$lib.a
117 break
118 fi
119 done
120 IFS=$save_IFS
121
122 if test "$found" != yes; then
123 lib=$lib.lib
124 fi
125 }
126
127 # func_cl_wrapper cl arg...
128 # Adjust compile command to suit cl
129 func_cl_wrapper ()
130 {
131 # Assume a capable shell
132 lib_path=
133 shared=:
134 linker_opts=
135 for arg
136 do
137 if test -n "$eat"; then
138 eat=
139 else
140 case $1 in
141 -o)
142 # configure might choose to run compile as 'compile cc -o foo foo.c'.
143 eat=1
144 case $2 in
145 *.o | *.[oO][bB][jJ])
146 func_file_conv "$2"
147 set x "$@" -Fo"$file"
148 shift
149 ;;
150 *)
151 func_file_conv "$2"
152 set x "$@" -Fe"$file"
153 shift
154 ;;
155 esac
156 ;;
157 -I)
158 eat=1
159 func_file_conv "$2" mingw
160 set x "$@" -I"$file"
161 shift
162 ;;
163 -I*)
164 func_file_conv "${1#-I}" mingw
165 set x "$@" -I"$file"
166 shift
167 ;;
168 -l)
169 eat=1
170 func_cl_dashl "$2"
171 set x "$@" "$lib"
172 shift
173 ;;
174 -l*)
175 func_cl_dashl "${1#-l}"
176 set x "$@" "$lib"
177 shift
178 ;;
179 -L)
180 eat=1
181 func_cl_dashL "$2"
182 ;;
183 -L*)
184 func_cl_dashL "${1#-L}"
185 ;;
186 -static)
187 shared=false
188 ;;
189 -Wl,*)
190 arg=${1#-Wl,}
191 save_ifs="$IFS"; IFS=','
192 for flag in $arg; do
193 IFS="$save_ifs"
194 linker_opts="$linker_opts $flag"
195 done
196 IFS="$save_ifs"
197 ;;
198 -Xlinker)
199 eat=1
200 linker_opts="$linker_opts $2"
201 ;;
202 -*)
203 set x "$@" "$1"
204 shift
205 ;;
206 *.cc | *.CC | *.cxx | *.CXX | *.[cC]++)
207 func_file_conv "$1"
208 set x "$@" -Tp"$file"
209 shift
210 ;;
211 *.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO])
212 func_file_conv "$1" mingw
213 set x "$@" "$file"
214 shift
215 ;;
216 *)
217 set x "$@" "$1"
218 shift
219 ;;
220 esac
221 fi
222 shift
223 done
224 if test -n "$linker_opts"; then
225 linker_opts="-link$linker_opts"
226 fi
227 exec "$@" $linker_opts
228 exit 1
229 }
230
231 eat=
232
233 case $1 in
234 '')
235 echo "$0: No command. Try '$0 --help' for more information." 1>&2
236 exit 1;
237 ;;
238 -h | --h*)
239 cat <<\EOF
240 Usage: compile [--help] [--version] PROGRAM [ARGS]
241
242 Wrapper for compilers which do not understand '-c -o'.
243 Remove '-o dest.o' from ARGS, run PROGRAM with the remaining
244 arguments, and rename the output as expected.
245
246 If you are trying to build a whole package this is not the
247 right script to run: please start by reading the file 'INSTALL'.
248
249 Report bugs to <bug-automake@gnu.org>.
250 EOF
251 exit $?
252 ;;
253 -v | --v*)
254 echo "compile $scriptversion"
255 exit $?
256 ;;
257 cl | *[/\\]cl | cl.exe | *[/\\]cl.exe )
258 func_cl_wrapper "$@" # Doesn't return...
259 ;;
260 esac
261
262 ofile=
263 cfile=
264
265 for arg
266 do
267 if test -n "$eat"; then
268 eat=
269 else
270 case $1 in
271 -o)
272 # configure might choose to run compile as 'compile cc -o foo foo.c'.
273 # So we strip '-o arg' only if arg is an object.
274 eat=1
275 case $2 in
276 *.o | *.obj)
277 ofile=$2
278 ;;
279 *)
280 set x "$@" -o "$2"
281 shift
282 ;;
283 esac
284 ;;
285 *.c)
286 cfile=$1
287 set x "$@" "$1"
288 shift
289 ;;
290 *)
291 set x "$@" "$1"
292 shift
293 ;;
294 esac
295 fi
296 shift
297 done
298
299 if test -z "$ofile" || test -z "$cfile"; then
300 # If no '-o' option was seen then we might have been invoked from a
301 # pattern rule where we don't need one. That is ok -- this is a
302 # normal compilation that the losing compiler can handle. If no
303 # '.c' file was seen then we are probably linking. That is also
304 # ok.
305 exec "$@"
306 fi
307
308 # Name of file we expect compiler to create.
309 cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'`
310
311 # Create the lock directory.
312 # Note: use '[/\\:.-]' here to ensure that we don't use the same name
313 # that we are using for the .o file. Also, base the name on the expected
314 # object file name, since that is what matters with a parallel build.
315 lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d
316 while true; do
317 if mkdir "$lockdir" >/dev/null 2>&1; then
318 break
319 fi
320 sleep 1
321 done
322 # FIXME: race condition here if user kills between mkdir and trap.
323 trap "rmdir '$lockdir'; exit 1" 1 2 15
324
325 # Run the compile.
326 "$@"
327 ret=$?
328
329 if test -f "$cofile"; then
330 test "$cofile" = "$ofile" || mv "$cofile" "$ofile"
331 elif test -f "${cofile}bj"; then
332 test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile"
333 fi
334
335 rmdir "$lockdir"
336 exit $ret
337
338 # Local Variables:
339 # mode: shell-script
340 # sh-indentation: 2
341 # eval: (add-hook 'write-file-hooks 'time-stamp)
342 # time-stamp-start: "scriptversion="
343 # time-stamp-format: "%:y-%02m-%02d.%02H"
344 # time-stamp-time-zone: "UTC"
345 # time-stamp-end: "; # UTC"
346 # End:
2626 /* Define to 1 if you have the `posix_memalign' function. */
2727 #undef HAVE_POSIX_MEMALIGN
2828
29 /* Whether libSDL was found. */
29 /* Whether libsdl was found. */
3030 #undef HAVE_SDL
3131
3232 /* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
4646
4747 /* Define to 1 if you have the <string.h> header file. */
4848 #undef HAVE_STRING_H
49
50 /* Whether libswscale was found. */
51 #undef HAVE_SWSCALE
4952
5053 /* Define to 1 if you have the <sys/stat.h> header file. */
5154 #undef HAVE_SYS_STAT_H
00 #! /bin/sh
11 # Guess values for system-dependent variables and create Makefiles.
2 # Generated by GNU Autoconf 2.69 for libde265 0.6.
2 # Generated by GNU Autoconf 2.69 for libde265 0.8.
33 #
44 # Report bugs to <farin@struktur.de>.
55 #
589589 # Identity of this package.
590590 PACKAGE_NAME='libde265'
591591 PACKAGE_TARNAME='libde265'
592 PACKAGE_VERSION='0.6'
593 PACKAGE_STRING='libde265 0.6'
592 PACKAGE_VERSION='0.8'
593 PACKAGE_STRING='libde265 0.8'
594594 PACKAGE_BUGREPORT='farin@struktur.de'
595595 PACKAGE_URL=''
596596
597 ac_unique_file="libde265/de265.c"
597 ac_unique_file="libde265/de265.cc"
598598 # Factoring default headers for most tests.
599599 ac_includes_default="\
600600 #include <stdio.h>
643643 QT_CFLAGS
644644 HAVE_SDL_FALSE
645645 HAVE_SDL_TRUE
646 HAVE_SWSCALE_FALSE
647 HAVE_SWSCALE_TRUE
648 HAVE_VIDEOGFX_FALSE
649 HAVE_VIDEOGFX_TRUE
650 SWSCALE_LIBS
651 SWSCALE_CFLAGS
646652 SDL_LIBS
647653 SDL_CFLAGS
648654 VIDEOGFX_LIBS
793799 enable_libtool_lock
794800 enable_dependency_tracking
795801 enable_silent_rules
802 enable_sse
796803 enable_log_error
797804 enable_log_info
798805 enable_log_debug
820827 VIDEOGFX_LIBS
821828 SDL_CFLAGS
822829 SDL_LIBS
830 SWSCALE_CFLAGS
831 SWSCALE_LIBS
823832 QT_CFLAGS
824833 QT_LIBS'
825834
13621371 # Omit some internal or obsolete options to make the list less imposing.
13631372 # This message is too long to be a string in the A/UX 3.1 sh.
13641373 cat <<_ACEOF
1365 \`configure' configures libde265 0.6 to adapt to many kinds of systems.
1374 \`configure' configures libde265 0.8 to adapt to many kinds of systems.
13661375
13671376 Usage: $0 [OPTION]... [VAR=VALUE]...
13681377
14331442
14341443 if test -n "$ac_init_help"; then
14351444 case $ac_init_help in
1436 short | recursive ) echo "Configuration of libde265 0.6:";;
1445 short | recursive ) echo "Configuration of libde265 0.8:";;
14371446 esac
14381447 cat <<\_ACEOF
14391448
14521461 speeds up one-time build
14531462 --enable-silent-rules less verbose build output (undo: "make V=1")
14541463 --disable-silent-rules verbose build output (undo: "make V=0")
1464 --disable-sse disable SSE optimizations (default=no)
14551465 --enable-log-error turn on logging at error level (default=yes)
14561466 --enable-log-info turn on logging at info level (default=no)
14571467 --enable-log-debug turn on logging at debug level (default=no)
14911501 linker flags for VIDEOGFX, overriding pkg-config
14921502 SDL_CFLAGS C compiler flags for SDL, overriding pkg-config
14931503 SDL_LIBS linker flags for SDL, overriding pkg-config
1504 SWSCALE_CFLAGS
1505 C compiler flags for SWSCALE, overriding pkg-config
1506 SWSCALE_LIBS
1507 linker flags for SWSCALE, overriding pkg-config
14941508 QT_CFLAGS C compiler flags for QT, overriding pkg-config
14951509 QT_LIBS linker flags for QT, overriding pkg-config
14961510
15601574 test -n "$ac_init_help" && exit $ac_status
15611575 if $ac_init_version; then
15621576 cat <<\_ACEOF
1563 libde265 configure 0.6
1577 libde265 configure 0.8
15641578 generated by GNU Autoconf 2.69
15651579
15661580 Copyright (C) 2012 Free Software Foundation, Inc.
22342248 This file contains any messages produced by compilers while
22352249 running configure, to aid debugging if configure makes a mistake.
22362250
2237 It was created by libde265 $as_me 0.6, which was
2251 It was created by libde265 $as_me 0.8, which was
22382252 generated by GNU Autoconf 2.69. Invocation command line was
22392253
22402254 $ $0 $@
25862600 ac_config_headers="$ac_config_headers config.h"
25872601
25882602
2589 NUMERIC_VERSION=0x00060000 # Numeric representation of the version
2603 NUMERIC_VERSION=0x00080000 # Numeric representation of the version
25902604
25912605
25922606 LIBDE265_CURRENT=0
2593 LIBDE265_REVISION=5
2607 LIBDE265_REVISION=7
25942608 LIBDE265_AGE=0
25952609
25962610 # ---------------------------------------------------------------------------
28332847
28342848
28352849
2850
2851 # expand $ac_aux_dir to an absolute path
2852 am_aux_dir=`cd $ac_aux_dir && pwd`
28362853
28372854 ac_ext=c
28382855 ac_cpp='$CPP $CPPFLAGS'
36223639 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
36233640 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
36243641 ac_compiler_gnu=$ac_cv_c_compiler_gnu
3642
3643 ac_ext=c
3644 ac_cpp='$CPP $CPPFLAGS'
3645 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
3646 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
3647 ac_compiler_gnu=$ac_cv_c_compiler_gnu
3648 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC understands -c and -o together" >&5
3649 $as_echo_n "checking whether $CC understands -c and -o together... " >&6; }
3650 if ${am_cv_prog_cc_c_o+:} false; then :
3651 $as_echo_n "(cached) " >&6
3652 else
3653 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
3654 /* end confdefs.h. */
3655
3656 int
3657 main ()
3658 {
3659
3660 ;
3661 return 0;
3662 }
3663 _ACEOF
3664 # Make sure it works both with $CC and with simple cc.
3665 # Following AC_PROG_CC_C_O, we do the test twice because some
3666 # compilers refuse to overwrite an existing .o file with -o,
3667 # though they will create one.
3668 am_cv_prog_cc_c_o=yes
3669 for am_i in 1 2; do
3670 if { echo "$as_me:$LINENO: $CC -c conftest.$ac_ext -o conftest2.$ac_objext" >&5
3671 ($CC -c conftest.$ac_ext -o conftest2.$ac_objext) >&5 2>&5
3672 ac_status=$?
3673 echo "$as_me:$LINENO: \$? = $ac_status" >&5
3674 (exit $ac_status); } \
3675 && test -f conftest2.$ac_objext; then
3676 : OK
3677 else
3678 am_cv_prog_cc_c_o=no
3679 break
3680 fi
3681 done
3682 rm -f core conftest*
3683 unset am_i
3684 fi
3685 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_cc_c_o" >&5
3686 $as_echo "$am_cv_prog_cc_c_o" >&6; }
3687 if test "$am_cv_prog_cc_c_o" != yes; then
3688 # Losing compiler, so override with the script.
3689 # FIXME: It is wrong to rewrite CC.
3690 # But if we don't then we get into trouble of one sort or another.
3691 # A longer-term fix would be to have automake use am__CC in this case,
3692 # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
3693 CC="$am_aux_dir/compile $CC"
3694 fi
3695 ac_ext=c
3696 ac_cpp='$CPP $CPPFLAGS'
3697 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
3698 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
3699 ac_compiler_gnu=$ac_cv_c_compiler_gnu
3700
36253701
36263702 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for a sed that does not truncate output" >&5
36273703 $as_echo_n "checking for a sed that does not truncate output... " >&6; }
59075983 rm -rf conftest*
59085984 ;;
59095985
5910 x86_64-*kfreebsd*-gnu|x86_64-*linux*|ppc*-*linux*|powerpc*-*linux*| \
5986 x86_64-*kfreebsd*-gnu|x86_64-*linux*|powerpc*-*linux*| \
59115987 s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
59125988 # Find out which ABI we are using.
59135989 echo 'int i;' > conftest.$ac_ext
59326008 ;;
59336009 esac
59346010 ;;
5935 ppc64-*linux*|powerpc64-*linux*)
6011 powerpc64le-*)
6012 LD="${LD-ld} -m elf32lppclinux"
6013 ;;
6014 powerpc64-*)
59366015 LD="${LD-ld} -m elf32ppclinux"
59376016 ;;
59386017 s390x-*linux*)
59516030 x86_64-*linux*)
59526031 LD="${LD-ld} -m elf_x86_64"
59536032 ;;
5954 ppc*-*linux*|powerpc*-*linux*)
6033 powerpcle-*)
6034 LD="${LD-ld} -m elf64lppc"
6035 ;;
6036 powerpc-*)
59556037 LD="${LD-ld} -m elf64ppc"
59566038 ;;
59576039 s390*-*linux*|s390*-*tpf*)
1509015172 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
1509115173 ac_compiler_gnu=$ac_cv_c_compiler_gnu
1509215174
15175 ac_ext=c
15176 ac_cpp='$CPP $CPPFLAGS'
15177 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
15178 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
15179 ac_compiler_gnu=$ac_cv_c_compiler_gnu
15180 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC understands -c and -o together" >&5
15181 $as_echo_n "checking whether $CC understands -c and -o together... " >&6; }
15182 if ${am_cv_prog_cc_c_o+:} false; then :
15183 $as_echo_n "(cached) " >&6
15184 else
15185 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
15186 /* end confdefs.h. */
15187
15188 int
15189 main ()
15190 {
15191
15192 ;
15193 return 0;
15194 }
15195 _ACEOF
15196 # Make sure it works both with $CC and with simple cc.
15197 # Following AC_PROG_CC_C_O, we do the test twice because some
15198 # compilers refuse to overwrite an existing .o file with -o,
15199 # though they will create one.
15200 am_cv_prog_cc_c_o=yes
15201 for am_i in 1 2; do
15202 if { echo "$as_me:$LINENO: $CC -c conftest.$ac_ext -o conftest2.$ac_objext" >&5
15203 ($CC -c conftest.$ac_ext -o conftest2.$ac_objext) >&5 2>&5
15204 ac_status=$?
15205 echo "$as_me:$LINENO: \$? = $ac_status" >&5
15206 (exit $ac_status); } \
15207 && test -f conftest2.$ac_objext; then
15208 : OK
15209 else
15210 am_cv_prog_cc_c_o=no
15211 break
15212 fi
15213 done
15214 rm -f core conftest*
15215 unset am_i
15216 fi
15217 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_cc_c_o" >&5
15218 $as_echo "$am_cv_prog_cc_c_o" >&6; }
15219 if test "$am_cv_prog_cc_c_o" != yes; then
15220 # Losing compiler, so override with the script.
15221 # FIXME: It is wrong to rewrite CC.
15222 # But if we don't then we get into trouble of one sort or another.
15223 # A longer-term fix would be to have automake use am__CC in this case,
15224 # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
15225 CC="$am_aux_dir/compile $CC"
15226 fi
15227 ac_ext=c
15228 ac_cpp='$CPP $CPPFLAGS'
15229 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
15230 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
15231 ac_compiler_gnu=$ac_cv_c_compiler_gnu
15232
15233
1509315234 # Find a good install program. We prefer a C program (faster),
1509415235 # so one script is as good as another. But avoid the broken or
1509515236 # incompatible versions:
1519615337
1519715338
1519815339 # Initialize automake stuff
15199 am__api_version='1.13'
15340 am__api_version='1.14'
1520015341
1520115342 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether build environment is sane" >&5
1520215343 $as_echo_n "checking whether build environment is sane... " >&6; }
1527415415 # By default was `s,x,x', remove it if useless.
1527515416 ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
1527615417 program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
15277
15278 # expand $ac_aux_dir to an absolute path
15279 am_aux_dir=`cd $ac_aux_dir && pwd`
1528015418
1528115419 if test x"${MISSING+set}" != xset; then
1528215420 case $am_aux_dir in
1561015748
1561115749 # Define the identity of the package.
1561215750 PACKAGE='libde265'
15613 VERSION='0.6'
15751 VERSION='0.8'
1561415752
1561515753
1561615754 cat >>confdefs.h <<_ACEOF
1591716055
1591816056
1591916057
16058 # POSIX will say in a future version that running "rm -f" with no argument
16059 # is OK; and we want to be able to make that assumption in our Makefile
16060 # recipes. So use an aggressive probe to check that the usage we want is
16061 # actually supported "in the wild" to an acceptable degree.
16062 # See automake bug#10828.
16063 # To make any issue more visible, cause the running configure to be aborted
16064 # by default if the 'rm' program in use doesn't match our expectations; the
16065 # user can still override this though.
16066 if rm -f && rm -fr && rm -rf; then : OK; else
16067 cat >&2 <<'END'
16068 Oops!
16069
16070 Your 'rm' program seems unable to run without file operands specified
16071 on the command line, even when the '-f' option is present. This is contrary
16072 to the behaviour of most rm programs out there, and not conforming with
16073 the upcoming POSIX standard: <http://austingroupbugs.net/view.php?id=542>
16074
16075 Please tell bug-automake@gnu.org about your system, including the value
16076 of your $PATH and any error possibly output before this message. This
16077 can help us improve future automake versions.
16078
16079 END
16080 if test x"$ACCEPT_INFERIOR_RM_PROGRAM" = x"yes"; then
16081 echo 'Configuration will proceed anyway, since you have set the' >&2
16082 echo 'ACCEPT_INFERIOR_RM_PROGRAM variable to "yes"' >&2
16083 echo >&2
16084 else
16085 cat >&2 <<'END'
16086 Aborting the configuration process, to ensure you take notice of the issue.
16087
16088 You can download and install GNU coreutils to get an 'rm' implementation
16089 that behaves properly: <http://www.gnu.org/software/coreutils/>.
16090
16091 If you want to complete the configuration process using your problematic
16092 'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM
16093 to "yes", and re-run configure.
16094
16095 END
16096 as_fn_error $? "Your 'rm' program is bad, sorry." "$LINENO" 5
16097 fi
16098 fi
1592016099
1592116100 CFLAGS+=" -std=c99"
15922 CXXFLAGS+=" -std=c++0x"
1592316101
1592416102 if test "x$GCC" = "xyes"; then
1592516103 case " $CFLAGS " in
1626716445
1626816446 #AX_EXT
1626916447
16270 case $target_cpu in
16271 powerpc*)
16272 ;;
16273
16274 i[3456]86*|x86_64*|amd64*)
16275
16276 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -msse4.1" >&5
16448 # Check whether --enable-sse was given.
16449 if test "${enable_sse+set}" = set; then :
16450 enableval=$enable_sse; disable_sse=yes
16451 else
16452 disable_sse=no
16453 fi
16454
16455
16456 if eval "test x$disable_sse != xyes"; then
16457 case $target_cpu in
16458 powerpc*)
16459 ;;
16460
16461 i[3456]86*|x86_64*|amd64*)
16462
16463 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -msse4.1" >&5
1627716464 $as_echo_n "checking whether C compiler accepts -msse4.1... " >&6; }
1627816465 if ${ax_cv_check_cflags___msse4_1+:} false; then :
1627916466 $as_echo_n "(cached) " >&6
1630816495 :
1630916496 fi
1631016497
16311 if test x"$ax_cv_support_sse41_ext" = x"yes"; then
16312 # SIMD_FLAGS="$SIMD_FLAGS -msse4.1"
16498 if test x"$ax_cv_support_sse41_ext" = x"yes"; then
16499 # SIMD_FLAGS="$SIMD_FLAGS -msse4.1"
1631316500
1631416501 $as_echo "#define HAVE_SSE4_1 1" >>confdefs.h
1631516502
16316 else
16317 { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Your compiler does not support SSE4.1 instructions, can you try another compiler?" >&5
16503 else
16504 { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Your compiler does not support SSE4.1 instructions, can you try another compiler?" >&5
1631816505 $as_echo "$as_me: WARNING: Your compiler does not support SSE4.1 instructions, can you try another compiler?" >&2;}
16319 fi
16320 ;;
16321
16322 esac
16323
16506 fi
16507 ;;
16508
16509 esac
16510 fi
1632416511 if test x"$ax_cv_support_sse41_ext" = x"yes"; then
1632516512 ENABLE_SSE_OPT_TRUE=
1632616513 ENABLE_SSE_OPT_FALSE='#'
1634416531 fi
1634516532
1634616533 if eval "test $enable_log_error = yes"; then
16347 CFLAGS+=" -DDE265_LOG_ERROR"
16534 CXXFLAGS+=" -DDE265_LOG_ERROR"
1634816535 fi
1634916536
1635016537 # Check whether --enable-log-info was given.
1635516542 fi
1635616543
1635716544 if eval "test $enable_log_info = yes"; then
16358 CFLAGS+=" -DDE265_LOG_INFO"
16545 CXXFLAGS+=" -DDE265_LOG_INFO"
1635916546 fi
1636016547
1636116548 # Check whether --enable-log-debug was given.
1636616553 fi
1636716554
1636816555 if eval "test $enable_log_debug = yes"; then
16369 CFLAGS+=" -DDE265_LOG_DEBUG"
16556 CXXFLAGS+=" -DDE265_LOG_DEBUG"
1637016557 fi
1637116558
1637216559 # Check whether --enable-log-trace was given.
1637716564 fi
1637816565
1637916566 if eval "test $enable_log_trace = yes"; then
16380 CFLAGS+=" -DDE265_LOG_TRACE"
16567 CXXFLAGS+=" -DDE265_LOG_TRACE"
1638116568 fi
1638216569
1638316570
1657716764 # Put the nasty error message in config.log where it belongs
1657816765 echo "$VIDEOGFX_PKG_ERRORS" >&5
1657916766
16580 enable_sherlock265="no"
16581 { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Did not find libvideogfx, compilation of sherlock265 will be disabled." >&5
16582 $as_echo "$as_me: WARNING: Did not find libvideogfx, compilation of sherlock265 will be disabled." >&2;}
16767 have_videogfx="no"
1658316768
1658416769 elif test $pkg_failed = untried; then
1658516770 { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
1658616771 $as_echo "no" >&6; }
16587 enable_sherlock265="no"
16588 { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Did not find libvideogfx, compilation of sherlock265 will be disabled." >&5
16589 $as_echo "$as_me: WARNING: Did not find libvideogfx, compilation of sherlock265 will be disabled." >&2;}
16772 have_videogfx="no"
1659016773
1659116774 else
1659216775 VIDEOGFX_CFLAGS=$pkg_cv_VIDEOGFX_CFLAGS
1659816781
1659916782
1660016783
16601 fi
16602 fi
16603
16604 if eval "test x$enable_dec265 = xyes" || eval "test x$enable_sherlock265 = xyes" ; then
16784 have_videogfx="yes"
16785 fi
16786 fi
16787
16788 if eval "test x$enable_dec265 = xyes" ; then
1660516789
1660616790 pkg_failed=no
1660716791 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for SDL" >&5
1666216846 echo "$SDL_PKG_ERRORS" >&5
1666316847
1666416848 have_sdl="no"
16665 { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Did not find libsdl, video output of dec265 will be disabled." >&5
16666 $as_echo "$as_me: WARNING: Did not find libsdl, video output of dec265 will be disabled." >&2;}
1666716849
1666816850 elif test $pkg_failed = untried; then
1666916851 { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
1667016852 $as_echo "no" >&6; }
1667116853 have_sdl="no"
16672 { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Did not find libsdl, video output of dec265 will be disabled." >&5
16673 $as_echo "$as_me: WARNING: Did not find libsdl, video output of dec265 will be disabled." >&2;}
1667416854
1667516855 else
1667616856 SDL_CFLAGS=$pkg_cv_SDL_CFLAGS
1668616866 fi
1668716867 fi
1668816868
16689 if test "x$have_sdl" != "xno"; then
16869 if eval "test x$enable_sherlock265 = xyes" && eval "test x$have_videogfx != xyes" ; then
16870
16871 pkg_failed=no
16872 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for SWSCALE" >&5
16873 $as_echo_n "checking for SWSCALE... " >&6; }
16874
16875 if test -n "$SWSCALE_CFLAGS"; then
16876 pkg_cv_SWSCALE_CFLAGS="$SWSCALE_CFLAGS"
16877 elif test -n "$PKG_CONFIG"; then
16878 if test -n "$PKG_CONFIG" && \
16879 { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libswscale\""; } >&5
16880 ($PKG_CONFIG --exists --print-errors "libswscale") 2>&5
16881 ac_status=$?
16882 $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
16883 test $ac_status = 0; }; then
16884 pkg_cv_SWSCALE_CFLAGS=`$PKG_CONFIG --cflags "libswscale" 2>/dev/null`
16885 test "x$?" != "x0" && pkg_failed=yes
16886 else
16887 pkg_failed=yes
16888 fi
16889 else
16890 pkg_failed=untried
16891 fi
16892 if test -n "$SWSCALE_LIBS"; then
16893 pkg_cv_SWSCALE_LIBS="$SWSCALE_LIBS"
16894 elif test -n "$PKG_CONFIG"; then
16895 if test -n "$PKG_CONFIG" && \
16896 { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libswscale\""; } >&5
16897 ($PKG_CONFIG --exists --print-errors "libswscale") 2>&5
16898 ac_status=$?
16899 $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
16900 test $ac_status = 0; }; then
16901 pkg_cv_SWSCALE_LIBS=`$PKG_CONFIG --libs "libswscale" 2>/dev/null`
16902 test "x$?" != "x0" && pkg_failed=yes
16903 else
16904 pkg_failed=yes
16905 fi
16906 else
16907 pkg_failed=untried
16908 fi
16909
16910
16911
16912 if test $pkg_failed = yes; then
16913 { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
16914 $as_echo "no" >&6; }
16915
16916 if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
16917 _pkg_short_errors_supported=yes
16918 else
16919 _pkg_short_errors_supported=no
16920 fi
16921 if test $_pkg_short_errors_supported = yes; then
16922 SWSCALE_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libswscale" 2>&1`
16923 else
16924 SWSCALE_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libswscale" 2>&1`
16925 fi
16926 # Put the nasty error message in config.log where it belongs
16927 echo "$SWSCALE_PKG_ERRORS" >&5
16928
16929 have_swscale="no"
16930
16931 elif test $pkg_failed = untried; then
16932 { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
16933 $as_echo "no" >&6; }
16934 have_swscale="no"
16935
16936 else
16937 SWSCALE_CFLAGS=$pkg_cv_SWSCALE_CFLAGS
16938 SWSCALE_LIBS=$pkg_cv_SWSCALE_LIBS
16939 { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
16940 $as_echo "yes" >&6; }
16941
16942 $as_echo "#define HAVE_SWSCALE 1" >>confdefs.h
16943
16944
16945
16946 have_swscale="yes"
16947 fi
16948 fi
16949
16950 if test "x$have_videogfx" = "xyes"; then
16951 HAVE_VIDEOGFX_TRUE=
16952 HAVE_VIDEOGFX_FALSE='#'
16953 else
16954 HAVE_VIDEOGFX_TRUE='#'
16955 HAVE_VIDEOGFX_FALSE=
16956 fi
16957
16958 if test "x$have_swscale" = "xyes"; then
16959 HAVE_SWSCALE_TRUE=
16960 HAVE_SWSCALE_FALSE='#'
16961 else
16962 HAVE_SWSCALE_TRUE='#'
16963 HAVE_SWSCALE_FALSE=
16964 fi
16965
16966 if test "x$have_sdl" = "xyes"; then
1669016967 HAVE_SDL_TRUE=
1669116968 HAVE_SDL_FALSE='#'
1669216969 else
1669416971 HAVE_SDL_FALSE=
1669516972 fi
1669616973
16974
16975 if eval "test $enable_dec265 = yes" && eval "test $have_videogfx != yes" && eval "test $have_sdl != yes" ; then
16976 { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Did not find libvideogfx or libsdl, video output of dec265 will be disabled." >&5
16977 $as_echo "$as_me: WARNING: Did not find libvideogfx or libsdl, video output of dec265 will be disabled." >&2;}
16978 fi
16979
16980 if eval "test $enable_sherlock265 = yes" && eval "test $have_videogfx != yes" && eval "test $have_swscale != yes" ; then
16981 { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Did not find libvideogfx or libswscale, compilation of sherlock265 will be disabled." >&5
16982 $as_echo "$as_me: WARNING: Did not find libvideogfx or libswscale, compilation of sherlock265 will be disabled." >&2;}
16983 enable_sherlock265="no"
16984 fi
1669716985
1669816986 if eval "test $enable_sherlock265 = yes" ; then
1669916987
1680817096
1680917097
1681017098 # --- output configuration results ---
17099
17100 { $as_echo "$as_me:${as_lineno-$LINENO}: ---------------------------------------" >&5
17101 $as_echo "$as_me: ---------------------------------------" >&6;}
17102 { $as_echo "$as_me:${as_lineno-$LINENO}: Building dec265 example: $enable_dec265" >&5
17103 $as_echo "$as_me: Building dec265 example: $enable_dec265" >&6;}
17104 { $as_echo "$as_me:${as_lineno-$LINENO}: Building sherlock265 example: $enable_sherlock265" >&5
17105 $as_echo "$as_me: Building sherlock265 example: $enable_sherlock265" >&6;}
17106 { $as_echo "$as_me:${as_lineno-$LINENO}: ---------------------------------------" >&5
17107 $as_echo "$as_me: ---------------------------------------" >&6;}
1681117108
1681217109 ac_config_files="$ac_config_files Makefile"
1681317110
1696817265 as_fn_error $? "conditional \"ENABLE_SSE_OPT\" was never defined.
1696917266 Usually this means the macro was only invoked conditionally." "$LINENO" 5
1697017267 fi
17268 if test -z "${HAVE_VIDEOGFX_TRUE}" && test -z "${HAVE_VIDEOGFX_FALSE}"; then
17269 as_fn_error $? "conditional \"HAVE_VIDEOGFX\" was never defined.
17270 Usually this means the macro was only invoked conditionally." "$LINENO" 5
17271 fi
17272 if test -z "${HAVE_SWSCALE_TRUE}" && test -z "${HAVE_SWSCALE_FALSE}"; then
17273 as_fn_error $? "conditional \"HAVE_SWSCALE\" was never defined.
17274 Usually this means the macro was only invoked conditionally." "$LINENO" 5
17275 fi
1697117276 if test -z "${HAVE_SDL_TRUE}" && test -z "${HAVE_SDL_FALSE}"; then
1697217277 as_fn_error $? "conditional \"HAVE_SDL\" was never defined.
1697317278 Usually this means the macro was only invoked conditionally." "$LINENO" 5
1737717682 # report actual input values of CONFIG_FILES etc. instead of their
1737817683 # values after options handling.
1737917684 ac_log="
17380 This file was extended by libde265 $as_me 0.6, which was
17685 This file was extended by libde265 $as_me 0.8, which was
1738117686 generated by GNU Autoconf 2.69. Invocation command line was
1738217687
1738317688 CONFIG_FILES = $CONFIG_FILES
1744317748 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
1744417749 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
1744517750 ac_cs_version="\\
17446 libde265 config.status 0.6
17751 libde265 config.status 0.8
1744717752 configured by $0, generated by GNU Autoconf 2.69,
1744817753 with options \\"\$ac_cs_config\\"
1744917754
1948019785 $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
1948119786 fi
1948219787
19483
19484
19485 echo "---------------------------------------"
19486 echo "Building dec265 example:" $enable_dec265
19487 echo "Building sherlock265 example:" $enable_sherlock265
19488 echo "---------------------------------------"
11 # Process this file with autoconf to produce a configure script.
22
33 AC_PREREQ([2.68])
4 AC_INIT([libde265], [0.6], [farin@struktur.de])
5 AC_CONFIG_SRCDIR([libde265/de265.c])
4 AC_INIT([libde265], [0.8], [farin@struktur.de])
5 AC_CONFIG_SRCDIR([libde265/de265.cc])
66 AC_CONFIG_HEADERS([config.h])
77
8 NUMERIC_VERSION=0x00060000 # Numeric representation of the version
8 NUMERIC_VERSION=0x00080000 # Numeric representation of the version
99 AC_SUBST(NUMERIC_VERSION)
1010
1111 LIBDE265_CURRENT=0
12 LIBDE265_REVISION=5
12 LIBDE265_REVISION=7
1313 LIBDE265_AGE=0
1414
1515 # ---------------------------------------------------------------------------
3434 AM_INIT_AUTOMAKE
3535
3636 CFLAGS+=" -std=c99"
37 CXXFLAGS+=" -std=c++0x"
3837
3938 dnl Use -Wall if we have gcc.
4039 changequote(,)dnl
7372
7473 #AX_EXT
7574
76 case $target_cpu in
77 powerpc*)
78 ;;
79
80 i[[3456]]86*|x86_64*|amd64*)
81
82 AX_CHECK_COMPILE_FLAG(-msse4.1, ax_cv_support_sse41_ext=yes, [])
83 if test x"$ax_cv_support_sse41_ext" = x"yes"; then
84 # SIMD_FLAGS="$SIMD_FLAGS -msse4.1"
85 AC_DEFINE(HAVE_SSE4_1,1,[Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions])
86 else
87 AC_MSG_WARN([Your compiler does not support SSE4.1 instructions, can you try another compiler?])
88 fi
89 ;;
90
91 esac
92
75 AC_ARG_ENABLE(sse,
76 [AS_HELP_STRING([--disable-sse],
77 [disable SSE optimizations (default=no)])],
78 [disable_sse=yes],
79 [disable_sse=no])
80
81 if eval "test x$disable_sse != xyes"; then
82 case $target_cpu in
83 powerpc*)
84 ;;
85
86 i[[3456]]86*|x86_64*|amd64*)
87
88 AX_CHECK_COMPILE_FLAG(-msse4.1, ax_cv_support_sse41_ext=yes, [])
89 if test x"$ax_cv_support_sse41_ext" = x"yes"; then
90 # SIMD_FLAGS="$SIMD_FLAGS -msse4.1"
91 AC_DEFINE(HAVE_SSE4_1,1,[Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions])
92 else
93 AC_MSG_WARN([Your compiler does not support SSE4.1 instructions, can you try another compiler?])
94 fi
95 ;;
96
97 esac
98 fi
9399 AM_CONDITIONAL([ENABLE_SSE_OPT], [test x"$ax_cv_support_sse41_ext" = x"yes"])
94100
95101 # CFLAGS+=$SIMD_FLAGS
104110 [enable_log_error=$enableval],
105111 [enable_log_error=yes])
106112 if eval "test $enable_log_error = yes"; then
107 CFLAGS+=" -DDE265_LOG_ERROR"
113 CXXFLAGS+=" -DDE265_LOG_ERROR"
108114 fi
109115
110116 AC_ARG_ENABLE(log-info,
113119 [enable_log_info=$enableval],
114120 [enable_log_info=no])
115121 if eval "test $enable_log_info = yes"; then
116 CFLAGS+=" -DDE265_LOG_INFO"
122 CXXFLAGS+=" -DDE265_LOG_INFO"
117123 fi
118124
119125 AC_ARG_ENABLE(log-debug,
122128 [enable_log_debug=$enableval],
123129 [enable_log_debug=no])
124130 if eval "test $enable_log_debug = yes"; then
125 CFLAGS+=" -DDE265_LOG_DEBUG"
131 CXXFLAGS+=" -DDE265_LOG_DEBUG"
126132 fi
127133
128134 AC_ARG_ENABLE(log-trace,
131137 [enable_log_trace=$enableval],
132138 [enable_log_trace=no])
133139 if eval "test $enable_log_trace = yes"; then
134 CFLAGS+=" -DDE265_LOG_TRACE"
140 CXXFLAGS+=" -DDE265_LOG_TRACE"
135141 fi
136142
137143
147153 PKG_CHECK_MODULES([VIDEOGFX], [libvideogfx],
148154 [AC_DEFINE([HAVE_VIDEOGFX], [1], [Whether libvideogfx was found.])
149155 AC_SUBST(VIDEOGFX_CFLAGS)
150 AC_SUBST(VIDEOGFX_LIBS)],
151 [enable_sherlock265="no"
152 AC_MSG_WARN([Did not find libvideogfx, compilation of sherlock265 will be disabled.])]
156 AC_SUBST(VIDEOGFX_LIBS)
157 have_videogfx="yes"],
158 [have_videogfx="no"]
153159 )
154160 fi
155161
156 if eval "test x$enable_dec265 = xyes" || eval "test x$enable_sherlock265 = xyes" ; then
162 if eval "test x$enable_dec265 = xyes" ; then
157163 PKG_CHECK_MODULES([SDL], [sdl],
158 [AC_DEFINE([HAVE_SDL], [1], [Whether libSDL was found.])
164 [AC_DEFINE([HAVE_SDL], [1], [Whether libsdl was found.])
159165 AC_SUBST(SDL_CFLAGS)
160166 AC_SUBST(SDL_LIBS)
161167 have_sdl="yes"],
162 [have_sdl="no"
163 AC_MSG_WARN([Did not find libsdl, video output of dec265 will be disabled.])]
168 [have_sdl="no"]
164169 )
165170 fi
166171
167 AM_CONDITIONAL([HAVE_SDL], [test "x$have_sdl" != "xno"])
172 if eval "test x$enable_sherlock265 = xyes" && eval "test x$have_videogfx != xyes" ; then
173 PKG_CHECK_MODULES([SWSCALE], [libswscale],
174 [AC_DEFINE([HAVE_SWSCALE], [1], [Whether libswscale was found.])
175 AC_SUBST(SWSCALE_CFLAGS)
176 AC_SUBST(SWSCALE_LIBS)
177 have_swscale="yes"],
178 [have_swscale="no"]
179 )
180 fi
181
182 AM_CONDITIONAL([HAVE_VIDEOGFX], [test "x$have_videogfx" = "xyes"])
183 AM_CONDITIONAL([HAVE_SWSCALE], [test "x$have_swscale" = "xyes"])
184 AM_CONDITIONAL([HAVE_SDL], [test "x$have_sdl" = "xyes"])
185
186 if eval "test $enable_dec265 = yes" && eval "test $have_videogfx != yes" && eval "test $have_sdl != yes" ; then
187 AC_MSG_WARN([Did not find libvideogfx or libsdl, video output of dec265 will be disabled.])
188 fi
189
190 if eval "test $enable_sherlock265 = yes" && eval "test $have_videogfx != yes" && eval "test $have_swscale != yes" ; then
191 AC_MSG_WARN([Did not find libvideogfx or libswscale, compilation of sherlock265 will be disabled.])
192 enable_sherlock265="no"
193 fi
168194
169195 if eval "test $enable_sherlock265 = yes" ; then
170196 PKG_CHECK_MODULES([QT], [QtCore QtGui])
175201
176202
177203 # --- output configuration results ---
204
205 AC_MSG_NOTICE([---------------------------------------])
206 AC_MSG_NOTICE([Building dec265 example: $enable_dec265])
207 AC_MSG_NOTICE([Building sherlock265 example: $enable_sherlock265])
208 AC_MSG_NOTICE([---------------------------------------])
178209
179210 AC_CONFIG_FILES([Makefile])
180211 AC_CONFIG_FILES([libde265/Makefile])
184215 AC_CONFIG_FILES([sherlock265/Makefile])
185216 AC_CONFIG_FILES([libde265.pc])
186217 AC_OUTPUT
187
188
189 echo "---------------------------------------"
190 echo "Building dec265 example:" $enable_dec265
191 echo "Building sherlock265 example:" $enable_sherlock265
192 echo "---------------------------------------"
0 GNU GENERAL PUBLIC LICENSE
1 Version 3, 29 June 2007
2
3 Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
4 Everyone is permitted to copy and distribute verbatim copies
5 of this license document, but changing it is not allowed.
6
7 Preamble
8
9 The GNU General Public License is a free, copyleft license for
10 software and other kinds of works.
11
12 The licenses for most software and other practical works are designed
13 to take away your freedom to share and change the works. By contrast,
14 the GNU General Public License is intended to guarantee your freedom to
15 share and change all versions of a program--to make sure it remains free
16 software for all its users. We, the Free Software Foundation, use the
17 GNU General Public License for most of our software; it applies also to
18 any other work released this way by its authors. You can apply it to
19 your programs, too.
20
21 When we speak of free software, we are referring to freedom, not
22 price. Our General Public Licenses are designed to make sure that you
23 have the freedom to distribute copies of free software (and charge for
24 them if you wish), that you receive source code or can get it if you
25 want it, that you can change the software or use pieces of it in new
26 free programs, and that you know you can do these things.
27
28 To protect your rights, we need to prevent others from denying you
29 these rights or asking you to surrender the rights. Therefore, you have
30 certain responsibilities if you distribute copies of the software, or if
31 you modify it: responsibilities to respect the freedom of others.
32
33 For example, if you distribute copies of such a program, whether
34 gratis or for a fee, you must pass on to the recipients the same
35 freedoms that you received. You must make sure that they, too, receive
36 or can get the source code. And you must show them these terms so they
37 know their rights.
38
39 Developers that use the GNU GPL protect your rights with two steps:
40 (1) assert copyright on the software, and (2) offer you this License
41 giving you legal permission to copy, distribute and/or modify it.
42
43 For the developers' and authors' protection, the GPL clearly explains
44 that there is no warranty for this free software. For both users' and
45 authors' sake, the GPL requires that modified versions be marked as
46 changed, so that their problems will not be attributed erroneously to
47 authors of previous versions.
48
49 Some devices are designed to deny users access to install or run
50 modified versions of the software inside them, although the manufacturer
51 can do so. This is fundamentally incompatible with the aim of
52 protecting users' freedom to change the software. The systematic
53 pattern of such abuse occurs in the area of products for individuals to
54 use, which is precisely where it is most unacceptable. Therefore, we
55 have designed this version of the GPL to prohibit the practice for those
56 products. If such problems arise substantially in other domains, we
57 stand ready to extend this provision to those domains in future versions
58 of the GPL, as needed to protect the freedom of users.
59
60 Finally, every program is threatened constantly by software patents.
61 States should not allow patents to restrict development and use of
62 software on general-purpose computers, but in those that do, we wish to
63 avoid the special danger that patents applied to a free program could
64 make it effectively proprietary. To prevent this, the GPL assures that
65 patents cannot be used to render the program non-free.
66
67 The precise terms and conditions for copying, distribution and
68 modification follow.
69
70 TERMS AND CONDITIONS
71
72 0. Definitions.
73
74 "This License" refers to version 3 of the GNU General Public License.
75
76 "Copyright" also means copyright-like laws that apply to other kinds of
77 works, such as semiconductor masks.
78
79 "The Program" refers to any copyrightable work licensed under this
80 License. Each licensee is addressed as "you". "Licensees" and
81 "recipients" may be individuals or organizations.
82
83 To "modify" a work means to copy from or adapt all or part of the work
84 in a fashion requiring copyright permission, other than the making of an
85 exact copy. The resulting work is called a "modified version" of the
86 earlier work or a work "based on" the earlier work.
87
88 A "covered work" means either the unmodified Program or a work based
89 on the Program.
90
91 To "propagate" a work means to do anything with it that, without
92 permission, would make you directly or secondarily liable for
93 infringement under applicable copyright law, except executing it on a
94 computer or modifying a private copy. Propagation includes copying,
95 distribution (with or without modification), making available to the
96 public, and in some countries other activities as well.
97
98 To "convey" a work means any kind of propagation that enables other
99 parties to make or receive copies. Mere interaction with a user through
100 a computer network, with no transfer of a copy, is not conveying.
101
102 An interactive user interface displays "Appropriate Legal Notices"
103 to the extent that it includes a convenient and prominently visible
104 feature that (1) displays an appropriate copyright notice, and (2)
105 tells the user that there is no warranty for the work (except to the
106 extent that warranties are provided), that licensees may convey the
107 work under this License, and how to view a copy of this License. If
108 the interface presents a list of user commands or options, such as a
109 menu, a prominent item in the list meets this criterion.
110
111 1. Source Code.
112
113 The "source code" for a work means the preferred form of the work
114 for making modifications to it. "Object code" means any non-source
115 form of a work.
116
117 A "Standard Interface" means an interface that either is an official
118 standard defined by a recognized standards body, or, in the case of
119 interfaces specified for a particular programming language, one that
120 is widely used among developers working in that language.
121
122 The "System Libraries" of an executable work include anything, other
123 than the work as a whole, that (a) is included in the normal form of
124 packaging a Major Component, but which is not part of that Major
125 Component, and (b) serves only to enable use of the work with that
126 Major Component, or to implement a Standard Interface for which an
127 implementation is available to the public in source code form. A
128 "Major Component", in this context, means a major essential component
129 (kernel, window system, and so on) of the specific operating system
130 (if any) on which the executable work runs, or a compiler used to
131 produce the work, or an object code interpreter used to run it.
132
133 The "Corresponding Source" for a work in object code form means all
134 the source code needed to generate, install, and (for an executable
135 work) run the object code and to modify the work, including scripts to
136 control those activities. However, it does not include the work's
137 System Libraries, or general-purpose tools or generally available free
138 programs which are used unmodified in performing those activities but
139 which are not part of the work. For example, Corresponding Source
140 includes interface definition files associated with source files for
141 the work, and the source code for shared libraries and dynamically
142 linked subprograms that the work is specifically designed to require,
143 such as by intimate data communication or control flow between those
144 subprograms and other parts of the work.
145
146 The Corresponding Source need not include anything that users
147 can regenerate automatically from other parts of the Corresponding
148 Source.
149
150 The Corresponding Source for a work in source code form is that
151 same work.
152
153 2. Basic Permissions.
154
155 All rights granted under this License are granted for the term of
156 copyright on the Program, and are irrevocable provided the stated
157 conditions are met. This License explicitly affirms your unlimited
158 permission to run the unmodified Program. The output from running a
159 covered work is covered by this License only if the output, given its
160 content, constitutes a covered work. This License acknowledges your
161 rights of fair use or other equivalent, as provided by copyright law.
162
163 You may make, run and propagate covered works that you do not
164 convey, without conditions so long as your license otherwise remains
165 in force. You may convey covered works to others for the sole purpose
166 of having them make modifications exclusively for you, or provide you
167 with facilities for running those works, provided that you comply with
168 the terms of this License in conveying all material for which you do
169 not control copyright. Those thus making or running the covered works
170 for you must do so exclusively on your behalf, under your direction
171 and control, on terms that prohibit them from making any copies of
172 your copyrighted material outside their relationship with you.
173
174 Conveying under any other circumstances is permitted solely under
175 the conditions stated below. Sublicensing is not allowed; section 10
176 makes it unnecessary.
177
178 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
179
180 No covered work shall be deemed part of an effective technological
181 measure under any applicable law fulfilling obligations under article
182 11 of the WIPO copyright treaty adopted on 20 December 1996, or
183 similar laws prohibiting or restricting circumvention of such
184 measures.
185
186 When you convey a covered work, you waive any legal power to forbid
187 circumvention of technological measures to the extent such circumvention
188 is effected by exercising rights under this License with respect to
189 the covered work, and you disclaim any intention to limit operation or
190 modification of the work as a means of enforcing, against the work's
191 users, your or third parties' legal rights to forbid circumvention of
192 technological measures.
193
194 4. Conveying Verbatim Copies.
195
196 You may convey verbatim copies of the Program's source code as you
197 receive it, in any medium, provided that you conspicuously and
198 appropriately publish on each copy an appropriate copyright notice;
199 keep intact all notices stating that this License and any
200 non-permissive terms added in accord with section 7 apply to the code;
201 keep intact all notices of the absence of any warranty; and give all
202 recipients a copy of this License along with the Program.
203
204 You may charge any price or no price for each copy that you convey,
205 and you may offer support or warranty protection for a fee.
206
207 5. Conveying Modified Source Versions.
208
209 You may convey a work based on the Program, or the modifications to
210 produce it from the Program, in the form of source code under the
211 terms of section 4, provided that you also meet all of these conditions:
212
213 a) The work must carry prominent notices stating that you modified
214 it, and giving a relevant date.
215
216 b) The work must carry prominent notices stating that it is
217 released under this License and any conditions added under section
218 7. This requirement modifies the requirement in section 4 to
219 "keep intact all notices".
220
221 c) You must license the entire work, as a whole, under this
222 License to anyone who comes into possession of a copy. This
223 License will therefore apply, along with any applicable section 7
224 additional terms, to the whole of the work, and all its parts,
225 regardless of how they are packaged. This License gives no
226 permission to license the work in any other way, but it does not
227 invalidate such permission if you have separately received it.
228
229 d) If the work has interactive user interfaces, each must display
230 Appropriate Legal Notices; however, if the Program has interactive
231 interfaces that do not display Appropriate Legal Notices, your
232 work need not make them do so.
233
234 A compilation of a covered work with other separate and independent
235 works, which are not by their nature extensions of the covered work,
236 and which are not combined with it such as to form a larger program,
237 in or on a volume of a storage or distribution medium, is called an
238 "aggregate" if the compilation and its resulting copyright are not
239 used to limit the access or legal rights of the compilation's users
240 beyond what the individual works permit. Inclusion of a covered work
241 in an aggregate does not cause this License to apply to the other
242 parts of the aggregate.
243
244 6. Conveying Non-Source Forms.
245
246 You may convey a covered work in object code form under the terms
247 of sections 4 and 5, provided that you also convey the
248 machine-readable Corresponding Source under the terms of this License,
249 in one of these ways:
250
251 a) Convey the object code in, or embodied in, a physical product
252 (including a physical distribution medium), accompanied by the
253 Corresponding Source fixed on a durable physical medium
254 customarily used for software interchange.
255
256 b) Convey the object code in, or embodied in, a physical product
257 (including a physical distribution medium), accompanied by a
258 written offer, valid for at least three years and valid for as
259 long as you offer spare parts or customer support for that product
260 model, to give anyone who possesses the object code either (1) a
261 copy of the Corresponding Source for all the software in the
262 product that is covered by this License, on a durable physical
263 medium customarily used for software interchange, for a price no
264 more than your reasonable cost of physically performing this
265 conveying of source, or (2) access to copy the
266 Corresponding Source from a network server at no charge.
267
268 c) Convey individual copies of the object code with a copy of the
269 written offer to provide the Corresponding Source. This
270 alternative is allowed only occasionally and noncommercially, and
271 only if you received the object code with such an offer, in accord
272 with subsection 6b.
273
274 d) Convey the object code by offering access from a designated
275 place (gratis or for a charge), and offer equivalent access to the
276 Corresponding Source in the same way through the same place at no
277 further charge. You need not require recipients to copy the
278 Corresponding Source along with the object code. If the place to
279 copy the object code is a network server, the Corresponding Source
280 may be on a different server (operated by you or a third party)
281 that supports equivalent copying facilities, provided you maintain
282 clear directions next to the object code saying where to find the
283 Corresponding Source. Regardless of what server hosts the
284 Corresponding Source, you remain obligated to ensure that it is
285 available for as long as needed to satisfy these requirements.
286
287 e) Convey the object code using peer-to-peer transmission, provided
288 you inform other peers where the object code and Corresponding
289 Source of the work are being offered to the general public at no
290 charge under subsection 6d.
291
292 A separable portion of the object code, whose source code is excluded
293 from the Corresponding Source as a System Library, need not be
294 included in conveying the object code work.
295
296 A "User Product" is either (1) a "consumer product", which means any
297 tangible personal property which is normally used for personal, family,
298 or household purposes, or (2) anything designed or sold for incorporation
299 into a dwelling. In determining whether a product is a consumer product,
300 doubtful cases shall be resolved in favor of coverage. For a particular
301 product received by a particular user, "normally used" refers to a
302 typical or common use of that class of product, regardless of the status
303 of the particular user or of the way in which the particular user
304 actually uses, or expects or is expected to use, the product. A product
305 is a consumer product regardless of whether the product has substantial
306 commercial, industrial or non-consumer uses, unless such uses represent
307 the only significant mode of use of the product.
308
309 "Installation Information" for a User Product means any methods,
310 procedures, authorization keys, or other information required to install
311 and execute modified versions of a covered work in that User Product from
312 a modified version of its Corresponding Source. The information must
313 suffice to ensure that the continued functioning of the modified object
314 code is in no case prevented or interfered with solely because
315 modification has been made.
316
317 If you convey an object code work under this section in, or with, or
318 specifically for use in, a User Product, and the conveying occurs as
319 part of a transaction in which the right of possession and use of the
320 User Product is transferred to the recipient in perpetuity or for a
321 fixed term (regardless of how the transaction is characterized), the
322 Corresponding Source conveyed under this section must be accompanied
323 by the Installation Information. But this requirement does not apply
324 if neither you nor any third party retains the ability to install
325 modified object code on the User Product (for example, the work has
326 been installed in ROM).
327
328 The requirement to provide Installation Information does not include a
329 requirement to continue to provide support service, warranty, or updates
330 for a work that has been modified or installed by the recipient, or for
331 the User Product in which it has been modified or installed. Access to a
332 network may be denied when the modification itself materially and
333 adversely affects the operation of the network or violates the rules and
334 protocols for communication across the network.
335
336 Corresponding Source conveyed, and Installation Information provided,
337 in accord with this section must be in a format that is publicly
338 documented (and with an implementation available to the public in
339 source code form), and must require no special password or key for
340 unpacking, reading or copying.
341
342 7. Additional Terms.
343
344 "Additional permissions" are terms that supplement the terms of this
345 License by making exceptions from one or more of its conditions.
346 Additional permissions that are applicable to the entire Program shall
347 be treated as though they were included in this License, to the extent
348 that they are valid under applicable law. If additional permissions
349 apply only to part of the Program, that part may be used separately
350 under those permissions, but the entire Program remains governed by
351 this License without regard to the additional permissions.
352
353 When you convey a copy of a covered work, you may at your option
354 remove any additional permissions from that copy, or from any part of
355 it. (Additional permissions may be written to require their own
356 removal in certain cases when you modify the work.) You may place
357 additional permissions on material, added by you to a covered work,
358 for which you have or can give appropriate copyright permission.
359
360 Notwithstanding any other provision of this License, for material you
361 add to a covered work, you may (if authorized by the copyright holders of
362 that material) supplement the terms of this License with terms:
363
364 a) Disclaiming warranty or limiting liability differently from the
365 terms of sections 15 and 16 of this License; or
366
367 b) Requiring preservation of specified reasonable legal notices or
368 author attributions in that material or in the Appropriate Legal
369 Notices displayed by works containing it; or
370
371 c) Prohibiting misrepresentation of the origin of that material, or
372 requiring that modified versions of such material be marked in
373 reasonable ways as different from the original version; or
374
375 d) Limiting the use for publicity purposes of names of licensors or
376 authors of the material; or
377
378 e) Declining to grant rights under trademark law for use of some
379 trade names, trademarks, or service marks; or
380
381 f) Requiring indemnification of licensors and authors of that
382 material by anyone who conveys the material (or modified versions of
383 it) with contractual assumptions of liability to the recipient, for
384 any liability that these contractual assumptions directly impose on
385 those licensors and authors.
386
387 All other non-permissive additional terms are considered "further
388 restrictions" within the meaning of section 10. If the Program as you
389 received it, or any part of it, contains a notice stating that it is
390 governed by this License along with a term that is a further
391 restriction, you may remove that term. If a license document contains
392 a further restriction but permits relicensing or conveying under this
393 License, you may add to a covered work material governed by the terms
394 of that license document, provided that the further restriction does
395 not survive such relicensing or conveying.
396
397 If you add terms to a covered work in accord with this section, you
398 must place, in the relevant source files, a statement of the
399 additional terms that apply to those files, or a notice indicating
400 where to find the applicable terms.
401
402 Additional terms, permissive or non-permissive, may be stated in the
403 form of a separately written license, or stated as exceptions;
404 the above requirements apply either way.
405
406 8. Termination.
407
408 You may not propagate or modify a covered work except as expressly
409 provided under this License. Any attempt otherwise to propagate or
410 modify it is void, and will automatically terminate your rights under
411 this License (including any patent licenses granted under the third
412 paragraph of section 11).
413
414 However, if you cease all violation of this License, then your
415 license from a particular copyright holder is reinstated (a)
416 provisionally, unless and until the copyright holder explicitly and
417 finally terminates your license, and (b) permanently, if the copyright
418 holder fails to notify you of the violation by some reasonable means
419 prior to 60 days after the cessation.
420
421 Moreover, your license from a particular copyright holder is
422 reinstated permanently if the copyright holder notifies you of the
423 violation by some reasonable means, this is the first time you have
424 received notice of violation of this License (for any work) from that
425 copyright holder, and you cure the violation prior to 30 days after
426 your receipt of the notice.
427
428 Termination of your rights under this section does not terminate the
429 licenses of parties who have received copies or rights from you under
430 this License. If your rights have been terminated and not permanently
431 reinstated, you do not qualify to receive new licenses for the same
432 material under section 10.
433
434 9. Acceptance Not Required for Having Copies.
435
436 You are not required to accept this License in order to receive or
437 run a copy of the Program. Ancillary propagation of a covered work
438 occurring solely as a consequence of using peer-to-peer transmission
439 to receive a copy likewise does not require acceptance. However,
440 nothing other than this License grants you permission to propagate or
441 modify any covered work. These actions infringe copyright if you do
442 not accept this License. Therefore, by modifying or propagating a
443 covered work, you indicate your acceptance of this License to do so.
444
445 10. Automatic Licensing of Downstream Recipients.
446
447 Each time you convey a covered work, the recipient automatically
448 receives a license from the original licensors, to run, modify and
449 propagate that work, subject to this License. You are not responsible
450 for enforcing compliance by third parties with this License.
451
452 An "entity transaction" is a transaction transferring control of an
453 organization, or substantially all assets of one, or subdividing an
454 organization, or merging organizations. If propagation of a covered
455 work results from an entity transaction, each party to that
456 transaction who receives a copy of the work also receives whatever
457 licenses to the work the party's predecessor in interest had or could
458 give under the previous paragraph, plus a right to possession of the
459 Corresponding Source of the work from the predecessor in interest, if
460 the predecessor has it or can get it with reasonable efforts.
461
462 You may not impose any further restrictions on the exercise of the
463 rights granted or affirmed under this License. For example, you may
464 not impose a license fee, royalty, or other charge for exercise of
465 rights granted under this License, and you may not initiate litigation
466 (including a cross-claim or counterclaim in a lawsuit) alleging that
467 any patent claim is infringed by making, using, selling, offering for
468 sale, or importing the Program or any portion of it.
469
470 11. Patents.
471
472 A "contributor" is a copyright holder who authorizes use under this
473 License of the Program or a work on which the Program is based. The
474 work thus licensed is called the contributor's "contributor version".
475
476 A contributor's "essential patent claims" are all patent claims
477 owned or controlled by the contributor, whether already acquired or
478 hereafter acquired, that would be infringed by some manner, permitted
479 by this License, of making, using, or selling its contributor version,
480 but do not include claims that would be infringed only as a
481 consequence of further modification of the contributor version. For
482 purposes of this definition, "control" includes the right to grant
483 patent sublicenses in a manner consistent with the requirements of
484 this License.
485
486 Each contributor grants you a non-exclusive, worldwide, royalty-free
487 patent license under the contributor's essential patent claims, to
488 make, use, sell, offer for sale, import and otherwise run, modify and
489 propagate the contents of its contributor version.
490
491 In the following three paragraphs, a "patent license" is any express
492 agreement or commitment, however denominated, not to enforce a patent
493 (such as an express permission to practice a patent or covenant not to
494 sue for patent infringement). To "grant" such a patent license to a
495 party means to make such an agreement or commitment not to enforce a
496 patent against the party.
497
498 If you convey a covered work, knowingly relying on a patent license,
499 and the Corresponding Source of the work is not available for anyone
500 to copy, free of charge and under the terms of this License, through a
501 publicly available network server or other readily accessible means,
502 then you must either (1) cause the Corresponding Source to be so
503 available, or (2) arrange to deprive yourself of the benefit of the
504 patent license for this particular work, or (3) arrange, in a manner
505 consistent with the requirements of this License, to extend the patent
506 license to downstream recipients. "Knowingly relying" means you have
507 actual knowledge that, but for the patent license, your conveying the
508 covered work in a country, or your recipient's use of the covered work
509 in a country, would infringe one or more identifiable patents in that
510 country that you have reason to believe are valid.
511
512 If, pursuant to or in connection with a single transaction or
513 arrangement, you convey, or propagate by procuring conveyance of, a
514 covered work, and grant a patent license to some of the parties
515 receiving the covered work authorizing them to use, propagate, modify
516 or convey a specific copy of the covered work, then the patent license
517 you grant is automatically extended to all recipients of the covered
518 work and works based on it.
519
520 A patent license is "discriminatory" if it does not include within
521 the scope of its coverage, prohibits the exercise of, or is
522 conditioned on the non-exercise of one or more of the rights that are
523 specifically granted under this License. You may not convey a covered
524 work if you are a party to an arrangement with a third party that is
525 in the business of distributing software, under which you make payment
526 to the third party based on the extent of your activity of conveying
527 the work, and under which the third party grants, to any of the
528 parties who would receive the covered work from you, a discriminatory
529 patent license (a) in connection with copies of the covered work
530 conveyed by you (or copies made from those copies), or (b) primarily
531 for and in connection with specific products or compilations that
532 contain the covered work, unless you entered into that arrangement,
533 or that patent license was granted, prior to 28 March 2007.
534
535 Nothing in this License shall be construed as excluding or limiting
536 any implied license or other defenses to infringement that may
537 otherwise be available to you under applicable patent law.
538
539 12. No Surrender of Others' Freedom.
540
541 If conditions are imposed on you (whether by court order, agreement or
542 otherwise) that contradict the conditions of this License, they do not
543 excuse you from the conditions of this License. If you cannot convey a
544 covered work so as to satisfy simultaneously your obligations under this
545 License and any other pertinent obligations, then as a consequence you may
546 not convey it at all. For example, if you agree to terms that obligate you
547 to collect a royalty for further conveying from those to whom you convey
548 the Program, the only way you could satisfy both those terms and this
549 License would be to refrain entirely from conveying the Program.
550
551 13. Use with the GNU Affero General Public License.
552
553 Notwithstanding any other provision of this License, you have
554 permission to link or combine any covered work with a work licensed
555 under version 3 of the GNU Affero General Public License into a single
556 combined work, and to convey the resulting work. The terms of this
557 License will continue to apply to the part which is the covered work,
558 but the special requirements of the GNU Affero General Public License,
559 section 13, concerning interaction through a network will apply to the
560 combination as such.
561
562 14. Revised Versions of this License.
563
564 The Free Software Foundation may publish revised and/or new versions of
565 the GNU General Public License from time to time. Such new versions will
566 be similar in spirit to the present version, but may differ in detail to
567 address new problems or concerns.
568
569 Each version is given a distinguishing version number. If the
570 Program specifies that a certain numbered version of the GNU General
571 Public License "or any later version" applies to it, you have the
572 option of following the terms and conditions either of that numbered
573 version or of any later version published by the Free Software
574 Foundation. If the Program does not specify a version number of the
575 GNU General Public License, you may choose any version ever published
576 by the Free Software Foundation.
577
578 If the Program specifies that a proxy can decide which future
579 versions of the GNU General Public License can be used, that proxy's
580 public statement of acceptance of a version permanently authorizes you
581 to choose that version for the Program.
582
583 Later license versions may give you additional or different
584 permissions. However, no additional obligations are imposed on any
585 author or copyright holder as a result of your choosing to follow a
586 later version.
587
588 15. Disclaimer of Warranty.
589
590 THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
591 APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
592 HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
593 OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
594 THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
595 PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
596 IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
597 ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
598
599 16. Limitation of Liability.
600
601 IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
602 WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
603 THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
604 GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
605 USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
606 DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
607 PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
608 EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
609 SUCH DAMAGES.
610
611 17. Interpretation of Sections 15 and 16.
612
613 If the disclaimer of warranty and limitation of liability provided
614 above cannot be given local legal effect according to their terms,
615 reviewing courts shall apply local law that most closely approximates
616 an absolute waiver of all civil liability in connection with the
617 Program, unless a warranty or assumption of liability accompanies a
618 copy of the Program in return for a fee.
619
620 END OF TERMS AND CONDITIONS
621
622 How to Apply These Terms to Your New Programs
623
624 If you develop a new program, and you want it to be of the greatest
625 possible use to the public, the best way to achieve this is to make it
626 free software which everyone can redistribute and change under these terms.
627
628 To do so, attach the following notices to the program. It is safest
629 to attach them to the start of each source file to most effectively
630 state the exclusion of warranty; and each file should have at least
631 the "copyright" line and a pointer to where the full notice is found.
632
633 <one line to give the program's name and a brief idea of what it does.>
634 Copyright (C) <year> <name of author>
635
636 This program is free software: you can redistribute it and/or modify
637 it under the terms of the GNU General Public License as published by
638 the Free Software Foundation, either version 3 of the License, or
639 (at your option) any later version.
640
641 This program is distributed in the hope that it will be useful,
642 but WITHOUT ANY WARRANTY; without even the implied warranty of
643 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
644 GNU General Public License for more details.
645
646 You should have received a copy of the GNU General Public License
647 along with this program. If not, see <http://www.gnu.org/licenses/>.
648
649 Also add information on how to contact you by electronic and paper mail.
650
651 If the program does terminal interaction, make it output a short
652 notice like this when it starts in an interactive mode:
653
654 <program> Copyright (C) <year> <name of author>
655 This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
656 This is free software, and you are welcome to redistribute it
657 under certain conditions; type `show c' for details.
658
659 The hypothetical commands `show w' and `show c' should show the appropriate
660 parts of the General Public License. Of course, your program's commands
661 might be different; for a GUI interface, you would use an "about box".
662
663 You should also get your employer (if you work as a programmer) or school,
664 if any, to sign a "copyright disclaimer" for the program, if necessary.
665 For more information on this, and how to apply and follow the GNU GPL, see
666 <http://www.gnu.org/licenses/>.
667
668 The GNU General Public License does not permit incorporating your program
669 into proprietary programs. If your program is a subroutine library, you
670 may consider it more useful to permit linking proprietary applications with
671 the library. If this is what you want to do, use the GNU Lesser General
672 Public License instead of this License. But first, please read
673 <http://www.gnu.org/philosophy/why-not-lgpl.html>.
33 AM_CPPFLAGS = -I../libde265
44
55 dec265_DEPENDENCIES = ../libde265/libde265.la
6 dec265_CXXFLAGS = $(VIDEOGFX_CFLAGS) $(SDL_CFLAGS)
7 dec265_LDFLAGS = $(VIDEOGFX_LIBS) $(SDL_LIBS)
6 dec265_CXXFLAGS =
7 dec265_LDFLAGS =
88 dec265_LDADD = ../libde265/libde265.la -lstdc++
99 dec265_SOURCES = dec265.cc
1010
11 if HAVE_VIDEOGFX
12 dec265_CXXFLAGS += $(VIDEOGFX_CFLAGS)
13 dec265_LDFLAGS += $(VIDEOGFX_LIBS)
14 endif
15
1116 if HAVE_SDL
17 dec265_CXXFLAGS += $(SDL_CFLAGS)
18 dec265_LDFLAGS += $(SDL_LIBS)
1219 dec265_SOURCES += sdl.cc sdl.hh
1320 endif
1421
0 # Makefile.in generated by automake 1.13.3 from Makefile.am.
0 # Makefile.in generated by automake 1.14.1 from Makefile.am.
11 # @configure_input@
22
33 # Copyright (C) 1994-2013 Free Software Foundation, Inc.
7979 host_triplet = @host@
8080 target_triplet = @target@
8181 bin_PROGRAMS = dec265$(EXEEXT)
82 @HAVE_SDL_TRUE@am__append_1 = sdl.cc sdl.hh
83 @MINGW_TRUE@am__append_2 = -static-libgcc -static-libstdc++
82 @HAVE_VIDEOGFX_TRUE@am__append_1 = $(VIDEOGFX_CFLAGS)
83 @HAVE_VIDEOGFX_TRUE@am__append_2 = $(VIDEOGFX_LIBS)
84 @HAVE_SDL_TRUE@am__append_3 = $(SDL_CFLAGS)
85 @HAVE_SDL_TRUE@am__append_4 = $(SDL_LIBS)
86 @HAVE_SDL_TRUE@am__append_5 = sdl.cc sdl.hh
87 @MINGW_TRUE@am__append_6 = -static-libgcc -static-libstdc++
8488 subdir = dec265
8589 DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
86 $(top_srcdir)/depcomp
90 $(top_srcdir)/depcomp COPYING
8791 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
8892 am__aclocal_m4_deps = $(top_srcdir)/m4/libtool.m4 \
8993 $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
264268 SET_MAKE = @SET_MAKE@
265269 SHELL = @SHELL@
266270 STRIP = @STRIP@
271 SWSCALE_CFLAGS = @SWSCALE_CFLAGS@
272 SWSCALE_LIBS = @SWSCALE_LIBS@
267273 VERSION = @VERSION@
268274 VIDEOGFX_CFLAGS = @VIDEOGFX_CFLAGS@
269275 VIDEOGFX_LIBS = @VIDEOGFX_LIBS@
326332 top_srcdir = @top_srcdir@
327333 AM_CPPFLAGS = -I../libde265
328334 dec265_DEPENDENCIES = ../libde265/libde265.la
329 dec265_CXXFLAGS = $(VIDEOGFX_CFLAGS) $(SDL_CFLAGS)
330 dec265_LDFLAGS = $(VIDEOGFX_LIBS) $(SDL_LIBS) $(am__append_2)
335 dec265_CXXFLAGS = $(am__append_1) $(am__append_3)
336 dec265_LDFLAGS = $(am__append_2) $(am__append_4) $(am__append_6)
331337 dec265_LDADD = ../libde265/libde265.la -lstdc++
332 dec265_SOURCES = dec265.cc $(am__append_1)
338 dec265_SOURCES = dec265.cc $(am__append_5)
333339 EXTRA_DIST = Makefile.vc7 \
334340 CMakeLists.txt \
335341 ../extra/getopt.c \
55 LINK=link /nologo /subsystem:console
66 DEFINES=/DWIN32
77
8 CFLAGS=$(CFLAGS) /MT /Ob2 /Oi /W4
8 CFLAGS=$(CFLAGS) /MT /Ob2 /Oi /W4 /EHsc
99 CFLAGS=$(CFLAGS) $(DEFINES)
1010
1111 OBJS=\
2121 .c.obj:
2222 $(CC) /c $*.c /Fo$*.obj $(CFLAGS)
2323
24 .cc.obj:
25 $(CC) /c $*.cc /Fo$*.obj $(CFLAGS)
26
2427 dec265.exe: $(OBJS) ..\libde265\libde265.lib
2528 $(LINK) /out:dec265.exe $** ..\libde265\libde265.lib
2629
00 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
1 * libde265 example application "dec265".
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
4 * This file is part of libde265.
4 * This file is part of dec265, an example application using libde265.
55 *
6 * libde265 is free software: you can redistribute it and/or modify
6 * dec265 is free software: you can redistribute it and/or modify
77 * it under the terms of the GNU General Public License as published by
88 * the Free Software Foundation, either version 3 of the License, or
99 * (at your option) any later version.
1010 *
11 * libde265 is distributed in the hope that it will be useful,
11 * dec265 is distributed in the hope that it will be useful,
1212 * but WITHOUT ANY WARRANTY; without even the implied warranty of
1313 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1414 * GNU General Public License for more details.
1515 *
1616 * You should have received a copy of the GNU General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
17 * along with dec265. If not, see <http://www.gnu.org/licenses/>.
1818 */
1919
2020 #define DO_MEMORY_LOGGING 0
3535 #ifndef _MSC_VER
3636 #include <sys/time.h>
3737 #include <unistd.h>
38 extern "C" {
39 #include "libde265/decctx.h"
40 }
41 #else
42 // VS2008 didn't support C99, compile all everything as C++
43 #include "libde265/decctx.h"
4438 #endif
4539
4640 #if HAVE_VIDEOGFX
5044
5145 #if HAVE_SDL
5246 #include "sdl.hh"
53 #endif
54
55 extern "C" {
56 #include "libde265/threads.h"
57 }
58
59
60 #ifndef _MSC_VER
61 extern "C" {
62 void showMotionProfile();
63 void showIntraPredictionProfile();
64 void showTransformProfile();
65 }
66 #else
67 void showMotionProfile();
68 void showIntraPredictionProfile();
69 void showTransformProfile();
7047 #endif
7148
7249
7754 bool nal_input=false;
7855 bool quiet=false;
7956 bool check_hash=false;
80 bool show_profile=false;
8157 bool show_help=false;
8258 bool dump_headers=false;
8359 bool write_yuv=false;
8864 uint32_t max_frames=UINT32_MAX;
8965 bool write_bytestream=false;
9066 const char *bytestream_filename;
67 int highestTID = 100;
9168 int verbosity=0;
69 int disable_deblocking=0;
70 int disable_sao=0;
9271
9372 static struct option long_options[] = {
9473 {"quiet", no_argument, 0, 'q' },
10483 {"help", no_argument, 0, 'h' },
10584 {"noaccel", no_argument, 0, '0' },
10685 {"write-bytestream", required_argument,0, 'B' },
86 {"highest-TID", required_argument, 0, 'T' },
10787 {"verbose", no_argument, 0, 'v' },
88 {"disable-deblocking", no_argument, &disable_deblocking, 1 },
89 {"disable-sao", no_argument, &disable_sao, 1 },
10890 {0, 0, 0, 0 }
10991 };
92
93
94
95 static void write_picture(const de265_image* img)
96 {
97 static FILE* fh = NULL;
98 if (fh==NULL) { fh = fopen(output_filename, "wb"); }
99
100
101
102 for (int c=0;c<3;c++) {
103 int stride;
104 const uint8_t* p = de265_get_image_plane(img, c, &stride);
105 int width = de265_get_image_width(img,c);
106
107 for (int y=0;y<de265_get_image_height(img,c);y++) {
108 fwrite(p + y*stride, width, 1, fh);
109 }
110 }
111
112 fflush(fh);
113 }
110114
111115
112116
148152 }
149153
150154 win.Display(visu);
151 //win.WaitForKeypress();
155 win.WaitForKeypress();
152156 }
153157 #endif
154158
189193 height = de265_get_image_height(img,0);
190194
191195 framecnt++;
192 //printf("SHOW POC: %d / PTS: %ld\n",img->PicOrderCntVal, img->pts);
196 //printf("SHOW POC: %d / PTS: %ld / integrity: %d\n",img->PicOrderCntVal, img->pts, img->integrity);
197
198
199 if (0) {
200 const char* nal_unit_name;
201 int nuh_layer_id;
202 int nuh_temporal_id;
203 de265_get_image_NAL_header(img, NULL, &nal_unit_name, &nuh_layer_id, &nuh_temporal_id);
204
205 printf("NAL: %s layer:%d temporal:%d\n",nal_unit_name, nuh_layer_id, nuh_temporal_id);
206 }
207
193208
194209 if (!quiet) {
195210 #if HAVE_SDL && HAVE_VIDEOGFX
196 if (output_with_videogfx) {
211 if (output_with_videogfx) {
197212 display_image(img);
198213 } else {
199214 stop = display_sdl(img);
288303 while (1) {
289304 int option_index = 0;
290305
291 int c = getopt_long(argc, argv, "qt:chpf:o:dLB:n0v"
306 int c = getopt_long(argc, argv, "qt:chf:o:dLB:n0vT:"
292307 #if HAVE_VIDEOGFX && HAVE_SDL
293308 "V"
294309 #endif
300315 case 'q': quiet=true; break;
301316 case 't': nThreads=atoi(optarg); break;
302317 case 'c': check_hash=true; break;
303 case 'p': show_profile=true; break;
304318 case 'f': max_frames=atoi(optarg); break;
305 case 'o': write_yuv=true; output_filename=optarg;
306 set_output_filename(output_filename);
307 break;
319 case 'o': write_yuv=true; output_filename=optarg; break;
308320 case 'h': show_help=true; break;
309321 case 'd': dump_headers=true; break;
310322 case 'n': nal_input=true; break;
312324 case 'L': logging=false; break;
313325 case '0': no_acceleration=true; break;
314326 case 'B': write_bytestream=true; bytestream_filename=optarg; break;
327 case 'T': highestTID=atoi(optarg); break;
315328 case 'v': verbosity++; break;
316329 }
317330 }
327340 fprintf(stderr," -t, --threads N set number of worker threads (0 - no threading)\n");
328341 fprintf(stderr," -c, --check-hash perform hash check\n");
329342 fprintf(stderr," -n, --nal input is a stream with 4-byte length prefixed NAL units\n");
330 fprintf(stderr," -p, --profile show coding mode usage profile\n");
331343 fprintf(stderr," -f, --frames N set number of frames to process\n");
332344 fprintf(stderr," -o, --output write YUV reconstruction\n");
333345 fprintf(stderr," -d, --dump dump headers\n");
337349 fprintf(stderr," -0, --noaccel do not use any accelerated code (SSE)\n");
338350 fprintf(stderr," -L, --no-logging disable logging\n");
339351 fprintf(stderr," -B, --write-bytestream FILENAME write raw bytestream (from NAL input)\n");
352 fprintf(stderr," -T, --highest-TID select highest temporal sublayer to decode\n");
353 fprintf(stderr," --disable-deblocking disable deblocking filter\n");
354 fprintf(stderr," --disable-sao disable sample-adaptive offset filter\n");
340355 fprintf(stderr," -h, --help show help\n");
341356
342357 exit(show_help ? 0 : 5);
347362
348363 de265_decoder_context* ctx = de265_new_decoder();
349364
350 if (argc>=3) {
351 if (nThreads>0) {
352 err = de265_start_worker_threads(ctx, nThreads);
353 }
354 }
355
356365 de265_set_parameter_bool(ctx, DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH, check_hash);
366 de265_set_parameter_bool(ctx, DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES, false);
367
368 de265_set_parameter_bool(ctx, DE265_DECODER_PARAM_DISABLE_DEBLOCKING, disable_deblocking);
369 de265_set_parameter_bool(ctx, DE265_DECODER_PARAM_DISABLE_SAO, disable_sao);
357370
358371 if (dump_headers) {
359372 de265_set_parameter_int(ctx, DE265_DECODER_PARAM_DUMP_SPS_HEADERS, 1);
372385
373386 de265_set_verbosity(verbosity);
374387
388
389 if (argc>=3) {
390 if (nThreads>0) {
391 err = de265_start_worker_threads(ctx, nThreads);
392 }
393 }
394
395 de265_set_limit_TID(ctx, highestTID);
396
397
398
375399 FILE* fh = fopen(argv[optind], "rb");
376400 if (fh==NULL) {
377401 fprintf(stderr,"cannot open file %s!\n", argv[1]);
393417
394418 while (!stop)
395419 {
420 //tid = (framecnt/1000) & 1;
421 //de265_set_limit_TID(ctx, tid);
422
396423 if (nal_input) {
397424 uint8_t len[4];
398425 int n = fread(len,1,4,fh);
425452 }
426453
427454 pos+=n;
455
456 if (0) { // fake skipping
457 if (pos>1000000) {
458 printf("RESET\n");
459 de265_reset(ctx);
460 pos=0;
461
462 fseek(fh,-200000,SEEK_CUR);
463 }
464 }
428465 }
429466
430467 // printf("pending data: %d\n", de265_get_number_of_input_bytes_pending(ctx));
496533 width,height,framecnt/secs);
497534
498535
499 if (show_profile) {
500 showMotionProfile();
501 showIntraPredictionProfile();
502 showTransformProfile();
503 }
504
505536 return err==DE265_OK ? 0 : 10;
506537 }
0
1 #include "sdl.hh"
2
3
4 bool SDL_YUV_Display::init(int frame_width, int frame_height)
5 {
6 // reduce image size to a multiple of 8 (apparently required by YUV overlay)
7
8 frame_width &= ~7;
9 frame_height &= ~7;
10
11
12 if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_NOPARACHUTE) < 0 ) {
13 printf("SDL_Init() failed: %s\n", SDL_GetError( ) );
14 SDL_Quit();
15 return false;
16 }
17
18 const SDL_VideoInfo* info = SDL_GetVideoInfo();
19 if( !info ) {
20 printf("SDL_GetVideoInfo() failed: %s\n", SDL_GetError() );
21 SDL_Quit();
22 return false;
23 }
24
25 Uint8 bpp = info->vfmt->BitsPerPixel;
26
27 Uint32 vflags;
28 if (info->hw_available)
29 vflags = SDL_HWSURFACE;
30 else
31 vflags = SDL_SWSURFACE;
32
33 // set window title
34 const char *window_title = "SDL YUV display";
35 SDL_WM_SetCaption(window_title, NULL);
36
37 mScreen = SDL_SetVideoMode(frame_width, frame_height, bpp, vflags);
38 if (mScreen == NULL) {
39 printf("SDL: Couldn't set video mode to %dx%d,%d bpp: %s",
40 frame_width, frame_height, bpp, SDL_GetError());
41 SDL_Quit();
42 return false;
43 }
44
45 mYUVOverlay = SDL_CreateYUVOverlay(frame_width, frame_height, SDL_YV12_OVERLAY, mScreen);
46 if (mYUVOverlay == NULL ) {
47 printf("SDL: Couldn't create SDL YUV overlay: %s",SDL_GetError());
48 SDL_Quit();
49 return false;
50 }
51
52 rect.x = 0;
53 rect.y = 0;
54 rect.w = frame_width;
55 rect.h = frame_height;
56
57 mWindowOpen=true;
58
59 return true;
60 }
61
62 void SDL_YUV_Display::display(const unsigned char *Y,
63 const unsigned char *U,
64 const unsigned char *V,
65 int stride, int chroma_stride)
66 {
67 if (!mWindowOpen) return;
68 if (SDL_LockYUVOverlay(mYUVOverlay) < 0) return;
69
70 if (stride == rect.w && chroma_stride == rect.w/2) {
71
72 // fast copy
73
74 memcpy(mYUVOverlay->pixels[0], Y, rect.w * rect.h);
75 memcpy(mYUVOverlay->pixels[1], V, rect.w * rect.h / 4);
76 memcpy(mYUVOverlay->pixels[2], U, rect.w * rect.h / 4);
77 }
78 else {
79 // copy line by line, because sizes are different
80
81 for (int y=0;y<rect.h;y++)
82 {
83 memcpy(mYUVOverlay->pixels[0]+y*rect.w, Y+stride*y, rect.w);
84 }
85
86 for (int y=0;y<rect.h/2;y++)
87 {
88 memcpy(mYUVOverlay->pixels[2]+y*rect.w/2, U+chroma_stride*y, rect.w/2);
89 memcpy(mYUVOverlay->pixels[1]+y*rect.w/2, V+chroma_stride*y, rect.w/2);
90 }
91 }
92
93 SDL_UnlockYUVOverlay(mYUVOverlay);
94
95 SDL_DisplayYUVOverlay(mYUVOverlay, &rect);
96 }
97
98 bool SDL_YUV_Display::doQuit() const
99 {
100 SDL_Event event;
101 while (SDL_PollEvent(&event)) {
102 if (event.type == SDL_QUIT) {
103 return true;
104 }
105 }
106
107 return false;
108 }
109
110 void SDL_YUV_Display::close()
111 {
112 SDL_FreeYUVOverlay(mYUVOverlay);
113 SDL_Quit();
114
115 mWindowOpen=false;
116 }
0 /*
1 * libde265 example application "dec265".
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of dec265, an example application using libde265.
5 *
6 * dec265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * dec265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with dec265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "sdl.hh"
21
22
23 bool SDL_YUV_Display::init(int frame_width, int frame_height)
24 {
25 // reduce image size to a multiple of 8 (apparently required by YUV overlay)
26
27 frame_width &= ~7;
28 frame_height &= ~7;
29
30
31 if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_NOPARACHUTE) < 0 ) {
32 printf("SDL_Init() failed: %s\n", SDL_GetError( ) );
33 SDL_Quit();
34 return false;
35 }
36
37 const SDL_VideoInfo* info = SDL_GetVideoInfo();
38 if( !info ) {
39 printf("SDL_GetVideoInfo() failed: %s\n", SDL_GetError() );
40 SDL_Quit();
41 return false;
42 }
43
44 Uint8 bpp = info->vfmt->BitsPerPixel;
45
46 Uint32 vflags;
47 if (info->hw_available)
48 vflags = SDL_HWSURFACE;
49 else
50 vflags = SDL_SWSURFACE;
51
52 // set window title
53 const char *window_title = "SDL YUV display";
54 SDL_WM_SetCaption(window_title, NULL);
55
56 mScreen = SDL_SetVideoMode(frame_width, frame_height, bpp, vflags);
57 if (mScreen == NULL) {
58 printf("SDL: Couldn't set video mode to %dx%d,%d bpp: %s",
59 frame_width, frame_height, bpp, SDL_GetError());
60 SDL_Quit();
61 return false;
62 }
63
64 mYUVOverlay = SDL_CreateYUVOverlay(frame_width, frame_height, SDL_YV12_OVERLAY, mScreen);
65 if (mYUVOverlay == NULL ) {
66 printf("SDL: Couldn't create SDL YUV overlay: %s",SDL_GetError());
67 SDL_Quit();
68 return false;
69 }
70
71 rect.x = 0;
72 rect.y = 0;
73 rect.w = frame_width;
74 rect.h = frame_height;
75
76 mWindowOpen=true;
77
78 return true;
79 }
80
81 void SDL_YUV_Display::display(const unsigned char *Y,
82 const unsigned char *U,
83 const unsigned char *V,
84 int stride, int chroma_stride)
85 {
86 if (!mWindowOpen) return;
87 if (SDL_LockYUVOverlay(mYUVOverlay) < 0) return;
88
89 if (stride == rect.w && chroma_stride == rect.w/2) {
90
91 // fast copy
92
93 memcpy(mYUVOverlay->pixels[0], Y, rect.w * rect.h);
94 memcpy(mYUVOverlay->pixels[1], V, rect.w * rect.h / 4);
95 memcpy(mYUVOverlay->pixels[2], U, rect.w * rect.h / 4);
96 }
97 else {
98 // copy line by line, because sizes are different
99
100 for (int y=0;y<rect.h;y++)
101 {
102 memcpy(mYUVOverlay->pixels[0]+y*rect.w, Y+stride*y, rect.w);
103 }
104
105 for (int y=0;y<rect.h/2;y++)
106 {
107 memcpy(mYUVOverlay->pixels[2]+y*rect.w/2, U+chroma_stride*y, rect.w/2);
108 memcpy(mYUVOverlay->pixels[1]+y*rect.w/2, V+chroma_stride*y, rect.w/2);
109 }
110 }
111
112 SDL_UnlockYUVOverlay(mYUVOverlay);
113
114 SDL_DisplayYUVOverlay(mYUVOverlay, &rect);
115 }
116
117 bool SDL_YUV_Display::doQuit() const
118 {
119 SDL_Event event;
120 while (SDL_PollEvent(&event)) {
121 if (event.type == SDL_QUIT) {
122 return true;
123 }
124 }
125
126 return false;
127 }
128
129 void SDL_YUV_Display::close()
130 {
131 SDL_FreeYUVOverlay(mYUVOverlay);
132 SDL_Quit();
133
134 mWindowOpen=false;
135 }
0
1 #include <SDL.h>
2
3
4 class SDL_YUV_Display
5 {
6 public:
7
8 bool init(int frame_width, int frame_height);
9 void display(const unsigned char *Y, const unsigned char *U, const unsigned char *V,
10 int stride, int chroma_stride);
11 void close();
12
13 bool doQuit() const;
14
15 bool isOpen() const { return mWindowOpen; }
16
17 private:
18 SDL_Surface *mScreen;
19 SDL_Overlay *mYUVOverlay;
20 SDL_Rect rect;
21 bool mWindowOpen;
22 };
0 /*
1 * libde265 example application "dec265".
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of dec265, an example application using libde265.
5 *
6 * dec265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * dec265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with dec265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include <SDL.h>
21
22
23 class SDL_YUV_Display
24 {
25 public:
26
27 bool init(int frame_width, int frame_height);
28 void display(const unsigned char *Y, const unsigned char *U, const unsigned char *V,
29 int stride, int chroma_stride);
30 void close();
31
32 bool doQuit() const;
33
34 bool isOpen() const { return mWindowOpen; }
35
36 private:
37 SDL_Surface *mScreen;
38 SDL_Overlay *mYUVOverlay;
39 SDL_Rect rect;
40 bool mWindowOpen;
41 };
0 /*
1 * Copyright (c) 1987, 1993, 1994, 1996
2 * The Regents of the University of California. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * 3. All advertising materials mentioning features or use of this software
13 * must display the following acknowledgement:
14 * This product includes software developed by the University of
15 * California, Berkeley and its contributors.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
032 #ifndef __GETOPT_H__
133 #define __GETOPT_H__
234
4747 // allows us to optimize the code if we're just signaling.
4848 } win32_cond_t;
4949
50 #ifdef __cplusplus
51 extern "C" {
52 #endif
53
5054 int win32_cond_init(win32_cond_t *cv);
5155 int win32_cond_destroy(win32_cond_t *cv);
5256 int win32_cond_wait(win32_cond_t *cv, HANDLE *external_mutex);
5357 int win32_cond_signal(win32_cond_t *cv);
5458 int win32_cond_broadcast(win32_cond_t *cv);
5559
60 #ifdef __cplusplus
61 }
5662 #endif
63
64 #endif
0 GNU LESSER GENERAL PUBLIC LICENSE
1 Version 3, 29 June 2007
2
3 Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
4 Everyone is permitted to copy and distribute verbatim copies
5 of this license document, but changing it is not allowed.
6
7
8 This version of the GNU Lesser General Public License incorporates
9 the terms and conditions of version 3 of the GNU General Public
10 License, supplemented by the additional permissions listed below.
11
12 0. Additional Definitions.
13
14 As used herein, "this License" refers to version 3 of the GNU Lesser
15 General Public License, and the "GNU GPL" refers to version 3 of the GNU
16 General Public License.
17
18 "The Library" refers to a covered work governed by this License,
19 other than an Application or a Combined Work as defined below.
20
21 An "Application" is any work that makes use of an interface provided
22 by the Library, but which is not otherwise based on the Library.
23 Defining a subclass of a class defined by the Library is deemed a mode
24 of using an interface provided by the Library.
25
26 A "Combined Work" is a work produced by combining or linking an
27 Application with the Library. The particular version of the Library
28 with which the Combined Work was made is also called the "Linked
29 Version".
30
31 The "Minimal Corresponding Source" for a Combined Work means the
32 Corresponding Source for the Combined Work, excluding any source code
33 for portions of the Combined Work that, considered in isolation, are
34 based on the Application, and not on the Linked Version.
35
36 The "Corresponding Application Code" for a Combined Work means the
37 object code and/or source code for the Application, including any data
38 and utility programs needed for reproducing the Combined Work from the
39 Application, but excluding the System Libraries of the Combined Work.
40
41 1. Exception to Section 3 of the GNU GPL.
42
43 You may convey a covered work under sections 3 and 4 of this License
44 without being bound by section 3 of the GNU GPL.
45
46 2. Conveying Modified Versions.
47
48 If you modify a copy of the Library, and, in your modifications, a
49 facility refers to a function or data to be supplied by an Application
50 that uses the facility (other than as an argument passed when the
51 facility is invoked), then you may convey a copy of the modified
52 version:
53
54 a) under this License, provided that you make a good faith effort to
55 ensure that, in the event an Application does not supply the
56 function or data, the facility still operates, and performs
57 whatever part of its purpose remains meaningful, or
58
59 b) under the GNU GPL, with none of the additional permissions of
60 this License applicable to that copy.
61
62 3. Object Code Incorporating Material from Library Header Files.
63
64 The object code form of an Application may incorporate material from
65 a header file that is part of the Library. You may convey such object
66 code under terms of your choice, provided that, if the incorporated
67 material is not limited to numerical parameters, data structure
68 layouts and accessors, or small macros, inline functions and templates
69 (ten or fewer lines in length), you do both of the following:
70
71 a) Give prominent notice with each copy of the object code that the
72 Library is used in it and that the Library and its use are
73 covered by this License.
74
75 b) Accompany the object code with a copy of the GNU GPL and this license
76 document.
77
78 4. Combined Works.
79
80 You may convey a Combined Work under terms of your choice that,
81 taken together, effectively do not restrict modification of the
82 portions of the Library contained in the Combined Work and reverse
83 engineering for debugging such modifications, if you also do each of
84 the following:
85
86 a) Give prominent notice with each copy of the Combined Work that
87 the Library is used in it and that the Library and its use are
88 covered by this License.
89
90 b) Accompany the Combined Work with a copy of the GNU GPL and this license
91 document.
92
93 c) For a Combined Work that displays copyright notices during
94 execution, include the copyright notice for the Library among
95 these notices, as well as a reference directing the user to the
96 copies of the GNU GPL and this license document.
97
98 d) Do one of the following:
99
100 0) Convey the Minimal Corresponding Source under the terms of this
101 License, and the Corresponding Application Code in a form
102 suitable for, and under terms that permit, the user to
103 recombine or relink the Application with a modified version of
104 the Linked Version to produce a modified Combined Work, in the
105 manner specified by section 6 of the GNU GPL for conveying
106 Corresponding Source.
107
108 1) Use a suitable shared library mechanism for linking with the
109 Library. A suitable mechanism is one that (a) uses at run time
110 a copy of the Library already present on the user's computer
111 system, and (b) will operate properly with a modified version
112 of the Library that is interface-compatible with the Linked
113 Version.
114
115 e) Provide Installation Information, but only if you would otherwise
116 be required to provide such information under section 6 of the
117 GNU GPL, and only to the extent that such information is
118 necessary to install and execute a modified version of the
119 Combined Work produced by recombining or relinking the
120 Application with a modified version of the Linked Version. (If
121 you use option 4d0, the Installation Information must accompany
122 the Minimal Corresponding Source and Corresponding Application
123 Code. If you use option 4d1, you must provide the Installation
124 Information in the manner specified by section 6 of the GNU GPL
125 for conveying Corresponding Source.)
126
127 5. Combined Libraries.
128
129 You may place library facilities that are a work based on the
130 Library side by side in a single library together with other library
131 facilities that are not Applications and are not covered by this
132 License, and convey such a combined library under terms of your
133 choice, if you do both of the following:
134
135 a) Accompany the combined library with a copy of the same work based
136 on the Library, uncombined with any other library facilities,
137 conveyed under the terms of this License.
138
139 b) Give prominent notice with the combined library that part of it
140 is a work based on the Library, and explaining where to find the
141 accompanying uncombined form of the same work.
142
143 6. Revised Versions of the GNU Lesser General Public License.
144
145 The Free Software Foundation may publish revised and/or new versions
146 of the GNU Lesser General Public License from time to time. Such new
147 versions will be similar in spirit to the present version, but may
148 differ in detail to address new problems or concerns.
149
150 Each version is given a distinguishing version number. If the
151 Library as you received it specifies that a certain numbered version
152 of the GNU Lesser General Public License "or any later version"
153 applies to it, you have the option of following the terms and
154 conditions either of that published version or of any later version
155 published by the Free Software Foundation. If the Library as you
156 received it does not specify a version number of the GNU Lesser
157 General Public License, you may choose any version of the GNU Lesser
158 General Public License ever published by the Free Software Foundation.
159
160 If the Library as you received it specifies that a proxy can decide
161 whether future versions of the GNU Lesser General Public License shall
162 apply, that proxy's public statement of acceptance of any version is
163 permanent authorization for you to choose that version for the
164 Library.
66
77 libde265_la_CPPFLAGS =
88
9 libde265_la_LDFLAGS = -version-info $(LIBDE265_CURRENT):$(LIBDE265_REVISION):$(LIBDE265_AGE)
9 libde265_la_LDFLAGS = -version-info $(LIBDE265_CURRENT):$(LIBDE265_REVISION):$(LIBDE265_AGE) \
10 -export-symbols-regex ^de265_
11 libde265_la_LIBADD = -lstdc++
1012
1113 libde265_la_SOURCES = \
12 bitstream.c \
13 cabac.c \
14 de265.c \
15 deblock.c \
16 decctx.c \
17 image.c \
18 intrapred.c \
19 md5.c \
20 nal.c \
21 pps.c \
22 transform.c \
23 refpic.c \
24 sao.c \
25 scan.c \
26 sei.c \
27 slice.c \
28 sps.c \
29 util.c \
30 vps.c \
14 bitstream.cc \
15 cabac.cc \
16 de265.cc \
17 deblock.cc \
18 decctx.cc \
19 nal-parser.cc \
20 nal-parser.h \
21 dpb.cc \
22 dpb.h \
23 image.cc \
24 intrapred.cc \
25 md5.cc \
26 nal.cc \
27 pps.cc \
28 transform.cc \
29 refpic.cc \
30 sao.cc \
31 scan.cc \
32 sei.cc \
33 slice.cc \
34 sps.cc \
35 util.cc \
36 vps.cc \
3137 bitstream.h \
3238 cabac.h \
3339 deblock.h \
3743 md5.h \
3844 nal.h \
3945 pps.h \
40 pps_func.h \
4146 transform.h \
4247 refpic.h \
4348 sao.h \
4449 scan.h \
4550 sei.h \
4651 slice.h \
47 slice_func.h \
4852 sps.h \
49 sps_func.h \
5053 util.h \
5154 vps.h \
52 motion.c motion.h motion_func.h \
53 threads.c threads.h \
55 motion.cc motion.h \
56 threads.cc threads.h \
57 visualize.cc visualize.h \
5458 acceleration.h \
55 fallback.c fallback.h fallback-motion.c fallback-motion.h \
56 fallback-dct.h fallback-dct.c
59 fallback.cc fallback.h fallback-motion.cc fallback-motion.h \
60 fallback-dct.h fallback-dct.cc
5761
5862 if ENABLE_SSE_OPT
5963 SUBDIRS = x86
60 libde265_la_LIBADD = x86/libde265_x86.la
64 libde265_la_LIBADD += x86/libde265_x86.la
6165 endif
6266
6367 if MINGW
0 # Makefile.in generated by automake 1.13.3 from Makefile.am.
0 # Makefile.in generated by automake 1.14.1 from Makefile.am.
11 # @configure_input@
22
33 # Copyright (C) 1994-2013 Free Software Foundation, Inc.
7979 build_triplet = @build@
8080 host_triplet = @host@
8181 target_triplet = @target@
82 @MINGW_TRUE@am__append_1 = ../extra/win32cond.c ../extra/win32cond.h
83 @MINGW_TRUE@am__append_2 = -no-undefined -static-libgcc -static-libstdc++
82 @ENABLE_SSE_OPT_TRUE@am__append_1 = x86/libde265_x86.la
83 @MINGW_TRUE@am__append_2 = ../extra/win32cond.c ../extra/win32cond.h
84 @MINGW_TRUE@am__append_3 = -no-undefined -static-libgcc -static-libstdc++
8485 subdir = libde265
8586 DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
8687 $(srcdir)/de265-version.h.in $(top_srcdir)/depcomp \
87 $(libde265_la_HEADERS)
88 $(libde265_la_HEADERS) COPYING
8889 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
8990 am__aclocal_m4_deps = $(top_srcdir)/m4/libtool.m4 \
9091 $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
126127 }
127128 am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(libde265_ladir)"
128129 LTLIBRARIES = $(lib_LTLIBRARIES)
129 @ENABLE_SSE_OPT_TRUE@libde265_la_DEPENDENCIES = x86/libde265_x86.la
130 am__libde265_la_SOURCES_DIST = bitstream.c cabac.c de265.c deblock.c \
131 decctx.c image.c intrapred.c md5.c nal.c pps.c transform.c \
132 refpic.c sao.c scan.c sei.c slice.c sps.c util.c vps.c \
130 libde265_la_DEPENDENCIES = $(am__append_1)
131 am__libde265_la_SOURCES_DIST = bitstream.cc cabac.cc de265.cc \
132 deblock.cc decctx.cc nal-parser.cc nal-parser.h dpb.cc dpb.h \
133 image.cc intrapred.cc md5.cc nal.cc pps.cc transform.cc \
134 refpic.cc sao.cc scan.cc sei.cc slice.cc sps.cc util.cc vps.cc \
133135 bitstream.h cabac.h deblock.h decctx.h image.h intrapred.h \
134 md5.h nal.h pps.h pps_func.h transform.h refpic.h sao.h scan.h \
135 sei.h slice.h slice_func.h sps.h sps_func.h util.h vps.h \
136 motion.c motion.h motion_func.h threads.c threads.h \
137 acceleration.h fallback.c fallback.h fallback-motion.c \
138 fallback-motion.h fallback-dct.h fallback-dct.c \
139 ../extra/win32cond.c ../extra/win32cond.h
136 md5.h nal.h pps.h transform.h refpic.h sao.h scan.h sei.h \
137 slice.h sps.h util.h vps.h motion.cc motion.h threads.cc \
138 threads.h visualize.cc visualize.h acceleration.h fallback.cc \
139 fallback.h fallback-motion.cc fallback-motion.h fallback-dct.h \
140 fallback-dct.cc ../extra/win32cond.c ../extra/win32cond.h
140141 am__dirstamp = $(am__leading_dot)dirstamp
141142 @MINGW_TRUE@am__objects_1 = ../extra/libde265_la-win32cond.lo
142143 am_libde265_la_OBJECTS = libde265_la-bitstream.lo libde265_la-cabac.lo \
143144 libde265_la-de265.lo libde265_la-deblock.lo \
144 libde265_la-decctx.lo libde265_la-image.lo \
145 libde265_la-decctx.lo libde265_la-nal-parser.lo \
146 libde265_la-dpb.lo libde265_la-image.lo \
145147 libde265_la-intrapred.lo libde265_la-md5.lo libde265_la-nal.lo \
146148 libde265_la-pps.lo libde265_la-transform.lo \
147149 libde265_la-refpic.lo libde265_la-sao.lo libde265_la-scan.lo \
148150 libde265_la-sei.lo libde265_la-slice.lo libde265_la-sps.lo \
149151 libde265_la-util.lo libde265_la-vps.lo libde265_la-motion.lo \
150 libde265_la-threads.lo libde265_la-fallback.lo \
151 libde265_la-fallback-motion.lo libde265_la-fallback-dct.lo \
152 $(am__objects_1)
152 libde265_la-threads.lo libde265_la-visualize.lo \
153 libde265_la-fallback.lo libde265_la-fallback-motion.lo \
154 libde265_la-fallback-dct.lo $(am__objects_1)
153155 libde265_la_OBJECTS = $(am_libde265_la_OBJECTS)
154156 AM_V_lt = $(am__v_lt_@AM_V@)
155157 am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
156158 am__v_lt_0 = --silent
157159 am__v_lt_1 =
158 libde265_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
159 $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
160 $(libde265_la_LDFLAGS) $(LDFLAGS) -o $@
160 libde265_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
161 $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \
162 $(CXXFLAGS) $(libde265_la_LDFLAGS) $(LDFLAGS) -o $@
161163 AM_V_P = $(am__v_P_@AM_V@)
162164 am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
163165 am__v_P_0 = false
192194 am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
193195 am__v_CCLD_0 = @echo " CCLD " $@;
194196 am__v_CCLD_1 =
197 CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
198 $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
199 LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
200 $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \
201 $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
202 $(AM_CXXFLAGS) $(CXXFLAGS)
203 AM_V_CXX = $(am__v_CXX_@AM_V@)
204 am__v_CXX_ = $(am__v_CXX_@AM_DEFAULT_V@)
205 am__v_CXX_0 = @echo " CXX " $@;
206 am__v_CXX_1 =
207 CXXLD = $(CXX)
208 CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
209 $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \
210 $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
211 AM_V_CXXLD = $(am__v_CXXLD_@AM_V@)
212 am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@)
213 am__v_CXXLD_0 = @echo " CXXLD " $@;
214 am__v_CXXLD_1 =
195215 SOURCES = $(libde265_la_SOURCES)
196216 DIST_SOURCES = $(am__libde265_la_SOURCES_DIST)
197217 RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
338358 SET_MAKE = @SET_MAKE@
339359 SHELL = @SHELL@
340360 STRIP = @STRIP@
361 SWSCALE_CFLAGS = @SWSCALE_CFLAGS@
362 SWSCALE_LIBS = @SWSCALE_LIBS@
341363 VERSION = @VERSION@
342364 VIDEOGFX_CFLAGS = @VIDEOGFX_CFLAGS@
343365 VIDEOGFX_LIBS = @VIDEOGFX_LIBS@
406428 libde265_la_CPPFLAGS =
407429 libde265_la_LDFLAGS = -version-info \
408430 $(LIBDE265_CURRENT):$(LIBDE265_REVISION):$(LIBDE265_AGE) \
409 $(am__append_2)
410 libde265_la_SOURCES = bitstream.c cabac.c de265.c deblock.c decctx.c \
411 image.c intrapred.c md5.c nal.c pps.c transform.c refpic.c \
412 sao.c scan.c sei.c slice.c sps.c util.c vps.c bitstream.h \
413 cabac.h deblock.h decctx.h image.h intrapred.h md5.h nal.h \
414 pps.h pps_func.h transform.h refpic.h sao.h scan.h sei.h \
415 slice.h slice_func.h sps.h sps_func.h util.h vps.h motion.c \
416 motion.h motion_func.h threads.c threads.h acceleration.h \
417 fallback.c fallback.h fallback-motion.c fallback-motion.h \
418 fallback-dct.h fallback-dct.c $(am__append_1)
431 -export-symbols-regex ^de265_ $(am__append_3)
432 libde265_la_LIBADD = -lstdc++ $(am__append_1)
433 libde265_la_SOURCES = bitstream.cc cabac.cc de265.cc deblock.cc \
434 decctx.cc nal-parser.cc nal-parser.h dpb.cc dpb.h image.cc \
435 intrapred.cc md5.cc nal.cc pps.cc transform.cc refpic.cc \
436 sao.cc scan.cc sei.cc slice.cc sps.cc util.cc vps.cc \
437 bitstream.h cabac.h deblock.h decctx.h image.h intrapred.h \
438 md5.h nal.h pps.h transform.h refpic.h sao.h scan.h sei.h \
439 slice.h sps.h util.h vps.h motion.cc motion.h threads.cc \
440 threads.h visualize.cc visualize.h acceleration.h fallback.cc \
441 fallback.h fallback-motion.cc fallback-motion.h fallback-dct.h \
442 fallback-dct.cc $(am__append_2)
419443 @ENABLE_SSE_OPT_TRUE@SUBDIRS = x86
420 @ENABLE_SSE_OPT_TRUE@libde265_la_LIBADD = x86/libde265_x86.la
421444 EXTRA_DIST = Makefile.vc7 \
422445 ../extra/stdbool.h \
423446 ../extra/stdint.h
429452 all: all-recursive
430453
431454 .SUFFIXES:
432 .SUFFIXES: .c .lo .o .obj
455 .SUFFIXES: .c .cc .lo .o .obj
433456 $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
434457 @for dep in $?; do \
435458 case '$(am__configure_deps)' in \
507530 ../extra/$(DEPDIR)/$(am__dirstamp)
508531
509532 libde265.la: $(libde265_la_OBJECTS) $(libde265_la_DEPENDENCIES) $(EXTRA_libde265_la_DEPENDENCIES)
510 $(AM_V_CCLD)$(libde265_la_LINK) -rpath $(libdir) $(libde265_la_OBJECTS) $(libde265_la_LIBADD) $(LIBS)
533 $(AM_V_CXXLD)$(libde265_la_LINK) -rpath $(libdir) $(libde265_la_OBJECTS) $(libde265_la_LIBADD) $(LIBS)
511534
512535 mostlyclean-compile:
513536 -rm -f *.$(OBJEXT)
523546 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-de265.Plo@am__quote@
524547 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-deblock.Plo@am__quote@
525548 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-decctx.Plo@am__quote@
549 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-dpb.Plo@am__quote@
526550 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-fallback-dct.Plo@am__quote@
527551 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-fallback-motion.Plo@am__quote@
528552 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-fallback.Plo@am__quote@
530554 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-intrapred.Plo@am__quote@
531555 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-md5.Plo@am__quote@
532556 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-motion.Plo@am__quote@
557 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-nal-parser.Plo@am__quote@
533558 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-nal.Plo@am__quote@
534559 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-pps.Plo@am__quote@
535560 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-refpic.Plo@am__quote@
541566 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-threads.Plo@am__quote@
542567 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-transform.Plo@am__quote@
543568 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-util.Plo@am__quote@
569 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-visualize.Plo@am__quote@
544570 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_la-vps.Plo@am__quote@
545571
546572 .c.o:
567593 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
568594 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
569595
570 libde265_la-bitstream.lo: bitstream.c
571 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-bitstream.lo -MD -MP -MF $(DEPDIR)/libde265_la-bitstream.Tpo -c -o libde265_la-bitstream.lo `test -f 'bitstream.c' || echo '$(srcdir)/'`bitstream.c
572 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-bitstream.Tpo $(DEPDIR)/libde265_la-bitstream.Plo
573 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='bitstream.c' object='libde265_la-bitstream.lo' libtool=yes @AMDEPBACKSLASH@
574 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
575 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-bitstream.lo `test -f 'bitstream.c' || echo '$(srcdir)/'`bitstream.c
576
577 libde265_la-cabac.lo: cabac.c
578 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-cabac.lo -MD -MP -MF $(DEPDIR)/libde265_la-cabac.Tpo -c -o libde265_la-cabac.lo `test -f 'cabac.c' || echo '$(srcdir)/'`cabac.c
579 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-cabac.Tpo $(DEPDIR)/libde265_la-cabac.Plo
580 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='cabac.c' object='libde265_la-cabac.lo' libtool=yes @AMDEPBACKSLASH@
581 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
582 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-cabac.lo `test -f 'cabac.c' || echo '$(srcdir)/'`cabac.c
583
584 libde265_la-de265.lo: de265.c
585 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-de265.lo -MD -MP -MF $(DEPDIR)/libde265_la-de265.Tpo -c -o libde265_la-de265.lo `test -f 'de265.c' || echo '$(srcdir)/'`de265.c
586 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-de265.Tpo $(DEPDIR)/libde265_la-de265.Plo
587 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='de265.c' object='libde265_la-de265.lo' libtool=yes @AMDEPBACKSLASH@
588 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
589 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-de265.lo `test -f 'de265.c' || echo '$(srcdir)/'`de265.c
590
591 libde265_la-deblock.lo: deblock.c
592 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-deblock.lo -MD -MP -MF $(DEPDIR)/libde265_la-deblock.Tpo -c -o libde265_la-deblock.lo `test -f 'deblock.c' || echo '$(srcdir)/'`deblock.c
593 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-deblock.Tpo $(DEPDIR)/libde265_la-deblock.Plo
594 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='deblock.c' object='libde265_la-deblock.lo' libtool=yes @AMDEPBACKSLASH@
595 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
596 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-deblock.lo `test -f 'deblock.c' || echo '$(srcdir)/'`deblock.c
597
598 libde265_la-decctx.lo: decctx.c
599 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-decctx.lo -MD -MP -MF $(DEPDIR)/libde265_la-decctx.Tpo -c -o libde265_la-decctx.lo `test -f 'decctx.c' || echo '$(srcdir)/'`decctx.c
600 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-decctx.Tpo $(DEPDIR)/libde265_la-decctx.Plo
601 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='decctx.c' object='libde265_la-decctx.lo' libtool=yes @AMDEPBACKSLASH@
602 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
603 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-decctx.lo `test -f 'decctx.c' || echo '$(srcdir)/'`decctx.c
604
605 libde265_la-image.lo: image.c
606 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-image.lo -MD -MP -MF $(DEPDIR)/libde265_la-image.Tpo -c -o libde265_la-image.lo `test -f 'image.c' || echo '$(srcdir)/'`image.c
607 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-image.Tpo $(DEPDIR)/libde265_la-image.Plo
608 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='image.c' object='libde265_la-image.lo' libtool=yes @AMDEPBACKSLASH@
609 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
610 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-image.lo `test -f 'image.c' || echo '$(srcdir)/'`image.c
611
612 libde265_la-intrapred.lo: intrapred.c
613 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-intrapred.lo -MD -MP -MF $(DEPDIR)/libde265_la-intrapred.Tpo -c -o libde265_la-intrapred.lo `test -f 'intrapred.c' || echo '$(srcdir)/'`intrapred.c
614 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-intrapred.Tpo $(DEPDIR)/libde265_la-intrapred.Plo
615 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='intrapred.c' object='libde265_la-intrapred.lo' libtool=yes @AMDEPBACKSLASH@
616 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
617 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-intrapred.lo `test -f 'intrapred.c' || echo '$(srcdir)/'`intrapred.c
618
619 libde265_la-md5.lo: md5.c
620 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-md5.lo -MD -MP -MF $(DEPDIR)/libde265_la-md5.Tpo -c -o libde265_la-md5.lo `test -f 'md5.c' || echo '$(srcdir)/'`md5.c
621 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-md5.Tpo $(DEPDIR)/libde265_la-md5.Plo
622 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='md5.c' object='libde265_la-md5.lo' libtool=yes @AMDEPBACKSLASH@
623 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
624 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-md5.lo `test -f 'md5.c' || echo '$(srcdir)/'`md5.c
625
626 libde265_la-nal.lo: nal.c
627 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-nal.lo -MD -MP -MF $(DEPDIR)/libde265_la-nal.Tpo -c -o libde265_la-nal.lo `test -f 'nal.c' || echo '$(srcdir)/'`nal.c
628 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-nal.Tpo $(DEPDIR)/libde265_la-nal.Plo
629 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='nal.c' object='libde265_la-nal.lo' libtool=yes @AMDEPBACKSLASH@
630 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
631 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-nal.lo `test -f 'nal.c' || echo '$(srcdir)/'`nal.c
632
633 libde265_la-pps.lo: pps.c
634 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-pps.lo -MD -MP -MF $(DEPDIR)/libde265_la-pps.Tpo -c -o libde265_la-pps.lo `test -f 'pps.c' || echo '$(srcdir)/'`pps.c
635 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-pps.Tpo $(DEPDIR)/libde265_la-pps.Plo
636 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='pps.c' object='libde265_la-pps.lo' libtool=yes @AMDEPBACKSLASH@
637 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
638 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-pps.lo `test -f 'pps.c' || echo '$(srcdir)/'`pps.c
639
640 libde265_la-transform.lo: transform.c
641 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-transform.lo -MD -MP -MF $(DEPDIR)/libde265_la-transform.Tpo -c -o libde265_la-transform.lo `test -f 'transform.c' || echo '$(srcdir)/'`transform.c
642 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-transform.Tpo $(DEPDIR)/libde265_la-transform.Plo
643 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='transform.c' object='libde265_la-transform.lo' libtool=yes @AMDEPBACKSLASH@
644 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
645 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-transform.lo `test -f 'transform.c' || echo '$(srcdir)/'`transform.c
646
647 libde265_la-refpic.lo: refpic.c
648 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-refpic.lo -MD -MP -MF $(DEPDIR)/libde265_la-refpic.Tpo -c -o libde265_la-refpic.lo `test -f 'refpic.c' || echo '$(srcdir)/'`refpic.c
649 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-refpic.Tpo $(DEPDIR)/libde265_la-refpic.Plo
650 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='refpic.c' object='libde265_la-refpic.lo' libtool=yes @AMDEPBACKSLASH@
651 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
652 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-refpic.lo `test -f 'refpic.c' || echo '$(srcdir)/'`refpic.c
653
654 libde265_la-sao.lo: sao.c
655 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-sao.lo -MD -MP -MF $(DEPDIR)/libde265_la-sao.Tpo -c -o libde265_la-sao.lo `test -f 'sao.c' || echo '$(srcdir)/'`sao.c
656 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-sao.Tpo $(DEPDIR)/libde265_la-sao.Plo
657 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sao.c' object='libde265_la-sao.lo' libtool=yes @AMDEPBACKSLASH@
658 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
659 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-sao.lo `test -f 'sao.c' || echo '$(srcdir)/'`sao.c
660
661 libde265_la-scan.lo: scan.c
662 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-scan.lo -MD -MP -MF $(DEPDIR)/libde265_la-scan.Tpo -c -o libde265_la-scan.lo `test -f 'scan.c' || echo '$(srcdir)/'`scan.c
663 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-scan.Tpo $(DEPDIR)/libde265_la-scan.Plo
664 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='scan.c' object='libde265_la-scan.lo' libtool=yes @AMDEPBACKSLASH@
665 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
666 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-scan.lo `test -f 'scan.c' || echo '$(srcdir)/'`scan.c
667
668 libde265_la-sei.lo: sei.c
669 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-sei.lo -MD -MP -MF $(DEPDIR)/libde265_la-sei.Tpo -c -o libde265_la-sei.lo `test -f 'sei.c' || echo '$(srcdir)/'`sei.c
670 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-sei.Tpo $(DEPDIR)/libde265_la-sei.Plo
671 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sei.c' object='libde265_la-sei.lo' libtool=yes @AMDEPBACKSLASH@
672 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
673 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-sei.lo `test -f 'sei.c' || echo '$(srcdir)/'`sei.c
674
675 libde265_la-slice.lo: slice.c
676 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-slice.lo -MD -MP -MF $(DEPDIR)/libde265_la-slice.Tpo -c -o libde265_la-slice.lo `test -f 'slice.c' || echo '$(srcdir)/'`slice.c
677 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-slice.Tpo $(DEPDIR)/libde265_la-slice.Plo
678 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='slice.c' object='libde265_la-slice.lo' libtool=yes @AMDEPBACKSLASH@
679 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
680 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-slice.lo `test -f 'slice.c' || echo '$(srcdir)/'`slice.c
681
682 libde265_la-sps.lo: sps.c
683 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-sps.lo -MD -MP -MF $(DEPDIR)/libde265_la-sps.Tpo -c -o libde265_la-sps.lo `test -f 'sps.c' || echo '$(srcdir)/'`sps.c
684 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-sps.Tpo $(DEPDIR)/libde265_la-sps.Plo
685 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sps.c' object='libde265_la-sps.lo' libtool=yes @AMDEPBACKSLASH@
686 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
687 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-sps.lo `test -f 'sps.c' || echo '$(srcdir)/'`sps.c
688
689 libde265_la-util.lo: util.c
690 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-util.lo -MD -MP -MF $(DEPDIR)/libde265_la-util.Tpo -c -o libde265_la-util.lo `test -f 'util.c' || echo '$(srcdir)/'`util.c
691 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-util.Tpo $(DEPDIR)/libde265_la-util.Plo
692 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='util.c' object='libde265_la-util.lo' libtool=yes @AMDEPBACKSLASH@
693 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
694 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-util.lo `test -f 'util.c' || echo '$(srcdir)/'`util.c
695
696 libde265_la-vps.lo: vps.c
697 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-vps.lo -MD -MP -MF $(DEPDIR)/libde265_la-vps.Tpo -c -o libde265_la-vps.lo `test -f 'vps.c' || echo '$(srcdir)/'`vps.c
698 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-vps.Tpo $(DEPDIR)/libde265_la-vps.Plo
699 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='vps.c' object='libde265_la-vps.lo' libtool=yes @AMDEPBACKSLASH@
700 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
701 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-vps.lo `test -f 'vps.c' || echo '$(srcdir)/'`vps.c
702
703 libde265_la-motion.lo: motion.c
704 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-motion.lo -MD -MP -MF $(DEPDIR)/libde265_la-motion.Tpo -c -o libde265_la-motion.lo `test -f 'motion.c' || echo '$(srcdir)/'`motion.c
705 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-motion.Tpo $(DEPDIR)/libde265_la-motion.Plo
706 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='motion.c' object='libde265_la-motion.lo' libtool=yes @AMDEPBACKSLASH@
707 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
708 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-motion.lo `test -f 'motion.c' || echo '$(srcdir)/'`motion.c
709
710 libde265_la-threads.lo: threads.c
711 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-threads.lo -MD -MP -MF $(DEPDIR)/libde265_la-threads.Tpo -c -o libde265_la-threads.lo `test -f 'threads.c' || echo '$(srcdir)/'`threads.c
712 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-threads.Tpo $(DEPDIR)/libde265_la-threads.Plo
713 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='threads.c' object='libde265_la-threads.lo' libtool=yes @AMDEPBACKSLASH@
714 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
715 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-threads.lo `test -f 'threads.c' || echo '$(srcdir)/'`threads.c
716
717 libde265_la-fallback.lo: fallback.c
718 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-fallback.lo -MD -MP -MF $(DEPDIR)/libde265_la-fallback.Tpo -c -o libde265_la-fallback.lo `test -f 'fallback.c' || echo '$(srcdir)/'`fallback.c
719 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-fallback.Tpo $(DEPDIR)/libde265_la-fallback.Plo
720 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='fallback.c' object='libde265_la-fallback.lo' libtool=yes @AMDEPBACKSLASH@
721 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
722 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-fallback.lo `test -f 'fallback.c' || echo '$(srcdir)/'`fallback.c
723
724 libde265_la-fallback-motion.lo: fallback-motion.c
725 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-fallback-motion.lo -MD -MP -MF $(DEPDIR)/libde265_la-fallback-motion.Tpo -c -o libde265_la-fallback-motion.lo `test -f 'fallback-motion.c' || echo '$(srcdir)/'`fallback-motion.c
726 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-fallback-motion.Tpo $(DEPDIR)/libde265_la-fallback-motion.Plo
727 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='fallback-motion.c' object='libde265_la-fallback-motion.lo' libtool=yes @AMDEPBACKSLASH@
728 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
729 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-fallback-motion.lo `test -f 'fallback-motion.c' || echo '$(srcdir)/'`fallback-motion.c
730
731 libde265_la-fallback-dct.lo: fallback-dct.c
732 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libde265_la-fallback-dct.lo -MD -MP -MF $(DEPDIR)/libde265_la-fallback-dct.Tpo -c -o libde265_la-fallback-dct.lo `test -f 'fallback-dct.c' || echo '$(srcdir)/'`fallback-dct.c
733 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-fallback-dct.Tpo $(DEPDIR)/libde265_la-fallback-dct.Plo
734 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='fallback-dct.c' object='libde265_la-fallback-dct.lo' libtool=yes @AMDEPBACKSLASH@
735 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
736 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libde265_la-fallback-dct.lo `test -f 'fallback-dct.c' || echo '$(srcdir)/'`fallback-dct.c
737
738596 ../extra/libde265_la-win32cond.lo: ../extra/win32cond.c
739597 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ../extra/libde265_la-win32cond.lo -MD -MP -MF ../extra/$(DEPDIR)/libde265_la-win32cond.Tpo -c -o ../extra/libde265_la-win32cond.lo `test -f '../extra/win32cond.c' || echo '$(srcdir)/'`../extra/win32cond.c
740598 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) ../extra/$(DEPDIR)/libde265_la-win32cond.Tpo ../extra/$(DEPDIR)/libde265_la-win32cond.Plo
741599 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='../extra/win32cond.c' object='../extra/libde265_la-win32cond.lo' libtool=yes @AMDEPBACKSLASH@
742600 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
743601 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ../extra/libde265_la-win32cond.lo `test -f '../extra/win32cond.c' || echo '$(srcdir)/'`../extra/win32cond.c
602
603 .cc.o:
604 @am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
605 @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
606 @am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Po
607 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
608 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
609 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $<
610
611 .cc.obj:
612 @am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\
613 @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\
614 @am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Po
615 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
616 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
617 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
618
619 .cc.lo:
620 @am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\
621 @am__fastdepCXX_TRUE@ $(LTCXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
622 @am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Plo
623 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
624 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
625 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
626
627 libde265_la-bitstream.lo: bitstream.cc
628 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-bitstream.lo -MD -MP -MF $(DEPDIR)/libde265_la-bitstream.Tpo -c -o libde265_la-bitstream.lo `test -f 'bitstream.cc' || echo '$(srcdir)/'`bitstream.cc
629 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-bitstream.Tpo $(DEPDIR)/libde265_la-bitstream.Plo
630 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='bitstream.cc' object='libde265_la-bitstream.lo' libtool=yes @AMDEPBACKSLASH@
631 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
632 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-bitstream.lo `test -f 'bitstream.cc' || echo '$(srcdir)/'`bitstream.cc
633
634 libde265_la-cabac.lo: cabac.cc
635 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-cabac.lo -MD -MP -MF $(DEPDIR)/libde265_la-cabac.Tpo -c -o libde265_la-cabac.lo `test -f 'cabac.cc' || echo '$(srcdir)/'`cabac.cc
636 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-cabac.Tpo $(DEPDIR)/libde265_la-cabac.Plo
637 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='cabac.cc' object='libde265_la-cabac.lo' libtool=yes @AMDEPBACKSLASH@
638 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
639 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-cabac.lo `test -f 'cabac.cc' || echo '$(srcdir)/'`cabac.cc
640
641 libde265_la-de265.lo: de265.cc
642 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-de265.lo -MD -MP -MF $(DEPDIR)/libde265_la-de265.Tpo -c -o libde265_la-de265.lo `test -f 'de265.cc' || echo '$(srcdir)/'`de265.cc
643 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-de265.Tpo $(DEPDIR)/libde265_la-de265.Plo
644 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='de265.cc' object='libde265_la-de265.lo' libtool=yes @AMDEPBACKSLASH@
645 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
646 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-de265.lo `test -f 'de265.cc' || echo '$(srcdir)/'`de265.cc
647
648 libde265_la-deblock.lo: deblock.cc
649 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-deblock.lo -MD -MP -MF $(DEPDIR)/libde265_la-deblock.Tpo -c -o libde265_la-deblock.lo `test -f 'deblock.cc' || echo '$(srcdir)/'`deblock.cc
650 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-deblock.Tpo $(DEPDIR)/libde265_la-deblock.Plo
651 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='deblock.cc' object='libde265_la-deblock.lo' libtool=yes @AMDEPBACKSLASH@
652 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
653 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-deblock.lo `test -f 'deblock.cc' || echo '$(srcdir)/'`deblock.cc
654
655 libde265_la-decctx.lo: decctx.cc
656 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-decctx.lo -MD -MP -MF $(DEPDIR)/libde265_la-decctx.Tpo -c -o libde265_la-decctx.lo `test -f 'decctx.cc' || echo '$(srcdir)/'`decctx.cc
657 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-decctx.Tpo $(DEPDIR)/libde265_la-decctx.Plo
658 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='decctx.cc' object='libde265_la-decctx.lo' libtool=yes @AMDEPBACKSLASH@
659 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
660 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-decctx.lo `test -f 'decctx.cc' || echo '$(srcdir)/'`decctx.cc
661
662 libde265_la-nal-parser.lo: nal-parser.cc
663 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-nal-parser.lo -MD -MP -MF $(DEPDIR)/libde265_la-nal-parser.Tpo -c -o libde265_la-nal-parser.lo `test -f 'nal-parser.cc' || echo '$(srcdir)/'`nal-parser.cc
664 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-nal-parser.Tpo $(DEPDIR)/libde265_la-nal-parser.Plo
665 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='nal-parser.cc' object='libde265_la-nal-parser.lo' libtool=yes @AMDEPBACKSLASH@
666 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
667 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-nal-parser.lo `test -f 'nal-parser.cc' || echo '$(srcdir)/'`nal-parser.cc
668
669 libde265_la-dpb.lo: dpb.cc
670 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-dpb.lo -MD -MP -MF $(DEPDIR)/libde265_la-dpb.Tpo -c -o libde265_la-dpb.lo `test -f 'dpb.cc' || echo '$(srcdir)/'`dpb.cc
671 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-dpb.Tpo $(DEPDIR)/libde265_la-dpb.Plo
672 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='dpb.cc' object='libde265_la-dpb.lo' libtool=yes @AMDEPBACKSLASH@
673 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
674 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-dpb.lo `test -f 'dpb.cc' || echo '$(srcdir)/'`dpb.cc
675
676 libde265_la-image.lo: image.cc
677 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-image.lo -MD -MP -MF $(DEPDIR)/libde265_la-image.Tpo -c -o libde265_la-image.lo `test -f 'image.cc' || echo '$(srcdir)/'`image.cc
678 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-image.Tpo $(DEPDIR)/libde265_la-image.Plo
679 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='image.cc' object='libde265_la-image.lo' libtool=yes @AMDEPBACKSLASH@
680 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
681 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-image.lo `test -f 'image.cc' || echo '$(srcdir)/'`image.cc
682
683 libde265_la-intrapred.lo: intrapred.cc
684 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-intrapred.lo -MD -MP -MF $(DEPDIR)/libde265_la-intrapred.Tpo -c -o libde265_la-intrapred.lo `test -f 'intrapred.cc' || echo '$(srcdir)/'`intrapred.cc
685 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-intrapred.Tpo $(DEPDIR)/libde265_la-intrapred.Plo
686 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='intrapred.cc' object='libde265_la-intrapred.lo' libtool=yes @AMDEPBACKSLASH@
687 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
688 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-intrapred.lo `test -f 'intrapred.cc' || echo '$(srcdir)/'`intrapred.cc
689
690 libde265_la-md5.lo: md5.cc
691 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-md5.lo -MD -MP -MF $(DEPDIR)/libde265_la-md5.Tpo -c -o libde265_la-md5.lo `test -f 'md5.cc' || echo '$(srcdir)/'`md5.cc
692 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-md5.Tpo $(DEPDIR)/libde265_la-md5.Plo
693 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='md5.cc' object='libde265_la-md5.lo' libtool=yes @AMDEPBACKSLASH@
694 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
695 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-md5.lo `test -f 'md5.cc' || echo '$(srcdir)/'`md5.cc
696
697 libde265_la-nal.lo: nal.cc
698 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-nal.lo -MD -MP -MF $(DEPDIR)/libde265_la-nal.Tpo -c -o libde265_la-nal.lo `test -f 'nal.cc' || echo '$(srcdir)/'`nal.cc
699 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-nal.Tpo $(DEPDIR)/libde265_la-nal.Plo
700 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='nal.cc' object='libde265_la-nal.lo' libtool=yes @AMDEPBACKSLASH@
701 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
702 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-nal.lo `test -f 'nal.cc' || echo '$(srcdir)/'`nal.cc
703
704 libde265_la-pps.lo: pps.cc
705 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-pps.lo -MD -MP -MF $(DEPDIR)/libde265_la-pps.Tpo -c -o libde265_la-pps.lo `test -f 'pps.cc' || echo '$(srcdir)/'`pps.cc
706 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-pps.Tpo $(DEPDIR)/libde265_la-pps.Plo
707 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='pps.cc' object='libde265_la-pps.lo' libtool=yes @AMDEPBACKSLASH@
708 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
709 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-pps.lo `test -f 'pps.cc' || echo '$(srcdir)/'`pps.cc
710
711 libde265_la-transform.lo: transform.cc
712 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-transform.lo -MD -MP -MF $(DEPDIR)/libde265_la-transform.Tpo -c -o libde265_la-transform.lo `test -f 'transform.cc' || echo '$(srcdir)/'`transform.cc
713 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-transform.Tpo $(DEPDIR)/libde265_la-transform.Plo
714 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='transform.cc' object='libde265_la-transform.lo' libtool=yes @AMDEPBACKSLASH@
715 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
716 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-transform.lo `test -f 'transform.cc' || echo '$(srcdir)/'`transform.cc
717
718 libde265_la-refpic.lo: refpic.cc
719 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-refpic.lo -MD -MP -MF $(DEPDIR)/libde265_la-refpic.Tpo -c -o libde265_la-refpic.lo `test -f 'refpic.cc' || echo '$(srcdir)/'`refpic.cc
720 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-refpic.Tpo $(DEPDIR)/libde265_la-refpic.Plo
721 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='refpic.cc' object='libde265_la-refpic.lo' libtool=yes @AMDEPBACKSLASH@
722 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
723 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-refpic.lo `test -f 'refpic.cc' || echo '$(srcdir)/'`refpic.cc
724
725 libde265_la-sao.lo: sao.cc
726 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-sao.lo -MD -MP -MF $(DEPDIR)/libde265_la-sao.Tpo -c -o libde265_la-sao.lo `test -f 'sao.cc' || echo '$(srcdir)/'`sao.cc
727 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-sao.Tpo $(DEPDIR)/libde265_la-sao.Plo
728 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='sao.cc' object='libde265_la-sao.lo' libtool=yes @AMDEPBACKSLASH@
729 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
730 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-sao.lo `test -f 'sao.cc' || echo '$(srcdir)/'`sao.cc
731
732 libde265_la-scan.lo: scan.cc
733 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-scan.lo -MD -MP -MF $(DEPDIR)/libde265_la-scan.Tpo -c -o libde265_la-scan.lo `test -f 'scan.cc' || echo '$(srcdir)/'`scan.cc
734 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-scan.Tpo $(DEPDIR)/libde265_la-scan.Plo
735 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='scan.cc' object='libde265_la-scan.lo' libtool=yes @AMDEPBACKSLASH@
736 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
737 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-scan.lo `test -f 'scan.cc' || echo '$(srcdir)/'`scan.cc
738
739 libde265_la-sei.lo: sei.cc
740 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-sei.lo -MD -MP -MF $(DEPDIR)/libde265_la-sei.Tpo -c -o libde265_la-sei.lo `test -f 'sei.cc' || echo '$(srcdir)/'`sei.cc
741 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-sei.Tpo $(DEPDIR)/libde265_la-sei.Plo
742 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='sei.cc' object='libde265_la-sei.lo' libtool=yes @AMDEPBACKSLASH@
743 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
744 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-sei.lo `test -f 'sei.cc' || echo '$(srcdir)/'`sei.cc
745
746 libde265_la-slice.lo: slice.cc
747 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-slice.lo -MD -MP -MF $(DEPDIR)/libde265_la-slice.Tpo -c -o libde265_la-slice.lo `test -f 'slice.cc' || echo '$(srcdir)/'`slice.cc
748 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-slice.Tpo $(DEPDIR)/libde265_la-slice.Plo
749 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='slice.cc' object='libde265_la-slice.lo' libtool=yes @AMDEPBACKSLASH@
750 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
751 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-slice.lo `test -f 'slice.cc' || echo '$(srcdir)/'`slice.cc
752
753 libde265_la-sps.lo: sps.cc
754 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-sps.lo -MD -MP -MF $(DEPDIR)/libde265_la-sps.Tpo -c -o libde265_la-sps.lo `test -f 'sps.cc' || echo '$(srcdir)/'`sps.cc
755 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-sps.Tpo $(DEPDIR)/libde265_la-sps.Plo
756 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='sps.cc' object='libde265_la-sps.lo' libtool=yes @AMDEPBACKSLASH@
757 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
758 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-sps.lo `test -f 'sps.cc' || echo '$(srcdir)/'`sps.cc
759
760 libde265_la-util.lo: util.cc
761 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-util.lo -MD -MP -MF $(DEPDIR)/libde265_la-util.Tpo -c -o libde265_la-util.lo `test -f 'util.cc' || echo '$(srcdir)/'`util.cc
762 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-util.Tpo $(DEPDIR)/libde265_la-util.Plo
763 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='util.cc' object='libde265_la-util.lo' libtool=yes @AMDEPBACKSLASH@
764 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
765 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-util.lo `test -f 'util.cc' || echo '$(srcdir)/'`util.cc
766
767 libde265_la-vps.lo: vps.cc
768 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-vps.lo -MD -MP -MF $(DEPDIR)/libde265_la-vps.Tpo -c -o libde265_la-vps.lo `test -f 'vps.cc' || echo '$(srcdir)/'`vps.cc
769 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-vps.Tpo $(DEPDIR)/libde265_la-vps.Plo
770 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='vps.cc' object='libde265_la-vps.lo' libtool=yes @AMDEPBACKSLASH@
771 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
772 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-vps.lo `test -f 'vps.cc' || echo '$(srcdir)/'`vps.cc
773
774 libde265_la-motion.lo: motion.cc
775 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-motion.lo -MD -MP -MF $(DEPDIR)/libde265_la-motion.Tpo -c -o libde265_la-motion.lo `test -f 'motion.cc' || echo '$(srcdir)/'`motion.cc
776 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-motion.Tpo $(DEPDIR)/libde265_la-motion.Plo
777 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='motion.cc' object='libde265_la-motion.lo' libtool=yes @AMDEPBACKSLASH@
778 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
779 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-motion.lo `test -f 'motion.cc' || echo '$(srcdir)/'`motion.cc
780
781 libde265_la-threads.lo: threads.cc
782 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-threads.lo -MD -MP -MF $(DEPDIR)/libde265_la-threads.Tpo -c -o libde265_la-threads.lo `test -f 'threads.cc' || echo '$(srcdir)/'`threads.cc
783 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-threads.Tpo $(DEPDIR)/libde265_la-threads.Plo
784 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='threads.cc' object='libde265_la-threads.lo' libtool=yes @AMDEPBACKSLASH@
785 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
786 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-threads.lo `test -f 'threads.cc' || echo '$(srcdir)/'`threads.cc
787
788 libde265_la-visualize.lo: visualize.cc
789 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-visualize.lo -MD -MP -MF $(DEPDIR)/libde265_la-visualize.Tpo -c -o libde265_la-visualize.lo `test -f 'visualize.cc' || echo '$(srcdir)/'`visualize.cc
790 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-visualize.Tpo $(DEPDIR)/libde265_la-visualize.Plo
791 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='visualize.cc' object='libde265_la-visualize.lo' libtool=yes @AMDEPBACKSLASH@
792 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
793 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-visualize.lo `test -f 'visualize.cc' || echo '$(srcdir)/'`visualize.cc
794
795 libde265_la-fallback.lo: fallback.cc
796 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-fallback.lo -MD -MP -MF $(DEPDIR)/libde265_la-fallback.Tpo -c -o libde265_la-fallback.lo `test -f 'fallback.cc' || echo '$(srcdir)/'`fallback.cc
797 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-fallback.Tpo $(DEPDIR)/libde265_la-fallback.Plo
798 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='fallback.cc' object='libde265_la-fallback.lo' libtool=yes @AMDEPBACKSLASH@
799 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
800 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-fallback.lo `test -f 'fallback.cc' || echo '$(srcdir)/'`fallback.cc
801
802 libde265_la-fallback-motion.lo: fallback-motion.cc
803 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-fallback-motion.lo -MD -MP -MF $(DEPDIR)/libde265_la-fallback-motion.Tpo -c -o libde265_la-fallback-motion.lo `test -f 'fallback-motion.cc' || echo '$(srcdir)/'`fallback-motion.cc
804 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-fallback-motion.Tpo $(DEPDIR)/libde265_la-fallback-motion.Plo
805 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='fallback-motion.cc' object='libde265_la-fallback-motion.lo' libtool=yes @AMDEPBACKSLASH@
806 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
807 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-fallback-motion.lo `test -f 'fallback-motion.cc' || echo '$(srcdir)/'`fallback-motion.cc
808
809 libde265_la-fallback-dct.lo: fallback-dct.cc
810 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-fallback-dct.lo -MD -MP -MF $(DEPDIR)/libde265_la-fallback-dct.Tpo -c -o libde265_la-fallback-dct.lo `test -f 'fallback-dct.cc' || echo '$(srcdir)/'`fallback-dct.cc
811 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-fallback-dct.Tpo $(DEPDIR)/libde265_la-fallback-dct.Plo
812 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='fallback-dct.cc' object='libde265_la-fallback-dct.lo' libtool=yes @AMDEPBACKSLASH@
813 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
814 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-fallback-dct.lo `test -f 'fallback-dct.cc' || echo '$(srcdir)/'`fallback-dct.cc
744815
745816 mostlyclean-libtool:
746817 -rm -f *.lo
33 CFLAGS=/I..\extra /I.. /I.
44 CC=cl /nologo
55 LINK=link /nologo /subsystem:console
6 DEFINES=/DWIN32 /D_WIN32_WINNT=0x0400 /DNDEBUG /DLIBDE265_EXPORTS /D_CRT_SECURE_NO_WARNINGS /DHAVE_SSE4_1
6 DEFINES=/DWIN32 /D_WIN32_WINNT=0x0400 /DNDEBUG /DLIBDE265_EXPORTS /D_CRT_SECURE_NO_WARNINGS /DHAVE_SSE4_1 /DNOMINMAX
77
8 CFLAGS=$(CFLAGS) /MT /Ox /Ob2 /Oi /TP /W4 /GL
8 CFLAGS=$(CFLAGS) /MT /Ox /Ob2 /Oi /TP /W4 /GL /EHsc
99
1010 # type conversion, possible loss of data
1111 CFLAGS=$(CFLAGS) /wd4244
1515 CFLAGS=$(CFLAGS) /wd4189
1616 # unreferenced local function has been removed
1717 CFLAGS=$(CFLAGS) /wd4505
18 # padded structures
19 CFLAGS=$(CFLAGS) /wd4324
20 # conversion signed/unsigned
21 CFLAGS=$(CFLAGS) /wd4245
22 # comparison signed/unsigned
23 CFLAGS=$(CFLAGS) /wd4018 /wd4389
24 # possible loss of data with return
25 CFLAGS=$(CFLAGS) /wd4267
26 # forcing value to bool (performance warning)
27 CFLAGS=$(CFLAGS) /wd4800
1828
1929 CFLAGS=$(CFLAGS) $(DEFINES)
2030
2434 de265.obj \
2535 deblock.obj \
2636 decctx.obj \
37 dpb.obj \
2738 fallback-dct.obj \
2839 fallback-motion.obj \
2940 fallback.obj \
3243 md5.obj \
3344 motion.obj \
3445 nal.obj \
46 nal-parser.obj \
3547 pps.obj \
3648 refpic.obj \
3749 sao.obj \
4254 threads.obj \
4355 transform.obj \
4456 util.obj \
57 visualize.obj \
4558 vps.obj \
4659 x86\sse.obj \
4760 x86\sse-dct.obj \
5366 .c.obj:
5467 $(CC) /c $*.c /Fo$*.obj $(CFLAGS)
5568
69 .cc.obj:
70 $(CC) /c $*.cc /Fo$*.obj $(CFLAGS)
71
5672 libde265.dll: $(OBJS)
5773 $(LINK) /dll /out:libde265.dll $**
5874
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
019
120 #ifndef DE265_ACCELERATION_H
221 #define DE265_ACCELERATION_H
+0
-227
libde265/bitstream.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "bitstream.h"
21 #include "de265.h"
22
23 #include <stdlib.h>
24 #include <string.h>
25 #include <assert.h>
26
27
28 void rbsp_buffer_init(rbsp_buffer* buffer)
29 {
30 buffer->data = NULL;
31 buffer->size = 0;
32 buffer->capacity = 0;
33 }
34
35
36 void rbsp_buffer_resize(rbsp_buffer* buffer, int new_size)
37 {
38 if (buffer->capacity < new_size) {
39 unsigned char* newbuffer = (unsigned char*)malloc(new_size);
40
41 if (buffer->data != NULL) {
42 memcpy(newbuffer, buffer->data, buffer->size);
43 free(buffer->data);
44 }
45
46 buffer->data = newbuffer;
47 buffer->capacity = new_size;
48 }
49 }
50
51
52 void rbsp_buffer_free(rbsp_buffer* buffer)
53 {
54 if (buffer->data != NULL) {
55 free(buffer->data);
56
57 buffer->data = NULL;
58 buffer->size = 0;
59 buffer->capacity = 0;
60 }
61 }
62
63
64 void rbsp_buffer_append(rbsp_buffer* buffer, const unsigned char* data, int n)
65 {
66 rbsp_buffer_resize(buffer, buffer->size + n);
67 memcpy(buffer->data + buffer->size, data, n);
68 buffer->size += n;
69 }
70
71
72 void rbsp_buffer_pop(rbsp_buffer* buffer, int n)
73 {
74 memmove(buffer->data,
75 buffer->data + n,
76 buffer->size - n);
77 buffer->size -= n;
78 }
79
80
81
82 void bitreader_init(bitreader* br, rbsp_buffer* buffer)
83 {
84 br->data = buffer->data;
85 br->bytes_remaining = buffer->size;
86
87 br->nextbits=0;
88 br->nextbits_cnt=0;
89
90 bitreader_refill(br);
91 }
92
93 void bitreader_refill(bitreader* br)
94 {
95 int shift = 64-br->nextbits_cnt;
96
97 while (shift >= 8 && br->bytes_remaining) {
98 uint64_t newval = *br->data++;
99 br->bytes_remaining--;
100
101 shift -= 8;
102 newval <<= shift;
103 br->nextbits |= newval;
104 }
105
106 br->nextbits_cnt = 64-shift;
107 }
108
109 int get_bits(bitreader* br, int n)
110 {
111 if (br->nextbits_cnt < n) {
112 bitreader_refill(br);
113 }
114
115 uint64_t val = br->nextbits;
116 val >>= 64-n;
117
118 br->nextbits <<= n;
119 br->nextbits_cnt -= n;
120
121 return val;
122 }
123
124 int get_bits_fast(bitreader* br, int n)
125 {
126 assert(br->nextbits_cnt >= n);
127
128 uint64_t val = br->nextbits;
129 val >>= 64-n;
130
131 br->nextbits <<= n;
132 br->nextbits_cnt -= n;
133
134 return val;
135 }
136
137 int peek_bits(bitreader* br, int n)
138 {
139 if (br->nextbits_cnt < n) {
140 bitreader_refill(br);
141 }
142
143 uint64_t val = br->nextbits;
144 val >>= 64-n;
145
146 return val;
147 }
148
149 void skip_bits(bitreader* br, int n)
150 {
151 if (br->nextbits_cnt < n) {
152 bitreader_refill(br);
153 }
154
155 br->nextbits <<= n;
156 br->nextbits_cnt -= n;
157 }
158
159 void skip_bits_fast(bitreader* br, int n)
160 {
161 br->nextbits <<= n;
162 br->nextbits_cnt -= n;
163 }
164
165 void skip_to_byte_boundary(bitreader* br)
166 {
167 int nskip = (br->nextbits_cnt & 7);
168
169 br->nextbits <<= nskip;
170 br->nextbits_cnt -= nskip;
171 }
172
173 void prepare_for_CABAC(bitreader* br)
174 {
175 skip_to_byte_boundary(br);
176
177 int rewind = br->nextbits_cnt/8;
178 br->data -= rewind;
179 br->bytes_remaining += rewind;
180 br->nextbits = 0;
181 br->nextbits_cnt = 0;
182 }
183
184 int get_uvlc(bitreader* br)
185 {
186 int num_zeros=0;
187
188 while (get_bits(br,1)==0) {
189 num_zeros++;
190
191 if (num_zeros > MAX_UVLC_LEADING_ZEROS) { return UVLC_ERROR; }
192 }
193
194 int offset = 0;
195 if (num_zeros != 0) {
196 offset = get_bits(br, num_zeros);
197 return offset + (1<<num_zeros)-1;
198 } else {
199 return 0;
200 }
201 }
202
203 int get_svlc(bitreader* br)
204 {
205 int v = get_uvlc(br);
206 if (v==0) return v;
207 if (v==UVLC_ERROR) return UVLC_ERROR;
208
209 bool negative = ((v&1)==0);
210 return negative ? -v/2 : (v+1)/2;
211 }
212
213 bool check_rbsp_trailing_bits(bitreader* br)
214 {
215 int stop_bit = get_bits(br,1);
216 assert(stop_bit==1);
217
218 while (br->nextbits_cnt>0 || br->bytes_remaining>0) {
219 int filler = get_bits(br,1);
220 if (filler!=0) {
221 return false;
222 }
223 }
224
225 return true;
226 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "bitstream.h"
21 #include "de265.h"
22
23 #include <stdlib.h>
24 #include <string.h>
25 #include <assert.h>
26
27
28
29 void bitreader_init(bitreader* br, unsigned char* buffer, int len)
30 {
31 br->data = buffer;
32 br->bytes_remaining = len;
33
34 br->nextbits=0;
35 br->nextbits_cnt=0;
36
37 bitreader_refill(br);
38 }
39
40 void bitreader_refill(bitreader* br)
41 {
42 int shift = 64-br->nextbits_cnt;
43
44 while (shift >= 8 && br->bytes_remaining) {
45 uint64_t newval = *br->data++;
46 br->bytes_remaining--;
47
48 shift -= 8;
49 newval <<= shift;
50 br->nextbits |= newval;
51 }
52
53 br->nextbits_cnt = 64-shift;
54 }
55
56 int get_bits(bitreader* br, int n)
57 {
58 if (br->nextbits_cnt < n) {
59 bitreader_refill(br);
60 }
61
62 uint64_t val = br->nextbits;
63 val >>= 64-n;
64
65 br->nextbits <<= n;
66 br->nextbits_cnt -= n;
67
68 return val;
69 }
70
71 int get_bits_fast(bitreader* br, int n)
72 {
73 assert(br->nextbits_cnt >= n);
74
75 uint64_t val = br->nextbits;
76 val >>= 64-n;
77
78 br->nextbits <<= n;
79 br->nextbits_cnt -= n;
80
81 return val;
82 }
83
84 int peek_bits(bitreader* br, int n)
85 {
86 if (br->nextbits_cnt < n) {
87 bitreader_refill(br);
88 }
89
90 uint64_t val = br->nextbits;
91 val >>= 64-n;
92
93 return val;
94 }
95
96 void skip_bits(bitreader* br, int n)
97 {
98 if (br->nextbits_cnt < n) {
99 bitreader_refill(br);
100 }
101
102 br->nextbits <<= n;
103 br->nextbits_cnt -= n;
104 }
105
106 void skip_bits_fast(bitreader* br, int n)
107 {
108 br->nextbits <<= n;
109 br->nextbits_cnt -= n;
110 }
111
112 void skip_to_byte_boundary(bitreader* br)
113 {
114 int nskip = (br->nextbits_cnt & 7);
115
116 br->nextbits <<= nskip;
117 br->nextbits_cnt -= nskip;
118 }
119
120 void prepare_for_CABAC(bitreader* br)
121 {
122 skip_to_byte_boundary(br);
123
124 int rewind = br->nextbits_cnt/8;
125 br->data -= rewind;
126 br->bytes_remaining += rewind;
127 br->nextbits = 0;
128 br->nextbits_cnt = 0;
129 }
130
131 int get_uvlc(bitreader* br)
132 {
133 int num_zeros=0;
134
135 while (get_bits(br,1)==0) {
136 num_zeros++;
137
138 if (num_zeros > MAX_UVLC_LEADING_ZEROS) { return UVLC_ERROR; }
139 }
140
141 int offset = 0;
142 if (num_zeros != 0) {
143 offset = get_bits(br, num_zeros);
144 return offset + (1<<num_zeros)-1;
145 } else {
146 return 0;
147 }
148 }
149
150 int get_svlc(bitreader* br)
151 {
152 int v = get_uvlc(br);
153 if (v==0) return v;
154 if (v==UVLC_ERROR) return UVLC_ERROR;
155
156 bool negative = ((v&1)==0);
157 return negative ? -v/2 : (v+1)/2;
158 }
159
160 bool check_rbsp_trailing_bits(bitreader* br)
161 {
162 int stop_bit = get_bits(br,1);
163 assert(stop_bit==1);
164
165 while (br->nextbits_cnt>0 || br->bytes_remaining>0) {
166 int filler = get_bits(br,1);
167 if (filler!=0) {
168 return false;
169 }
170 }
171
172 return true;
173 }
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
3434 #define MAX_UVLC_LEADING_ZEROS 20
3535 #define UVLC_ERROR -99999
3636
37 typedef struct {
38 unsigned char* data;
39 int size;
40 int capacity;
41 } rbsp_buffer;
42
43 void rbsp_buffer_init(rbsp_buffer* buffer);
44 void rbsp_buffer_resize(rbsp_buffer* buffer, int new_size);
45 void rbsp_buffer_free(rbsp_buffer* buffer);
46 void rbsp_buffer_append(rbsp_buffer* buffer, const unsigned char* data, int n);
47 void rbsp_buffer_pop(rbsp_buffer* buffer, int n);
48
49
50
5137
5238 typedef struct {
5339 uint8_t* data;
5743 int nextbits_cnt;
5844 } bitreader;
5945
60 void bitreader_init(bitreader*, rbsp_buffer*);
46 void bitreader_init(bitreader*, unsigned char* buffer, int len);
6147 void bitreader_refill(bitreader*); // refill to at least 56+1 bits
6248 int next_bit(bitreader*);
6349 int next_bit_norefill(bitreader*);
+0
-439
libde265/cabac.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "cabac.h"
21 #include "util.h"
22
23 #include <stdint.h>
24 #include <stdio.h>
25 #include <assert.h>
26
27
28 static const uint8_t LPS_table[64][4] =
29 {
30 { 128, 176, 208, 240},
31 { 128, 167, 197, 227},
32 { 128, 158, 187, 216},
33 { 123, 150, 178, 205},
34 { 116, 142, 169, 195},
35 { 111, 135, 160, 185},
36 { 105, 128, 152, 175},
37 { 100, 122, 144, 166},
38 { 95, 116, 137, 158},
39 { 90, 110, 130, 150},
40 { 85, 104, 123, 142},
41 { 81, 99, 117, 135},
42 { 77, 94, 111, 128},
43 { 73, 89, 105, 122},
44 { 69, 85, 100, 116},
45 { 66, 80, 95, 110},
46 { 62, 76, 90, 104},
47 { 59, 72, 86, 99},
48 { 56, 69, 81, 94},
49 { 53, 65, 77, 89},
50 { 51, 62, 73, 85},
51 { 48, 59, 69, 80},
52 { 46, 56, 66, 76},
53 { 43, 53, 63, 72},
54 { 41, 50, 59, 69},
55 { 39, 48, 56, 65},
56 { 37, 45, 54, 62},
57 { 35, 43, 51, 59},
58 { 33, 41, 48, 56},
59 { 32, 39, 46, 53},
60 { 30, 37, 43, 50},
61 { 29, 35, 41, 48},
62 { 27, 33, 39, 45},
63 { 26, 31, 37, 43},
64 { 24, 30, 35, 41},
65 { 23, 28, 33, 39},
66 { 22, 27, 32, 37},
67 { 21, 26, 30, 35},
68 { 20, 24, 29, 33},
69 { 19, 23, 27, 31},
70 { 18, 22, 26, 30},
71 { 17, 21, 25, 28},
72 { 16, 20, 23, 27},
73 { 15, 19, 22, 25},
74 { 14, 18, 21, 24},
75 { 14, 17, 20, 23},
76 { 13, 16, 19, 22},
77 { 12, 15, 18, 21},
78 { 12, 14, 17, 20},
79 { 11, 14, 16, 19},
80 { 11, 13, 15, 18},
81 { 10, 12, 15, 17},
82 { 10, 12, 14, 16},
83 { 9, 11, 13, 15},
84 { 9, 11, 12, 14},
85 { 8, 10, 12, 14},
86 { 8, 9, 11, 13},
87 { 7, 9, 11, 12},
88 { 7, 9, 10, 12},
89 { 7, 8, 10, 11},
90 { 6, 8, 9, 11},
91 { 6, 7, 9, 10},
92 { 6, 7, 8, 9},
93 { 2, 2, 2, 2}
94 };
95
96 static const uint8_t renorm_table[32] =
97 {
98 6, 5, 4, 4,
99 3, 3, 3, 3,
100 2, 2, 2, 2,
101 2, 2, 2, 2,
102 1, 1, 1, 1,
103 1, 1, 1, 1,
104 1, 1, 1, 1,
105 1, 1, 1, 1
106 };
107
108 static const uint8_t next_state_MPS[64] =
109 {
110 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,
111 17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,
112 33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,
113 49,50,51,52,53,54,55,56,57,58,59,60,61,62,62,63
114 };
115
116 static const uint8_t next_state_LPS[64] =
117 {
118 0,0,1,2,2,4,4,5,6,7,8,9,9,11,11,12,
119 13,13,15,15,16,16,18,18,19,19,21,21,22,22,23,24,
120 24,25,26,26,27,27,28,29,29,30,30,30,31,32,32,33,
121 33,33,34,34,35,35,35,36,36,36,37,37,37,38,38,63
122 };
123
124
125
126
127
128 #ifdef DE265_LOG_TRACE
129 int logcnt=1;
130 #endif
131
132 void init_CABAC_decoder(CABAC_decoder* decoder, uint8_t* bitstream, int length)
133 {
134 decoder->bitstream_start = bitstream;
135 decoder->bitstream_curr = bitstream;
136 decoder->bitstream_end = bitstream+length;
137 }
138
139 void init_CABAC_decoder_2(CABAC_decoder* decoder)
140 {
141 int length = decoder->bitstream_end - decoder->bitstream_curr;
142
143 decoder->range = 510;
144 decoder->bits_needed = 8;
145
146 decoder->value = 0;
147
148 if (length>0) { decoder->value = (*decoder->bitstream_curr++) << 8; decoder->bits_needed-=8; }
149 if (length>1) { decoder->value |= (*decoder->bitstream_curr++); decoder->bits_needed-=8; }
150
151 logtrace(LogCABAC,"[%3d] init_CABAC_decode_2 r:%x v:%x\n", logcnt, decoder->range, decoder->value);
152 }
153
154
155 //#include <sys/types.h>
156 //#include <signal.h>
157
158 int decode_CABAC_bit(CABAC_decoder* decoder, context_model* model)
159 {
160 //if (logcnt >= 1100000) { enablelog(); }
161
162 // if (logcnt==400068770) { raise(SIGINT); }
163
164 logtrace(LogCABAC,"[%3d] decodeBin r:%x v:%x state:%d\n",logcnt,decoder->range, decoder->value, model->state);
165
166 //assert(decoder->range>=0x100);
167
168 int decoded_bit;
169 int LPS = LPS_table[model->state][ ( decoder->range >> 6 ) - 4 ];
170 decoder->range -= LPS;
171
172 uint32_t scaled_range = decoder->range << 7;
173
174 logtrace(LogCABAC,"[%3d] sr:%x v:%x\n",logcnt,scaled_range, decoder->value);
175
176 if (decoder->value < scaled_range)
177 {
178 logtrace(LogCABAC,"[%3d] MPS\n",logcnt);
179
180 // MPS path
181
182 decoded_bit = model->MPSbit;
183 model->state = next_state_MPS[model->state];
184
185 if (scaled_range < ( 256 << 7 ) )
186 {
187 // scaled range, highest bit (15) not set
188
189 decoder->range = scaled_range >> 6; // shift range by one bit
190 decoder->value <<= 1; // shift value by one bit
191 decoder->bits_needed++;
192
193 if (decoder->bits_needed == 0)
194 {
195 decoder->bits_needed = -8;
196 if (decoder->bitstream_curr != decoder->bitstream_end)
197 { decoder->value |= *decoder->bitstream_curr++; }
198 }
199 }
200 }
201 else
202 {
203 logtrace(LogCABAC,"[%3d] LPS\n",logcnt);
204
205 // LPS path
206
207 int num_bits = renorm_table[ LPS >> 3 ];
208 decoder->value = (decoder->value - scaled_range);
209
210 decoder->value <<= num_bits;
211 decoder->range = LPS << num_bits; /* this is always >= 0x100 except for state 63,
212 but state 63 is never used */
213 decoded_bit = 1 - model->MPSbit;
214
215 if (model->state==0) { model->MPSbit = 1-model->MPSbit; }
216 model->state = next_state_LPS[model->state];
217
218 decoder->bits_needed += num_bits;
219
220 if (decoder->bits_needed >= 0)
221 {
222 logtrace(LogCABAC,"bits_needed: %d\n", decoder->bits_needed);
223 if (decoder->bitstream_curr != decoder->bitstream_end)
224 { decoder->value |= (*decoder->bitstream_curr++) << decoder->bits_needed; }
225
226 decoder->bits_needed -= 8;
227 }
228 }
229
230 logtrace(LogCABAC,"[%3d] -> bit %d r:%x v:%x\n", logcnt, decoded_bit, decoder->range, decoder->value);
231 #ifdef DE265_LOG_TRACE
232 logcnt++;
233 #endif
234
235 //assert(decoder->range>=0x100);
236
237 return decoded_bit;
238 }
239
240 int decode_CABAC_term_bit(CABAC_decoder* decoder)
241 {
242 decoder->range -= 2;
243 uint32_t scaledRange = decoder->range << 7;
244
245 if (decoder->value >= scaledRange)
246 {
247 return 1;
248 }
249 else
250 {
251 // there is a while loop in the standard, but it will always be executed only once
252
253 if (scaledRange < (256<<7))
254 {
255 decoder->range = scaledRange >> 6;
256 decoder->value *= 2;
257
258 decoder->bits_needed++;
259 if (decoder->bits_needed==0)
260 {
261 decoder->bits_needed = -8;
262
263 if (decoder->bitstream_curr != decoder->bitstream_end) {
264 decoder->value += (*decoder->bitstream_curr++);
265 }
266 }
267 }
268
269 return 0;
270 }
271 }
272
273
274
275 int decode_CABAC_bypass(CABAC_decoder* decoder)
276 {
277 logtrace(LogCABAC,"[%3d] bypass r:%x v:%x\n",logcnt,decoder->range, decoder->value);
278
279 //assert(decoder->range>=0x100);
280
281 decoder->value <<= 1;
282 decoder->bits_needed++;
283
284 if (decoder->bits_needed >= 0)
285 {
286 //assert(decoder->bits_needed==0);
287
288 decoder->bits_needed = -8;
289 decoder->value |= *decoder->bitstream_curr++;
290 }
291
292 int bit;
293 uint32_t scaled_range = decoder->range << 7;
294 if (decoder->value >= scaled_range)
295 {
296 decoder->value -= scaled_range;
297 bit=1;
298 }
299 else
300 {
301 bit=0;
302 }
303
304 logtrace(LogCABAC,"[%3d] -> bit %d r:%x v:%x\n", logcnt, bit, decoder->range, decoder->value);
305 #ifdef DE265_LOG_TRACE
306 logcnt++;
307 #endif
308
309 //assert(decoder->range>=0x100);
310
311 return bit;
312 }
313
314
315 int decode_CABAC_TU_bypass(CABAC_decoder* decoder, int cMax)
316 {
317 for (int i=0;i<cMax;i++)
318 {
319 int bit = decode_CABAC_bypass(decoder);
320 if (bit==0)
321 return i;
322 }
323
324 return cMax;
325 }
326
327 int decode_CABAC_TU(CABAC_decoder* decoder, int cMax, context_model* model)
328 {
329 for (int i=0;i<cMax;i++)
330 {
331 int bit = decode_CABAC_bit(decoder,model);
332 if (bit==0)
333 return i;
334 }
335
336 return cMax;
337 }
338
339
340 int decode_CABAC_FL_bypass_parallel(CABAC_decoder* decoder, int nBits)
341 {
342 logtrace(LogCABAC,"[%3d] bypass group r:%x v:%x\n",logcnt,decoder->range, decoder->value);
343
344 decoder->value <<= nBits;
345 decoder->bits_needed+=nBits;
346
347 if (decoder->bits_needed >= 0)
348 {
349 int input = *decoder->bitstream_curr++;
350 input <<= decoder->bits_needed;
351
352 decoder->bits_needed -= 8;
353 decoder->value |= input;
354 }
355
356 uint32_t scaled_range = decoder->range << 7;
357 int value = decoder->value / scaled_range;
358 if (unlikely(value>=(1<<nBits))) { value=(1<<nBits)-1; } // may happen with broken bitstreams
359 decoder->value -= value * scaled_range;
360
361 logtrace(LogCABAC,"[%3d] -> value %d r:%x v:%x\n", logcnt+nBits-1,
362 value, decoder->range, decoder->value);
363 #ifdef DE265_LOG_TRACE
364 logcnt+=nBits;
365 #endif
366
367 //assert(decoder->range>=0x100);
368
369 return value;
370 }
371
372
373 int decode_CABAC_FL_bypass(CABAC_decoder* decoder, int nBits)
374 {
375 int value=0;
376
377
378 if (likely(nBits<=8)) {
379 if (nBits==0) {
380 return 0;
381 }
382 // we could use decode_CABAC_bypass() for a single bit, but this seems to be slower
383 #if 0
384 else if (nBits==1) {
385 value = decode_CABAC_bypass(decoder);
386 }
387 #endif
388 else {
389 value = decode_CABAC_FL_bypass_parallel(decoder,nBits);
390 }
391 }
392 else {
393 value = decode_CABAC_FL_bypass_parallel(decoder,8);
394 nBits-=8;
395
396 while (nBits--) {
397 value <<= 1;
398 value |= decode_CABAC_bypass(decoder);
399 }
400 }
401
402 logtrace(LogCABAC," -> FL: %d\n", value);
403
404 return value;
405 }
406
407 int decode_CABAC_TR_bypass(CABAC_decoder* decoder, int cRiceParam, int cTRMax)
408 {
409 int prefix = decode_CABAC_TU_bypass(decoder, cTRMax>>cRiceParam);
410 if (prefix==4) { // TODO check: constant 4 only works for coefficient decoding
411 return cTRMax;
412 }
413
414 int suffix = decode_CABAC_FL_bypass(decoder, cRiceParam);
415
416 return (prefix << cRiceParam) | suffix;
417 }
418
419 int decode_CABAC_EGk_bypass(CABAC_decoder* decoder, int k)
420 {
421 int base=0;
422 int n=k;
423
424 for (;;)
425 {
426 int bit = decode_CABAC_bypass(decoder);
427 if (bit==0)
428 break;
429 else {
430 base += 1<<n;
431 n++;
432 }
433 }
434
435 int suffix = decode_CABAC_FL_bypass(decoder, n);
436 return base + suffix;
437 }
438
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "cabac.h"
21 #include "util.h"
22
23 #include <stdint.h>
24 #include <stdio.h>
25 #include <assert.h>
26
27
28 static const uint8_t LPS_table[64][4] =
29 {
30 { 128, 176, 208, 240},
31 { 128, 167, 197, 227},
32 { 128, 158, 187, 216},
33 { 123, 150, 178, 205},
34 { 116, 142, 169, 195},
35 { 111, 135, 160, 185},
36 { 105, 128, 152, 175},
37 { 100, 122, 144, 166},
38 { 95, 116, 137, 158},
39 { 90, 110, 130, 150},
40 { 85, 104, 123, 142},
41 { 81, 99, 117, 135},
42 { 77, 94, 111, 128},
43 { 73, 89, 105, 122},
44 { 69, 85, 100, 116},
45 { 66, 80, 95, 110},
46 { 62, 76, 90, 104},
47 { 59, 72, 86, 99},
48 { 56, 69, 81, 94},
49 { 53, 65, 77, 89},
50 { 51, 62, 73, 85},
51 { 48, 59, 69, 80},
52 { 46, 56, 66, 76},
53 { 43, 53, 63, 72},
54 { 41, 50, 59, 69},
55 { 39, 48, 56, 65},
56 { 37, 45, 54, 62},
57 { 35, 43, 51, 59},
58 { 33, 41, 48, 56},
59 { 32, 39, 46, 53},
60 { 30, 37, 43, 50},
61 { 29, 35, 41, 48},
62 { 27, 33, 39, 45},
63 { 26, 31, 37, 43},
64 { 24, 30, 35, 41},
65 { 23, 28, 33, 39},
66 { 22, 27, 32, 37},
67 { 21, 26, 30, 35},
68 { 20, 24, 29, 33},
69 { 19, 23, 27, 31},
70 { 18, 22, 26, 30},
71 { 17, 21, 25, 28},
72 { 16, 20, 23, 27},
73 { 15, 19, 22, 25},
74 { 14, 18, 21, 24},
75 { 14, 17, 20, 23},
76 { 13, 16, 19, 22},
77 { 12, 15, 18, 21},
78 { 12, 14, 17, 20},
79 { 11, 14, 16, 19},
80 { 11, 13, 15, 18},
81 { 10, 12, 15, 17},
82 { 10, 12, 14, 16},
83 { 9, 11, 13, 15},
84 { 9, 11, 12, 14},
85 { 8, 10, 12, 14},
86 { 8, 9, 11, 13},
87 { 7, 9, 11, 12},
88 { 7, 9, 10, 12},
89 { 7, 8, 10, 11},
90 { 6, 8, 9, 11},
91 { 6, 7, 9, 10},
92 { 6, 7, 8, 9},
93 { 2, 2, 2, 2}
94 };
95
96 static const uint8_t renorm_table[32] =
97 {
98 6, 5, 4, 4,
99 3, 3, 3, 3,
100 2, 2, 2, 2,
101 2, 2, 2, 2,
102 1, 1, 1, 1,
103 1, 1, 1, 1,
104 1, 1, 1, 1,
105 1, 1, 1, 1
106 };
107
108 static const uint8_t next_state_MPS[64] =
109 {
110 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,
111 17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,
112 33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,
113 49,50,51,52,53,54,55,56,57,58,59,60,61,62,62,63
114 };
115
116 static const uint8_t next_state_LPS[64] =
117 {
118 0,0,1,2,2,4,4,5,6,7,8,9,9,11,11,12,
119 13,13,15,15,16,16,18,18,19,19,21,21,22,22,23,24,
120 24,25,26,26,27,27,28,29,29,30,30,30,31,32,32,33,
121 33,33,34,34,35,35,35,36,36,36,37,37,37,38,38,63
122 };
123
124
125
126
127
128 #ifdef DE265_LOG_TRACE
129 int logcnt=1;
130 #endif
131
132 void init_CABAC_decoder(CABAC_decoder* decoder, uint8_t* bitstream, int length)
133 {
134 decoder->bitstream_start = bitstream;
135 decoder->bitstream_curr = bitstream;
136 decoder->bitstream_end = bitstream+length;
137 }
138
139 void init_CABAC_decoder_2(CABAC_decoder* decoder)
140 {
141 int length = decoder->bitstream_end - decoder->bitstream_curr;
142
143 decoder->range = 510;
144 decoder->bits_needed = 8;
145
146 decoder->value = 0;
147
148 if (length>0) { decoder->value = (*decoder->bitstream_curr++) << 8; decoder->bits_needed-=8; }
149 if (length>1) { decoder->value |= (*decoder->bitstream_curr++); decoder->bits_needed-=8; }
150
151 logtrace(LogCABAC,"[%3d] init_CABAC_decode_2 r:%x v:%x\n", logcnt, decoder->range, decoder->value);
152 }
153
154
155 //#include <sys/types.h>
156 //#include <signal.h>
157
158 int decode_CABAC_bit(CABAC_decoder* decoder, context_model* model)
159 {
160 //if (logcnt >= 1100000) { enablelog(); }
161
162 // if (logcnt==400068770) { raise(SIGINT); }
163
164 logtrace(LogCABAC,"[%3d] decodeBin r:%x v:%x state:%d\n",logcnt,decoder->range, decoder->value, model->state);
165
166 //assert(decoder->range>=0x100);
167
168 int decoded_bit;
169 int LPS = LPS_table[model->state][ ( decoder->range >> 6 ) - 4 ];
170 decoder->range -= LPS;
171
172 uint32_t scaled_range = decoder->range << 7;
173
174 logtrace(LogCABAC,"[%3d] sr:%x v:%x\n",logcnt,scaled_range, decoder->value);
175
176 if (decoder->value < scaled_range)
177 {
178 logtrace(LogCABAC,"[%3d] MPS\n",logcnt);
179
180 // MPS path
181
182 decoded_bit = model->MPSbit;
183 model->state = next_state_MPS[model->state];
184
185 if (scaled_range < ( 256 << 7 ) )
186 {
187 // scaled range, highest bit (15) not set
188
189 decoder->range = scaled_range >> 6; // shift range by one bit
190 decoder->value <<= 1; // shift value by one bit
191 decoder->bits_needed++;
192
193 if (decoder->bits_needed == 0)
194 {
195 decoder->bits_needed = -8;
196 if (decoder->bitstream_curr != decoder->bitstream_end)
197 { decoder->value |= *decoder->bitstream_curr++; }
198 }
199 }
200 }
201 else
202 {
203 logtrace(LogCABAC,"[%3d] LPS\n",logcnt);
204
205 // LPS path
206
207 int num_bits = renorm_table[ LPS >> 3 ];
208 decoder->value = (decoder->value - scaled_range);
209
210 decoder->value <<= num_bits;
211 decoder->range = LPS << num_bits; /* this is always >= 0x100 except for state 63,
212 but state 63 is never used */
213 decoded_bit = 1 - model->MPSbit;
214
215 if (model->state==0) { model->MPSbit = 1-model->MPSbit; }
216 model->state = next_state_LPS[model->state];
217
218 decoder->bits_needed += num_bits;
219
220 if (decoder->bits_needed >= 0)
221 {
222 logtrace(LogCABAC,"bits_needed: %d\n", decoder->bits_needed);
223 if (decoder->bitstream_curr != decoder->bitstream_end)
224 { decoder->value |= (*decoder->bitstream_curr++) << decoder->bits_needed; }
225
226 decoder->bits_needed -= 8;
227 }
228 }
229
230 logtrace(LogCABAC,"[%3d] -> bit %d r:%x v:%x\n", logcnt, decoded_bit, decoder->range, decoder->value);
231 #ifdef DE265_LOG_TRACE
232 logcnt++;
233 #endif
234
235 //assert(decoder->range>=0x100);
236
237 return decoded_bit;
238 }
239
240 int decode_CABAC_term_bit(CABAC_decoder* decoder)
241 {
242 decoder->range -= 2;
243 uint32_t scaledRange = decoder->range << 7;
244
245 if (decoder->value >= scaledRange)
246 {
247 return 1;
248 }
249 else
250 {
251 // there is a while loop in the standard, but it will always be executed only once
252
253 if (scaledRange < (256<<7))
254 {
255 decoder->range = scaledRange >> 6;
256 decoder->value *= 2;
257
258 decoder->bits_needed++;
259 if (decoder->bits_needed==0)
260 {
261 decoder->bits_needed = -8;
262
263 if (decoder->bitstream_curr != decoder->bitstream_end) {
264 decoder->value += (*decoder->bitstream_curr++);
265 }
266 }
267 }
268
269 return 0;
270 }
271 }
272
273
274
275 int decode_CABAC_bypass(CABAC_decoder* decoder)
276 {
277 logtrace(LogCABAC,"[%3d] bypass r:%x v:%x\n",logcnt,decoder->range, decoder->value);
278
279 //assert(decoder->range>=0x100);
280
281 decoder->value <<= 1;
282 decoder->bits_needed++;
283
284 if (decoder->bits_needed >= 0)
285 {
286 //assert(decoder->bits_needed==0);
287
288 decoder->bits_needed = -8;
289 decoder->value |= *decoder->bitstream_curr++;
290 }
291
292 int bit;
293 uint32_t scaled_range = decoder->range << 7;
294 if (decoder->value >= scaled_range)
295 {
296 decoder->value -= scaled_range;
297 bit=1;
298 }
299 else
300 {
301 bit=0;
302 }
303
304 logtrace(LogCABAC,"[%3d] -> bit %d r:%x v:%x\n", logcnt, bit, decoder->range, decoder->value);
305 #ifdef DE265_LOG_TRACE
306 logcnt++;
307 #endif
308
309 //assert(decoder->range>=0x100);
310
311 return bit;
312 }
313
314
315 int decode_CABAC_TU_bypass(CABAC_decoder* decoder, int cMax)
316 {
317 for (int i=0;i<cMax;i++)
318 {
319 int bit = decode_CABAC_bypass(decoder);
320 if (bit==0)
321 return i;
322 }
323
324 return cMax;
325 }
326
327 int decode_CABAC_TU(CABAC_decoder* decoder, int cMax, context_model* model)
328 {
329 for (int i=0;i<cMax;i++)
330 {
331 int bit = decode_CABAC_bit(decoder,model);
332 if (bit==0)
333 return i;
334 }
335
336 return cMax;
337 }
338
339
340 int decode_CABAC_FL_bypass_parallel(CABAC_decoder* decoder, int nBits)
341 {
342 logtrace(LogCABAC,"[%3d] bypass group r:%x v:%x\n",logcnt,decoder->range, decoder->value);
343
344 decoder->value <<= nBits;
345 decoder->bits_needed+=nBits;
346
347 if (decoder->bits_needed >= 0)
348 {
349 int input = *decoder->bitstream_curr++;
350 input <<= decoder->bits_needed;
351
352 decoder->bits_needed -= 8;
353 decoder->value |= input;
354 }
355
356 uint32_t scaled_range = decoder->range << 7;
357 int value = decoder->value / scaled_range;
358 if (unlikely(value>=(1<<nBits))) { value=(1<<nBits)-1; } // may happen with broken bitstreams
359 decoder->value -= value * scaled_range;
360
361 logtrace(LogCABAC,"[%3d] -> value %d r:%x v:%x\n", logcnt+nBits-1,
362 value, decoder->range, decoder->value);
363 #ifdef DE265_LOG_TRACE
364 logcnt+=nBits;
365 #endif
366
367 //assert(decoder->range>=0x100);
368
369 return value;
370 }
371
372
373 int decode_CABAC_FL_bypass(CABAC_decoder* decoder, int nBits)
374 {
375 int value=0;
376
377
378 if (likely(nBits<=8)) {
379 if (nBits==0) {
380 return 0;
381 }
382 // we could use decode_CABAC_bypass() for a single bit, but this seems to be slower
383 #if 0
384 else if (nBits==1) {
385 value = decode_CABAC_bypass(decoder);
386 }
387 #endif
388 else {
389 value = decode_CABAC_FL_bypass_parallel(decoder,nBits);
390 }
391 }
392 else {
393 value = decode_CABAC_FL_bypass_parallel(decoder,8);
394 nBits-=8;
395
396 while (nBits--) {
397 value <<= 1;
398 value |= decode_CABAC_bypass(decoder);
399 }
400 }
401
402 logtrace(LogCABAC," -> FL: %d\n", value);
403
404 return value;
405 }
406
407 int decode_CABAC_TR_bypass(CABAC_decoder* decoder, int cRiceParam, int cTRMax)
408 {
409 int prefix = decode_CABAC_TU_bypass(decoder, cTRMax>>cRiceParam);
410 if (prefix==4) { // TODO check: constant 4 only works for coefficient decoding
411 return cTRMax;
412 }
413
414 int suffix = decode_CABAC_FL_bypass(decoder, cRiceParam);
415
416 return (prefix << cRiceParam) | suffix;
417 }
418
419 int decode_CABAC_EGk_bypass(CABAC_decoder* decoder, int k)
420 {
421 int base=0;
422 int n=k;
423
424 for (;;)
425 {
426 int bit = decode_CABAC_bypass(decoder);
427 if (bit==0)
428 break;
429 else {
430 base += 1<<n;
431 n++;
432 }
433 }
434
435 int suffix = decode_CABAC_FL_bypass(decoder, n);
436 return base + suffix;
437 }
438
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
020 /* de265-version.h
121 *
222 * This file was generated by autoconf when libde265 was built.
727 #define LIBDE265_VERSION_H
828
929 /* Numeric representation of the version */
10 #define LIBDE265_NUMERIC_VERSION 0x00060000
30 #define LIBDE265_NUMERIC_VERSION 0x00080000
1131
1232 /* Version string */
13 #define LIBDE265_VERSION "0.6"
33 #define LIBDE265_VERSION "0.8"
1434
1535 #endif
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
020 /* de265-version.h
121 *
222 * This file was generated by autoconf when libde265 was built.
+0
-1124
libde265/de265.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #define DEBUG_INSERT_STREAM_ERRORS 0
21
22
23 #include "de265.h"
24 #include "decctx.h"
25 #include "slice_func.h"
26 #include "pps_func.h"
27 #include "sps_func.h"
28 #include "util.h"
29 #include "scan.h"
30 #include "image.h"
31 #include "sei.h"
32
33 #include <assert.h>
34 #include <string.h>
35 #include <stdlib.h>
36
37
38 de265_error de265_decode_NAL(de265_decoder_context* de265ctx, NAL_unit* nal);
39
40 // TODO: should be in some vps.c related header
41 de265_error read_vps(decoder_context* ctx, bitreader* reader, video_parameter_set* vps);
42
43
44 LIBDE265_API const char *de265_get_version(void)
45 {
46 return (LIBDE265_VERSION);
47 }
48
49 LIBDE265_API uint32_t de265_get_version_number(void)
50 {
51 return (LIBDE265_NUMERIC_VERSION);
52 }
53
54 LIBDE265_API const char* de265_get_error_text(de265_error err)
55 {
56 switch (err) {
57 case DE265_OK: return "no error";
58 case DE265_ERROR_NO_SUCH_FILE: return "no such file";
59 //case DE265_ERROR_NO_STARTCODE: return "no startcode found";
60 case DE265_ERROR_EOF: return "end of file";
61 case DE265_ERROR_COEFFICIENT_OUT_OF_IMAGE_BOUNDS: return "coefficient out of image bounds";
62 case DE265_ERROR_CHECKSUM_MISMATCH: return "image checksum mismatch";
63 case DE265_ERROR_CTB_OUTSIDE_IMAGE_AREA: return "CTB outside of image area";
64 case DE265_ERROR_OUT_OF_MEMORY: return "out of memory";
65 case DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE: return "coded parameter out of range";
66 case DE265_ERROR_IMAGE_BUFFER_FULL: return "DPB/output queue full";
67 case DE265_ERROR_CANNOT_START_THREADPOOL: return "cannot start decoding threads";
68 case DE265_ERROR_LIBRARY_INITIALIZATION_FAILED: return "global library initialization failed";
69 case DE265_ERROR_LIBRARY_NOT_INITIALIZED: return "cannot free library data (not initialized";
70
71 case DE265_ERROR_MAX_THREAD_CONTEXTS_EXCEEDED:
72 return "internal error: maximum number of thread contexts exceeded";
73 case DE265_ERROR_MAX_NUMBER_OF_SLICES_EXCEEDED:
74 return "internal error: maximum number of slices exceeded";
75 //case DE265_ERROR_SCALING_LIST_NOT_IMPLEMENTED:
76 //return "scaling list not implemented";
77 case DE265_ERROR_WAITING_FOR_INPUT_DATA:
78 return "no more input data, decoder stalled";
79 case DE265_ERROR_CANNOT_PROCESS_SEI:
80 return "SEI data cannot be processed";
81
82 case DE265_WARNING_NO_WPP_CANNOT_USE_MULTITHREADING:
83 return "Cannot run decoder multi-threaded because stream does not support WPP";
84 case DE265_WARNING_WARNING_BUFFER_FULL:
85 return "Too many warnings queued";
86 case DE265_WARNING_PREMATURE_END_OF_SLICE_SEGMENT:
87 return "Premature end of slice segment";
88 case DE265_WARNING_INCORRECT_ENTRY_POINT_OFFSET:
89 return "Incorrect entry-point offset";
90 case DE265_WARNING_CTB_OUTSIDE_IMAGE_AREA:
91 return "CTB outside of image area (concealing stream error...)";
92 case DE265_WARNING_SPS_HEADER_INVALID:
93 return "sps header invalid";
94 case DE265_WARNING_PPS_HEADER_INVALID:
95 return "pps header invalid";
96 case DE265_WARNING_SLICEHEADER_INVALID:
97 return "slice header invalid";
98 case DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING:
99 return "impossible motion vector scaling";
100 case DE265_WARNING_NONEXISTING_PPS_REFERENCED:
101 return "non-existing PPS referenced";
102 case DE265_WARNING_NONEXISTING_SPS_REFERENCED:
103 return "non-existing SPS referenced";
104 case DE265_WARNING_BOTH_PREDFLAGS_ZERO:
105 return "both predFlags[] are zero in MC";
106 case DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED:
107 return "non-existing reference picture accessed";
108 case DE265_WARNING_NUMMVP_NOT_EQUAL_TO_NUMMVQ:
109 return "numMV_P != numMV_Q in deblocking";
110 case DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE:
111 return "number of short-term ref-pic-sets out of range";
112 case DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE:
113 return "short-term ref-pic-set index out of range";
114 case DE265_WARNING_FAULTY_REFERENCE_PICTURE_LIST:
115 return "faulty reference picture list";
116 case DE265_WARNING_EOSS_BIT_NOT_SET:
117 return "end_of_sub_stream_one_bit not set to 1 when it should be";
118 case DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED:
119 return "maximum number of reference pictures exceeded";
120 case DE265_WARNING_INVALID_CHROMA_FORMAT:
121 return "invalid chroma format in SPS header";
122 case DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID:
123 return "slice segment address invalid";
124 case DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO:
125 return "dependent slice with address 0";
126 case DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM:
127 return "number of threads limited to maximum amount";
128 case DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER:
129 return "non-existing long-term reference candidate specified in slice header";
130
131 default: return "unknown error";
132 }
133 }
134
135 LIBDE265_API int de265_isOK(de265_error err)
136 {
137 return err == DE265_OK || err >= 1000;
138 }
139
140
141
142 ALIGNED_8(static de265_sync_int de265_init_count) = 0;
143
144 LIBDE265_API de265_error de265_init()
145 {
146 int cnt = de265_sync_add_and_fetch(&de265_init_count,1);
147 if (cnt>1) {
148 // we are not the first -> already initialized
149
150 return DE265_OK;
151 }
152
153
154 // do initializations
155
156 init_scan_orders();
157
158 if (!alloc_and_init_significant_coeff_ctxIdx_lookupTable()) {
159 de265_sync_sub_and_fetch(&de265_init_count,1);
160 return DE265_ERROR_LIBRARY_INITIALIZATION_FAILED;
161 }
162
163 return DE265_OK;
164 }
165
166 LIBDE265_API de265_error de265_free()
167 {
168 int cnt = de265_sync_sub_and_fetch(&de265_init_count,1);
169 if (cnt<0) {
170 de265_sync_add_and_fetch(&de265_init_count,1);
171 return DE265_ERROR_LIBRARY_NOT_INITIALIZED;
172 }
173
174 if (cnt==0) {
175 free_significant_coeff_ctxIdx_lookupTable();
176 }
177
178 return DE265_OK;
179 }
180
181
182 LIBDE265_API de265_decoder_context* de265_new_decoder()
183 {
184 de265_error init_err = de265_init();
185 if (init_err != DE265_OK) {
186 return NULL;
187 }
188
189 decoder_context* ctx = (decoder_context*)calloc(sizeof(decoder_context),1);
190 if (!ctx) {
191 de265_free();
192 return NULL;
193 }
194
195 init_decoder_context(ctx);
196
197 return (de265_decoder_context*)ctx;
198 }
199
200
201 LIBDE265_API de265_error de265_free_decoder(de265_decoder_context* de265ctx)
202 {
203 decoder_context* ctx = (decoder_context*)de265ctx;
204
205 if (ctx->num_worker_threads>0) {
206 //flush_thread_pool(&ctx->thread_pool);
207 stop_thread_pool(&ctx->thread_pool);
208 }
209
210 free_decoder_context(ctx);
211 free(de265ctx);
212
213 return de265_free();
214 }
215
216
217 LIBDE265_API de265_error de265_start_worker_threads(de265_decoder_context* de265ctx, int number_of_threads)
218 {
219 decoder_context* ctx = (decoder_context*)de265ctx;
220
221 if (number_of_threads > MAX_THREADS) {
222 number_of_threads = MAX_THREADS;
223 }
224
225 ctx->num_worker_threads = number_of_threads;
226
227 if (number_of_threads>0) {
228 de265_error err = start_thread_pool(&ctx->thread_pool, number_of_threads);
229 if (de265_isOK(err)) {
230 err = DE265_OK;
231 }
232 return err;
233 }
234 else {
235 return DE265_OK;
236 }
237 }
238
239
240 void nal_insert_skipped_byte(NAL_unit* nal, int pos)
241 {
242 if (nal->max_skipped_bytes == nal->num_skipped_bytes) {
243 if (nal->max_skipped_bytes == 0) {
244 nal->max_skipped_bytes = DE265_SKIPPED_BYTES_INITIAL_SIZE;
245 } else {
246 nal->max_skipped_bytes <<= 2;
247 }
248
249 // TODO: handle case where realloc fails
250 nal->skipped_bytes = (int *)realloc(nal->skipped_bytes,
251 nal->max_skipped_bytes * sizeof(int));
252 }
253
254 nal->skipped_bytes[nal->num_skipped_bytes] = pos;
255 nal->num_skipped_bytes++;
256 }
257
258
259 #ifndef LIBDE265_DISABLE_DEPRECATED
260 LIBDE265_API de265_error de265_decode_data(de265_decoder_context* de265ctx,
261 const void* data8, int len)
262 {
263 //decoder_context* ctx = (decoder_context*)de265ctx;
264 de265_error err;
265 if (len > 0) {
266 err = de265_push_data(de265ctx, data8, len, 0, NULL);
267 } else {
268 err = de265_flush_data(de265ctx);
269 }
270 if (err != DE265_OK) {
271 return err;
272 }
273
274 int more = 0;
275 do {
276 err = de265_decode(de265ctx, &more);
277 if (err != DE265_OK) {
278 more = 0;
279 }
280
281 switch (err) {
282 case DE265_ERROR_WAITING_FOR_INPUT_DATA:
283 // ignore error (didn't exist in 0.4 and before)
284 err = DE265_OK;
285 break;
286 default:
287 break;
288 }
289 } while (more);
290 return err;
291 }
292 #endif
293
294 LIBDE265_API de265_error de265_push_data(de265_decoder_context* de265ctx,
295 const void* data8, int len,
296 de265_PTS pts, void* user_data)
297 {
298 decoder_context* ctx = (decoder_context*)de265ctx;
299 uint8_t* data = (uint8_t*)data8;
300
301 if (ctx->pending_input_NAL == NULL) {
302 ctx->pending_input_NAL = alloc_NAL_unit(ctx, len+3, DE265_SKIPPED_BYTES_INITIAL_SIZE);
303 ctx->pending_input_NAL->pts = pts;
304 ctx->pending_input_NAL->user_data = user_data;
305 }
306
307 NAL_unit* nal = ctx->pending_input_NAL; // shortcut
308
309 // Resize output buffer so that complete input would fit.
310 // We add 3, because in the worst case 3 extra bytes are created for an input byte.
311 rbsp_buffer_resize(&nal->nal_data, nal->nal_data.size + len + 3);
312
313 unsigned char* out = nal->nal_data.data + nal->nal_data.size;
314
315 for (int i=0;i<len;i++) {
316 /*
317 printf("state=%d input=%02x (%p) (output size: %d)\n",ctx->input_push_state, *data, data,
318 out - ctx->nal_data.data);
319 */
320
321 switch (ctx->input_push_state) {
322 case 0:
323 case 1:
324 if (*data == 0) { ctx->input_push_state++; }
325 else { ctx->input_push_state=0; }
326 break;
327 case 2:
328 if (*data == 1) { ctx->input_push_state=3; nal->num_skipped_bytes=0; }
329 else if (*data == 0) { } // *out++ = 0; }
330 else { ctx->input_push_state=0; }
331 break;
332 case 3:
333 *out++ = *data;
334 ctx->input_push_state = 4;
335 break;
336 case 4:
337 *out++ = *data;
338 ctx->input_push_state = 5;
339 break;
340
341 case 5:
342 if (*data==0) { ctx->input_push_state=6; }
343 else { *out++ = *data; }
344 break;
345
346 case 6:
347 if (*data==0) { ctx->input_push_state=7; }
348 else {
349 *out++ = 0;
350 *out++ = *data;
351 ctx->input_push_state=5;
352 }
353 break;
354
355 case 7:
356 if (*data==0) { *out++ = 0; }
357 else if (*data==3) {
358 *out++ = 0; *out++ = 0; ctx->input_push_state=5;
359
360 // remember which byte we removed
361 nal_insert_skipped_byte(nal, (out - nal->nal_data.data) + nal->num_skipped_bytes);
362 }
363 else if (*data==1) {
364
365 #if DEBUG_INSERT_STREAM_ERRORS
366 if ((rand()%100)<90 && ctx->nal_data.size>0) {
367 int pos = rand()%ctx->nal_data.size;
368 int bit = rand()%8;
369 nal->nal_data.data[pos] ^= 1<<bit;
370
371 //printf("inserted error...\n");
372 }
373 #endif
374
375 nal->nal_data.size = out - nal->nal_data.data;
376
377 // push this NAL decoder queue
378 push_to_NAL_queue(ctx, nal);
379
380
381 // initialize new, empty NAL unit
382
383 ctx->pending_input_NAL = alloc_NAL_unit(ctx, len+3, DE265_SKIPPED_BYTES_INITIAL_SIZE);
384 ctx->pending_input_NAL->pts = pts;
385 nal = ctx->pending_input_NAL;
386 out = nal->nal_data.data;
387
388 ctx->input_push_state=3;
389 nal->num_skipped_bytes=0;
390 }
391 else {
392 *out++ = 0;
393 *out++ = 0;
394 *out++ = *data;
395
396 ctx->input_push_state=5;
397 }
398 break;
399 }
400
401 data++;
402 }
403
404 nal->nal_data.size = out - nal->nal_data.data;
405 return DE265_OK;
406 }
407
408
409 void remove_stuffing_bytes(NAL_unit* nal)
410 {
411 uint8_t* p = nal->nal_data.data;
412
413 for (int i=0;i<nal->nal_data.size-2;i++)
414 {
415 #if 0
416 for (int k=i;k<i+64;k++)
417 if (i*0+k<nal->nal_data.size) {
418 printf("%c%02x", (k==i) ? '[':' ', nal->nal_data.data[k]);
419 }
420 printf("\n");
421 #endif
422
423 if (p[2]!=3 && p[2]!=0) {
424 // fast forward 3 bytes (2+1)
425 p+=2;
426 i+=2;
427 }
428 else {
429 if (p[0]==0 && p[1]==0 && p[2]==3) {
430 //printf("SKIP NAL @ %d\n",i+2+nal->num_skipped_bytes);
431 nal_insert_skipped_byte(nal, i+2 + nal->num_skipped_bytes);
432
433 memmove(p+2, p+3, nal->nal_data.size-i-3);
434 nal->nal_data.size--;
435
436 p++;
437 i++;
438 }
439 }
440
441 p++;
442 }
443 }
444
445
446 LIBDE265_API de265_error de265_push_NAL(de265_decoder_context* de265ctx,
447 const void* data8, int len,
448 de265_PTS pts, void* user_data)
449 {
450 decoder_context* ctx = (decoder_context*)de265ctx;
451 uint8_t* data = (uint8_t*)data8;
452
453 // Cannot use byte-stream input and NAL input at the same time.
454 assert(ctx->pending_input_NAL == NULL);
455
456 NAL_unit* nal = alloc_NAL_unit(ctx, len, DE265_SKIPPED_BYTES_INITIAL_SIZE);
457 rbsp_buffer_resize(&nal->nal_data, len);
458 nal->nal_data.size = len;
459 nal->pts = pts;
460 nal->user_data = user_data;
461 memcpy(nal->nal_data.data, data, len);
462
463 remove_stuffing_bytes(nal);
464
465 push_to_NAL_queue(ctx, nal);
466
467 return DE265_OK;
468 }
469
470
471 LIBDE265_API de265_error de265_decode(de265_decoder_context* de265ctx, int* more)
472 {
473 decoder_context* ctx = (decoder_context*)de265ctx;
474
475 // if the stream has ended, and no more NALs are to be decoded, flush all pictures
476
477 if (ctx->NAL_queue_len == 0 && ctx->end_of_stream) {
478 if (more) { *more=0; } // 0 if no more pictures in queue
479
480 push_current_picture_to_output_queue(ctx);
481
482 while (ctx->reorder_output_queue_length>0) {
483 flush_next_picture_from_reorder_buffer(ctx);
484 if (more) { *more=1; }
485 }
486
487 return DE265_OK;
488 }
489
490
491 // if NAL-queue is empty, we need more data
492 // -> input stalled
493
494 if (ctx->NAL_queue_len == 0) {
495 if (more) { *more=1; }
496
497 return DE265_ERROR_WAITING_FOR_INPUT_DATA;
498 }
499
500
501 // when there are no free image buffers in the DPB, pause decoding
502 // -> output stalled
503
504 if (!has_free_dpb_picture(ctx, false)) {
505 if (more) *more = 1;
506 return DE265_ERROR_IMAGE_BUFFER_FULL;
507 }
508
509
510 // decode one NAL from the queue
511
512 NAL_unit* nal = pop_from_NAL_queue(ctx);
513 assert(nal);
514 de265_error err = de265_decode_NAL(de265ctx, nal);
515 free_NAL_unit(ctx,nal);
516
517 if (more) {
518 // decoding error is assumed to be unrecoverable
519 *more = (err==DE265_OK);
520 }
521
522 return err;
523 }
524
525
526 LIBDE265_API de265_error de265_flush_data(de265_decoder_context* de265ctx)
527 {
528 decoder_context* ctx = (decoder_context*)de265ctx;
529
530 if (ctx->pending_input_NAL) {
531 NAL_unit* nal = ctx->pending_input_NAL;
532 uint8_t null[2] = { 0,0 };
533
534 // append bytes that are implied by the push state
535
536 if (ctx->input_push_state==6) { rbsp_buffer_append(&nal->nal_data,null,1); }
537 if (ctx->input_push_state==7) { rbsp_buffer_append(&nal->nal_data,null,2); }
538
539
540 // only push the NAL if it contains at least the NAL header
541
542 if (ctx->input_push_state>=5) {
543 push_to_NAL_queue(ctx, nal);
544 ctx->pending_input_NAL = NULL;
545 }
546
547 ctx->input_push_state = 0;
548 }
549
550 ctx->end_of_stream = true;
551
552 return DE265_OK;
553 }
554
555
556 void init_thread_context(thread_context* tctx)
557 {
558 // zero scrap memory for coefficient blocks
559 memset(tctx->_coeffBuf, 0, sizeof(tctx->_coeffBuf));
560
561 tctx->currentQG_x = -1;
562 tctx->currentQG_y = -1;
563
564 tctx->inUse = true;
565 }
566
567
568 extern void thread_decode_CTB_row(void* d);
569 extern void thread_decode_slice_segment(void* d);
570
571
572 void add_task_decode_CTB_row(decoder_context* ctx, int thread_id, bool initCABAC)
573 {
574 thread_task task;
575 task.task_id = 0; // no ID
576 task.task_cmd = THREAD_TASK_DECODE_CTB_ROW;
577 task.work_routine = thread_decode_CTB_row;
578 task.data.task_ctb_row.ctx = ctx;
579 task.data.task_ctb_row.initCABAC = initCABAC;
580 task.data.task_ctb_row.thread_context_id = thread_id;
581 add_task(&ctx->thread_pool, &task);
582 }
583
584
585 void add_task_decode_slice_segment(decoder_context* ctx, int thread_id)
586 {
587 thread_task task;
588 task.task_id = 0; // no ID
589 task.task_cmd = THREAD_TASK_DECODE_SLICE_SEGMENT;
590 task.work_routine = thread_decode_slice_segment;
591 task.data.task_ctb_row.ctx = ctx;
592 task.data.task_ctb_row.thread_context_id = thread_id;
593 add_task(&ctx->thread_pool, &task);
594 }
595
596
597 de265_error de265_decode_NAL(de265_decoder_context* de265ctx, NAL_unit* nal)
598 {
599 decoder_context* ctx = (decoder_context*)de265ctx;
600 rbsp_buffer* data = &nal->nal_data;
601
602 de265_error err = DE265_OK;
603
604 bitreader reader;
605 bitreader_init(&reader, data);
606
607 nal_header nal_hdr;
608 nal_read_header(&reader, &nal_hdr);
609 process_nal_hdr(ctx, &nal_hdr);
610
611 loginfo(LogHighlevel,"NAL: 0x%x 0x%x - unit type:%s temporal id:%d\n",
612 data->data[0], data->data[1],
613 get_NAL_name(nal_hdr.nal_unit_type),
614 nal_hdr.nuh_temporal_id);
615
616 if (nal_hdr.nal_unit_type<32) {
617 logdebug(LogHeaders,"---> read slice segment header\n");
618
619 //printf("-------- slice header --------\n");
620
621 int sliceIndex = get_next_slice_index(ctx);
622 if (sliceIndex<0) {
623 add_warning(ctx,DE265_ERROR_MAX_NUMBER_OF_SLICES_EXCEEDED, true);
624 return DE265_ERROR_MAX_NUMBER_OF_SLICES_EXCEEDED;
625 }
626
627 slice_segment_header* hdr = &ctx->slice[sliceIndex];
628 bool continueDecoding;
629 err = read_slice_segment_header(&reader,hdr,ctx, &continueDecoding);
630 if (!continueDecoding) {
631 return err;
632 }
633 else {
634 hdr->slice_index = sliceIndex;
635
636 if (ctx->param_slice_headers_fd>=0) {
637 dump_slice_segment_header(hdr, ctx, ctx->param_slice_headers_fd);
638 }
639
640 if (process_slice_segment_header(ctx, hdr, &err, nal->pts, nal->user_data) == false)
641 {
642 ctx->img->integrity = INTEGRITY_NOT_DECODED;
643 return err;
644 }
645
646 skip_bits(&reader,1); // TODO: why?
647 prepare_for_CABAC(&reader);
648
649
650 // modify entry_point_offsets
651
652 int headerLength = reader.data - data->data;
653 for (int i=0;i<nal->num_skipped_bytes;i++)
654 {
655 nal->skipped_bytes[i] -= headerLength;
656 }
657
658 for (int i=0;i<hdr->num_entry_point_offsets;i++) {
659 for (int k=nal->num_skipped_bytes-1;k>=0;k--)
660 if (nal->skipped_bytes[k] <= hdr->entry_point_offset[i]) {
661 hdr->entry_point_offset[i] -= k+1;
662 break;
663 }
664 }
665
666
667 const pic_parameter_set* pps = ctx->current_pps;
668 int ctbsWidth = ctx->current_sps->PicWidthInCtbsY;
669
670 int nRows = hdr->num_entry_point_offsets +1;
671
672 bool use_WPP = (ctx->num_worker_threads > 0 &&
673 ctx->current_pps->entropy_coding_sync_enabled_flag);
674
675 bool use_tiles = (ctx->num_worker_threads > 0 &&
676 ctx->current_pps->tiles_enabled_flag);
677
678 if (use_WPP && use_tiles) {
679 //add_warning(ctx, DE265_WARNING_STREAMS_APPLIES_TILES_AND_WPP, true);
680 }
681
682 if (ctx->num_worker_threads > 0 &&
683 ctx->current_pps->entropy_coding_sync_enabled_flag == false &&
684 ctx->current_pps->tiles_enabled_flag == false) {
685
686 // TODO: new error should be: no WPP and no Tiles ...
687 add_warning(ctx, DE265_WARNING_NO_WPP_CANNOT_USE_MULTITHREADING, true);
688 }
689
690 if (!use_WPP && !use_tiles) {
691 // --- single threaded decoding ---
692
693 #if 0
694 int thread_context_idx = get_next_thread_context_index(ctx);
695 if (thread_context_idx<0) {
696 assert(false); // TODO
697 }
698 #else
699 int thread_context_idx=0;
700 #endif
701
702 thread_context* tctx = &ctx->thread_context[thread_context_idx];
703
704 init_thread_context(tctx);
705
706 init_CABAC_decoder(&tctx->cabac_decoder,
707 reader.data,
708 reader.bytes_remaining);
709
710 tctx->shdr = hdr;
711 tctx->decctx = ctx;
712 tctx->CtbAddrInTS = pps->CtbAddrRStoTS[hdr->slice_segment_address];
713
714 // fixed context 0
715 if ((err=read_slice_segment_data(ctx, tctx)) != DE265_OK)
716 { return err; }
717 }
718 else if (use_tiles && !use_WPP) {
719 int nTiles = nRows; // TODO: rename 'nRows'
720
721 if (nTiles > MAX_THREAD_CONTEXTS) {
722 return DE265_ERROR_MAX_THREAD_CONTEXTS_EXCEEDED;
723 }
724
725 assert(nTiles == pps->num_tile_columns * pps->num_tile_rows); // TODO: handle other cases
726
727 assert(ctx->img->tasks_pending == 0);
728 increase_pending_tasks(ctx->img, nTiles);
729
730 for (int ty=0;ty<pps->num_tile_rows;ty++)
731 for (int tx=0;tx<pps->num_tile_columns;tx++) {
732 int tile = tx + ty*pps->num_tile_columns;
733
734 // set thread context
735
736 ctx->thread_context[tile].shdr = hdr;
737 ctx->thread_context[tile].decctx = ctx;
738
739 ctx->thread_context[tile].CtbAddrInTS = pps->CtbAddrRStoTS[pps->colBd[tx] + pps->rowBd[ty]*ctbsWidth];
740
741
742 // init CABAC
743
744 int dataStartIndex;
745 if (tile==0) { dataStartIndex=0; }
746 else { dataStartIndex=hdr->entry_point_offset[tile-1]; }
747
748 int dataEnd;
749 if (tile==nRows-1) dataEnd = reader.bytes_remaining;
750 else dataEnd = hdr->entry_point_offset[tile];
751
752 init_thread_context(&ctx->thread_context[tile]);
753
754 init_CABAC_decoder(&ctx->thread_context[tile].cabac_decoder,
755 &reader.data[dataStartIndex],
756 dataEnd-dataStartIndex);
757 }
758
759 // add tasks
760
761 for (int i=0;i<nTiles;i++) {
762 add_task_decode_slice_segment(ctx, i);
763 }
764
765 wait_for_completion(ctx->img);
766 }
767 else {
768 if (nRows > MAX_THREAD_CONTEXTS) {
769 return DE265_ERROR_MAX_THREAD_CONTEXTS_EXCEEDED;
770 }
771
772 assert(ctx->img->tasks_pending == 0);
773 increase_pending_tasks(ctx->img, nRows);
774
775 //printf("-------- decode --------\n");
776
777
778 for (int y=0;y<nRows;y++) {
779
780 // set thread context
781
782 for (int x=0;x<ctbsWidth;x++) {
783 ctx->img->ctb_info[x+y*ctbsWidth].thread_context_id = y; // TODO: shouldn't be hardcoded
784 }
785
786 ctx->thread_context[y].shdr = hdr;
787 ctx->thread_context[y].decctx = ctx;
788 ctx->thread_context[y].CtbAddrInTS = pps->CtbAddrRStoTS[0 + y*ctbsWidth];
789
790
791 // init CABAC
792
793 int dataStartIndex;
794 if (y==0) { dataStartIndex=0; }
795 else { dataStartIndex=hdr->entry_point_offset[y-1]; }
796
797 int dataEnd;
798 if (y==nRows-1) dataEnd = reader.bytes_remaining;
799 else dataEnd = hdr->entry_point_offset[y];
800
801 init_thread_context(&ctx->thread_context[y]);
802
803 init_CABAC_decoder(&ctx->thread_context[y].cabac_decoder,
804 &reader.data[dataStartIndex],
805 dataEnd-dataStartIndex);
806 }
807
808 // add tasks
809
810 for (int y=0;y<nRows;y++) {
811 add_task_decode_CTB_row(ctx, y, y==0);
812 }
813
814 wait_for_completion(ctx->img);
815 }
816 }
817 }
818 else switch (nal_hdr.nal_unit_type) {
819 case NAL_UNIT_VPS_NUT:
820 {
821 logdebug(LogHeaders,"---> read VPS\n");
822
823 video_parameter_set vps;
824 err=read_vps(ctx,&reader,&vps);
825 if (err != DE265_OK) {
826 break;
827 }
828
829 if (ctx->param_vps_headers_fd>=0) {
830 dump_vps(&vps, ctx->param_vps_headers_fd);
831 }
832
833 process_vps(ctx, &vps);
834 }
835 break;
836
837 case NAL_UNIT_SPS_NUT:
838 {
839 logdebug(LogHeaders,"----> read SPS\n");
840
841 seq_parameter_set sps;
842 init_sps(&sps);
843
844 if ((err=read_sps(ctx, &reader,&sps)) != DE265_OK) {
845 break;
846 }
847
848 if (ctx->param_sps_headers_fd>=0) {
849 dump_sps(&sps, ctx->param_sps_headers_fd);
850 }
851
852 process_sps(ctx, &sps);
853 }
854 break;
855
856 case NAL_UNIT_PPS_NUT:
857 {
858 logdebug(LogHeaders,"----> read PPS\n");
859
860 pic_parameter_set pps;
861
862 init_pps(&pps);
863 bool success = read_pps(&reader,&pps,ctx);
864
865 if (ctx->param_pps_headers_fd>=0) {
866 dump_pps(&pps, ctx->param_pps_headers_fd);
867 }
868
869 if (success) {
870 process_pps(ctx,&pps);
871 }
872 }
873 break;
874
875 case NAL_UNIT_PREFIX_SEI_NUT:
876 case NAL_UNIT_SUFFIX_SEI_NUT:
877 logdebug(LogHeaders,"----> read SEI\n");
878
879 sei_message sei;
880
881 push_current_picture_to_output_queue(ctx);
882
883 if (read_sei(&reader,&sei, nal_hdr.nal_unit_type==NAL_UNIT_SUFFIX_SEI_NUT, ctx)) {
884 dump_sei(&sei, ctx);
885
886 err = process_sei(&sei, ctx);
887 }
888 break;
889
890 case NAL_UNIT_EOS_NUT:
891 ctx->FirstAfterEndOfSequenceNAL = true;
892 break;
893 }
894
895 return err;
896 }
897
898
899 LIBDE265_API void de265_reset(de265_decoder_context* de265ctx)
900 {
901 decoder_context* ctx = (decoder_context*)de265ctx;
902
903 int num_worker_threads = ctx->num_worker_threads;
904 if (num_worker_threads>0) {
905 //flush_thread_pool(&ctx->thread_pool);
906 stop_thread_pool(&ctx->thread_pool);
907 }
908
909 // TODO: maybe we can do things better here
910
911 free_decoder_context(ctx);
912 init_decoder_context(ctx);
913 if (num_worker_threads>0) {
914 // TODO: need error checking
915 de265_start_worker_threads(de265ctx, num_worker_threads);
916 }
917 }
918
919
920 LIBDE265_API const struct de265_image* de265_get_next_picture(de265_decoder_context* de265ctx)
921 {
922 const struct de265_image* img = de265_peek_next_picture(de265ctx);
923 if (img) {
924 de265_release_next_picture(de265ctx);
925 }
926
927 return img;
928 }
929
930
931 LIBDE265_API const struct de265_image* de265_peek_next_picture(de265_decoder_context* de265ctx)
932 {
933 decoder_context* ctx = (decoder_context*)de265ctx;
934
935 return ctx->image_output_queue[0];
936 }
937
938
939 LIBDE265_API void de265_release_next_picture(de265_decoder_context* de265ctx)
940 {
941 decoder_context* ctx = (decoder_context*)de265ctx;
942
943 // no active output picture -> ignore release request
944
945 if (ctx->image_output_queue_length==0) { return; }
946
947
948 loginfo(LogDPB, "release DPB with POC=%d\n",ctx->image_output_queue[0]->PicOrderCntVal);
949
950 ctx->image_output_queue[0]->PicOutputFlag = false;
951 cleanup_image(ctx, ctx->image_output_queue[0]);
952
953 // pop output queue
954
955 for (int i=1;i<ctx->image_output_queue_length;i++)
956 {
957 ctx->image_output_queue[i-1] = ctx->image_output_queue[i];
958 }
959
960 ctx->image_output_queue_length--;
961
962 ctx->image_output_queue[ ctx->image_output_queue_length ] = NULL;
963
964
965 loginfo(LogDPB, "DPB output queue: ");
966 for (int i=0;i<ctx->image_output_queue_length;i++) {
967 loginfo(LogDPB, "*%d ", ctx->image_output_queue[i]->PicOrderCntVal);
968 }
969 loginfo(LogDPB,"*\n");
970 }
971
972
973 LIBDE265_API de265_error de265_get_warning(de265_decoder_context* de265ctx)
974 {
975 decoder_context* ctx = (decoder_context*)de265ctx;
976
977 return get_warning(ctx);
978 }
979
980 LIBDE265_API void de265_set_parameter_bool(de265_decoder_context* de265ctx, enum de265_param param, int value)
981 {
982 decoder_context* ctx = (decoder_context*)de265ctx;
983
984 switch (param)
985 {
986 case DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH:
987 ctx->param_sei_check_hash = !!value;
988 break;
989
990 default:
991 assert(false);
992 break;
993 }
994 }
995
996
997 LIBDE265_API void de265_set_parameter_int(de265_decoder_context* de265ctx, enum de265_param param, int value)
998 {
999 decoder_context* ctx = (decoder_context*)de265ctx;
1000
1001 switch (param)
1002 {
1003 case DE265_DECODER_PARAM_DUMP_SPS_HEADERS:
1004 ctx->param_sps_headers_fd = value;
1005 break;
1006
1007 case DE265_DECODER_PARAM_DUMP_VPS_HEADERS:
1008 ctx->param_vps_headers_fd = value;
1009 break;
1010
1011 case DE265_DECODER_PARAM_DUMP_PPS_HEADERS:
1012 ctx->param_pps_headers_fd = value;
1013 break;
1014
1015 case DE265_DECODER_PARAM_DUMP_SLICE_HEADERS:
1016 ctx->param_slice_headers_fd = value;
1017 break;
1018
1019 case DE265_DECODER_PARAM_ACCELERATION_CODE:
1020 set_acceleration_functions(ctx, (enum de265_acceleration)value);
1021 break;
1022
1023 default:
1024 assert(false);
1025 break;
1026 }
1027 }
1028
1029
1030
1031
1032 LIBDE265_API int de265_get_parameter_bool(de265_decoder_context* de265ctx, enum de265_param param)
1033 {
1034 decoder_context* ctx = (decoder_context*)de265ctx;
1035
1036 switch (param)
1037 {
1038 case DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH:
1039 return ctx->param_sei_check_hash;
1040 break;
1041
1042 default:
1043 assert(false);
1044 return false;
1045 }
1046 }
1047
1048
1049 LIBDE265_API int de265_get_number_of_input_bytes_pending(de265_decoder_context* de265ctx)
1050 {
1051 decoder_context* ctx = (decoder_context*)de265ctx;
1052
1053 int size = ctx->nBytes_in_NAL_queue;
1054 if (ctx->pending_input_NAL) { size += ctx->pending_input_NAL->nal_data.size; }
1055 return size;
1056 }
1057
1058
1059 LIBDE265_API int de265_get_number_of_NAL_units_pending(de265_decoder_context* de265ctx)
1060 {
1061 decoder_context* ctx = (decoder_context*)de265ctx;
1062
1063 int size = ctx->NAL_queue_len;
1064 if (ctx->pending_input_NAL) { size++; }
1065
1066 return size;
1067 }
1068
1069
1070 LIBDE265_API int de265_get_image_width(const struct de265_image* img,int channel)
1071 {
1072 switch (channel) {
1073 case 0:
1074 return img->width_confwin;
1075 case 1:
1076 case 2:
1077 return img->chroma_width_confwin;
1078 default:
1079 return 0;
1080 }
1081 }
1082
1083 LIBDE265_API int de265_get_image_height(const struct de265_image* img,int channel)
1084 {
1085 switch (channel) {
1086 case 0:
1087 return img->height_confwin;
1088 case 1:
1089 case 2:
1090 return img->chroma_height_confwin;
1091 default:
1092 return 0;
1093 }
1094 }
1095
1096 LIBDE265_API enum de265_chroma de265_get_chroma_format(const struct de265_image* img)
1097 {
1098 return img->chroma_format;
1099 }
1100
1101 LIBDE265_API const uint8_t* de265_get_image_plane(const de265_image* img, int channel, int* stride)
1102 {
1103 uint8_t* data;
1104
1105 switch (channel) {
1106 case 0: data = img->y_confwin; if (stride) *stride = img->stride; break;
1107 case 1: data = img->cb_confwin; if (stride) *stride = img->chroma_stride; break;
1108 case 2: data = img->cr_confwin; if (stride) *stride = img->chroma_stride; break;
1109 default: data = NULL; if (stride) *stride = 0; break;
1110 }
1111
1112 return data;
1113 }
1114
1115 LIBDE265_API de265_PTS de265_get_image_PTS(const struct de265_image* img)
1116 {
1117 return img->pts;
1118 }
1119
1120 LIBDE265_API void* de265_get_image_user_data(const struct de265_image* img)
1121 {
1122 return img->user_data;
1123 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #define DEBUG_INSERT_STREAM_ERRORS 0
21
22
23 #include "de265.h"
24 #include "decctx.h"
25 #include "util.h"
26 #include "scan.h"
27 #include "image.h"
28 #include "sei.h"
29
30 #include <assert.h>
31 #include <string.h>
32 #include <stdlib.h>
33
34
35 // TODO: should be in some vps.c related header
36 de265_error read_vps(decoder_context* ctx, bitreader* reader, video_parameter_set* vps);
37
38 extern "C" {
39 LIBDE265_API const char *de265_get_version(void)
40 {
41 return (LIBDE265_VERSION);
42 }
43
44 LIBDE265_API uint32_t de265_get_version_number(void)
45 {
46 return (LIBDE265_NUMERIC_VERSION);
47 }
48
49 LIBDE265_API const char* de265_get_error_text(de265_error err)
50 {
51 switch (err) {
52 case DE265_OK: return "no error";
53 case DE265_ERROR_NO_SUCH_FILE: return "no such file";
54 //case DE265_ERROR_NO_STARTCODE: return "no startcode found";
55 case DE265_ERROR_EOF: return "end of file";
56 case DE265_ERROR_COEFFICIENT_OUT_OF_IMAGE_BOUNDS: return "coefficient out of image bounds";
57 case DE265_ERROR_CHECKSUM_MISMATCH: return "image checksum mismatch";
58 case DE265_ERROR_CTB_OUTSIDE_IMAGE_AREA: return "CTB outside of image area";
59 case DE265_ERROR_OUT_OF_MEMORY: return "out of memory";
60 case DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE: return "coded parameter out of range";
61 case DE265_ERROR_IMAGE_BUFFER_FULL: return "DPB/output queue full";
62 case DE265_ERROR_CANNOT_START_THREADPOOL: return "cannot start decoding threads";
63 case DE265_ERROR_LIBRARY_INITIALIZATION_FAILED: return "global library initialization failed";
64 case DE265_ERROR_LIBRARY_NOT_INITIALIZED: return "cannot free library data (not initialized";
65
66 case DE265_ERROR_MAX_THREAD_CONTEXTS_EXCEEDED:
67 return "internal error: maximum number of thread contexts exceeded";
68 case DE265_ERROR_MAX_NUMBER_OF_SLICES_EXCEEDED:
69 return "internal error: maximum number of slices exceeded";
70 //case DE265_ERROR_SCALING_LIST_NOT_IMPLEMENTED:
71 //return "scaling list not implemented";
72 case DE265_ERROR_WAITING_FOR_INPUT_DATA:
73 return "no more input data, decoder stalled";
74 case DE265_ERROR_CANNOT_PROCESS_SEI:
75 return "SEI data cannot be processed";
76
77 case DE265_WARNING_NO_WPP_CANNOT_USE_MULTITHREADING:
78 return "Cannot run decoder multi-threaded because stream does not support WPP";
79 case DE265_WARNING_WARNING_BUFFER_FULL:
80 return "Too many warnings queued";
81 case DE265_WARNING_PREMATURE_END_OF_SLICE_SEGMENT:
82 return "Premature end of slice segment";
83 case DE265_WARNING_INCORRECT_ENTRY_POINT_OFFSET:
84 return "Incorrect entry-point offsets";
85 case DE265_WARNING_CTB_OUTSIDE_IMAGE_AREA:
86 return "CTB outside of image area (concealing stream error...)";
87 case DE265_WARNING_SPS_HEADER_INVALID:
88 return "sps header invalid";
89 case DE265_WARNING_PPS_HEADER_INVALID:
90 return "pps header invalid";
91 case DE265_WARNING_SLICEHEADER_INVALID:
92 return "slice header invalid";
93 case DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING:
94 return "impossible motion vector scaling";
95 case DE265_WARNING_NONEXISTING_PPS_REFERENCED:
96 return "non-existing PPS referenced";
97 case DE265_WARNING_NONEXISTING_SPS_REFERENCED:
98 return "non-existing SPS referenced";
99 case DE265_WARNING_BOTH_PREDFLAGS_ZERO:
100 return "both predFlags[] are zero in MC";
101 case DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED:
102 return "non-existing reference picture accessed";
103 case DE265_WARNING_NUMMVP_NOT_EQUAL_TO_NUMMVQ:
104 return "numMV_P != numMV_Q in deblocking";
105 case DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE:
106 return "number of short-term ref-pic-sets out of range";
107 case DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE:
108 return "short-term ref-pic-set index out of range";
109 case DE265_WARNING_FAULTY_REFERENCE_PICTURE_LIST:
110 return "faulty reference picture list";
111 case DE265_WARNING_EOSS_BIT_NOT_SET:
112 return "end_of_sub_stream_one_bit not set to 1 when it should be";
113 case DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED:
114 return "maximum number of reference pictures exceeded";
115 case DE265_WARNING_INVALID_CHROMA_FORMAT:
116 return "invalid chroma format in SPS header";
117 case DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID:
118 return "slice segment address invalid";
119 case DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO:
120 return "dependent slice with address 0";
121 case DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM:
122 return "number of threads limited to maximum amount";
123 case DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER:
124 return "non-existing long-term reference candidate specified in slice header";
125 case DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY:
126 return "cannot apply SAO because we ran out of memory";
127 case DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI:
128 return "SPS header missing, cannot decode SEI";
129
130 default: return "unknown error";
131 }
132 }
133
134 LIBDE265_API int de265_isOK(de265_error err)
135 {
136 return err == DE265_OK || err >= 1000;
137 }
138
139
140
141 ALIGNED_8(static de265_sync_int de265_init_count) = 0;
142
143 LIBDE265_API de265_error de265_init()
144 {
145 int cnt = de265_sync_add_and_fetch(&de265_init_count,1);
146 if (cnt>1) {
147 // we are not the first -> already initialized
148
149 return DE265_OK;
150 }
151
152
153 // do initializations
154
155 init_scan_orders();
156
157 if (!alloc_and_init_significant_coeff_ctxIdx_lookupTable()) {
158 de265_sync_sub_and_fetch(&de265_init_count,1);
159 return DE265_ERROR_LIBRARY_INITIALIZATION_FAILED;
160 }
161
162 return DE265_OK;
163 }
164
165 LIBDE265_API de265_error de265_free()
166 {
167 int cnt = de265_sync_sub_and_fetch(&de265_init_count,1);
168 if (cnt<0) {
169 de265_sync_add_and_fetch(&de265_init_count,1);
170 return DE265_ERROR_LIBRARY_NOT_INITIALIZED;
171 }
172
173 if (cnt==0) {
174 free_significant_coeff_ctxIdx_lookupTable();
175 }
176
177 return DE265_OK;
178 }
179
180
181 LIBDE265_API de265_decoder_context* de265_new_decoder()
182 {
183 de265_error init_err = de265_init();
184 if (init_err != DE265_OK) {
185 return NULL;
186 }
187
188 decoder_context* ctx = new decoder_context;
189 if (!ctx) {
190 de265_free();
191 return NULL;
192 }
193
194 return (de265_decoder_context*)ctx;
195 }
196
197
198 LIBDE265_API de265_error de265_free_decoder(de265_decoder_context* de265ctx)
199 {
200 decoder_context* ctx = (decoder_context*)de265ctx;
201
202 ctx->stop_thread_pool();
203
204 delete ctx;
205
206 return de265_free();
207 }
208
209
210 LIBDE265_API de265_error de265_start_worker_threads(de265_decoder_context* de265ctx, int number_of_threads)
211 {
212 decoder_context* ctx = (decoder_context*)de265ctx;
213
214 if (number_of_threads > MAX_THREADS) {
215 number_of_threads = MAX_THREADS;
216 }
217
218 if (number_of_threads>0) {
219 de265_error err = ctx->start_thread_pool(number_of_threads);
220 if (de265_isOK(err)) {
221 err = DE265_OK;
222 }
223 return err;
224 }
225 else {
226 return DE265_OK;
227 }
228 }
229
230
231 #ifndef LIBDE265_DISABLE_DEPRECATED
232 LIBDE265_API de265_error de265_decode_data(de265_decoder_context* de265ctx,
233 const void* data8, int len)
234 {
235 //decoder_context* ctx = (decoder_context*)de265ctx;
236 de265_error err;
237 if (len > 0) {
238 err = de265_push_data(de265ctx, data8, len, 0, NULL);
239 } else {
240 err = de265_flush_data(de265ctx);
241 }
242 if (err != DE265_OK) {
243 return err;
244 }
245
246 int more = 0;
247 do {
248 err = de265_decode(de265ctx, &more);
249 if (err != DE265_OK) {
250 more = 0;
251 }
252
253 switch (err) {
254 case DE265_ERROR_WAITING_FOR_INPUT_DATA:
255 // ignore error (didn't exist in 0.4 and before)
256 err = DE265_OK;
257 break;
258 default:
259 break;
260 }
261 } while (more);
262 return err;
263 }
264 #endif
265
266 static void dumpdata(const void* data, int len)
267 {
268 for (int i=0;i<len;i++) {
269 printf("%02x ", ((uint8_t*)data)[i]);
270 }
271 printf("\n");
272 }
273
274
275 LIBDE265_API de265_error de265_push_data(de265_decoder_context* de265ctx,
276 const void* data8, int len,
277 de265_PTS pts, void* user_data)
278 {
279 decoder_context* ctx = (decoder_context*)de265ctx;
280 uint8_t* data = (uint8_t*)data8;
281
282 //printf("push data (size %d)\n",len);
283 //dumpdata(data8,16);
284
285 return ctx->nal_parser.push_data(data,len,pts,user_data);
286 }
287
288
289 LIBDE265_API de265_error de265_push_NAL(de265_decoder_context* de265ctx,
290 const void* data8, int len,
291 de265_PTS pts, void* user_data)
292 {
293 decoder_context* ctx = (decoder_context*)de265ctx;
294 uint8_t* data = (uint8_t*)data8;
295
296 //printf("push NAL (size %d)\n",len);
297 //dumpdata(data8,16);
298
299 return ctx->nal_parser.push_NAL(data,len,pts,user_data);
300 }
301
302
303 LIBDE265_API de265_error de265_decode(de265_decoder_context* de265ctx, int* more)
304 {
305 decoder_context* ctx = (decoder_context*)de265ctx;
306
307 return ctx->decode(more);
308 }
309
310
311 LIBDE265_API void de265_push_end_of_NAL(de265_decoder_context* de265ctx)
312 {
313 decoder_context* ctx = (decoder_context*)de265ctx;
314
315 ctx->nal_parser.flush_data();
316 }
317
318
319 LIBDE265_API de265_error de265_flush_data(de265_decoder_context* de265ctx)
320 {
321 de265_push_end_of_NAL(de265ctx);
322
323 decoder_context* ctx = (decoder_context*)de265ctx;
324
325 ctx->nal_parser.flush_data();
326 ctx->nal_parser.mark_end_of_stream();
327
328 return DE265_OK;
329 }
330
331
332 LIBDE265_API void de265_reset(de265_decoder_context* de265ctx)
333 {
334 decoder_context* ctx = (decoder_context*)de265ctx;
335
336 //printf("--- reset ---\n");
337
338 ctx->reset();
339 }
340
341
342 LIBDE265_API const struct de265_image* de265_get_next_picture(de265_decoder_context* de265ctx)
343 {
344 const struct de265_image* img = de265_peek_next_picture(de265ctx);
345 if (img) {
346 de265_release_next_picture(de265ctx);
347 }
348
349 return img;
350 }
351
352
353 LIBDE265_API const struct de265_image* de265_peek_next_picture(de265_decoder_context* de265ctx)
354 {
355 decoder_context* ctx = (decoder_context*)de265ctx;
356
357 if (ctx->num_pictures_in_output_queue()>0) {
358 de265_image* img = ctx->get_next_picture_in_output_queue();
359 return img;
360 }
361 else {
362 return NULL;
363 }
364 }
365
366
367 LIBDE265_API void de265_release_next_picture(de265_decoder_context* de265ctx)
368 {
369 decoder_context* ctx = (decoder_context*)de265ctx;
370
371 // no active output picture -> ignore release request
372
373 if (ctx->num_pictures_in_output_queue()==0) { return; }
374
375 de265_image* next_image = ctx->get_next_picture_in_output_queue();
376
377 loginfo(LogDPB, "release DPB with POC=%d\n",next_image->PicOrderCntVal);
378
379 next_image->PicOutputFlag = false;
380
381 // TODO: actually, we want to release it here, but we cannot without breaking API
382 // compatibility, because get_next_picture calls this immediately. Hence, we release
383 // images while scanning for available slots in the DPB.
384 // if (next_image->can_be_released()) { next_image->release(); }
385
386 // pop output queue
387
388 ctx->pop_next_picture_in_output_queue();
389 }
390
391
392
393 LIBDE265_API int de265_get_highest_TID(de265_decoder_context* de265ctx)
394 {
395 decoder_context* ctx = (decoder_context*)de265ctx;
396 return ctx->get_highest_TID();
397 }
398
399 LIBDE265_API int de265_get_current_TID(de265_decoder_context* de265ctx)
400 {
401 decoder_context* ctx = (decoder_context*)de265ctx;
402 return ctx->get_current_TID();
403 }
404
405 LIBDE265_API void de265_set_limit_TID(de265_decoder_context* de265ctx,int max_tid)
406 {
407 decoder_context* ctx = (decoder_context*)de265ctx;
408 ctx->set_limit_TID(max_tid);
409 }
410
411 LIBDE265_API void de265_set_framerate_ratio(de265_decoder_context* de265ctx,int percent)
412 {
413 decoder_context* ctx = (decoder_context*)de265ctx;
414 ctx->set_framerate_ratio(percent);
415 }
416
417 LIBDE265_API int de265_change_framerate(de265_decoder_context* de265ctx,int more)
418 {
419 decoder_context* ctx = (decoder_context*)de265ctx;
420 return ctx->change_framerate(more);
421 }
422
423
424 LIBDE265_API de265_error de265_get_warning(de265_decoder_context* de265ctx)
425 {
426 decoder_context* ctx = (decoder_context*)de265ctx;
427
428 return ctx->get_warning();
429 }
430
431 LIBDE265_API void de265_set_parameter_bool(de265_decoder_context* de265ctx, enum de265_param param, int value)
432 {
433 decoder_context* ctx = (decoder_context*)de265ctx;
434
435 switch (param)
436 {
437 case DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH:
438 ctx->param_sei_check_hash = !!value;
439 break;
440
441 case DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES:
442 ctx->param_suppress_faulty_pictures = !!value;
443 break;
444
445 case DE265_DECODER_PARAM_DISABLE_DEBLOCKING:
446 ctx->param_disable_deblocking = !!value;
447 break;
448
449 case DE265_DECODER_PARAM_DISABLE_SAO:
450 ctx->param_disable_sao = !!value;
451 break;
452
453 /*
454 case DE265_DECODER_PARAM_DISABLE_MC_RESIDUAL_IDCT:
455 ctx->param_disable_mc_residual_idct = !!value;
456 break;
457
458 case DE265_DECODER_PARAM_DISABLE_INTRA_RESIDUAL_IDCT:
459 ctx->param_disable_intra_residual_idct = !!value;
460 break;
461 */
462
463 default:
464 assert(false);
465 break;
466 }
467 }
468
469
470 LIBDE265_API void de265_set_parameter_int(de265_decoder_context* de265ctx, enum de265_param param, int value)
471 {
472 decoder_context* ctx = (decoder_context*)de265ctx;
473
474 switch (param)
475 {
476 case DE265_DECODER_PARAM_DUMP_SPS_HEADERS:
477 ctx->param_sps_headers_fd = value;
478 break;
479
480 case DE265_DECODER_PARAM_DUMP_VPS_HEADERS:
481 ctx->param_vps_headers_fd = value;
482 break;
483
484 case DE265_DECODER_PARAM_DUMP_PPS_HEADERS:
485 ctx->param_pps_headers_fd = value;
486 break;
487
488 case DE265_DECODER_PARAM_DUMP_SLICE_HEADERS:
489 ctx->param_slice_headers_fd = value;
490 break;
491
492 case DE265_DECODER_PARAM_ACCELERATION_CODE:
493 ctx->set_acceleration_functions((enum de265_acceleration)value);
494 break;
495
496 default:
497 assert(false);
498 break;
499 }
500 }
501
502
503
504
505 LIBDE265_API int de265_get_parameter_bool(de265_decoder_context* de265ctx, enum de265_param param)
506 {
507 decoder_context* ctx = (decoder_context*)de265ctx;
508
509 switch (param)
510 {
511 case DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH:
512 return ctx->param_sei_check_hash;
513
514 case DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES:
515 return ctx->param_suppress_faulty_pictures;
516
517 case DE265_DECODER_PARAM_DISABLE_DEBLOCKING:
518 return ctx->param_disable_deblocking;
519
520 case DE265_DECODER_PARAM_DISABLE_SAO:
521 return ctx->param_disable_sao;
522
523 /*
524 case DE265_DECODER_PARAM_DISABLE_MC_RESIDUAL_IDCT:
525 return ctx->param_disable_mc_residual_idct;
526
527 case DE265_DECODER_PARAM_DISABLE_INTRA_RESIDUAL_IDCT:
528 return ctx->param_disable_intra_residual_idct;
529 */
530
531 default:
532 assert(false);
533 return false;
534 }
535 }
536
537
538 LIBDE265_API int de265_get_number_of_input_bytes_pending(de265_decoder_context* de265ctx)
539 {
540 decoder_context* ctx = (decoder_context*)de265ctx;
541
542 return ctx->nal_parser.bytes_in_input_queue();
543 }
544
545
546 LIBDE265_API int de265_get_number_of_NAL_units_pending(de265_decoder_context* de265ctx)
547 {
548 decoder_context* ctx = (decoder_context*)de265ctx;
549
550 return ctx->nal_parser.number_of_NAL_units_pending();
551 }
552
553
554 LIBDE265_API int de265_get_image_width(const struct de265_image* img,int channel)
555 {
556 switch (channel) {
557 case 0:
558 return img->width_confwin;
559 case 1:
560 case 2:
561 return img->chroma_width_confwin;
562 default:
563 return 0;
564 }
565 }
566
567 LIBDE265_API int de265_get_image_height(const struct de265_image* img,int channel)
568 {
569 switch (channel) {
570 case 0:
571 return img->height_confwin;
572 case 1:
573 case 2:
574 return img->chroma_height_confwin;
575 default:
576 return 0;
577 }
578 }
579
580 LIBDE265_API enum de265_chroma de265_get_chroma_format(const struct de265_image* img)
581 {
582 return img->get_chroma_format();
583 }
584
585 LIBDE265_API const uint8_t* de265_get_image_plane(const de265_image* img, int channel, int* stride)
586 {
587 assert(channel>=0 && channel <= 2);
588
589 uint8_t* data = img->pixels_confwin[channel];
590
591 if (stride) *stride = img->get_image_stride(channel);
592
593 return data;
594 }
595
596 LIBDE265_API void *de265_get_image_plane_user_data(const struct de265_image* img, int channel)
597 {
598 assert(channel>=0 && channel <= 2);
599
600 return img->plane_user_data[channel];
601 }
602
603 LIBDE265_API void de265_set_image_plane(de265_image* img, int cIdx, void* mem, int stride, void *userdata)
604 {
605 img->set_image_plane(cIdx, (uint8_t*)mem, stride, userdata);
606 }
607
608 LIBDE265_API void de265_set_image_allocation_functions(de265_decoder_context* de265ctx,
609 de265_image_allocation* allocfunc,
610 void* userdata)
611 {
612 decoder_context* ctx = (decoder_context*)de265ctx;
613
614 ctx->set_image_allocation_functions(allocfunc, userdata);
615 }
616
617 LIBDE265_API const struct de265_image_allocation *de265_get_default_image_allocation_functions(void)
618 {
619 return &de265_image::default_image_allocation;
620 }
621
622 LIBDE265_API de265_PTS de265_get_image_PTS(const struct de265_image* img)
623 {
624 return img->pts;
625 }
626
627 LIBDE265_API void* de265_get_image_user_data(const struct de265_image* img)
628 {
629 return img->user_data;
630 }
631
632 LIBDE265_API void de265_get_image_NAL_header(const struct de265_image* img,
633 int* nal_unit_type,
634 const char** nal_unit_name,
635 int* nuh_layer_id,
636 int* nuh_temporal_id)
637 {
638 if (nal_unit_type) *nal_unit_type = img->nal_hdr.nal_unit_type;
639 if (nal_unit_name) *nal_unit_name = get_NAL_name(img->nal_hdr.nal_unit_type);
640 if (nuh_layer_id) *nuh_layer_id = img->nal_hdr.nuh_layer_id;
641 if (nuh_temporal_id) *nuh_temporal_id = img->nal_hdr.nuh_temporal_id;
642 }
643 }
644
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
3030 //#define inline static __inline
3131
3232
33 #ifndef __STDC_LIMIT_MACROS
3334 #define __STDC_LIMIT_MACROS 1
35 #endif
3436 #include <stdint.h>
3537
3638 #if defined(_MSC_VER) && !defined(LIBDE265_STATIC_BUILD)
4951 #define LIBDE265_DEPRECATED __declspec(deprecated)
5052 #else
5153 #define LIBDE265_DEPRECATED
54 #endif
55
56 #if defined(_MSC_VER)
57 #define LIBDE265_INLINE __inline
58 #else
59 #define LIBDE265_INLINE inline
5260 #endif
5361
5462 /* === version numbers === */
108116 DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID=1020,
109117 DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO=1021,
110118 DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM=1022,
111 DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER=1023
119 DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER=1023,
120 DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY=1024,
121 DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI=1025
112122 } de265_error;
113123
114124 LIBDE265_API const char* de265_get_error_text(de265_error err);
143153 LIBDE265_API int de265_get_image_height(const struct de265_image*,int channel);
144154 LIBDE265_API enum de265_chroma de265_get_chroma_format(const struct de265_image*);
145155 LIBDE265_API const uint8_t* de265_get_image_plane(const struct de265_image*, int channel, int* out_stride);
156 LIBDE265_API void* de265_get_image_plane_user_data(const struct de265_image*, int channel);
146157 LIBDE265_API de265_PTS de265_get_image_PTS(const struct de265_image*);
147158 LIBDE265_API void* de265_get_image_user_data(const struct de265_image*);
148159
160 /* Get NAL-header information of this frame. You can pass in NULL pointers if you
161 do not need this piece of information.
162 */
163 LIBDE265_API void de265_get_image_NAL_header(const struct de265_image*,
164 int* nal_unit_type,
165 const char** nal_unit_name, // textual description of 'nal_unit_type'
166 int* nuh_layer_id,
167 int* nuh_temporal_id);
168
149169
150170 /* === decoder === */
151171
152 typedef void* de265_decoder_context; // private structure
172 typedef void de265_decoder_context; // private structure
153173
154174
155175
187207 LIBDE265_API de265_error de265_push_data(de265_decoder_context*, const void* data, int length,
188208 de265_PTS pts, void* user_data);
189209
210 /* Indicate that de265_push_data has just received data until the end of a NAL.
211 The remaining pending input data is put into a NAL package and forwarded to the decoder.
212 */
213 LIBDE265_API void de265_push_end_of_NAL(de265_decoder_context*);
214
190215 /* Push a complete NAL unit without startcode into the decoder. The data must still
191216 contain all stuffing-bytes.
192217 This function only pushes data into the decoder, nothing will be decoded.
246271 LIBDE265_API de265_error de265_get_warning(de265_decoder_context*);
247272
248273
274 enum de265_image_format {
275 de265_image_format_mono8 = 1,
276 de265_image_format_YUV420P8 = 2,
277 de265_image_format_YUV422P8 = 3,
278 de265_image_format_YUV444P8 = 4
279 };
280
281 struct de265_image_spec
282 {
283 enum de265_image_format format;
284 int width;
285 int height;
286 int alignment;
287
288 // conformance window
289
290 int crop_left;
291 int crop_right;
292 int crop_top;
293 int crop_bottom;
294
295 int visible_width; // convenience, width - crop_left - crop_right
296 int visible_height; // convenience, height - crop_top - crop_bottom
297 };
298
299 struct de265_image_allocation
300 {
301 int (*get_buffer)(de265_decoder_context* ctx,
302 struct de265_image_spec* spec,
303 struct de265_image* img,
304 void* userdata);
305 void (*release_buffer)(de265_decoder_context* ctx,
306 struct de265_image* img,
307 void* userdata);
308 };
309
310 /* The user data pointer will be given to the get_buffer() and release_buffer() functions
311 in de265_image_allocation. */
312 LIBDE265_API void de265_set_image_allocation_functions(de265_decoder_context*,
313 struct de265_image_allocation*,
314 void* userdata);
315 LIBDE265_API const struct de265_image_allocation *de265_get_default_image_allocation_functions(void);
316
317 LIBDE265_API void de265_set_image_plane(struct de265_image* img, int cIdx, void* mem, int stride, void *userdata);
318
319
320 /* --- frame dropping API ---
321
322 To limit decoding to a maximum temporal layer (TID), use de265_set_limit_TID().
323 The maximum layer ID in the stream can be queried with de265_get_highest_TID().
324 Note that the maximum layer ID can change throughout the stream.
325
326 For a fine-grained selection of the frame-rate, use de265_set_framerate_ratio().
327 A percentage of 100% will decode all frames in all temporal layers. A lower percentage
328 will drop approximately as many frames. Note that this only accurate if the frames
329 are distributed evenly among the layers. Otherwise, the mapping is non-linear.
330
331 The limit_TID has a higher precedence than framerate_ratio. Hence, setting a higher
332 framerate-ratio will decode at limit_TID without dropping.
333
334 With change_framerate(), the output frame-rate can be increased/decreased to some
335 discrete preferable values. Currently, these are non-dropped decoding at various
336 TID layers.
337 */
338
339 LIBDE265_API int de265_get_highest_TID(de265_decoder_context*); // highest temporal substream to decode
340 LIBDE265_API int de265_get_current_TID(de265_decoder_context*); // currently decoded temporal substream
341
342 LIBDE265_API void de265_set_limit_TID(de265_decoder_context*,int max_tid); // highest temporal substream to decode
343 LIBDE265_API void de265_set_framerate_ratio(de265_decoder_context*,int percent); // percentage of frames to decode (approx)
344 LIBDE265_API int de265_change_framerate(de265_decoder_context*,int more_vs_less); // 1: more, -1: less, returns corresponding framerate_ratio
345
346
249347 /* --- decoding parameters --- */
250348
251349 enum de265_param {
254352 DE265_DECODER_PARAM_DUMP_VPS_HEADERS=2,
255353 DE265_DECODER_PARAM_DUMP_PPS_HEADERS=3,
256354 DE265_DECODER_PARAM_DUMP_SLICE_HEADERS=4,
257 DE265_DECODER_PARAM_ACCELERATION_CODE=5 // (int) enum de265_acceleration, default: AUTO
355 DE265_DECODER_PARAM_ACCELERATION_CODE=5, // (int) enum de265_acceleration, default: AUTO
356 DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES=6, // (bool) do not output frames with decoding errors, default: no (output all images)
357
358 DE265_DECODER_PARAM_DISABLE_DEBLOCKING=7, // (bool) disable deblocking
359 DE265_DECODER_PARAM_DISABLE_SAO=8 // (bool) disable SAO filter
360 //DE265_DECODER_PARAM_DISABLE_MC_RESIDUAL_IDCT=9, // (bool) disable decoding of IDCT residuals in MC blocks
361 //DE265_DECODER_PARAM_DISABLE_INTRA_RESIDUAL_IDCT=10 // (bool) disable decoding of IDCT residuals in MC blocks
258362 };
259363
260364 // sorted such that a large ID includes all optimizations from lower IDs
+0
-1054
libde265/deblock.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "deblock.h"
21 #include "util.h"
22 #include "transform.h"
23 #include "de265.h"
24
25 #include <assert.h>
26
27
28
29 // 8.7.2.1 for both EDGE_HOR and EDGE_VER at the same time
30 void markTransformBlockBoundary(decoder_context* ctx, int x0,int y0,
31 int log2TrafoSize,int trafoDepth,
32 int filterLeftCbEdge, int filterTopCbEdge)
33 {
34 logtrace(LogDeblock,"markTransformBlockBoundary(%d,%d, %d,%d, %d,%d)\n",x0,y0,
35 log2TrafoSize,trafoDepth, filterLeftCbEdge,filterTopCbEdge);
36
37 int split_transform = get_split_transform_flag(ctx->img,ctx->current_sps,x0,y0,trafoDepth);
38 if (split_transform) {
39 int x1 = x0 + ((1<<log2TrafoSize)>>1);
40 int y1 = y0 + ((1<<log2TrafoSize)>>1);
41
42 markTransformBlockBoundary(ctx,x0,y0,log2TrafoSize-1,trafoDepth+1, filterLeftCbEdge, filterTopCbEdge);
43 markTransformBlockBoundary(ctx,x1,y0,log2TrafoSize-1,trafoDepth+1, DEBLOCK_FLAG_VERTI, filterTopCbEdge);
44 markTransformBlockBoundary(ctx,x0,y1,log2TrafoSize-1,trafoDepth+1, filterLeftCbEdge, DEBLOCK_FLAG_HORIZ);
45 markTransformBlockBoundary(ctx,x1,y1,log2TrafoSize-1,trafoDepth+1, DEBLOCK_FLAG_VERTI, DEBLOCK_FLAG_HORIZ);
46 }
47 else {
48 // VER
49
50 for (int k=0;k<(1<<log2TrafoSize);k+=4) {
51 set_deblk_flags(ctx->img, x0,y0+k, filterLeftCbEdge);
52 }
53
54 // HOR
55
56 for (int k=0;k<(1<<log2TrafoSize);k+=4) {
57 set_deblk_flags(ctx->img, x0+k,y0, filterTopCbEdge);
58 }
59 }
60 }
61
62
63
64 // 8.7.2.2 for both EDGE_HOR and EDGE_VER at the same time
65 void markPredictionBlockBoundary(decoder_context* ctx, int x0,int y0,
66 int log2CbSize,
67 int filterLeftCbEdge, int filterTopCbEdge)
68 {
69 logtrace(LogDeblock,"markPredictionBlockBoundary(%d,%d, %d, %d,%d)\n",x0,y0,
70 log2CbSize, filterLeftCbEdge,filterTopCbEdge);
71
72 enum PartMode partMode = get_PartMode(ctx->img,ctx->current_sps,x0,y0);
73
74 int cbSize = 1<<log2CbSize;
75 int cbSize2 = 1<<(log2CbSize-1);
76 int cbSize4 = 1<<(log2CbSize-2);
77
78 switch (partMode) {
79 case PART_NxN:
80 for (int k=0;k<cbSize;k++) {
81 set_deblk_flags(ctx->img, x0+cbSize2,y0+k, DEBLOCK_PB_EDGE_VERTI);
82 set_deblk_flags(ctx->img, x0+k,y0+cbSize2, DEBLOCK_PB_EDGE_HORIZ);
83 }
84 break;
85
86 case PART_Nx2N:
87 for (int k=0;k<cbSize;k++) {
88 set_deblk_flags(ctx->img, x0+cbSize2,y0+k, DEBLOCK_PB_EDGE_VERTI);
89 }
90 break;
91
92 case PART_2NxN:
93 for (int k=0;k<cbSize;k++) {
94 set_deblk_flags(ctx->img, x0+k,y0+cbSize2, DEBLOCK_PB_EDGE_HORIZ);
95 }
96 break;
97
98 case PART_nLx2N:
99 for (int k=0;k<cbSize;k++) {
100 set_deblk_flags(ctx->img, x0+cbSize4,y0+k, DEBLOCK_PB_EDGE_VERTI);
101 }
102 break;
103
104 case PART_nRx2N:
105 for (int k=0;k<cbSize;k++) {
106 set_deblk_flags(ctx->img, x0+cbSize2+cbSize4,y0+k, DEBLOCK_PB_EDGE_VERTI);
107 }
108 break;
109
110 case PART_2NxnU:
111 for (int k=0;k<cbSize;k++) {
112 set_deblk_flags(ctx->img, x0+k,y0+cbSize4, DEBLOCK_PB_EDGE_HORIZ);
113 }
114 break;
115
116 case PART_2NxnD:
117 for (int k=0;k<cbSize;k++) {
118 set_deblk_flags(ctx->img, x0+k,y0+cbSize2+cbSize4, DEBLOCK_PB_EDGE_HORIZ);
119 }
120 break;
121
122 case PART_2Nx2N:
123 // NOP
124 break;
125 }
126 }
127
128
129 char derive_edgeFlags(decoder_context* ctx)
130 {
131 const int minCbSize = ctx->current_sps->MinCbSizeY;
132 char deblocking_enabled=0; // whether deblocking is enabled in some part of the image
133
134 int ctb_mask = (1<<ctx->current_sps->Log2CtbSizeY)-1;
135 int picWidthInCtbs = ctx->current_sps->PicWidthInCtbsY;
136 int ctbshift = ctx->current_sps->Log2CtbSizeY;
137
138 const pic_parameter_set* pps = ctx->current_pps;
139
140 for (int cb_y=0;cb_y<ctx->current_sps->PicHeightInMinCbsY;cb_y++)
141 for (int cb_x=0;cb_x<ctx->current_sps->PicWidthInMinCbsY;cb_x++)
142 {
143 int log2CbSize = get_log2CbSize_cbUnits(ctx->img,ctx->current_sps,cb_x,cb_y);
144 if (log2CbSize==0) {
145 continue;
146 }
147
148 // we are now at the top corner of a CB
149
150 int x0 = cb_x * minCbSize;
151 int y0 = cb_y * minCbSize;
152
153 int x0ctb = x0 >> ctbshift;
154 int y0ctb = y0 >> ctbshift;
155
156 // check whether we should filter this slice
157
158 slice_segment_header* shdr = get_SliceHeader(ctx,x0,y0);
159
160 // check whether to filter left and top edge
161
162 uint8_t filterLeftCbEdge = DEBLOCK_FLAG_VERTI;
163 uint8_t filterTopCbEdge = DEBLOCK_FLAG_HORIZ;
164 if (x0 == 0) filterLeftCbEdge = 0;
165 if (y0 == 0) filterTopCbEdge = 0;
166
167 // check for slice and tile boundaries (8.7.2, step 2 in both processes)
168
169 if (x0 && ((x0 & ctb_mask) == 0)) { // left edge at CTB boundary
170 if (shdr->slice_loop_filter_across_slices_enabled_flag == 0 &&
171 //shdr->slice_index != get_SliceHeaderIndex(ctx->img,ctx->current_sps,x0-1,y0))
172 shdr->SliceAddrRS != get_SliceHeader(ctx,x0-1,y0)->SliceAddrRS)
173 {
174 filterLeftCbEdge = 0;
175 }
176 else if (pps->loop_filter_across_tiles_enabled_flag == 0 &&
177 pps->TileIdRS[ x0ctb +y0ctb*picWidthInCtbs] !=
178 pps->TileIdRS[((x0-1)>>ctbshift)+y0ctb*picWidthInCtbs]) {
179 filterLeftCbEdge = 0;
180 }
181 }
182
183 if (y0 && ((y0 & ctb_mask) == 0)) { // top edge at CTB boundary
184 if (shdr->slice_loop_filter_across_slices_enabled_flag == 0 &&
185 //shdr->slice_index != get_SliceHeaderIndex(ctx->img,ctx->current_sps,x0,y0-1))
186 shdr->SliceAddrRS != get_SliceHeader(ctx,x0,y0-1)->SliceAddrRS)
187 {
188 filterTopCbEdge = 0;
189 }
190 else if (pps->loop_filter_across_tiles_enabled_flag == 0 &&
191 pps->TileIdRS[x0ctb+ y0ctb *picWidthInCtbs] !=
192 pps->TileIdRS[x0ctb+((y0-1)>>ctbshift)*picWidthInCtbs]) {
193 filterTopCbEdge = 0;
194 }
195 }
196
197
198 // mark edges
199
200 if (shdr->slice_deblocking_filter_disabled_flag==0) {
201 deblocking_enabled=1;
202
203 markTransformBlockBoundary(ctx, x0,y0, log2CbSize,0,
204 filterLeftCbEdge, filterTopCbEdge);
205
206 markPredictionBlockBoundary(ctx, x0,y0, log2CbSize,
207 filterLeftCbEdge, filterTopCbEdge);
208 }
209 }
210
211 return deblocking_enabled;
212 }
213
214
215
216 // 8.7.2.3 (both, EDGE_VER and EDGE_HOR)
217 void derive_boundaryStrength(decoder_context* ctx, bool vertical, int yStart,int yEnd,
218 int xStart,int xEnd)
219 {
220 //int stride = ctx->img.stride; TODO: UNUSED
221 int xIncr = vertical ? 2 : 1;
222 int yIncr = vertical ? 1 : 2;
223 int xOffs = vertical ? 1 : 0;
224 int yOffs = vertical ? 0 : 1;
225 int edgeMask = vertical ?
226 (DEBLOCK_FLAG_VERTI | DEBLOCK_PB_EDGE_VERTI) :
227 (DEBLOCK_FLAG_HORIZ | DEBLOCK_PB_EDGE_HORIZ);
228 int transformEdgeMask = vertical ? DEBLOCK_FLAG_VERTI : DEBLOCK_FLAG_HORIZ;
229
230 de265_image* img = ctx->img;
231
232 xEnd = libde265_min(xEnd,img->deblk_width);
233 yEnd = libde265_min(yEnd,img->deblk_height);
234
235 int TUShift = ctx->current_sps->Log2MinTrafoSize;
236 int TUStride= ctx->current_sps->PicWidthInTbsY;
237
238 for (int y=yStart;y<yEnd;y+=yIncr)
239 for (int x=xStart;x<xEnd;x+=xIncr) {
240 int xDi = x*4;
241 int yDi = y*4;
242
243 logtrace(LogDeblock,"%d %d %s = %s\n",xDi,yDi, vertical?"Vertical":"Horizontal",
244 (get_deblk_flags(ctx->img,xDi,yDi) & edgeMask) ? "edge" : "...");
245
246 uint8_t edgeFlags = get_deblk_flags(ctx->img,xDi,yDi);
247
248 if (edgeFlags & edgeMask) {
249 //int p0 = ctx->img.y[(xDi-xOffs)+(yDi-yOffs)*stride]; TODO: UNUSED
250 //int q0 = ctx->img.y[xDi+yDi*stride]; TODO: UNUSED
251
252 bool p_is_intra_pred = (get_pred_mode(ctx->img,ctx->current_sps,xDi-xOffs, yDi-yOffs) == MODE_INTRA);
253 bool q_is_intra_pred = (get_pred_mode(ctx->img,ctx->current_sps,xDi, yDi ) == MODE_INTRA);
254
255 int bS;
256
257 if (p_is_intra_pred || q_is_intra_pred) {
258 bS = 2;
259 }
260 else {
261 // opposing site
262 int xDiOpp = xDi-xOffs;
263 int yDiOpp = yDi-yOffs;
264 //uint8_t edgeFlagsOpp = get_deblk_flags(ctx,xDiOpp,yDiOpp);
265
266 /*
267 if ((edgeFlags & transformEdgeMask) &&
268 (get_nonzero_coefficient(ctx,xDi,yDi) ||
269 get_nonzero_coefficient(ctx,xDiOpp,yDiOpp))) {
270 */
271 if ((edgeFlags & transformEdgeMask) &&
272 (ctx->img->tu_info[(xDi >>TUShift) + (yDi >>TUShift)*TUStride] & TU_FLAG_NONZERO_COEFF ||
273 ctx->img->tu_info[(xDiOpp>>TUShift) + (yDiOpp>>TUShift)*TUStride] & TU_FLAG_NONZERO_COEFF)) {
274 bS = 1;
275 }
276 else {
277
278 bS = 0;
279
280 const PredVectorInfo* mviP = get_mv_info(ctx,xDiOpp,yDiOpp);
281 const PredVectorInfo* mviQ = get_mv_info(ctx,xDi ,yDi);
282
283 slice_segment_header* shdrP = get_SliceHeader(ctx,xDiOpp,yDiOpp);
284 slice_segment_header* shdrQ = get_SliceHeader(ctx,xDi ,yDi);
285
286 int refPicP0 = mviP->predFlag[0] ? shdrP->RefPicList[0][ mviP->refIdx[0] ] : -1;
287 int refPicP1 = mviP->predFlag[1] ? shdrP->RefPicList[1][ mviP->refIdx[1] ] : -1;
288 int refPicQ0 = mviQ->predFlag[0] ? shdrQ->RefPicList[0][ mviQ->refIdx[0] ] : -1;
289 int refPicQ1 = mviQ->predFlag[1] ? shdrQ->RefPicList[1][ mviQ->refIdx[1] ] : -1;
290
291 MotionVector mvP0 = mviP->mv[0]; if (!mviP->predFlag[0]) { mvP0.x=mvP0.y=0; }
292 MotionVector mvP1 = mviP->mv[1]; if (!mviP->predFlag[1]) { mvP1.x=mvP1.y=0; }
293 MotionVector mvQ0 = mviQ->mv[0]; if (!mviQ->predFlag[0]) { mvQ0.x=mvQ0.y=0; }
294 MotionVector mvQ1 = mviQ->mv[1]; if (!mviQ->predFlag[1]) { mvQ1.x=mvQ1.y=0; }
295
296 bool samePics = ((refPicP0==refPicQ0 && refPicP1==refPicQ1) ||
297 (refPicP0==refPicQ1 && refPicP1==refPicQ0));
298
299 if (!samePics) {
300 bS = 1;
301 }
302 else {
303 int numMV_P = mviP->predFlag[0] + mviP->predFlag[1];
304 int numMV_Q = mviQ->predFlag[0] + mviQ->predFlag[1];
305
306 if (numMV_P!=numMV_Q) {
307 add_warning(ctx, DE265_WARNING_NUMMVP_NOT_EQUAL_TO_NUMMVQ, false);
308 ctx->img->integrity = INTEGRITY_DECODING_ERRORS;
309 }
310
311 // two different reference pictures or only one reference picture
312 if (refPicP0 != refPicP1) {
313
314 if (refPicP0 == refPicQ0) {
315 if (abs_value(mvP0.x-mvQ0.x) >= 4 ||
316 abs_value(mvP0.y-mvQ0.y) >= 4 ||
317 abs_value(mvP1.x-mvQ1.x) >= 4 ||
318 abs_value(mvP1.y-mvQ1.y) >= 4) {
319 bS = 1;
320 }
321 }
322 else {
323 if (abs_value(mvP0.x-mvQ1.x) >= 4 ||
324 abs_value(mvP0.y-mvQ1.y) >= 4 ||
325 abs_value(mvP1.x-mvQ0.x) >= 4 ||
326 abs_value(mvP1.y-mvQ0.y) >= 4) {
327 bS = 1;
328 }
329 }
330 }
331 else {
332 assert(refPicQ0==refPicQ1);
333
334 if ((abs_value(mvP0.x-mvQ0.x) >= 4 ||
335 abs_value(mvP0.y-mvQ0.y) >= 4 ||
336 abs_value(mvP1.x-mvQ1.x) >= 4 ||
337 abs_value(mvP1.y-mvQ1.y) >= 4)
338 &&
339 (abs_value(mvP0.x-mvQ1.x) >= 4 ||
340 abs_value(mvP0.y-mvQ1.y) >= 4 ||
341 abs_value(mvP1.x-mvQ0.x) >= 4 ||
342 abs_value(mvP1.y-mvQ0.y) >= 4)) {
343 bS = 1;
344 }
345 }
346 }
347
348 /*
349 printf("unimplemented deblocking code for CU at %d;%d\n",xDi,yDi);
350
351 logerror(LogDeblock, "unimplemented code reached (file %s, line %d)\n",
352 __FILE__, __LINE__);
353 */
354 }
355 }
356
357 set_deblk_bS(ctx->img,xDi,yDi, bS);
358 }
359 else {
360 set_deblk_bS(ctx->img,xDi,yDi, 0);
361 }
362 }
363 }
364
365
366 void derive_boundaryStrength_CTB(decoder_context* ctx, bool vertical, int xCtb,int yCtb)
367 {
368 int ctbSize = ctx->current_sps->CtbSizeY;
369 int deblkSize = ctbSize/4;
370
371 derive_boundaryStrength(ctx,vertical,
372 yCtb*deblkSize, (yCtb+1)*deblkSize,
373 xCtb*deblkSize, (xCtb+1)*deblkSize);
374 }
375
376
377 static uint8_t table_8_23_beta[52] = {
378 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8,
379 9,10,11,12,13,14,15,16,17,18,20,22,24,26,28,30,32,34,36,
380 38,40,42,44,46,48,50,52,54,56,58,60,62,64
381 };
382
383 static uint8_t table_8_23_tc[54] = {
384 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
385 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
386 5, 5, 6, 6, 7, 8, 9,10,11,13,14,16,18,20,22,24
387 };
388
389
390
391 // 8.7.2.4
392 void edge_filtering_luma(decoder_context* ctx, bool vertical,
393 int yStart,int yEnd, int xStart,int xEnd)
394 {
395 const seq_parameter_set* sps = ctx->current_sps;
396
397 //int minCbSize = ctx->current_sps->MinCbSizeY;
398 int xIncr = vertical ? 2 : 1;
399 int yIncr = vertical ? 1 : 2;
400
401 const int stride = ctx->img->stride;
402
403 //printf("-> %d %d\n",yStart,yEnd);
404
405 de265_image* img = ctx->img;
406 int bitDepth_Y = ctx->current_sps->BitDepth_Y;
407
408 xEnd = libde265_min(xEnd,img->deblk_width);
409 yEnd = libde265_min(yEnd,img->deblk_height);
410
411 for (int y=yStart;y<yEnd;y+=yIncr)
412 for (int x=xStart;x<xEnd;x+=xIncr) {
413 int xDi = x*4;
414 int yDi = y*4;
415 int bS = get_deblk_bS(ctx->img, xDi,yDi);
416
417 logtrace(LogDeblock,"deblock POC=%d %c --- x:%d y:%d bS:%d---\n",
418 img->PicOrderCntVal,vertical ? 'V':'H',xDi,yDi,bS);
419
420 #if 0
421 {
422 uint8_t* ptr = ctx->img->y + stride*yDi + xDi;
423
424 for (int dy=-4;dy<4;dy++) {
425 for (int dx=-4;dx<4;dx++) {
426 printf("%02x ", ptr[dy*stride + dx]);
427 if (dx==-1) printf("| ");
428 }
429 printf("\n");
430 if (dy==-1) printf("-------------------------\n");
431 }
432 }
433 #endif
434
435 #if 0
436 if (!vertical)
437 {
438 uint8_t* ptr = ctx->img->y + stride*yDi + xDi;
439
440 for (int dy=-4;dy<4;dy++) {
441 for (int dx=0;dx<4;dx++) {
442 printf("%02x ", ptr[dy*stride + dx]);
443 if (dx==-1) printf("| ");
444 }
445 printf("\n");
446 if (dy==-1) printf("-------------------------\n");
447 }
448 }
449 #endif
450
451 if (bS>0) {
452
453 // 8.7.2.4.3
454
455 uint8_t* ptr = ctx->img->y + stride*yDi + xDi;
456
457 uint8_t q[4][4], p[4][4];
458 for (int k=0;k<4;k++)
459 for (int i=0;i<4;i++)
460 {
461 if (vertical) {
462 q[k][i] = ptr[ i +k*stride];
463 p[k][i] = ptr[-i-1+k*stride];
464 }
465 else {
466 q[k][i] = ptr[k + i *stride];
467 p[k][i] = ptr[k -(i+1)*stride];
468 }
469 }
470
471 #if 0
472 for (int k=0;k<4;k++)
473 {
474 for (int i=0;i<4;i++)
475 {
476 printf("%02x ", p[k][3-i]);
477 }
478
479 printf("| ");
480
481 for (int i=0;i<4;i++)
482 {
483 printf("%02x ", q[k][i]);
484 }
485 printf("\n");
486 }
487 #endif
488
489
490 int QP_Q = get_QPY(ctx->img, ctx->current_sps, xDi,yDi);
491 int QP_P = (vertical ?
492 get_QPY(ctx->img, ctx->current_sps, xDi-1,yDi) :
493 get_QPY(ctx->img, ctx->current_sps, xDi,yDi-1) );
494 int qP_L = (QP_Q+QP_P+1)>>1;
495
496 logtrace(LogDeblock,"QP: %d & %d -> %d\n",QP_Q,QP_P,qP_L);
497
498 int sliceIndexQ00 = get_SliceHeaderIndex(ctx->img,ctx->current_sps,xDi,yDi);
499 int beta_offset = ctx->slice[sliceIndexQ00].slice_beta_offset;
500 int tc_offset = ctx->slice[sliceIndexQ00].slice_tc_offset;
501
502 int Q_beta = Clip3(0,51, qP_L + beta_offset);
503 int betaPrime = table_8_23_beta[Q_beta];
504 int beta = betaPrime * (1<<(bitDepth_Y - 8));
505
506 int Q_tc = Clip3(0,53, qP_L + 2*(bS-1) + tc_offset);
507 int tcPrime = table_8_23_tc[Q_tc];
508 int tc = tcPrime * (1<<(bitDepth_Y - 8));
509
510 logtrace(LogDeblock,"beta: %d (%d) tc: %d (%d)\n",beta,beta_offset, tc,tc_offset);
511
512 int dE=0, dEp=0, dEq=0;
513
514 if (vertical || !vertical) {
515 int dp0 = abs_value(p[0][2] - 2*p[0][1] + p[0][0]);
516 int dp3 = abs_value(p[3][2] - 2*p[3][1] + p[3][0]);
517 int dq0 = abs_value(q[0][2] - 2*q[0][1] + q[0][0]);
518 int dq3 = abs_value(q[3][2] - 2*q[3][1] + q[3][0]);
519
520 int dpq0 = dp0 + dq0;
521 int dpq3 = dp3 + dq3;
522
523 int dp = dp0 + dp3;
524 int dq = dq0 + dq3;
525 int d = dpq0+ dpq3;
526
527 if (d<beta) {
528 //int dpq = 2*dpq0;
529 bool dSam0 = (2*dpq0 < (beta>>2) &&
530 abs_value(p[0][3]-p[0][0])+abs_value(q[0][0]-q[0][3]) < (beta>>3) &&
531 abs_value(p[0][0]-q[0][0]) < ((5*tc+1)>>1));
532
533 bool dSam3 = (2*dpq3 < (beta>>2) &&
534 abs_value(p[3][3]-p[3][0])+abs_value(q[3][0]-q[3][3]) < (beta>>3) &&
535 abs_value(p[3][0]-q[3][0]) < ((5*tc+1)>>1));
536
537 if (dSam0 && dSam3) {
538 dE=2;
539 }
540 else {
541 dE=1;
542 }
543
544 if (dp < ((beta + (beta>>1))>>3)) { dEp=1; }
545 if (dq < ((beta + (beta>>1))>>3)) { dEq=1; }
546
547 logtrace(LogDeblock,"dE:%d dEp:%d dEq:%d\n",dE,dEp,dEq);
548 }
549 }
550 else {
551 // TODO
552 assert(0);
553 }
554
555
556 // 8.7.2.4.4
557
558 if (dE != 0) {
559 bool filterP = true;
560 bool filterQ = true;
561
562 if (vertical) {
563 if (sps->pcm_loop_filter_disable_flag && get_pcm_flag(img,sps,xDi-1,yDi)) filterP=false;
564 if (get_cu_transquant_bypass(img,sps,xDi-1,yDi)) filterP=false;
565
566 if (sps->pcm_loop_filter_disable_flag && get_pcm_flag(img,sps,xDi,yDi)) filterQ=false;
567 if (get_cu_transquant_bypass(img,sps,xDi,yDi)) filterQ=false;
568 }
569 else {
570 if (sps->pcm_loop_filter_disable_flag && get_pcm_flag(img,sps,xDi,yDi-1)) filterP=false;
571 if (get_cu_transquant_bypass(img,sps,xDi,yDi-1)) filterP=false;
572
573 if (sps->pcm_loop_filter_disable_flag && get_pcm_flag(img,sps,xDi,yDi)) filterQ=false;
574 if (get_cu_transquant_bypass(img,sps,xDi,yDi)) filterQ=false;
575 }
576
577 for (int k=0;k<4;k++) {
578 //int nDp,nDq;
579
580 logtrace(LogDeblock,"line:%d\n",k);
581
582 const uint8_t p0 = p[k][0];
583 const uint8_t p1 = p[k][1];
584 const uint8_t p2 = p[k][2];
585 const uint8_t p3 = p[k][3];
586 const uint8_t q0 = q[k][0];
587 const uint8_t q1 = q[k][1];
588 const uint8_t q2 = q[k][2];
589 const uint8_t q3 = q[k][3];
590
591 if (dE==2) {
592 // strong filtering
593
594 //nDp=nDq=3;
595
596 uint8_t pnew[3],qnew[3];
597 pnew[0] = Clip3(p0-2*tc,p0+2*tc, (p2 + 2*p1 + 2*p0 + 2*q0 + q1 +4)>>3);
598 pnew[1] = Clip3(p1-2*tc,p1+2*tc, (p2 + p1 + p0 + q0+2)>>2);
599 pnew[2] = Clip3(p2-2*tc,p2+2*tc, (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3);
600 qnew[0] = Clip3(q0-2*tc,q0+2*tc, (p1+2*p0+2*q0+2*q1+q2+4)>>3);
601 qnew[1] = Clip3(q1-2*tc,q1+2*tc, (p0+q0+q1+q2+2)>>2);
602 qnew[2] = Clip3(q2-2*tc,q2+2*tc, (p0+q0+q1+3*q2+2*q3+4)>>3);
603
604 logtrace(LogDeblock,"strong filtering\n");
605
606 if (vertical) {
607 for (int i=0;i<3;i++) {
608 if (filterP) { ptr[-i-1+k*stride] = pnew[i]; }
609 if (filterQ) { ptr[ i + k*stride] = qnew[i]; }
610 }
611
612 // ptr[-1+k*stride] = ptr[ 0+k*stride] = 200;
613 }
614 else {
615 for (int i=0;i<3;i++) {
616 if (filterP) { ptr[ k -(i+1)*stride] = pnew[i]; }
617 if (filterQ) { ptr[ k + i *stride] = qnew[i]; }
618 }
619 }
620 }
621 else {
622 // weak filtering
623
624 //nDp=nDq=0;
625
626 int delta = (9*(q0-p0) - 3*(q1-p1) + 8)>>4;
627 logtrace(LogDeblock,"delta=%d, tc=%d\n",delta,tc);
628
629 if (abs_value(delta) < tc*10) {
630
631 delta = Clip3(-tc,tc,delta);
632 logtrace(LogDeblock," deblk + %d;%d [%02x->%02x] - %d;%d [%02x->%02x] delta:%d\n",
633 vertical ? xDi-1 : xDi+k,
634 vertical ? yDi+k : yDi-1, p0,Clip1_8bit(p0+delta),
635 vertical ? xDi : xDi+k,
636 vertical ? yDi+k : yDi, q0,Clip1_8bit(q0-delta),
637 delta);
638
639 if (vertical) {
640 if (filterP) { ptr[-0-1+k*stride] = Clip1_8bit(p0+delta); }
641 if (filterQ) { ptr[ 0 +k*stride] = Clip1_8bit(q0-delta); }
642 }
643 else {
644 if (filterP) { ptr[ k -1*stride] = Clip1_8bit(p0+delta); }
645 if (filterQ) { ptr[ k +0*stride] = Clip1_8bit(q0-delta); }
646 }
647
648 //ptr[ 0+k*stride] = 200;
649
650 if (dEp==1 && filterP) {
651 int delta_p = Clip3(-(tc>>1), tc>>1, (((p2+p0+1)>>1)-p1+delta)>>1);
652
653 logtrace(LogDeblock," deblk dEp %d;%d delta:%d\n",
654 vertical ? xDi-2 : xDi+k,
655 vertical ? yDi+k : yDi-2,
656 delta_p);
657
658 if (vertical) { ptr[-1-1+k*stride] = Clip1_8bit(p1+delta_p); }
659 else { ptr[ k -2*stride] = Clip1_8bit(p1+delta_p); }
660 }
661
662 if (dEq==1 && filterQ) {
663 int delta_q = Clip3(-(tc>>1), tc>>1, (((q2+q0+1)>>1)-q1-delta)>>1);
664
665 logtrace(LogDeblock," delkb dEq %d;%d delta:%d\n",
666 vertical ? xDi+1 : xDi+k,
667 vertical ? yDi+k : yDi+1,
668 delta_q);
669
670 if (vertical) { ptr[ 1 +k*stride] = Clip1_8bit(q1+delta_q); }
671 else { ptr[ k +1*stride] = Clip1_8bit(q1+delta_q); }
672 }
673
674 //nDp = dEp+1;
675 //nDq = dEq+1;
676
677 //logtrace(LogDeblock,"weak filtering (%d:%d)\n",nDp,nDq);
678 }
679 }
680 }
681 }
682 }
683 }
684 }
685
686
687 void edge_filtering_luma_CTB(decoder_context* ctx, bool vertical, int xCtb,int yCtb)
688 {
689 int ctbSize = ctx->current_sps->CtbSizeY;
690 int deblkSize = ctbSize/4;
691
692 edge_filtering_luma(ctx,vertical,
693 yCtb*deblkSize, (yCtb+1)*deblkSize,
694 xCtb*deblkSize, (xCtb+1)*deblkSize);
695 }
696
697
698
699
700 // 8.7.2.4
701 void edge_filtering_chroma(decoder_context* ctx, bool vertical, int yStart,int yEnd,
702 int xStart,int xEnd)
703 {
704 //int minCbSize = ctx->current_sps->MinCbSizeY;
705 int xIncr = vertical ? 4 : 2;
706 int yIncr = vertical ? 2 : 4;
707
708 de265_image* img = ctx->img;
709 seq_parameter_set* sps = ctx->current_sps;
710
711 const int stride = img->chroma_stride;
712
713 xEnd = libde265_min(xEnd,img->deblk_width);
714 yEnd = libde265_min(yEnd,img->deblk_height);
715
716 for (int y=yStart;y<yEnd;y+=yIncr)
717 for (int x=xStart;x<xEnd;x+=xIncr) {
718 int xDi = x*2;
719 int yDi = y*2;
720 int bS = get_deblk_bS(ctx->img, 2*xDi,2*yDi);
721
722 if (bS>1) {
723 // 8.7.2.4.5
724
725 for (int cplane=0;cplane<2;cplane++) {
726 int cQpPicOffset = (cplane==0 ?
727 ctx->current_pps->pic_cb_qp_offset :
728 ctx->current_pps->pic_cr_qp_offset);
729
730 uint8_t* ptr = (cplane==0 ? ctx->img->cb : ctx->img->cr);
731 ptr += stride*yDi + xDi;
732
733 uint8_t p[2][4];
734 uint8_t q[2][4];
735
736 logtrace(LogDeblock,"-%s- %d %d\n",cplane==0 ? "Cb" : "Cr",xDi,yDi);
737
738 for (int i=0;i<2;i++)
739 for (int k=0;k<4;k++)
740 {
741 if (vertical) {
742 q[i][k] = ptr[ i +k*stride];
743 p[i][k] = ptr[-i-1+k*stride];
744 }
745 else {
746 q[i][k] = ptr[k + i *stride];
747 p[i][k] = ptr[k -(i+1)*stride];
748 }
749 }
750
751 #if 0
752 for (int k=0;k<4;k++)
753 {
754 for (int i=0;i<2;i++)
755 {
756 printf("%02x ", p[1-i][k]);
757 }
758
759 printf("| ");
760
761 for (int i=0;i<2;i++)
762 {
763 printf("%02x ", q[i][k]);
764 }
765 printf("\n");
766 }
767 #endif
768
769 int QP_Q = get_QPY(ctx->img, ctx->current_sps, 2*xDi,2*yDi);
770 int QP_P = (vertical ?
771 get_QPY(ctx->img,ctx->current_sps, 2*xDi-1,2*yDi) :
772 get_QPY(ctx->img,ctx->current_sps, 2*xDi,2*yDi-1));
773 int qP_i = ((QP_Q+QP_P+1)>>1) + cQpPicOffset;
774 int QP_C = table8_22(qP_i);
775
776 //printf("POC=%d\n",ctx->img->PicOrderCntVal);
777 logtrace(LogDeblock,"%d %d: ((%d+%d+1)>>1) + %d = qP_i=%d (QP_C=%d)\n",
778 2*xDi,2*yDi, QP_Q,QP_P,cQpPicOffset,qP_i,QP_C);
779
780 int sliceIndexQ00 = get_SliceHeaderIndex(ctx->img,ctx->current_sps,2*xDi,2*yDi);
781 //int tc_offset = ctx->current_pps->tc_offset;
782 int tc_offset = ctx->slice[sliceIndexQ00].slice_tc_offset;
783
784 int Q = Clip3(0,53, QP_C + 2*(bS-1) + tc_offset);
785
786 int tcPrime = table_8_23_tc[Q];
787 int tc = tcPrime * (1<<(ctx->current_sps->BitDepth_C - 8));
788
789 logtrace(LogDeblock,"tc_offset=%d Q=%d tc'=%d tc=%d\n",tc_offset,Q,tcPrime,tc);
790
791 if (vertical) {
792 bool filterP = true;
793 if (sps->pcm_loop_filter_disable_flag && get_pcm_flag(img,sps,2*xDi-1,2*yDi)) filterP=false;
794 if (get_cu_transquant_bypass(img,sps,2*xDi-1,2*yDi)) filterP=false;
795
796 bool filterQ = true;
797 if (sps->pcm_loop_filter_disable_flag && get_pcm_flag(img,sps,2*xDi,2*yDi)) filterQ=false;
798 if (get_cu_transquant_bypass(img,sps,2*xDi,2*yDi)) filterQ=false;
799
800
801 for (int k=0;k<4;k++) {
802 int delta = Clip3(-tc,tc, ((((q[0][k]-p[0][k])<<2)+p[1][k]-q[1][k]+4)>>3));
803 logtrace(LogDeblock,"delta=%d\n",delta);
804 if (filterP) { ptr[-1+k*stride] = Clip1_8bit(p[0][k]+delta); }
805 if (filterQ) { ptr[ 0+k*stride] = Clip1_8bit(q[0][k]-delta); }
806 }
807 }
808 else {
809 bool filterP = true;
810 if (sps->pcm_loop_filter_disable_flag && get_pcm_flag(img,sps,2*xDi,2*yDi-1)) filterP=false;
811 if (get_cu_transquant_bypass(img,sps,2*xDi,2*yDi-1)) filterP=false;
812
813 bool filterQ = true;
814 if (sps->pcm_loop_filter_disable_flag && get_pcm_flag(img,sps,2*xDi,2*yDi)) filterQ=false;
815 if (get_cu_transquant_bypass(img,sps,2*xDi,2*yDi)) filterQ=false;
816
817 for (int k=0;k<4;k++) {
818 int delta = Clip3(-tc,tc, ((((q[0][k]-p[0][k])<<2)+p[1][k]-q[1][k]+4)>>3));
819 if (filterP) { ptr[ k-1*stride] = Clip1_8bit(p[0][k]+delta); }
820 if (filterQ) { ptr[ k+0*stride] = Clip1_8bit(q[0][k]-delta); }
821 }
822 }
823 }
824 }
825 }
826 }
827
828 void edge_filtering_chroma_CTB(decoder_context* ctx, bool vertical, int xCtb,int yCtb)
829 {
830 int ctbSize = ctx->current_sps->CtbSizeY;
831 int deblkSize = ctbSize/4;
832
833 edge_filtering_chroma(ctx,vertical,
834 yCtb*deblkSize, (yCtb+1)*deblkSize,
835 xCtb*deblkSize, (xCtb+1)*deblkSize);
836 }
837
838
839
840
841 static void thread_deblock(void* d)
842 {
843 struct thread_task_deblock* data = (struct thread_task_deblock*)d;
844 struct decoder_context* ctx = data->ctx;
845 de265_image* img = ctx->img;
846
847 int xStart=0;
848 int xEnd = img->deblk_width;
849
850 derive_boundaryStrength(ctx, data->vertical, data->first,data->last, xStart,xEnd);
851 edge_filtering_luma (ctx, data->vertical, data->first,data->last, xStart,xEnd);
852 edge_filtering_chroma (ctx, data->vertical, data->first,data->last, xStart,xEnd);
853
854 decrease_pending_tasks(ctx->img, 1);
855 }
856
857
858 #if 0
859 static void thread_deblock_ctb(void* d)
860 {
861 struct thread_task_deblock* data = (struct thread_task_deblock*)d;
862 struct decoder_context* ctx = data->ctx;
863
864 derive_boundaryStrength_CTB(ctx, data->vertical, data->ctb_x,data->ctb_y);
865 edge_filtering_luma_CTB (ctx, data->vertical, data->ctb_x,data->ctb_y);
866 edge_filtering_chroma_CTB (ctx, data->vertical, data->ctb_x,data->ctb_y);
867
868 decrease_pending_tasks(ctx->img, 1);
869 }
870 #endif
871
872 /*
873 static void thread_deblock_ctb_row(void* d)
874 {
875 struct thread_task_deblock* data = (struct thread_task_deblock*)d;
876 struct decoder_context* ctx = data->ctx;
877
878 for (int x=0;x<ctx->current_sps->PicWidthInCtbsY;x++) {
879 derive_boundaryStrength_CTB(ctx, data->vertical, x,data->ctb_y);
880 edge_filtering_luma_CTB (ctx, data->vertical, x,data->ctb_y);
881 edge_filtering_chroma_CTB (ctx, data->vertical, x,data->ctb_y);
882 }
883 }
884
885 static void thread_deblock_full_ctb_row(void* d)
886 {
887 struct thread_task_deblock* data = (struct thread_task_deblock*)d;
888 struct decoder_context* ctx = data->ctx;
889
890 de265_image* img = ctx->img;
891
892 int ctbSize = ctx->current_sps->CtbSizeY;
893 int deblkSize = ctbSize/4;
894
895 int xStart=0;
896 int xEnd = img->deblk_width;
897
898 int yStart = data->ctb_y *deblkSize;
899 int yEnd = (data->ctb_y+1)*deblkSize;
900
901 derive_boundaryStrength(ctx, data->vertical, yStart,yEnd, xStart,xEnd);
902 edge_filtering_luma (ctx, data->vertical, yStart,yEnd, xStart,xEnd);
903 edge_filtering_chroma (ctx, data->vertical, yStart,yEnd, xStart,xEnd);
904 }
905 */
906
907
908 void apply_deblocking_filter(decoder_context* ctx)
909 {
910 char enabled_deblocking = derive_edgeFlags(ctx);
911
912 de265_image* img = ctx->img;
913
914
915 if (enabled_deblocking)
916 {
917 if (ctx->num_worker_threads==0) { // TMP HACK / TODO / switched off multi-core
918
919 // vertical filtering
920
921 logtrace(LogDeblock,"VERTICAL\n");
922 derive_boundaryStrength(ctx, true ,0,img->deblk_height,0,img->deblk_width);
923 edge_filtering_luma (ctx, true ,0,img->deblk_height,0,img->deblk_width);
924 edge_filtering_chroma (ctx, true ,0,img->deblk_height,0,img->deblk_width);
925
926 #if 0
927 char buf[1000];
928 sprintf(buf,"lf-after-V-%05d.yuv", ctx->img->PicOrderCntVal);
929 write_picture_to_file(ctx->img, buf);
930 #endif
931
932 // horizontal filtering
933
934 logtrace(LogDeblock,"HORIZONTAL\n");
935 derive_boundaryStrength(ctx, false ,0,img->deblk_height,0,img->deblk_width);
936 edge_filtering_luma (ctx, false ,0,img->deblk_height,0,img->deblk_width);
937 edge_filtering_chroma (ctx, false ,0,img->deblk_height,0,img->deblk_width);
938
939 #if 0
940 sprintf(buf,"lf-after-H-%05d.yuv", ctx->img->PicOrderCntVal);
941 write_picture_to_file(ctx->img, buf);
942 #endif
943 }
944 else {
945 #if 1
946 for (int pass=0;pass<2;pass++) {
947
948 thread_task task;
949
950 task.task_id = -1;
951 task.task_cmd = THREAD_TASK_DEBLOCK;
952 task.work_routine = thread_deblock;
953
954 int numStripes= ctx->num_worker_threads * 4; // TODO: what is a good number of stripes?
955 //ctx->thread_pool.tasks_pending = numStripes;
956 increase_pending_tasks(ctx->img, numStripes);
957
958 for (int i=0;i<numStripes;i++)
959 {
960 int ys = i*img->deblk_height/numStripes;
961 int ye = (i+1)*img->deblk_height/numStripes;
962
963 // required because multi-threading might cut odd strips
964 ys &= ~3;
965 if (i != numStripes-1) ye &= ~3;
966
967
968 task.data.task_deblock.ctx = ctx;
969 task.data.task_deblock.first = ys;
970 task.data.task_deblock.last = ye;
971 task.data.task_deblock.vertical = (pass==0);
972
973 add_task(&ctx->thread_pool, &task);
974 }
975
976 wait_for_completion(ctx->img);
977 }
978 #endif
979 #if 0
980 for (int pass=0;pass<2;pass++)
981 {
982 thread_task task;
983
984 task.task_id = -1;
985 task.task_cmd = THREAD_TASK_DEBLOCK;
986 task.work_routine = thread_deblock_ctb;
987
988 //ctx->thread_pool.tasks_pending = ctx->current_sps->PicSizeInCtbsY;
989 increase_pending_tasks(ctx->img, ctx->current_sps->PicSizeInCtbsY);
990
991 for (int y=0;y<ctx->current_sps->PicHeightInCtbsY;y++)
992 for (int x=0;x<ctx->current_sps->PicWidthInCtbsY;x++)
993 {
994 task.data.task_deblock.ctx = ctx;
995 task.data.task_deblock.ctb_x = x;
996 task.data.task_deblock.ctb_y = y;
997 task.data.task_deblock.vertical = (pass==0);
998
999 add_task(&ctx->thread_pool, &task);
1000 }
1001
1002 wait_for_completion(ctx->img);
1003 }
1004 #endif
1005 #if 0
1006 for (int pass=0;pass<2;pass++)
1007 {
1008 thread_task task;
1009
1010 task.task_id = -1;
1011 task.task_cmd = THREAD_TASK_DEBLOCK;
1012 task.work_routine = thread_deblock_ctb_row;
1013
1014 ctx->thread_pool.tasks_pending = ctx->current_sps->PicHeightInCtbsY;
1015
1016 for (int y=0;y<ctx->current_sps->PicHeightInCtbsY;y++)
1017 //for (int x=0;x<ctx->current_sps->PicWidthInCtbsY;x++)
1018 {
1019 task.data.task_deblock.ctx = ctx;
1020 //task.data.task_deblock.ctb_x = x;
1021 task.data.task_deblock.ctb_y = y;
1022 task.data.task_deblock.vertical = (pass==0);
1023
1024 add_task(&ctx->thread_pool, &task);
1025 }
1026 }
1027 #endif
1028 #if 0
1029 for (int pass=0;pass<2;pass++)
1030 {
1031 thread_task task;
1032
1033 task.task_id = -1;
1034 task.task_cmd = THREAD_TASK_DEBLOCK;
1035 task.work_routine = thread_deblock_full_ctb_row;
1036
1037 ctx->thread_pool.tasks_pending = ctx->current_sps->PicHeightInCtbsY;
1038
1039 for (int y=0;y<ctx->current_sps->PicHeightInCtbsY;y++)
1040 //for (int x=0;x<ctx->current_sps->PicWidthInCtbsY;x++)
1041 {
1042 task.data.task_deblock.ctx = ctx;
1043 //task.data.task_deblock.ctb_x = x;
1044 task.data.task_deblock.ctb_y = y;
1045 task.data.task_deblock.vertical = (pass==0);
1046
1047 add_task(&ctx->thread_pool, &task);
1048 }
1049 }
1050 #endif
1051 }
1052 }
1053 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "deblock.h"
21 #include "util.h"
22 #include "transform.h"
23 #include "de265.h"
24
25 #include <assert.h>
26
27
28
29 // 8.7.2.1 for both EDGE_HOR and EDGE_VER at the same time
30 void markTransformBlockBoundary(de265_image* img, int x0,int y0,
31 int log2TrafoSize,int trafoDepth,
32 int filterLeftCbEdge, int filterTopCbEdge)
33 {
34 logtrace(LogDeblock,"markTransformBlockBoundary(%d,%d, %d,%d, %d,%d)\n",x0,y0,
35 log2TrafoSize,trafoDepth, filterLeftCbEdge,filterTopCbEdge);
36
37 int split_transform = img->get_split_transform_flag(x0,y0,trafoDepth);
38 if (split_transform) {
39 int x1 = x0 + ((1<<log2TrafoSize)>>1);
40 int y1 = y0 + ((1<<log2TrafoSize)>>1);
41
42 markTransformBlockBoundary(img,x0,y0,log2TrafoSize-1,trafoDepth+1, filterLeftCbEdge, filterTopCbEdge);
43 markTransformBlockBoundary(img,x1,y0,log2TrafoSize-1,trafoDepth+1, DEBLOCK_FLAG_VERTI, filterTopCbEdge);
44 markTransformBlockBoundary(img,x0,y1,log2TrafoSize-1,trafoDepth+1, filterLeftCbEdge, DEBLOCK_FLAG_HORIZ);
45 markTransformBlockBoundary(img,x1,y1,log2TrafoSize-1,trafoDepth+1, DEBLOCK_FLAG_VERTI, DEBLOCK_FLAG_HORIZ);
46 }
47 else {
48 // VER
49
50 for (int k=0;k<(1<<log2TrafoSize);k+=4) {
51 img->set_deblk_flags(x0,y0+k, filterLeftCbEdge);
52 }
53
54 // HOR
55
56 for (int k=0;k<(1<<log2TrafoSize);k+=4) {
57 img->set_deblk_flags(x0+k,y0, filterTopCbEdge);
58 }
59 }
60 }
61
62
63
64 // 8.7.2.2 for both EDGE_HOR and EDGE_VER at the same time
65 void markPredictionBlockBoundary(de265_image* img, int x0,int y0,
66 int log2CbSize,
67 int filterLeftCbEdge, int filterTopCbEdge)
68 {
69 logtrace(LogDeblock,"markPredictionBlockBoundary(%d,%d, %d, %d,%d)\n",x0,y0,
70 log2CbSize, filterLeftCbEdge,filterTopCbEdge);
71
72 enum PartMode partMode = img->get_PartMode(x0,y0);
73
74 int cbSize = 1<<log2CbSize;
75 int cbSize2 = 1<<(log2CbSize-1);
76 int cbSize4 = 1<<(log2CbSize-2);
77
78 switch (partMode) {
79 case PART_NxN:
80 for (int k=0;k<cbSize;k++) {
81 img->set_deblk_flags(x0+cbSize2,y0+k, DEBLOCK_PB_EDGE_VERTI);
82 img->set_deblk_flags(x0+k,y0+cbSize2, DEBLOCK_PB_EDGE_HORIZ);
83 }
84 break;
85
86 case PART_Nx2N:
87 for (int k=0;k<cbSize;k++) {
88 img->set_deblk_flags(x0+cbSize2,y0+k, DEBLOCK_PB_EDGE_VERTI);
89 }
90 break;
91
92 case PART_2NxN:
93 for (int k=0;k<cbSize;k++) {
94 img->set_deblk_flags(x0+k,y0+cbSize2, DEBLOCK_PB_EDGE_HORIZ);
95 }
96 break;
97
98 case PART_nLx2N:
99 for (int k=0;k<cbSize;k++) {
100 img->set_deblk_flags(x0+cbSize4,y0+k, DEBLOCK_PB_EDGE_VERTI);
101 }
102 break;
103
104 case PART_nRx2N:
105 for (int k=0;k<cbSize;k++) {
106 img->set_deblk_flags(x0+cbSize2+cbSize4,y0+k, DEBLOCK_PB_EDGE_VERTI);
107 }
108 break;
109
110 case PART_2NxnU:
111 for (int k=0;k<cbSize;k++) {
112 img->set_deblk_flags(x0+k,y0+cbSize4, DEBLOCK_PB_EDGE_HORIZ);
113 }
114 break;
115
116 case PART_2NxnD:
117 for (int k=0;k<cbSize;k++) {
118 img->set_deblk_flags(x0+k,y0+cbSize2+cbSize4, DEBLOCK_PB_EDGE_HORIZ);
119 }
120 break;
121
122 case PART_2Nx2N:
123 // NOP
124 break;
125 }
126 }
127
128
129 bool derive_edgeFlags_CTBRow(de265_image* img, int ctby)
130 {
131 const int minCbSize = img->sps.MinCbSizeY;
132 bool deblocking_enabled=false; // whether deblocking is enabled in some part of the image
133
134 int ctb_mask = (1<<img->sps.Log2CtbSizeY)-1;
135 int picWidthInCtbs = img->sps.PicWidthInCtbsY;
136 int ctbshift = img->sps.Log2CtbSizeY;
137
138 const pic_parameter_set* pps = &img->pps;
139
140
141 int cb_y_start = ( ctby << img->sps.Log2CtbSizeY) >> img->sps.Log2MinCbSizeY;
142 int cb_y_end = ((ctby+1) << img->sps.Log2CtbSizeY) >> img->sps.Log2MinCbSizeY;
143
144 cb_y_end = std::min(cb_y_end, img->sps.PicHeightInMinCbsY);
145
146 for (int cb_y=cb_y_start;cb_y<cb_y_end;cb_y++)
147 for (int cb_x=0;cb_x<img->sps.PicWidthInMinCbsY;cb_x++)
148 {
149 int log2CbSize = img->get_log2CbSize_cbUnits(cb_x,cb_y);
150 if (log2CbSize==0) {
151 continue;
152 }
153
154 // we are now at the top corner of a CB
155
156 int x0 = cb_x * minCbSize;
157 int y0 = cb_y * minCbSize;
158
159 int x0ctb = x0 >> ctbshift;
160 int y0ctb = y0 >> ctbshift;
161
162 // check whether we should filter this slice
163
164 slice_segment_header* shdr = img->get_SliceHeader(x0,y0);
165
166 // check whether to filter left and top edge
167
168 uint8_t filterLeftCbEdge = DEBLOCK_FLAG_VERTI;
169 uint8_t filterTopCbEdge = DEBLOCK_FLAG_HORIZ;
170 if (x0 == 0) filterLeftCbEdge = 0;
171 if (y0 == 0) filterTopCbEdge = 0;
172
173 // check for slice and tile boundaries (8.7.2, step 2 in both processes)
174
175 if (x0 && ((x0 & ctb_mask) == 0)) { // left edge at CTB boundary
176 if (shdr->slice_loop_filter_across_slices_enabled_flag == 0 &&
177 shdr->SliceAddrRS != img->get_SliceHeader(x0-1,y0)->SliceAddrRS)
178 {
179 filterLeftCbEdge = 0;
180 }
181 else if (pps->loop_filter_across_tiles_enabled_flag == 0 &&
182 pps->TileIdRS[ x0ctb +y0ctb*picWidthInCtbs] !=
183 pps->TileIdRS[((x0-1)>>ctbshift)+y0ctb*picWidthInCtbs]) {
184 filterLeftCbEdge = 0;
185 }
186 }
187
188 if (y0 && ((y0 & ctb_mask) == 0)) { // top edge at CTB boundary
189 if (shdr->slice_loop_filter_across_slices_enabled_flag == 0 &&
190 shdr->SliceAddrRS != img->get_SliceHeader(x0,y0-1)->SliceAddrRS)
191 {
192 filterTopCbEdge = 0;
193 }
194 else if (pps->loop_filter_across_tiles_enabled_flag == 0 &&
195 pps->TileIdRS[x0ctb+ y0ctb *picWidthInCtbs] !=
196 pps->TileIdRS[x0ctb+((y0-1)>>ctbshift)*picWidthInCtbs]) {
197 filterTopCbEdge = 0;
198 }
199 }
200
201
202 // mark edges
203
204 if (shdr->slice_deblocking_filter_disabled_flag==0) {
205 deblocking_enabled=true;
206
207 markTransformBlockBoundary(img, x0,y0, log2CbSize,0,
208 filterLeftCbEdge, filterTopCbEdge);
209
210 markPredictionBlockBoundary(img, x0,y0, log2CbSize,
211 filterLeftCbEdge, filterTopCbEdge);
212 }
213 }
214
215 return deblocking_enabled;
216 }
217
218
219 bool derive_edgeFlags(de265_image* img)
220 {
221 bool deblocking_enabled=false;
222
223 for (int y=0;y<img->sps.PicHeightInCtbsY;y++) {
224 deblocking_enabled |= derive_edgeFlags_CTBRow(img,y);
225 }
226
227 return deblocking_enabled;
228 }
229
230
231 // 8.7.2.3 (both, EDGE_VER and EDGE_HOR)
232 void derive_boundaryStrength(de265_image* img, bool vertical, int yStart,int yEnd,
233 int xStart,int xEnd)
234 {
235 int xIncr = vertical ? 2 : 1;
236 int yIncr = vertical ? 1 : 2;
237 int xOffs = vertical ? 1 : 0;
238 int yOffs = vertical ? 0 : 1;
239 int edgeMask = vertical ?
240 (DEBLOCK_FLAG_VERTI | DEBLOCK_PB_EDGE_VERTI) :
241 (DEBLOCK_FLAG_HORIZ | DEBLOCK_PB_EDGE_HORIZ);
242 int transformEdgeMask = vertical ? DEBLOCK_FLAG_VERTI : DEBLOCK_FLAG_HORIZ;
243
244 xEnd = libde265_min(xEnd,img->get_deblk_width());
245 yEnd = libde265_min(yEnd,img->get_deblk_height());
246
247 int TUShift = img->sps.Log2MinTrafoSize;
248 int TUStride= img->sps.PicWidthInTbsY;
249
250 for (int y=yStart;y<yEnd;y+=yIncr)
251 for (int x=xStart;x<xEnd;x+=xIncr) {
252 int xDi = x<<2;
253 int yDi = y<<2;
254
255 logtrace(LogDeblock,"%d %d %s = %s\n",xDi,yDi, vertical?"Vertical":"Horizontal",
256 (img->get_deblk_flags(xDi,yDi) & edgeMask) ? "edge" : "...");
257
258 uint8_t edgeFlags = img->get_deblk_flags(xDi,yDi);
259
260 if (edgeFlags & edgeMask) {
261 bool p_is_intra_pred = (img->get_pred_mode(xDi-xOffs, yDi-yOffs) == MODE_INTRA);
262 bool q_is_intra_pred = (img->get_pred_mode(xDi, yDi ) == MODE_INTRA);
263
264 int bS;
265
266 if (p_is_intra_pred || q_is_intra_pred) {
267 bS = 2;
268 }
269 else {
270 // opposing site
271 int xDiOpp = xDi-xOffs;
272 int yDiOpp = yDi-yOffs;
273
274 if ((edgeFlags & transformEdgeMask) &&
275 (img->get_nonzero_coefficient(xDi ,yDi) ||
276 img->get_nonzero_coefficient(xDiOpp,yDiOpp))) {
277 bS = 1;
278 }
279 else {
280
281 bS = 0;
282
283 const PredVectorInfo* mviP = img->get_mv_info(xDiOpp,yDiOpp);
284 const PredVectorInfo* mviQ = img->get_mv_info(xDi ,yDi);
285
286 slice_segment_header* shdrP = img->get_SliceHeader(xDiOpp,yDiOpp);
287 slice_segment_header* shdrQ = img->get_SliceHeader(xDi ,yDi);
288
289 int refPicP0 = mviP->predFlag[0] ? shdrP->RefPicList[0][ mviP->refIdx[0] ] : -1;
290 int refPicP1 = mviP->predFlag[1] ? shdrP->RefPicList[1][ mviP->refIdx[1] ] : -1;
291 int refPicQ0 = mviQ->predFlag[0] ? shdrQ->RefPicList[0][ mviQ->refIdx[0] ] : -1;
292 int refPicQ1 = mviQ->predFlag[1] ? shdrQ->RefPicList[1][ mviQ->refIdx[1] ] : -1;
293
294 bool samePics = ((refPicP0==refPicQ0 && refPicP1==refPicQ1) ||
295 (refPicP0==refPicQ1 && refPicP1==refPicQ0));
296
297 if (!samePics) {
298 bS = 1;
299 }
300 else {
301 MotionVector mvP0 = mviP->mv[0]; if (!mviP->predFlag[0]) { mvP0.x=mvP0.y=0; }
302 MotionVector mvP1 = mviP->mv[1]; if (!mviP->predFlag[1]) { mvP1.x=mvP1.y=0; }
303 MotionVector mvQ0 = mviQ->mv[0]; if (!mviQ->predFlag[0]) { mvQ0.x=mvQ0.y=0; }
304 MotionVector mvQ1 = mviQ->mv[1]; if (!mviQ->predFlag[1]) { mvQ1.x=mvQ1.y=0; }
305
306 int numMV_P = mviP->predFlag[0] + mviP->predFlag[1];
307 int numMV_Q = mviQ->predFlag[0] + mviQ->predFlag[1];
308
309 if (numMV_P!=numMV_Q) {
310 img->decctx->add_warning(DE265_WARNING_NUMMVP_NOT_EQUAL_TO_NUMMVQ, false);
311 img->integrity = INTEGRITY_DECODING_ERRORS;
312 }
313
314 // two different reference pictures or only one reference picture
315 if (refPicP0 != refPicP1) {
316
317 if (refPicP0 == refPicQ0) {
318 if (abs_value(mvP0.x-mvQ0.x) >= 4 ||
319 abs_value(mvP0.y-mvQ0.y) >= 4 ||
320 abs_value(mvP1.x-mvQ1.x) >= 4 ||
321 abs_value(mvP1.y-mvQ1.y) >= 4) {
322 bS = 1;
323 }
324 }
325 else {
326 if (abs_value(mvP0.x-mvQ1.x) >= 4 ||
327 abs_value(mvP0.y-mvQ1.y) >= 4 ||
328 abs_value(mvP1.x-mvQ0.x) >= 4 ||
329 abs_value(mvP1.y-mvQ0.y) >= 4) {
330 bS = 1;
331 }
332 }
333 }
334 else {
335 assert(refPicQ0==refPicQ1);
336
337 if ((abs_value(mvP0.x-mvQ0.x) >= 4 ||
338 abs_value(mvP0.y-mvQ0.y) >= 4 ||
339 abs_value(mvP1.x-mvQ1.x) >= 4 ||
340 abs_value(mvP1.y-mvQ1.y) >= 4)
341 &&
342 (abs_value(mvP0.x-mvQ1.x) >= 4 ||
343 abs_value(mvP0.y-mvQ1.y) >= 4 ||
344 abs_value(mvP1.x-mvQ0.x) >= 4 ||
345 abs_value(mvP1.y-mvQ0.y) >= 4)) {
346 bS = 1;
347 }
348 }
349 }
350
351 /*
352 printf("unimplemented deblocking code for CU at %d;%d\n",xDi,yDi);
353
354 logerror(LogDeblock, "unimplemented code reached (file %s, line %d)\n",
355 __FILE__, __LINE__);
356 */
357 }
358 }
359
360 img->set_deblk_bS(xDi,yDi, bS);
361 }
362 else {
363 img->set_deblk_bS(xDi,yDi, 0);
364 }
365 }
366 }
367
368
369 void derive_boundaryStrength_CTB(de265_image* img, bool vertical, int xCtb,int yCtb)
370 {
371 int ctbSize = img->sps.CtbSizeY;
372 int deblkSize = ctbSize/4;
373
374 derive_boundaryStrength(img,vertical,
375 yCtb*deblkSize, (yCtb+1)*deblkSize,
376 xCtb*deblkSize, (xCtb+1)*deblkSize);
377 }
378
379
380 static uint8_t table_8_23_beta[52] = {
381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8,
382 9,10,11,12,13,14,15,16,17,18,20,22,24,26,28,30,32,34,36,
383 38,40,42,44,46,48,50,52,54,56,58,60,62,64
384 };
385
386 static uint8_t table_8_23_tc[54] = {
387 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
388 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
389 5, 5, 6, 6, 7, 8, 9,10,11,13,14,16,18,20,22,24
390 };
391
392
393
394 // 8.7.2.4
395 void edge_filtering_luma(de265_image* img, bool vertical,
396 int yStart,int yEnd, int xStart,int xEnd)
397 {
398 int xIncr = vertical ? 2 : 1;
399 int yIncr = vertical ? 1 : 2;
400
401 const int stride = img->get_image_stride(0);
402
403 int bitDepth_Y = img->sps.BitDepth_Y;
404
405 xEnd = libde265_min(xEnd,img->get_deblk_width());
406 yEnd = libde265_min(yEnd,img->get_deblk_height());
407
408 for (int y=yStart;y<yEnd;y+=yIncr)
409 for (int x=xStart;x<xEnd;x+=xIncr) {
410 int xDi = x<<2;
411 int yDi = y<<2;
412 int bS = img->get_deblk_bS(xDi,yDi);
413
414 logtrace(LogDeblock,"deblock POC=%d %c --- x:%d y:%d bS:%d---\n",
415 img->PicOrderCntVal,vertical ? 'V':'H',xDi,yDi,bS);
416
417 #if 0
418 {
419 uint8_t* ptr = img->y + stride*yDi + xDi;
420
421 for (int dy=-4;dy<4;dy++) {
422 for (int dx=-4;dx<4;dx++) {
423 printf("%02x ", ptr[dy*stride + dx]);
424 if (dx==-1) printf("| ");
425 }
426 printf("\n");
427 if (dy==-1) printf("-------------------------\n");
428 }
429 }
430 #endif
431
432 #if 0
433 if (!vertical)
434 {
435 uint8_t* ptr = img->y + stride*yDi + xDi;
436
437 for (int dy=-4;dy<4;dy++) {
438 for (int dx=0;dx<4;dx++) {
439 printf("%02x ", ptr[dy*stride + dx]);
440 if (dx==-1) printf("| ");
441 }
442 printf("\n");
443 if (dy==-1) printf("-------------------------\n");
444 }
445 }
446 #endif
447
448 if (bS>0) {
449
450 // 8.7.2.4.3
451
452 uint8_t* ptr = img->get_image_plane_at_pos(0, xDi,yDi);
453
454 uint8_t q[4][4], p[4][4];
455 for (int k=0;k<4;k++)
456 for (int i=0;i<4;i++)
457 {
458 if (vertical) {
459 q[k][i] = ptr[ i +k*stride];
460 p[k][i] = ptr[-i-1+k*stride];
461 }
462 else {
463 q[k][i] = ptr[k + i *stride];
464 p[k][i] = ptr[k -(i+1)*stride];
465 }
466 }
467
468 #if 0
469 for (int k=0;k<4;k++)
470 {
471 for (int i=0;i<4;i++)
472 {
473 printf("%02x ", p[k][3-i]);
474 }
475
476 printf("| ");
477
478 for (int i=0;i<4;i++)
479 {
480 printf("%02x ", q[k][i]);
481 }
482 printf("\n");
483 }
484 #endif
485
486
487 int QP_Q = img->get_QPY(xDi,yDi);
488 int QP_P = (vertical ?
489 img->get_QPY(xDi-1,yDi) :
490 img->get_QPY(xDi,yDi-1) );
491 int qP_L = (QP_Q+QP_P+1)>>1;
492
493 logtrace(LogDeblock,"QP: %d & %d -> %d\n",QP_Q,QP_P,qP_L);
494
495 int sliceIndexQ00 = img->get_SliceHeaderIndex(xDi,yDi);
496 int beta_offset = img->slices[sliceIndexQ00]->slice_beta_offset;
497 int tc_offset = img->slices[sliceIndexQ00]->slice_tc_offset;
498
499 int Q_beta = Clip3(0,51, qP_L + beta_offset);
500 int betaPrime = table_8_23_beta[Q_beta];
501 int beta = betaPrime * (1<<(bitDepth_Y - 8));
502
503 int Q_tc = Clip3(0,53, qP_L + 2*(bS-1) + tc_offset);
504 int tcPrime = table_8_23_tc[Q_tc];
505 int tc = tcPrime * (1<<(bitDepth_Y - 8));
506
507 logtrace(LogDeblock,"beta: %d (%d) tc: %d (%d)\n",beta,beta_offset, tc,tc_offset);
508
509 int dE=0, dEp=0, dEq=0;
510
511 if (vertical || !vertical) {
512 int dp0 = abs_value(p[0][2] - 2*p[0][1] + p[0][0]);
513 int dp3 = abs_value(p[3][2] - 2*p[3][1] + p[3][0]);
514 int dq0 = abs_value(q[0][2] - 2*q[0][1] + q[0][0]);
515 int dq3 = abs_value(q[3][2] - 2*q[3][1] + q[3][0]);
516
517 int dpq0 = dp0 + dq0;
518 int dpq3 = dp3 + dq3;
519
520 int dp = dp0 + dp3;
521 int dq = dq0 + dq3;
522 int d = dpq0+ dpq3;
523
524 if (d<beta) {
525 //int dpq = 2*dpq0;
526 bool dSam0 = (2*dpq0 < (beta>>2) &&
527 abs_value(p[0][3]-p[0][0])+abs_value(q[0][0]-q[0][3]) < (beta>>3) &&
528 abs_value(p[0][0]-q[0][0]) < ((5*tc+1)>>1));
529
530 bool dSam3 = (2*dpq3 < (beta>>2) &&
531 abs_value(p[3][3]-p[3][0])+abs_value(q[3][0]-q[3][3]) < (beta>>3) &&
532 abs_value(p[3][0]-q[3][0]) < ((5*tc+1)>>1));
533
534 if (dSam0 && dSam3) {
535 dE=2;
536 }
537 else {
538 dE=1;
539 }
540
541 if (dp < ((beta + (beta>>1))>>3)) { dEp=1; }
542 if (dq < ((beta + (beta>>1))>>3)) { dEq=1; }
543
544 logtrace(LogDeblock,"dE:%d dEp:%d dEq:%d\n",dE,dEp,dEq);
545 }
546 }
547 else {
548 // TODO
549 assert(0);
550 }
551
552
553 // 8.7.2.4.4
554
555 if (dE != 0) {
556 bool filterP = true;
557 bool filterQ = true;
558
559 if (vertical) {
560 if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi-1,yDi)) filterP=false;
561 if (img->get_cu_transquant_bypass(xDi-1,yDi)) filterP=false;
562
563 if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi,yDi)) filterQ=false;
564 if (img->get_cu_transquant_bypass(xDi,yDi)) filterQ=false;
565 }
566 else {
567 if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi,yDi-1)) filterP=false;
568 if (img->get_cu_transquant_bypass(xDi,yDi-1)) filterP=false;
569
570 if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi,yDi)) filterQ=false;
571 if (img->get_cu_transquant_bypass(xDi,yDi)) filterQ=false;
572 }
573
574 for (int k=0;k<4;k++) {
575 //int nDp,nDq;
576
577 logtrace(LogDeblock,"line:%d\n",k);
578
579 const uint8_t p0 = p[k][0];
580 const uint8_t p1 = p[k][1];
581 const uint8_t p2 = p[k][2];
582 const uint8_t p3 = p[k][3];
583 const uint8_t q0 = q[k][0];
584 const uint8_t q1 = q[k][1];
585 const uint8_t q2 = q[k][2];
586 const uint8_t q3 = q[k][3];
587
588 if (dE==2) {
589 // strong filtering
590
591 //nDp=nDq=3;
592
593 uint8_t pnew[3],qnew[3];
594 pnew[0] = Clip3(p0-2*tc,p0+2*tc, (p2 + 2*p1 + 2*p0 + 2*q0 + q1 +4)>>3);
595 pnew[1] = Clip3(p1-2*tc,p1+2*tc, (p2 + p1 + p0 + q0+2)>>2);
596 pnew[2] = Clip3(p2-2*tc,p2+2*tc, (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3);
597 qnew[0] = Clip3(q0-2*tc,q0+2*tc, (p1+2*p0+2*q0+2*q1+q2+4)>>3);
598 qnew[1] = Clip3(q1-2*tc,q1+2*tc, (p0+q0+q1+q2+2)>>2);
599 qnew[2] = Clip3(q2-2*tc,q2+2*tc, (p0+q0+q1+3*q2+2*q3+4)>>3);
600
601 logtrace(LogDeblock,"strong filtering\n");
602
603 if (vertical) {
604 for (int i=0;i<3;i++) {
605 if (filterP) { ptr[-i-1+k*stride] = pnew[i]; }
606 if (filterQ) { ptr[ i + k*stride] = qnew[i]; }
607 }
608
609 // ptr[-1+k*stride] = ptr[ 0+k*stride] = 200;
610 }
611 else {
612 for (int i=0;i<3;i++) {
613 if (filterP) { ptr[ k -(i+1)*stride] = pnew[i]; }
614 if (filterQ) { ptr[ k + i *stride] = qnew[i]; }
615 }
616 }
617 }
618 else {
619 // weak filtering
620
621 //nDp=nDq=0;
622
623 int delta = (9*(q0-p0) - 3*(q1-p1) + 8)>>4;
624 logtrace(LogDeblock,"delta=%d, tc=%d\n",delta,tc);
625
626 if (abs_value(delta) < tc*10) {
627
628 delta = Clip3(-tc,tc,delta);
629 logtrace(LogDeblock," deblk + %d;%d [%02x->%02x] - %d;%d [%02x->%02x] delta:%d\n",
630 vertical ? xDi-1 : xDi+k,
631 vertical ? yDi+k : yDi-1, p0,Clip1_8bit(p0+delta),
632 vertical ? xDi : xDi+k,
633 vertical ? yDi+k : yDi, q0,Clip1_8bit(q0-delta),
634 delta);
635
636 if (vertical) {
637 if (filterP) { ptr[-0-1+k*stride] = Clip1_8bit(p0+delta); }
638 if (filterQ) { ptr[ 0 +k*stride] = Clip1_8bit(q0-delta); }
639 }
640 else {
641 if (filterP) { ptr[ k -1*stride] = Clip1_8bit(p0+delta); }
642 if (filterQ) { ptr[ k +0*stride] = Clip1_8bit(q0-delta); }
643 }
644
645 //ptr[ 0+k*stride] = 200;
646
647 if (dEp==1 && filterP) {
648 int delta_p = Clip3(-(tc>>1), tc>>1, (((p2+p0+1)>>1)-p1+delta)>>1);
649
650 logtrace(LogDeblock," deblk dEp %d;%d delta:%d\n",
651 vertical ? xDi-2 : xDi+k,
652 vertical ? yDi+k : yDi-2,
653 delta_p);
654
655 if (vertical) { ptr[-1-1+k*stride] = Clip1_8bit(p1+delta_p); }
656 else { ptr[ k -2*stride] = Clip1_8bit(p1+delta_p); }
657 }
658
659 if (dEq==1 && filterQ) {
660 int delta_q = Clip3(-(tc>>1), tc>>1, (((q2+q0+1)>>1)-q1-delta)>>1);
661
662 logtrace(LogDeblock," delkb dEq %d;%d delta:%d\n",
663 vertical ? xDi+1 : xDi+k,
664 vertical ? yDi+k : yDi+1,
665 delta_q);
666
667 if (vertical) { ptr[ 1 +k*stride] = Clip1_8bit(q1+delta_q); }
668 else { ptr[ k +1*stride] = Clip1_8bit(q1+delta_q); }
669 }
670
671 //nDp = dEp+1;
672 //nDq = dEq+1;
673
674 //logtrace(LogDeblock,"weak filtering (%d:%d)\n",nDp,nDq);
675 }
676 }
677 }
678 }
679 }
680 }
681 }
682
683
684 void edge_filtering_luma_CTB(de265_image* img, bool vertical, int xCtb,int yCtb)
685 {
686 int ctbSize = img->sps.CtbSizeY;
687 int deblkSize = ctbSize/4;
688
689 edge_filtering_luma(img,vertical,
690 yCtb*deblkSize, (yCtb+1)*deblkSize,
691 xCtb*deblkSize, (xCtb+1)*deblkSize);
692 }
693
694
695
696
697 // 8.7.2.4
698 void edge_filtering_chroma(de265_image* img, bool vertical, int yStart,int yEnd,
699 int xStart,int xEnd)
700 {
701 int xIncr = vertical ? 4 : 2;
702 int yIncr = vertical ? 2 : 4;
703
704 const int stride = img->get_image_stride(1);
705
706 xEnd = libde265_min(xEnd,img->get_deblk_width());
707 yEnd = libde265_min(yEnd,img->get_deblk_height());
708
709 for (int y=yStart;y<yEnd;y+=yIncr)
710 for (int x=xStart;x<xEnd;x+=xIncr) {
711 int xDi = x*2;
712 int yDi = y*2;
713 int bS = img->get_deblk_bS(2*xDi,2*yDi);
714
715 if (bS>1) {
716 // 8.7.2.4.5
717
718 for (int cplane=0;cplane<2;cplane++) {
719 int cQpPicOffset = (cplane==0 ?
720 img->pps.pic_cb_qp_offset :
721 img->pps.pic_cr_qp_offset);
722
723 uint8_t* ptr = img->get_image_plane_at_pos(cplane+1, xDi,yDi);
724
725 uint8_t p[2][4];
726 uint8_t q[2][4];
727
728 logtrace(LogDeblock,"-%s- %d %d\n",cplane==0 ? "Cb" : "Cr",xDi,yDi);
729
730 for (int i=0;i<2;i++)
731 for (int k=0;k<4;k++)
732 {
733 if (vertical) {
734 q[i][k] = ptr[ i +k*stride];
735 p[i][k] = ptr[-i-1+k*stride];
736 }
737 else {
738 q[i][k] = ptr[k + i *stride];
739 p[i][k] = ptr[k -(i+1)*stride];
740 }
741 }
742
743 #if 0
744 for (int k=0;k<4;k++)
745 {
746 for (int i=0;i<2;i++)
747 {
748 printf("%02x ", p[1-i][k]);
749 }
750
751 printf("| ");
752
753 for (int i=0;i<2;i++)
754 {
755 printf("%02x ", q[i][k]);
756 }
757 printf("\n");
758 }
759 #endif
760
761 int QP_Q = img->get_QPY(2*xDi,2*yDi);
762 int QP_P = (vertical ?
763 img->get_QPY(2*xDi-1,2*yDi) :
764 img->get_QPY(2*xDi,2*yDi-1));
765 int qP_i = ((QP_Q+QP_P+1)>>1) + cQpPicOffset;
766 int QP_C = table8_22(qP_i);
767
768 //printf("POC=%d\n",ctx->img->PicOrderCntVal);
769 logtrace(LogDeblock,"%d %d: ((%d+%d+1)>>1) + %d = qP_i=%d (QP_C=%d)\n",
770 2*xDi,2*yDi, QP_Q,QP_P,cQpPicOffset,qP_i,QP_C);
771
772 int sliceIndexQ00 = img->get_SliceHeaderIndex(2*xDi,2*yDi);
773 int tc_offset = img->slices[sliceIndexQ00]->slice_tc_offset;
774
775 int Q = Clip3(0,53, QP_C + 2*(bS-1) + tc_offset);
776
777 int tcPrime = table_8_23_tc[Q];
778 int tc = tcPrime * (1<<(img->sps.BitDepth_C - 8));
779
780 logtrace(LogDeblock,"tc_offset=%d Q=%d tc'=%d tc=%d\n",tc_offset,Q,tcPrime,tc);
781
782 if (vertical) {
783 bool filterP = true;
784 if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(2*xDi-1,2*yDi)) filterP=false;
785 if (img->get_cu_transquant_bypass(2*xDi-1,2*yDi)) filterP=false;
786
787 bool filterQ = true;
788 if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(2*xDi,2*yDi)) filterQ=false;
789 if (img->get_cu_transquant_bypass(2*xDi,2*yDi)) filterQ=false;
790
791
792 for (int k=0;k<4;k++) {
793 int delta = Clip3(-tc,tc, ((((q[0][k]-p[0][k])<<2)+p[1][k]-q[1][k]+4)>>3));
794 logtrace(LogDeblock,"delta=%d\n",delta);
795 if (filterP) { ptr[-1+k*stride] = Clip1_8bit(p[0][k]+delta); }
796 if (filterQ) { ptr[ 0+k*stride] = Clip1_8bit(q[0][k]-delta); }
797 }
798 }
799 else {
800 bool filterP = true;
801 if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(2*xDi,2*yDi-1)) filterP=false;
802 if (img->get_cu_transquant_bypass(2*xDi,2*yDi-1)) filterP=false;
803
804 bool filterQ = true;
805 if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(2*xDi,2*yDi)) filterQ=false;
806 if (img->get_cu_transquant_bypass(2*xDi,2*yDi)) filterQ=false;
807
808 for (int k=0;k<4;k++) {
809 int delta = Clip3(-tc,tc, ((((q[0][k]-p[0][k])<<2)+p[1][k]-q[1][k]+4)>>3));
810 if (filterP) { ptr[ k-1*stride] = Clip1_8bit(p[0][k]+delta); }
811 if (filterQ) { ptr[ k+0*stride] = Clip1_8bit(q[0][k]-delta); }
812 }
813 }
814 }
815 }
816 }
817 }
818
819 void edge_filtering_chroma_CTB(de265_image* img, bool vertical, int xCtb,int yCtb)
820 {
821 int ctbSize = img->sps.CtbSizeY;
822 int deblkSize = ctbSize/4;
823
824 edge_filtering_chroma(img,vertical,
825 yCtb*deblkSize, (yCtb+1)*deblkSize,
826 xCtb*deblkSize, (xCtb+1)*deblkSize);
827 }
828
829
830
831 class thread_task_deblock_CTBRow : public thread_task
832 {
833 public:
834 struct de265_image* img;
835 int ctb_y;
836 bool vertical;
837
838 virtual void work();
839 };
840
841
842 void thread_task_deblock_CTBRow::work()
843 {
844 state = Running;
845 img->thread_run();
846
847 int xStart=0;
848 int xEnd = img->get_deblk_width();
849
850 int ctbSize = img->sps.CtbSizeY;
851 int deblkSize = ctbSize/4;
852
853 int first = ctb_y * deblkSize;
854 int last = (ctb_y+1) * deblkSize;
855 if (last > img->get_deblk_height()) {
856 last = img->get_deblk_height();
857 }
858
859 int finalProgress = CTB_PROGRESS_DEBLK_V;
860 if (!vertical) finalProgress = CTB_PROGRESS_DEBLK_H;
861
862 int rightCtb = img->sps.PicWidthInCtbsY-1;
863
864 if (vertical) {
865 // pass 1: vertical
866
867 int CtbRow = std::min(ctb_y+1 , img->sps.PicHeightInCtbsY-1);
868 img->wait_for_progress(this, rightCtb,CtbRow, CTB_PROGRESS_PREFILTER);
869 }
870 else {
871 // pass 2: horizontal
872
873 if (ctb_y>0) {
874 img->wait_for_progress(this, rightCtb,ctb_y-1, CTB_PROGRESS_DEBLK_V);
875 }
876
877 img->wait_for_progress(this, rightCtb,ctb_y, CTB_PROGRESS_DEBLK_V);
878
879 if (ctb_y+1<img->sps.PicHeightInCtbsY) {
880 img->wait_for_progress(this, rightCtb,ctb_y+1, CTB_PROGRESS_DEBLK_V);
881 }
882 }
883
884 //printf("deblock %d to %d orientation: %d\n",first,last,vertical);
885
886 bool deblocking_enabled;
887
888 // first pass: check edge flags and whether we have to deblock
889 if (vertical) {
890 deblocking_enabled = derive_edgeFlags_CTBRow(img, ctb_y);
891
892 //for (int x=0;x<=rightCtb;x++) {
893 int x=0; img->set_CtbDeblockFlag(x,ctb_y, deblocking_enabled);
894 //}
895 }
896 else {
897 int x=0; deblocking_enabled=img->get_CtbDeblockFlag(x,ctb_y);
898 }
899
900 if (deblocking_enabled) {
901 derive_boundaryStrength(img, vertical, first,last, xStart,xEnd);
902 edge_filtering_luma (img, vertical, first,last, xStart,xEnd);
903 edge_filtering_chroma (img, vertical, first,last, xStart,xEnd);
904 }
905
906 for (int x=0;x<=rightCtb;x++) {
907 const int CtbWidth = img->sps.PicWidthInCtbsY;
908 img->ctb_progress[x+ctb_y*CtbWidth].set_progress(finalProgress);
909 }
910
911 state = Finished;
912 img->thread_finishes();
913 }
914
915
916 void add_deblocking_tasks(image_unit* imgunit)
917 {
918 de265_image* img = imgunit->img;
919 decoder_context* ctx = img->decctx;
920
921 int nRows = img->sps.PicHeightInCtbsY;
922
923 int n=0;
924 img->thread_start(nRows*2);
925
926 for (int pass=0;pass<2;pass++)
927 {
928 for (int y=0;y<img->sps.PicHeightInCtbsY;y++)
929 {
930 thread_task_deblock_CTBRow* task = new thread_task_deblock_CTBRow;
931
932 task->img = img;
933 task->ctb_y = y;
934 task->vertical = (pass==0);
935
936 imgunit->tasks.push_back(task);
937 add_task(&ctx->thread_pool, task);
938 n++;
939 }
940 }
941 }
942
943
944 void apply_deblocking_filter(de265_image* img) // decoder_context* ctx)
945 {
946 decoder_context* ctx = img->decctx;
947
948 char enabled_deblocking = derive_edgeFlags(img);
949
950 if (enabled_deblocking)
951 {
952 // vertical filtering
953
954 logtrace(LogDeblock,"VERTICAL\n");
955 derive_boundaryStrength(img, true ,0,img->get_deblk_height(),0,img->get_deblk_width());
956 edge_filtering_luma (img, true ,0,img->get_deblk_height(),0,img->get_deblk_width());
957 edge_filtering_chroma (img, true ,0,img->get_deblk_height(),0,img->get_deblk_width());
958
959 #if 0
960 char buf[1000];
961 sprintf(buf,"lf-after-V-%05d.yuv", ctx->img->PicOrderCntVal);
962 write_picture_to_file(ctx->img, buf);
963 #endif
964
965 // horizontal filtering
966
967 logtrace(LogDeblock,"HORIZONTAL\n");
968 derive_boundaryStrength(img, false ,0,img->get_deblk_height(),0,img->get_deblk_width());
969 edge_filtering_luma (img, false ,0,img->get_deblk_height(),0,img->get_deblk_width());
970 edge_filtering_chroma (img, false ,0,img->get_deblk_height(),0,img->get_deblk_width());
971
972 #if 0
973 sprintf(buf,"lf-after-H-%05d.yuv", ctx->img->PicOrderCntVal);
974 write_picture_to_file(ctx->img, buf);
975 #endif
976 }
977 }
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
2222
2323 #include "libde265/decctx.h"
2424
25 void apply_deblocking_filter(decoder_context* ctx);
25 void add_deblocking_tasks(image_unit* imgunit);
26 void apply_deblocking_filter(de265_image* img); //decoder_context* ctx);
2627
2728 #endif
+0
-1768
libde265/decctx.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "decctx.h"
21 #include "util.h"
22 #include "pps_func.h"
23 #include "sps_func.h"
24 #include "sao.h"
25 #include "sei.h"
26 #include "deblock.h"
27
28 #include <string.h>
29 #include <assert.h>
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <math.h>
33
34 #include "fallback.h"
35
36 #ifdef HAVE_CONFIG_H
37 #include "config.h"
38 #endif
39
40 #ifdef HAVE_SSE4_1
41 #include "x86/sse.h"
42 #endif
43
44 #define SAVE_INTERMEDIATE_IMAGES 0
45
46
47 void init_decoder_context(decoder_context* ctx)
48 {
49 memset(ctx, 0, sizeof(decoder_context));
50
51 // --- parameters ---
52
53 ctx->param_sei_check_hash = false;
54 ctx->param_HighestTid = 999; // unlimited
55 ctx->param_conceal_stream_errors = true;
56
57 // --- processing ---
58
59 set_acceleration_functions(ctx,de265_acceleration_AUTO);
60
61 ctx->param_sps_headers_fd = -1;
62 ctx->param_vps_headers_fd = -1;
63 ctx->param_pps_headers_fd = -1;
64 ctx->param_slice_headers_fd = -1;
65
66 // --- internal data ---
67
68 for (int i=0;i<DE265_MAX_SPS_SETS;i++) {
69 init_sps(&ctx->sps[i]);
70 }
71
72 for (int i=0;i<DE265_DPB_SIZE;i++) {
73 de265_init_image(&ctx->dpb[i]);
74 }
75
76 ctx->first_decoded_picture = true;
77 //ctx->FirstAfterEndOfSequenceNAL = true;
78 //ctx->last_RAP_picture_NAL_type = NAL_UNIT_UNDEFINED;
79
80 //de265_init_image(&ctx->coeff);
81
82 // --- decoded picture buffer ---
83
84 ctx->current_image_poc_lsb = -1; // any invalid number
85
86 for (int i=0;i<MAX_THREAD_CONTEXTS;i++) {
87 ctx->thread_context[i].coeffBuf = (int16_t *) &ctx->thread_context[i]._coeffBuf;
88 // some compilers/linkers don't align struct members correctly,
89 // adjust if necessary
90 int offset = (uintptr_t) ctx->thread_context[i].coeffBuf & 0x0f;
91 if (offset != 0) {
92 ctx->thread_context[i].coeffBuf = (int16_t *) (((uint8_t *)ctx->thread_context[i].coeffBuf) + (16-offset));
93 }
94 }
95 }
96
97
98 void set_acceleration_functions(decoder_context* ctx, enum de265_acceleration l)
99 {
100 // fill scalar functions first (so that function table is completely filled)
101
102 init_acceleration_functions_fallback(&ctx->acceleration);
103
104
105 // override functions with optimized variants
106
107 #ifdef HAVE_SSE4_1
108 if (l>=de265_acceleration_SSE) {
109 init_acceleration_functions_sse(&ctx->acceleration);
110 }
111 #endif
112 }
113
114
115 NAL_unit* alloc_NAL_unit(decoder_context* ctx, int size, int skipped_size)
116 {
117 NAL_unit* nal;
118
119 // --- get NAL-unit object ---
120
121 if (ctx->NAL_free_list == NULL ||
122 ctx->NAL_free_list_len==0) {
123 nal = (NAL_unit*)calloc( sizeof(NAL_unit),1 );
124 rbsp_buffer_init(&nal->nal_data);
125 }
126 else {
127 ctx->NAL_free_list_len--;
128 nal = ctx->NAL_free_list[ctx->NAL_free_list_len];
129 }
130
131
132 // --- allocate skipped-bytes set ---
133
134 if (skipped_size>0 && skipped_size>nal->max_skipped_bytes) {
135 nal->skipped_bytes = (int*)realloc( nal->skipped_bytes, skipped_size*sizeof(int) );
136 nal->max_skipped_bytes = skipped_size;
137 }
138
139 nal->num_skipped_bytes = 0;
140 nal->nal_data.size = 0;
141 rbsp_buffer_resize(&nal->nal_data, size);
142
143 return nal;
144 }
145
146 void free_NAL_unit(decoder_context* ctx, NAL_unit* nal)
147 {
148 // --- allocate free list if not already there ---
149
150 if (ctx->NAL_free_list == NULL) {
151 ctx->NAL_free_list_size = DE265_NAL_FREE_LIST_SIZE;
152 ctx->NAL_free_list = (NAL_unit**)malloc( ctx->NAL_free_list_size * sizeof(NAL_unit*) );
153 }
154
155
156 // --- put into free-list if not full ---
157
158 if (ctx->NAL_free_list_len < ctx->NAL_free_list_size) {
159 ctx->NAL_free_list[ ctx->NAL_free_list_len ] = nal;
160 ctx->NAL_free_list_len++;
161 }
162 else {
163 rbsp_buffer_free(&nal->nal_data);
164 free(nal->skipped_bytes);
165 free(nal);
166 }
167 }
168
169 NAL_unit* pop_from_NAL_queue(decoder_context* ctx)
170 {
171 if (ctx->NAL_queue_len==0) {
172 return NULL;
173 }
174 else {
175 assert(ctx->NAL_queue != NULL);
176 ctx->NAL_queue_len--;
177
178 NAL_unit* nal = ctx->NAL_queue[0];
179 memmove(ctx->NAL_queue, ctx->NAL_queue+1, sizeof(NAL_unit*)* ctx->NAL_queue_len);
180
181 ctx->nBytes_in_NAL_queue -= nal->nal_data.size;
182
183 return nal;
184 }
185 }
186
187 void push_to_NAL_queue(decoder_context* ctx,NAL_unit* nal)
188 {
189 if (ctx->NAL_queue == NULL ||
190 ctx->NAL_queue_len == ctx->NAL_queue_size) {
191 ctx->NAL_queue_size += 10;
192 ctx->NAL_queue = (NAL_unit**)realloc(ctx->NAL_queue,
193 sizeof(NAL_unit*) * ctx->NAL_queue_size);
194 }
195
196 ctx->NAL_queue[ ctx->NAL_queue_len ] = nal;
197 ctx->NAL_queue_len++;
198
199 ctx->nBytes_in_NAL_queue += nal->nal_data.size;
200 }
201
202
203 void free_decoder_context(decoder_context* ctx)
204 {
205 // --- free NAL queues ---
206
207 // empty NAL queue
208
209 NAL_unit* nal;
210 while ( (nal = pop_from_NAL_queue(ctx)) ) {
211 free_NAL_unit(ctx,nal);
212 }
213
214 // free the pending input NAL
215
216 if (ctx->pending_input_NAL != NULL) {
217 free_NAL_unit(ctx, ctx->pending_input_NAL);
218 }
219
220 // free all NALs in free-list
221
222 for (int i=0;i<ctx->NAL_free_list_len;i++)
223 {
224 rbsp_buffer_free(&ctx->NAL_free_list[i]->nal_data);
225 free(ctx->NAL_free_list[i]->skipped_bytes);
226 free(ctx->NAL_free_list[i]);
227 }
228
229 // remove lists themselves
230
231 free(ctx->NAL_queue);
232 free(ctx->NAL_free_list);
233
234
235 for (int i=0;i<DE265_MAX_SPS_SETS;i++) {
236 free_sps(&ctx->sps[i]);
237 }
238
239 for (int i=0;i<DE265_DPB_SIZE;i++) {
240 de265_free_image(&ctx->dpb[i]);
241 }
242
243 for (int i=0;i<DE265_MAX_PPS_SETS;i++) {
244 free_pps(&ctx->pps[i]);
245 }
246 }
247
248
249 void reset_decoder_context_for_new_picture(decoder_context* ctx)
250 {
251 }
252
253 void prepare_new_picture(decoder_context* ctx)
254 {
255 prepare_image_for_decoding(ctx->img);
256
257 // initialize threading tasks (TODO: move this to picture initialization)
258 }
259
260
261 void process_nal_hdr(decoder_context* ctx, nal_header* nal)
262 {
263 ctx->nal_unit_type = nal->nal_unit_type;
264
265 ctx->IdrPicFlag = (nal->nal_unit_type == NAL_UNIT_IDR_W_RADL ||
266 nal->nal_unit_type == NAL_UNIT_IDR_N_LP);
267
268 ctx->RapPicFlag = (nal->nal_unit_type >= 16 &&
269 nal->nal_unit_type <= 23);
270 }
271
272
273 void process_vps(decoder_context* ctx, video_parameter_set* vps)
274 {
275 memcpy(&ctx->vps[ vps->video_parameter_set_id ], vps, sizeof(video_parameter_set));
276 }
277
278
279 void process_sps(decoder_context* ctx, seq_parameter_set* sps)
280 {
281 push_current_picture_to_output_queue(ctx);
282
283 move_sps(&ctx->sps[ sps->seq_parameter_set_id ], sps);
284 ctx->HighestTid = libde265_min(sps->sps_max_sub_layers-1, ctx->param_HighestTid);
285 }
286
287
288 void process_pps(decoder_context* ctx, pic_parameter_set* pps)
289 {
290 push_current_picture_to_output_queue(ctx);
291
292 free_pps(&ctx->pps[ (int)pps->pic_parameter_set_id ]);
293 memcpy(&ctx->pps[ (int)pps->pic_parameter_set_id ], pps, sizeof(pic_parameter_set));
294 }
295
296
297 seq_parameter_set* get_sps(decoder_context* ctx, int id)
298 {
299 if (ctx->sps[id].sps_read==false) {
300 logerror(LogHeaders, "SPS %d has not been read\n", id);
301 return NULL;
302 }
303
304 return &ctx->sps[id];
305 }
306
307
308 /* The returned index rotates through [0;DE265_MAX_SLICES) and is not reset at each new picture.
309 Returns -1 if no more slice data structure available.
310 */
311 int get_next_slice_index(decoder_context* ctx)
312 {
313 for (int i=0;i<DE265_MAX_SLICES;i++) {
314 if (ctx->slice[i].inUse == false) {
315 return i;
316 }
317 }
318
319 // TODO: make this dynamic, increase storage when completely full
320
321 return -1;
322 }
323
324
325 /* The returned index rotates through [0;MAX_THREAD_CONTEXTS) and is not reset at each new picture.
326 Returns -1 if no more context data structure available.
327 */
328 int get_next_thread_context_index(decoder_context* ctx)
329 {
330 for (int i=0;i<MAX_THREAD_CONTEXTS;i++) {
331 if (ctx->thread_context[i].inUse == false) {
332 return i;
333 }
334 }
335
336 // TODO: make this dynamic, increase storage when completely full
337
338 return -1;
339 }
340
341
342 static void log_dpb_content(const decoder_context* ctx)
343 {
344 for (int i=0;i<DE265_DPB_SIZE;i++) {
345 loginfo(LogHighlevel, " DPB %d: POC=%d %s %s\n", i, ctx->dpb[i].PicOrderCntVal,
346 ctx->dpb[i].PicState == UnusedForReference ? "unused" :
347 ctx->dpb[i].PicState == UsedForShortTermReference ? "short-term" : "long-term",
348 ctx->dpb[i].PicOutputFlag ? "output" : "---");
349 }
350 }
351
352
353 /* 8.3.1
354 */
355 void process_picture_order_count(decoder_context* ctx, slice_segment_header* hdr)
356 {
357 loginfo(LogHeaders,"POC computation. lsb:%d prev.pic.lsb:%d msb:%d\n",
358 hdr->slice_pic_order_cnt_lsb,
359 ctx->prevPicOrderCntLsb,
360 ctx->PicOrderCntMsb);
361
362 if (isIRAP(ctx->nal_unit_type) &&
363 ctx->NoRaslOutputFlag)
364 {
365 ctx->PicOrderCntMsb=0;
366
367
368 // flush all images from reorder buffer
369
370 while (ctx->reorder_output_queue_length>0) {
371 flush_next_picture_from_reorder_buffer(ctx);
372 }
373 }
374 else
375 {
376 int MaxPicOrderCntLsb = ctx->current_sps->MaxPicOrderCntLsb;
377
378 if ((hdr->slice_pic_order_cnt_lsb < ctx->prevPicOrderCntLsb) &&
379 (ctx->prevPicOrderCntLsb - hdr->slice_pic_order_cnt_lsb) >= MaxPicOrderCntLsb/2) {
380 ctx->PicOrderCntMsb = ctx->prevPicOrderCntMsb + MaxPicOrderCntLsb;
381 }
382 else if ((hdr->slice_pic_order_cnt_lsb > ctx->prevPicOrderCntLsb) &&
383 (hdr->slice_pic_order_cnt_lsb - ctx->prevPicOrderCntLsb) > MaxPicOrderCntLsb/2) {
384 ctx->PicOrderCntMsb = ctx->prevPicOrderCntMsb - MaxPicOrderCntLsb;
385 }
386 else {
387 ctx->PicOrderCntMsb = ctx->prevPicOrderCntMsb;
388 }
389 }
390
391 ctx->img->PicOrderCntVal = ctx->PicOrderCntMsb + hdr->slice_pic_order_cnt_lsb;
392 ctx->img->picture_order_cnt_lsb = hdr->slice_pic_order_cnt_lsb;
393
394 loginfo(LogHeaders,"POC computation. new msb:%d POC=%d\n",
395 ctx->PicOrderCntMsb,
396 ctx->img->PicOrderCntVal);
397
398 if (1 /* TemporalID==0 */ && // TODO
399 (isReferenceNALU(ctx->nal_unit_type) &&
400 (!isRASL(ctx->nal_unit_type) && !isRADL(ctx->nal_unit_type))) &&
401 1 /* sub-layer non-reference picture */) // TODO
402 {
403 loginfo(LogHeaders,"set prevPicOrderCntLsb/Msb\n");
404
405 ctx->prevPicOrderCntLsb = hdr->slice_pic_order_cnt_lsb;
406 ctx->prevPicOrderCntMsb = ctx->PicOrderCntMsb;
407 }
408 }
409
410
411 bool has_free_dpb_picture(const decoder_context* ctx, bool high_priority)
412 {
413 int nImages = high_priority ? DE265_DPB_SIZE : DE265_DPB_OUTPUT_IMAGES;
414
415 for (int i=0;i<nImages;i++) {
416 if (ctx->dpb[i].PicOutputFlag==false && ctx->dpb[i].PicState == UnusedForReference) {
417 return true;
418 }
419 }
420
421 return false;
422 }
423
424
425 static int DPB_index_of_picture_with_POC(decoder_context* ctx, int poc)
426 {
427 logdebug(LogHeaders,"DPB_index_of_picture_with_POC POC=\n",poc);
428
429 //log_dpb_content(ctx);
430 //loginfo(LogDPB,"searching for short-term reference POC=%d\n",poc);
431
432 for (int k=0;k<DE265_DPB_SIZE;k++) {
433 if (ctx->dpb[k].PicOrderCntVal == poc &&
434 ctx->dpb[k].PicState != UnusedForReference) {
435 return k;
436 }
437 }
438
439 return -1;
440 }
441
442
443 static int DPB_index_of_picture_with_LSB(decoder_context* ctx, int lsb)
444 {
445 logdebug(LogHeaders,"get access to picture with PSB %d from DPB\n",lsb);
446
447 for (int k=0;k<DE265_DPB_SIZE;k++) {
448 if (ctx->dpb[k].picture_order_cnt_lsb == lsb &&
449 ctx->dpb[k].PicState != UnusedForReference) {
450 return k;
451 }
452 }
453
454 return -1;
455 }
456
457
458 /* 8.3.3.2
459 Returns DPB index of the generated picture.
460 */
461 int generate_unavailable_reference_picture(decoder_context* ctx, const seq_parameter_set* sps,
462 int POC, bool longTerm)
463 {
464 assert(has_free_dpb_picture(ctx, true));
465
466 //printf("generate_unavailable_reference_picture(%d,%d)\n",POC,longTerm);
467
468 int idx = initialize_new_DPB_image(ctx, ctx->current_sps);
469 assert(idx>=0);
470 //printf("-> fill with unavailable POC %d\n",POC);
471
472 de265_image* img = &ctx->dpb[idx];
473 assert(img->border==0);
474
475 memset( img->y - img->border, 1<<(sps->BitDepth_Y-1), img->stride * img->height );
476 memset( img->cb- img->border, 1<<(sps->BitDepth_C-1), img->chroma_stride * img->chroma_height );
477 memset( img->cr- img->border, 1<<(sps->BitDepth_C-1), img->chroma_stride * img->chroma_height );
478
479 for (int i=0;i<img->cb_info_size;i++)
480 { img->cb_info[i].PredMode = MODE_INTRA; }
481
482
483 img->PicOrderCntVal = POC;
484 img->picture_order_cnt_lsb = POC & (sps->MaxPicOrderCntLsb-1);
485 img->PicOutputFlag = false;
486 img->PicState = (longTerm ? UsedForLongTermReference : UsedForShortTermReference);
487 img->integrity = INTEGRITY_UNAVAILABLE_REFERENCE;
488 /*
489 int w = sps->pic_width_in_luma_samples;
490 int h = sps->pic_height_in_luma_samples;
491 de265_alloc_image(ctx->img, w,h, chroma, sps);
492 QQQ
493 */
494
495 return idx;
496 }
497
498
499 /* 8.3.2 invoked once per picture
500
501 This function will mark pictures in the DPB as 'unused' or 'used for long-term reference'
502 */
503 void process_reference_picture_set(decoder_context* ctx, slice_segment_header* hdr)
504 {
505 if (isIRAP(ctx->nal_unit_type) && ctx->NoRaslOutputFlag) {
506
507 int currentPOC = ctx->img->PicOrderCntVal;
508
509 // reset DPB
510
511 /* The standard says: "When the current picture is an IRAP picture with NoRaslOutputFlag
512 equal to 1, all reference pictures currently in the DPB (if any) are marked as
513 "unused for reference".
514
515 This seems to be wrong as it also throws out the first CRA picture in a stream like
516 RAP_A (decoding order: CRA,POC=64, RASL,POC=60). Removing only the pictures with
517 lower POCs seems to be compliant to the reference decoder.
518 */
519
520 for (int i=0;i<DE265_DPB_SIZE;i++) {
521 if (ctx->dpb[i].PicState != UnusedForReference &&
522 ctx->dpb[i].PicOrderCntVal < currentPOC) {
523 ctx->dpb[i].PicState = UnusedForReference;
524
525 cleanup_image(ctx, &ctx->dpb[i]);
526 }
527 }
528 }
529
530
531 if (isIDR(ctx->nal_unit_type)) {
532
533 // clear all reference pictures
534
535 ctx->NumPocStCurrBefore = 0;
536 ctx->NumPocStCurrAfter = 0;
537 ctx->NumPocStFoll = 0;
538 ctx->NumPocLtCurr = 0;
539 ctx->NumPocLtFoll = 0;
540 }
541 else {
542 const ref_pic_set* rps = hdr->CurrRps;
543
544 // (8-98)
545
546 int i,j,k;
547
548 // scan ref-pic-set for smaller POCs and fill into PocStCurrBefore / PocStFoll
549
550 for (i=0, j=0, k=0;
551 i<rps->NumNegativePics;
552 i++)
553 {
554 if (rps->UsedByCurrPicS0[i]) {
555 ctx->PocStCurrBefore[j++] = ctx->img->PicOrderCntVal + rps->DeltaPocS0[i];
556 //printf("PocStCurrBefore = %d\n",ctx->PocStCurrBefore[j-1]);
557 }
558 else {
559 ctx->PocStFoll[k++] = ctx->img->PicOrderCntVal + rps->DeltaPocS0[i];
560 }
561 }
562
563 ctx->NumPocStCurrBefore = j;
564
565
566 // scan ref-pic-set for larger POCs and fill into PocStCurrAfter / PocStFoll
567
568 for (i=0, j=0;
569 i<rps->NumPositivePics;
570 i++)
571 {
572 if (rps->UsedByCurrPicS1[i]) {
573 ctx->PocStCurrAfter[j++] = ctx->img->PicOrderCntVal + rps->DeltaPocS1[i];
574 //printf("PocStCurrAfter = %d\n",ctx->PocStCurrAfter[j-1]);
575 }
576 else {
577 ctx->PocStFoll[k++] = ctx->img->PicOrderCntVal + rps->DeltaPocS1[i];
578 }
579 }
580
581 ctx->NumPocStCurrAfter = j;
582 ctx->NumPocStFoll = k;
583
584
585 // find used / future long-term references
586
587 for (i=0, j=0, k=0;
588 i<ctx->current_sps->num_long_term_ref_pics_sps + hdr->num_long_term_pics;
589 i++)
590 {
591 int pocLt = ctx->PocLsbLt[i];
592
593 if (hdr->delta_poc_msb_present_flag[i]) {
594 int currentPictureMSB = ctx->img->PicOrderCntVal - hdr->slice_pic_order_cnt_lsb;
595 pocLt += currentPictureMSB
596 - ctx->DeltaPocMsbCycleLt[i] * ctx->current_sps->MaxPicOrderCntLsb;
597 }
598
599 if (ctx->UsedByCurrPicLt[i]) {
600 ctx->PocLtCurr[j] = pocLt;
601 ctx->CurrDeltaPocMsbPresentFlag[j] = hdr->delta_poc_msb_present_flag[i];
602 j++;
603 }
604 else {
605 ctx->PocLtFoll[k] = pocLt;
606 ctx->FollDeltaPocMsbPresentFlag[k] = hdr->delta_poc_msb_present_flag[i];
607 k++;
608 }
609 }
610
611 ctx->NumPocLtCurr = j;
612 ctx->NumPocLtFoll = k;
613 }
614
615
616 // (old 8-99) / (new 8-106)
617 // 1.
618
619 bool picInAnyList[DE265_DPB_SIZE];
620 memset(picInAnyList,0, DE265_DPB_SIZE*sizeof(bool));
621
622
623 for (int i=0;i<ctx->NumPocLtCurr;i++) {
624 if (!ctx->CurrDeltaPocMsbPresentFlag[i]) {
625 int k = DPB_index_of_picture_with_LSB(ctx, ctx->PocLtCurr[i]);
626
627 ctx->RefPicSetLtCurr[i] = k; // -1 == "no reference picture"
628 if (k>=0) picInAnyList[k]=true;
629 else {
630 // TODO, CHECK: is it ok that we generate a picture with POC = LSB (PocLtCurr)
631 // We do not know the correct MSB
632 int concealedPicture = generate_unavailable_reference_picture(ctx, ctx->current_sps,
633 ctx->PocLtCurr[i], true);
634 ctx->RefPicSetLtCurr[i] = concealedPicture;
635 picInAnyList[concealedPicture]=true;
636 }
637 }
638 else {
639 int k = DPB_index_of_picture_with_POC(ctx, ctx->PocLtCurr[i]);
640
641 ctx->RefPicSetLtCurr[i] = k; // -1 == "no reference picture"
642 if (k>=0) picInAnyList[k]=true;
643 else {
644 int concealedPicture = generate_unavailable_reference_picture(ctx, ctx->current_sps,
645 ctx->PocLtCurr[i], true);
646 ctx->RefPicSetLtCurr[i] = concealedPicture;
647 picInAnyList[concealedPicture]=true;
648 }
649 }
650 }
651
652 for (int i=0;i<ctx->NumPocLtFoll;i++) {
653 if (!ctx->FollDeltaPocMsbPresentFlag[i]) {
654 int k = DPB_index_of_picture_with_LSB(ctx, ctx->PocLtFoll[i]);
655
656 ctx->RefPicSetLtFoll[i] = k; // -1 == "no reference picture"
657 if (k>=0) picInAnyList[k]=true;
658 else {
659 // TODO, CHECK: is it ok that we generate a picture with POC = LSB (PocLtFoll)
660 // We do not know the correct MSB
661 int concealedPicture = generate_unavailable_reference_picture(ctx, ctx->current_sps,
662 ctx->PocLtFoll[i], true);
663 ctx->RefPicSetLtFoll[i] = concealedPicture;
664 picInAnyList[concealedPicture]=true;
665 }
666 }
667 else {
668 int k = DPB_index_of_picture_with_POC(ctx, ctx->PocLtFoll[i]);
669
670 ctx->RefPicSetLtFoll[i] = k; // -1 == "no reference picture"
671 if (k>=0) picInAnyList[k]=true;
672 else {
673 int concealedPicture = generate_unavailable_reference_picture(ctx, ctx->current_sps,
674 ctx->PocLtFoll[i], true);
675 ctx->RefPicSetLtFoll[i] = concealedPicture;
676 picInAnyList[concealedPicture]=true;
677 }
678 }
679 }
680
681
682 // 2. Mark all pictures in RefPicSetLtCurr / RefPicSetLtFoll as UsedForLongTermReference
683
684 for (int i=0;i<ctx->NumPocLtCurr;i++) {
685 ctx->dpb[ ctx->RefPicSetLtCurr[i] ].PicState = UsedForLongTermReference;
686 }
687
688 for (int i=0;i<ctx->NumPocLtFoll;i++) {
689 ctx->dpb[ ctx->RefPicSetLtFoll[i] ].PicState = UsedForLongTermReference;
690 }
691
692
693 // 3.
694
695 for (int i=0;i<ctx->NumPocStCurrBefore;i++) {
696 int k = DPB_index_of_picture_with_POC(ctx, ctx->PocStCurrBefore[i]);
697
698 //printf("st curr before, poc=%d -> idx=%d\n",ctx->PocStCurrBefore[i], k);
699
700 ctx->RefPicSetStCurrBefore[i] = k; // -1 == "no reference picture"
701 if (k>=0) picInAnyList[k]=true;
702 else {
703 int concealedPicture = generate_unavailable_reference_picture(ctx, ctx->current_sps,
704 ctx->PocStCurrBefore[i], false);
705 ctx->RefPicSetStCurrBefore[i] = concealedPicture;
706 picInAnyList[concealedPicture]=true;
707
708 //printf(" concealed: %d\n", concealedPicture);
709 }
710 }
711
712 for (int i=0;i<ctx->NumPocStCurrAfter;i++) {
713 int k = DPB_index_of_picture_with_POC(ctx, ctx->PocStCurrAfter[i]);
714
715 //printf("st curr after, poc=%d -> idx=%d\n",ctx->PocStCurrAfter[i], k);
716
717 ctx->RefPicSetStCurrAfter[i] = k; // -1 == "no reference picture"
718 if (k>=0) picInAnyList[k]=true;
719 else {
720 int concealedPicture = generate_unavailable_reference_picture(ctx, ctx->current_sps,
721 ctx->PocStCurrAfter[i], false);
722 ctx->RefPicSetStCurrAfter[i] = concealedPicture;
723 picInAnyList[concealedPicture]=true;
724
725 //printf(" concealed: %d\n", concealedPicture);
726 }
727 }
728
729 for (int i=0;i<ctx->NumPocStFoll;i++) {
730 int k = DPB_index_of_picture_with_POC(ctx, ctx->PocStFoll[i]);
731 // if (k<0) { assert(false); } // IGNORE
732
733 ctx->RefPicSetStFoll[i] = k; // -1 == "no reference picture"
734 if (k>=0) picInAnyList[k]=true;
735 }
736
737 // 4. any picture that is not marked for reference is put into the "UnusedForReference" state
738
739 for (int i=0;i<DE265_DPB_SIZE;i++)
740 if (!picInAnyList[i] && // no reference
741 &ctx->dpb[i] != ctx->img) // not the current picture
742 {
743 if (ctx->dpb[i].PicState != UnusedForReference) {
744 ctx->dpb[i].PicState = UnusedForReference;
745
746 cleanup_image(ctx, &ctx->dpb[i]);
747 }
748 }
749 }
750
751
752 // 8.3.3
753 /*
754 void generate_unavailable_reference_pictures(decoder_context* ctx, slice_segment_header* hdr)
755 {
756 for (int i=0;i<ctx->NumPocStCurrBefore;i++) {
757 if (ctx->RefPicSetStCurrBefore[i] < 0) {
758 //int idx = generate_unavailable_picture(ctx,ctx->current_sps,
759 }
760 }
761
762 for (int i=0;i<ctx->NumPocStCurrAfter;i++) {
763 if (ctx->RefPicSetStCurrAfter[i] < 0) {
764 //int idx = initialize_new_DPB_image(ctx, ctx->current_sps);
765 }
766 }
767 }
768 */
769
770 // 8.3.4
771 // Returns whether we can continue decoding (or whether there is a severe error).
772 /* Called at beginning of each slice.
773
774 Constructs
775 - the RefPicList[2][], containing indices into the DPB, and
776 - the RefPicList_POC[2][], containing POCs.
777 - LongTermRefPic[2][] is also set to true if it is a long-term reference
778 */
779 bool construct_reference_picture_lists(decoder_context* ctx, slice_segment_header* hdr)
780 {
781 int NumPocTotalCurr = hdr->CurrRps->NumPocTotalCurr;
782 int NumRpsCurrTempList0 = libde265_max(hdr->num_ref_idx_l0_active, NumPocTotalCurr);
783
784 // TODO: fold code for both lists together
785
786 int RefPicListTemp0[DE265_DPB_SIZE]; // TODO: what would be the correct maximum ?
787 int RefPicListTemp1[DE265_DPB_SIZE]; // TODO: what would be the correct maximum ?
788 char isLongTerm[2][DE265_DPB_SIZE];
789
790 memset(isLongTerm,0,2*DE265_DPB_SIZE);
791
792 /* --- Fill RefPicListTmp0 with reference pictures in this order:
793 1) short term, past POC
794 2) short term, future POC
795 3) long term
796 */
797
798 int rIdx=0;
799 while (rIdx < NumRpsCurrTempList0) {
800 for (int i=0;i<ctx->NumPocStCurrBefore && rIdx<NumRpsCurrTempList0; rIdx++,i++)
801 RefPicListTemp0[rIdx] = ctx->RefPicSetStCurrBefore[i];
802
803 for (int i=0;i<ctx->NumPocStCurrAfter && rIdx<NumRpsCurrTempList0; rIdx++,i++)
804 RefPicListTemp0[rIdx] = ctx->RefPicSetStCurrAfter[i];
805
806 for (int i=0;i<ctx->NumPocLtCurr && rIdx<NumRpsCurrTempList0; rIdx++,i++) {
807 RefPicListTemp0[rIdx] = ctx->RefPicSetLtCurr[i];
808 isLongTerm[0][rIdx] = true;
809 }
810
811 // This check is to prevent an endless loop when no images are added above.
812 if (rIdx==0) {
813 add_warning(ctx, DE265_WARNING_FAULTY_REFERENCE_PICTURE_LIST, false);
814 return false;
815 }
816 }
817
818 if (hdr->num_ref_idx_l0_active > 15) {
819 add_warning(ctx, DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false);
820 return false;
821 }
822
823 for (rIdx=0; rIdx<hdr->num_ref_idx_l0_active; rIdx++) {
824 int idx = hdr->ref_pic_list_modification_flag_l0 ? hdr->list_entry_l0[rIdx] : rIdx;
825
826 hdr->RefPicList[0][rIdx] = RefPicListTemp0[idx];
827 hdr->LongTermRefPic[0][rIdx] = isLongTerm[0][idx];
828
829 // remember POC of referenced imaged (needed in motion.c, derive_collocated_motion_vector)
830 hdr->RefPicList_POC[0][rIdx] = ctx->dpb[ hdr->RefPicList[0][rIdx] ].PicOrderCntVal;
831 }
832
833
834 /* --- Fill RefPicListTmp1 with reference pictures in this order:
835 1) short term, future POC
836 2) short term, past POC
837 3) long term
838 */
839
840 if (hdr->slice_type == SLICE_TYPE_B) {
841 int NumRpsCurrTempList1 = libde265_max(hdr->num_ref_idx_l1_active, NumPocTotalCurr);
842
843 int rIdx=0;
844 while (rIdx < NumRpsCurrTempList1) {
845 for (int i=0;i<ctx->NumPocStCurrAfter && rIdx<NumRpsCurrTempList1; rIdx++,i++)
846 RefPicListTemp1[rIdx] = ctx->RefPicSetStCurrAfter[i];
847
848 for (int i=0;i<ctx->NumPocStCurrBefore && rIdx<NumRpsCurrTempList1; rIdx++,i++)
849 RefPicListTemp1[rIdx] = ctx->RefPicSetStCurrBefore[i];
850
851 for (int i=0;i<ctx->NumPocLtCurr && rIdx<NumRpsCurrTempList1; rIdx++,i++) {
852 RefPicListTemp1[rIdx] = ctx->RefPicSetLtCurr[i];
853 isLongTerm[1][rIdx] = true;
854 }
855 }
856
857 assert(hdr->num_ref_idx_l1_active <= 15);
858 for (rIdx=0; rIdx<hdr->num_ref_idx_l1_active; rIdx++) {
859 int idx = hdr->ref_pic_list_modification_flag_l1 ? hdr->list_entry_l1[rIdx] : rIdx;
860
861 hdr->RefPicList[1][rIdx] = RefPicListTemp1[idx];
862 hdr->LongTermRefPic[1][rIdx] = isLongTerm[1][idx];
863
864 // remember POC of referenced imaged (needed in motion.c, derive_collocated_motion_vector)
865 hdr->RefPicList_POC[1][rIdx] = ctx->dpb[ hdr->RefPicList[1][rIdx] ].PicOrderCntVal;
866 }
867 }
868
869
870 // show reference picture lists
871
872 loginfo(LogHeaders,"RefPicList[0] =");
873 for (rIdx=0; rIdx<hdr->num_ref_idx_l0_active; rIdx++) {
874 loginfo(LogHeaders,"* [%d]=%d",
875 hdr->RefPicList[0][rIdx],
876 ctx->dpb[hdr->RefPicList[0][rIdx]].PicOrderCntVal
877 );
878 }
879 loginfo(LogHeaders,"*\n");
880
881 loginfo(LogHeaders,"RefPicList[1] =");
882 for (rIdx=0; rIdx<hdr->num_ref_idx_l1_active; rIdx++) {
883 loginfo(LogHeaders,"* [%d]=%d",
884 hdr->RefPicList[1][rIdx],
885 ctx->dpb[hdr->RefPicList[1][rIdx]].PicOrderCntVal
886 );
887 }
888 loginfo(LogHeaders,"*\n");
889
890 return true;
891 }
892
893
894
895 void flush_next_picture_from_reorder_buffer(decoder_context* ctx)
896 {
897 assert(ctx->reorder_output_queue_length>0);
898
899 // search for picture in reorder buffer with minimum POC
900
901 int minPOC = ctx->reorder_output_queue[0]->PicOrderCntVal;
902 int minIdx = 0;
903 for (int i=1;i<ctx->reorder_output_queue_length;i++)
904 {
905 if (ctx->reorder_output_queue[i]->PicOrderCntVal < minPOC) {
906 minPOC = ctx->reorder_output_queue[i]->PicOrderCntVal;
907 minIdx = i;
908 }
909 }
910
911
912 // put image into output queue
913
914 assert(ctx->image_output_queue_length < DE265_DPB_SIZE);
915 ctx->image_output_queue[ ctx->image_output_queue_length ] = ctx->reorder_output_queue[minIdx];
916 ctx->image_output_queue_length++;
917
918
919 // remove image from reorder buffer
920
921 for (int i=minIdx+1; i<ctx->reorder_output_queue_length; i++) {
922 ctx->reorder_output_queue[i-1] = ctx->reorder_output_queue[i];
923 }
924 ctx->reorder_output_queue_length--;
925 }
926
927
928 void cleanup_image(decoder_context* ctx, de265_image* img)
929 {
930 if (img->PicState != UnusedForReference) { return; } // still required for reference
931 if (img->PicOutputFlag) { return; } // required for output
932
933 if (img->sps==NULL) { return; } // might be an unavailable-reference replacement image
934
935
936 //printf("cleanup_image POC=%d\n",img->PicOrderCntVal);
937
938 // mark all slice-headers locked by this image as unused
939
940 /* Note: cannot use SPS here, because this may already be outdated when a
941 new SPS was sent before cleaning up this image.
942 */
943
944 for (int i=0;i<img->ctb_info_size;i++)
945 {
946 int sliceHeaderIdx = img->ctb_info[i].SliceHeaderIndex;
947
948 slice_segment_header* shdr;
949 shdr = &ctx->slice[ sliceHeaderIdx ];
950
951 //printf("cleanup SHDR %d\n",sliceHeaderIdx);
952
953 shdr->inUse = false;
954 }
955
956 img->sps = NULL; // this may not be valid anymore in the future
957 img->pps = NULL; // this may not be valid anymore in the future
958 }
959
960
961 void writeFrame_Y(decoder_context* ctx,const char* filename)
962 {
963 int w = ctx->img->width;
964 int h = ctx->img->height;
965 //int c_idx=0;
966 int ctb_size = 64; // HACK
967
968 int stride = ctx->img->stride;
969
970 for (int ctbY=0;ctbY<ctx->current_sps->PicHeightInCtbsY;ctbY++)
971 for (int ctbX=0;ctbX<ctx->current_sps->PicWidthInCtbsY;ctbX++)
972 {
973 int x0 = ctbX*ctb_size;
974 int y0 = ctbY*ctb_size;
975
976
977 uint8_t *src = &ctx->img->y[y0 * stride + x0];
978
979 printf("%s %d %d\n",filename,x0,y0);
980 int dx,dy;
981 for (dy=0;dy<ctb_size;dy++)
982 if (y0+dy < h)
983 {
984 printf("%s %d %d ",filename,y0+dy,x0);
985
986 for (dx=0;dx<ctb_size;dx++)
987 if (x0+dx < w)
988 {
989 printf("%02x ",*(src+dx+dy*stride));
990 }
991
992 printf("\n");
993 }
994 }
995 }
996
997
998 void push_current_picture_to_output_queue(decoder_context* ctx)
999 {
1000 if (ctx->img) {
1001 //ctx->img->PicState = UsedForShortTermReference;
1002
1003 // post-process image
1004
1005 #if SAVE_INTERMEDIATE_IMAGES
1006 char buf[1000];
1007 sprintf(buf,"pre-lf-%05d.yuv", ctx->img->PicOrderCntVal);
1008 write_picture_to_file(ctx->img, buf);
1009 #endif
1010
1011 //writeFrame_Y(ctx,"raw");
1012 apply_deblocking_filter(ctx);
1013 //writeFrame_Y(ctx,"deblk");
1014
1015 #if SAVE_INTERMEDIATE_IMAGES
1016 sprintf(buf,"pre-sao-%05d.yuv", ctx->img->PicOrderCntVal);
1017 write_picture_to_file(ctx->img, buf);
1018 #endif
1019
1020 apply_sample_adaptive_offset(ctx);
1021 //writeFrame_Y(ctx,"sao");
1022
1023 #if SAVE_INTERMEDIATE_IMAGES
1024 sprintf(buf,"sao-%05d.yuv", ctx->img->PicOrderCntVal);
1025 write_picture_to_file(ctx->img, buf);
1026 #endif
1027
1028 // push image into output queue
1029
1030 if (ctx->img->PicOutputFlag) {
1031 set_conformance_window(ctx->img,
1032 ctx->current_sps->conf_win_left_offset,
1033 ctx->current_sps->conf_win_right_offset,
1034 ctx->current_sps->conf_win_top_offset,
1035 ctx->current_sps->conf_win_bottom_offset);
1036
1037 loginfo(LogDPB,"new picture has output-flag=true\n");
1038
1039 assert(ctx->reorder_output_queue_length < DE265_DPB_SIZE);
1040 ctx->reorder_output_queue[ ctx->reorder_output_queue_length++ ] = ctx->img;
1041
1042 loginfo(LogDPB,"push image %d into reordering queue\n", ctx->img->PicOrderCntVal);
1043 }
1044
1045 ctx->last_decoded_image = ctx->img;
1046 ctx->img = NULL;
1047
1048 /*
1049 if (isRAP(ctx->nal_unit_type)) {
1050 ctx->last_RAP_picture_NAL_type = ctx->nal_unit_type;
1051
1052 ctx->last_RAP_was_CRA_and_first_image_of_sequence =
1053 isCRA(ctx->nal_unit_type) && ctx->first_decoded_picture;
1054 }
1055 */
1056
1057 // next image is not the first anymore
1058
1059 ctx->first_decoded_picture = false;
1060
1061
1062 // check for full reorder buffers
1063
1064 int sublayer = ctx->current_vps->vps_max_sub_layers -1;
1065
1066 int maxNumPicsInReorderBuffer = ctx->current_vps->layer[sublayer].vps_max_num_reorder_pics;
1067
1068 if (ctx->reorder_output_queue_length > maxNumPicsInReorderBuffer) {
1069 flush_next_picture_from_reorder_buffer(ctx);
1070 }
1071
1072
1073 loginfo(LogDPB, "DPB reorder queue (after push): ");
1074 for (int i=0;i<ctx->reorder_output_queue_length;i++) {
1075 loginfo(LogDPB, "*%d ", ctx->reorder_output_queue[i]->PicOrderCntVal);
1076 }
1077 loginfo(LogDPB,"*\n");
1078
1079 loginfo(LogDPB, "DPB output queue (after push): ");
1080 for (int i=0;i<ctx->image_output_queue_length;i++) {
1081 loginfo(LogDPB, "*%d ", ctx->image_output_queue[i]->PicOrderCntVal);
1082 }
1083 loginfo(LogDPB,"*\n");
1084 }
1085 }
1086
1087
1088 /* Alloc a new image in the DPB and return its index.
1089 If there is no space for a new image, return -1.
1090 */
1091 int initialize_new_DPB_image(decoder_context* ctx,const seq_parameter_set* sps)
1092 {
1093 loginfo(LogHeaders,"initialize_new_DPB_image\n");
1094
1095 //printf("initialize_new_DPB_image()\n");
1096 log_dpb_content(ctx);
1097
1098 int free_image_buffer_idx = -1;
1099 for (int i=0;i<DE265_DPB_SIZE;i++) {
1100 if (ctx->dpb[i].PicOutputFlag==false && ctx->dpb[i].PicState == UnusedForReference) {
1101 free_image_buffer_idx = i;
1102 break;
1103 }
1104 }
1105
1106 //printf("free buffer index = %d\n", free_image_buffer_idx);
1107
1108 if (free_image_buffer_idx == -1) {
1109 return -1;
1110 }
1111
1112 de265_image* img = &ctx->dpb[free_image_buffer_idx];
1113
1114 int w = sps->pic_width_in_luma_samples;
1115 int h = sps->pic_height_in_luma_samples;
1116
1117 enum de265_chroma chroma;
1118 switch (sps->chroma_format_idc) {
1119 case 0: chroma = de265_chroma_mono; break;
1120 case 1: chroma = de265_chroma_420; break;
1121 case 2: chroma = de265_chroma_422; break;
1122 case 3: chroma = de265_chroma_444; break;
1123 default: chroma = de265_chroma_420; assert(0); break; // should never happen
1124 }
1125
1126 de265_alloc_image(img, w,h, chroma, sps);
1127
1128 img->integrity = INTEGRITY_CORRECT;
1129
1130 return free_image_buffer_idx;
1131 }
1132
1133
1134 // returns whether we can continue decoding the stream or whether we should give up
1135 bool process_slice_segment_header(decoder_context* ctx, slice_segment_header* hdr,
1136 de265_error* err, de265_PTS pts, void* user_data)
1137 {
1138 *err = DE265_OK;
1139
1140 // get PPS and SPS for this slice
1141
1142 int pps_id = hdr->slice_pic_parameter_set_id;
1143 if (ctx->pps[pps_id].pps_read==false) {
1144 logerror(LogHeaders, "PPS %d has not been read\n", pps_id);
1145 assert(false); // TODO
1146 }
1147
1148 ctx->current_pps = &ctx->pps[pps_id];
1149 ctx->current_sps = &ctx->sps[ (int)ctx->current_pps->seq_parameter_set_id ];
1150 ctx->current_vps = &ctx->vps[ (int)ctx->current_sps->video_parameter_set_id ];
1151
1152
1153 // --- prepare decoding of new picture ---
1154
1155 //if (hdr->slice_pic_order_cnt_lsb != ctx->current_image_poc_lsb) {
1156 if (hdr->first_slice_segment_in_pic_flag) {
1157
1158 // previous picture has been completely decoded
1159
1160 push_current_picture_to_output_queue(ctx);
1161
1162 ctx->current_image_poc_lsb = hdr->slice_pic_order_cnt_lsb;
1163
1164
1165 seq_parameter_set* sps = ctx->current_sps;
1166
1167
1168 // --- find and allocate image buffer for decoding ---
1169
1170 int image_buffer_idx;
1171 image_buffer_idx = initialize_new_DPB_image(ctx,sps);
1172 if (image_buffer_idx == -1) {
1173 *err = DE265_ERROR_IMAGE_BUFFER_FULL;
1174 return false;
1175 }
1176
1177 de265_image* img = &ctx->dpb[image_buffer_idx];
1178 img->pts = pts;
1179 img->user_data = user_data;
1180 ctx->img = img;
1181
1182 img->sps = ctx->current_sps;
1183 img->pps = ctx->current_pps;
1184
1185 reset_decoder_context_for_new_picture(ctx);
1186 prepare_new_picture(ctx);
1187
1188
1189 if (isIRAP(ctx->nal_unit_type)) {
1190 if (isIDR(ctx->nal_unit_type) ||
1191 isBLA(ctx->nal_unit_type) ||
1192 ctx->first_decoded_picture ||
1193 ctx->FirstAfterEndOfSequenceNAL)
1194 {
1195 ctx->NoRaslOutputFlag = true;
1196 ctx->FirstAfterEndOfSequenceNAL = false;
1197 }
1198 else if (0) // TODO: set HandleCraAsBlaFlag by external means
1199 {
1200 }
1201 else
1202 {
1203 ctx->NoRaslOutputFlag = false;
1204 ctx->HandleCraAsBlaFlag = false;
1205 }
1206 }
1207
1208
1209 if (isRASL(ctx->nal_unit_type) &&
1210 ctx->NoRaslOutputFlag)
1211 {
1212 ctx->img->PicOutputFlag = false;
1213 }
1214 else
1215 {
1216 ctx->img->PicOutputFlag = !!hdr->pic_output_flag;
1217 }
1218
1219 process_picture_order_count(ctx,hdr);
1220
1221 if (hdr->first_slice_segment_in_pic_flag) {
1222 // mark picture so that it is not overwritten by unavailable reference frames
1223 img->PicState = UsedForShortTermReference;
1224
1225 process_reference_picture_set(ctx,hdr);
1226 }
1227
1228 img->PicState = UsedForShortTermReference;
1229
1230 //generate_unavailable_reference_pictures(ctx,hdr);
1231
1232 log_set_current_POC(ctx->img->PicOrderCntVal);
1233 }
1234
1235 if (hdr->slice_type == SLICE_TYPE_B ||
1236 hdr->slice_type == SLICE_TYPE_P)
1237 {
1238 bool success = construct_reference_picture_lists(ctx,hdr);
1239 if (!success) {
1240 return false;
1241 }
1242 }
1243
1244 //printf("process slice segment header\n");
1245
1246 loginfo(LogHeaders,"end of process-slice-header\n");
1247 log_dpb_content(ctx);
1248
1249
1250 if (hdr->dependent_slice_segment_flag==0) {
1251 hdr->SliceAddrRS = hdr->slice_segment_address;
1252 } else {
1253 const pic_parameter_set* pps = ctx->current_pps;
1254 int prevCtb = pps->CtbAddrTStoRS[ pps->CtbAddrRStoTS[hdr->slice_segment_address] -1 ];
1255
1256 hdr->SliceAddrRS = ctx->img->ctb_info[prevCtb].SliceAddrRS;
1257 }
1258
1259 loginfo(LogHeaders,"SliceAddrRS = %d\n",hdr->SliceAddrRS);
1260
1261 return true;
1262 }
1263
1264
1265 slice_segment_header* get_SliceHeader(decoder_context* ctx, int x, int y)
1266 {
1267 return &ctx->slice[ get_SliceHeaderIndex(ctx->img, ctx->current_sps,x,y) ];
1268 }
1269
1270 slice_segment_header* get_SliceHeaderCtb(decoder_context* ctx, int ctbX, int ctbY)
1271 {
1272 return &ctx->slice[ ctx->img->ctb_info[ctbX + ctbY*ctx->current_sps->PicWidthInCtbsY].SliceHeaderIndex ];
1273 }
1274
1275
1276 const PredVectorInfo* get_mv_info(const decoder_context* ctx,int x,int y)
1277 {
1278 int log2PuSize = 2; // (ctx->current_sps->Log2MinCbSizeY-f);
1279 int idx = (x>>log2PuSize) + (y>>log2PuSize)*ctx->img->pb_info_stride;
1280
1281 //int rootIdx = ctx->img->pb_rootIdx[idx];
1282 //return &ctx->img->pb_info[rootIdx].mvi;
1283
1284 return &ctx->img->pb_info[idx].mvi;
1285 }
1286
1287
1288 const PredVectorInfo* get_img_mv_info(const decoder_context* ctx,
1289 const de265_image* img, int x,int y)
1290 {
1291 int log2PuSize = 2; // (ctx->current_sps->Log2MinCbSizeY-f);
1292 int idx = (x>>log2PuSize) + (y>>log2PuSize)*ctx->img->pb_info_stride;
1293
1294 //int rootIdx = img->pb_rootIdx[idx];
1295 //return &img->pb_info[rootIdx].mvi;
1296
1297 return &img->pb_info[idx].mvi;
1298 }
1299
1300
1301 void set_mv_info(decoder_context* ctx,int x,int y, int nPbW,int nPbH, const PredVectorInfo* mv)
1302 {
1303 int log2PuSize = 2; // (ctx->current_sps->Log2MinCbSizeY-f);
1304
1305 int xPu = x >> log2PuSize;
1306 int yPu = y >> log2PuSize;
1307 int wPu = nPbW >> log2PuSize;
1308 int hPu = nPbH >> log2PuSize;
1309
1310 int stride = ctx->img->pb_info_stride; // ctx->current_sps->PicWidthInMinCbsY << f;
1311
1312 //int rootIdx = ctx->img->pb_info_nextRootIdx++;
1313 //ctx->img->pb_info[rootIdx].mvi = *mv;
1314
1315 for (int pby=0;pby<hPu;pby++)
1316 for (int pbx=0;pbx<wPu;pbx++)
1317 {
1318 //ctx->img->pb_rootIdx[ xPu+pbx + (yPu+pby)*stride ] = rootIdx;
1319 ctx->img->pb_info[ xPu+pbx + (yPu+pby)*stride ].mvi = *mv;
1320 }
1321
1322 //printf("%dx%d -> %dx%d size %d\n",nPbW,nPbH, wPu,hPu,sizeof(*mv));
1323
1324 /*
1325 fprintf(stderr,"set_mv_info %d;%d [%d;%d] to %d;%d (POC=%d)\n",x,y,nPbW,nPbH,
1326 mv->mv[0].x,mv->mv[0].y,
1327 ctx->img->PicOrderCntVal);
1328 */
1329 }
1330
1331
1332
1333 bool available_zscan(const de265_image* img,
1334 int xCurr,int yCurr, int xN,int yN)
1335 {
1336 seq_parameter_set* sps = img->sps;
1337 pic_parameter_set* pps = img->pps;
1338
1339 if (xN<0 || yN<0) return false;
1340 if (xN>=sps->pic_width_in_luma_samples ||
1341 yN>=sps->pic_height_in_luma_samples) return false;
1342
1343 int minBlockAddrN = pps->MinTbAddrZS[ (xN>>sps->Log2MinTrafoSize) +
1344 (yN>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ];
1345 int minBlockAddrCurr = pps->MinTbAddrZS[ (xCurr>>sps->Log2MinTrafoSize) +
1346 (yCurr>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ];
1347
1348 if (minBlockAddrN > minBlockAddrCurr) return false;
1349
1350 int xCurrCtb = xCurr >> sps->Log2CtbSizeY;
1351 int yCurrCtb = yCurr >> sps->Log2CtbSizeY;
1352 int xNCtb = xN >> sps->Log2CtbSizeY;
1353 int yNCtb = yN >> sps->Log2CtbSizeY;
1354
1355 if (get_SliceAddrRS(img,sps, xCurrCtb,yCurrCtb) !=
1356 get_SliceAddrRS(img,sps, xNCtb, yNCtb)) {
1357 return false;
1358 }
1359
1360 if (pps->TileIdRS[xCurrCtb + yCurrCtb*sps->PicWidthInCtbsY] !=
1361 pps->TileIdRS[xNCtb + yNCtb *sps->PicWidthInCtbsY]) {
1362 return false;
1363 }
1364
1365 return true;
1366 }
1367
1368
1369 bool available_pred_blk(const decoder_context* ctx,
1370 int xC,int yC, int nCbS, int xP, int yP, int nPbW, int nPbH, int partIdx,
1371 int xN,int yN)
1372 {
1373 logtrace(LogMotion,"C:%d;%d P:%d;%d N:%d;%d size=%d;%d\n",xC,yC,xP,yP,xN,yN,nPbW,nPbH);
1374
1375 int sameCb = (xC <= xN && xN < xC+nCbS &&
1376 yC <= yN && yN < yC+nCbS);
1377
1378 bool availableN;
1379
1380 if (!sameCb) {
1381 availableN = available_zscan(ctx->img,xP,yP,xN,yN);
1382 }
1383 else {
1384 availableN = !(nPbW<<1 == nCbS && nPbH<<1 == nCbS &&
1385 partIdx==1 &&
1386 yN >= yC+nPbH && xN < xC+nPbW);
1387 }
1388
1389 if (availableN && get_pred_mode(ctx->img,ctx->current_sps,xN,yN) == MODE_INTRA) {
1390 availableN = false;
1391 }
1392
1393 return availableN;
1394 }
1395
1396
1397 static const char *output_filename;
1398
1399 LIBDE265_API void set_output_filename(const char* filename)
1400 {
1401 output_filename = filename;
1402 }
1403
1404 LIBDE265_API void write_picture(const de265_image* img)
1405 {
1406 static FILE* fh = NULL;
1407 if (fh==NULL) { fh = fopen(output_filename, "wb"); }
1408
1409 for (int y=0;y<de265_get_image_height(img,0);y++)
1410 fwrite(img->y + y*img->stride, de265_get_image_width(img,0), 1, fh);
1411
1412 for (int y=0;y<de265_get_image_height(img,1);y++)
1413 fwrite(img->cb + y*img->chroma_stride, de265_get_image_width(img,1), 1, fh);
1414
1415 for (int y=0;y<de265_get_image_height(img,2);y++)
1416 fwrite(img->cr + y*img->chroma_stride, de265_get_image_width(img,2), 1, fh);
1417
1418 fflush(fh);
1419 //fclose(fh);
1420 }
1421
1422
1423 void write_picture_to_file(const de265_image* img, const char* filename)
1424 {
1425 FILE* fh = fopen(filename, "wb");
1426
1427 for (int y=0;y<de265_get_image_height(img,0);y++)
1428 fwrite(img->y + y*img->stride, de265_get_image_width(img,0), 1, fh);
1429
1430 for (int y=0;y<de265_get_image_height(img,1);y++)
1431 fwrite(img->cb + y*img->chroma_stride, de265_get_image_width(img,1), 1, fh);
1432
1433 for (int y=0;y<de265_get_image_height(img,2);y++)
1434 fwrite(img->cr + y*img->chroma_stride, de265_get_image_width(img,2), 1, fh);
1435
1436 fflush(fh);
1437 fclose(fh);
1438 }
1439
1440
1441 void draw_block_boundary(const decoder_context* ctx,
1442 uint8_t* img,int stride,
1443 int x,int y,int hBlkSize, int vBlkSize, uint8_t value)
1444 {
1445 for (int i=0;i<vBlkSize;i++)
1446 {
1447 int yi = y + i;
1448
1449 if (yi < ctx->current_sps->pic_height_in_luma_samples) {
1450 img[yi*stride + x] = value;
1451 }
1452 }
1453
1454 for (int i=0;i<hBlkSize;i++)
1455 {
1456 int xi = x + i;
1457
1458 if (xi < ctx->current_sps->pic_width_in_luma_samples) {
1459 img[y*stride + xi] = value;
1460 }
1461 }
1462 }
1463
1464
1465 #include "intrapred.h"
1466
1467 void draw_intra_pred_mode(const decoder_context* ctx,
1468 uint8_t* img,int stride,
1469 int x0,int y0,int log2BlkSize,
1470 enum IntraPredMode mode, uint8_t value)
1471 {
1472 int w = 1<<log2BlkSize;
1473
1474 if (mode==0) {
1475 // Planar -> draw square
1476
1477 for (int i=-w*1/4;i<=w*1/4;i++)
1478 {
1479 img[(y0+w/2+i)*stride + x0+w*1/4] = value;
1480 img[(y0+w/2+i)*stride + x0+w*3/4] = value;
1481 img[(y0+w*1/4)*stride + x0+w/2+i] = value;
1482 img[(y0+w*3/4)*stride + x0+w/2+i] = value;
1483 }
1484 }
1485 else if (mode==1) {
1486 // DC -> draw circle
1487
1488 for (int i=-w/4;i<w/4;i++)
1489 {
1490 int k = (sqrt((double)(w*w - i*i*16))+2)/4;
1491
1492 img[(y0+w/2+k)*stride + x0+w/2+i] = value;
1493 img[(y0+w/2-k)*stride + x0+w/2+i] = value;
1494 img[(y0+w/2+i)*stride + x0+w/2+k] = value;
1495 img[(y0+w/2+i)*stride + x0+w/2-k] = value;
1496 }
1497 }
1498 else {
1499 // angular -> draw line in prediction direction
1500
1501 int slope = intraPredAngle_table[mode];
1502 bool horiz = (mode<18);
1503
1504 if (horiz) {
1505 for (int i=-w/2;i<w/2;i++)
1506 {
1507 int dy = (slope*i+Sign(slope*i)*16)/32;
1508 int y = y0+w/2-dy;
1509 if (y>=0 && y<ctx->current_sps->pic_height_in_luma_samples) {
1510 img[y*stride + x0+i+w/2] = value;
1511 }
1512 }
1513 }
1514 else {
1515 for (int i=-w/2;i<w/2;i++)
1516 {
1517 int dx = (slope*i+Sign(slope*i)*16)/32;
1518 int x = x0+w/2-dx;
1519 if (x>=0 && x<ctx->current_sps->pic_width_in_luma_samples) {
1520 img[(y0+i+w/2)*stride + x] = value;
1521 }
1522 }
1523 }
1524 }
1525 }
1526
1527
1528 void drawTBgrid(const decoder_context* ctx, uint8_t* img, int stride,
1529 int x0,int y0, uint8_t value, int log2CbSize, int trafoDepth)
1530 {
1531 int split_transform_flag = get_split_transform_flag(ctx->img, ctx->current_sps,x0,y0,trafoDepth);
1532 if (split_transform_flag) {
1533 int x1 = x0 + ((1<<(log2CbSize-trafoDepth))>>1);
1534 int y1 = y0 + ((1<<(log2CbSize-trafoDepth))>>1);
1535 drawTBgrid(ctx,img,stride,x0,y0,value,log2CbSize,trafoDepth+1);
1536 drawTBgrid(ctx,img,stride,x1,y0,value,log2CbSize,trafoDepth+1);
1537 drawTBgrid(ctx,img,stride,x0,y1,value,log2CbSize,trafoDepth+1);
1538 drawTBgrid(ctx,img,stride,x1,y1,value,log2CbSize,trafoDepth+1);
1539 }
1540 else {
1541 draw_block_boundary(ctx,img,stride,x0,y0,1<<(log2CbSize-trafoDepth),1<<(log2CbSize-trafoDepth), value);
1542 }
1543 }
1544
1545
1546 enum DrawMode {
1547 Partitioning_CB,
1548 Partitioning_TB,
1549 Partitioning_PB,
1550 IntraPredMode,
1551 PBPredMode,
1552 PBMotionVectors
1553 };
1554
1555
1556 void tint_rect(uint8_t* img, int stride, int x0,int y0,int w,int h, uint8_t color)
1557 {
1558 for (int y=0;y<h;y++)
1559 for (int x=0;x<w;x++)
1560 {
1561 int xp = x0+x;
1562 int yp = y0+y;
1563
1564 img[xp+yp*stride] = (img[xp+yp*stride] + color)/2;
1565 }
1566 }
1567
1568
1569 void draw_PB_block(const decoder_context* ctx,uint8_t* img,int stride,
1570 int x0,int y0, int w,int h, enum DrawMode what, uint8_t value)
1571 {
1572 if (what == Partitioning_PB) {
1573 draw_block_boundary(ctx,img,stride,x0,y0,w,h, value);
1574 }
1575 else if (what == PBPredMode) {
1576 enum PredMode predMode = get_pred_mode(ctx->img,ctx->current_sps,x0,y0);
1577
1578 uint8_t cols[3][3] = { { 255,0,0 }, { 0,0,255 }, { 0,255,0 } };
1579
1580 tint_rect(img,stride, x0,y0,w,h, cols[predMode][value]);
1581 }
1582 else if (what == PBMotionVectors) {
1583 assert(false); // TODO
1584 }
1585 }
1586
1587
1588 void draw_tree_grid(const decoder_context* ctx, uint8_t* img, int stride,
1589 uint8_t value, enum DrawMode what)
1590 {
1591 const seq_parameter_set* sps = ctx->current_sps;
1592 int minCbSize = sps->MinCbSizeY;
1593
1594 for (int y0=0;y0<sps->PicHeightInMinCbsY;y0++)
1595 for (int x0=0;x0<sps->PicWidthInMinCbsY;x0++)
1596 {
1597 int log2CbSize = get_log2CbSize_cbUnits(ctx->img,sps,x0,y0);
1598 if (log2CbSize==0) {
1599 continue;
1600 }
1601
1602 int xb = x0*minCbSize;
1603 int yb = y0*minCbSize;
1604
1605
1606 if (what == Partitioning_TB) {
1607 drawTBgrid(ctx,img,stride,x0*minCbSize,y0*minCbSize, value, log2CbSize, 0);
1608 }
1609 else if (what == Partitioning_CB) {
1610 draw_block_boundary(ctx,img,stride,xb,yb, 1<<log2CbSize,1<<log2CbSize, value);
1611 }
1612 else if (what == Partitioning_PB ||
1613 what == PBPredMode) {
1614 enum PartMode partMode = get_PartMode(ctx->img,sps,xb,yb);
1615
1616 int CbSize = 1<<log2CbSize;
1617 int HalfCbSize = (1<<(log2CbSize-1));
1618
1619 switch (partMode) {
1620 case PART_2Nx2N:
1621 draw_PB_block(ctx,img,stride,xb,yb,CbSize,CbSize, what,value);
1622 break;
1623 case PART_NxN:
1624 draw_PB_block(ctx,img,stride,xb, yb, CbSize/2,CbSize/2, what,value);
1625 draw_PB_block(ctx,img,stride,xb+HalfCbSize,yb, CbSize/2,CbSize/2, what,value);
1626 draw_PB_block(ctx,img,stride,xb ,yb+HalfCbSize,CbSize/2,CbSize/2, what,value);
1627 draw_PB_block(ctx,img,stride,xb+HalfCbSize,yb+HalfCbSize,CbSize/2,CbSize/2, what,value);
1628 break;
1629 case PART_2NxN:
1630 draw_PB_block(ctx,img,stride,xb, yb, CbSize ,CbSize/2, what,value);
1631 draw_PB_block(ctx,img,stride,xb, yb+HalfCbSize,CbSize ,CbSize/2, what,value);
1632 break;
1633 case PART_Nx2N:
1634 draw_PB_block(ctx,img,stride,xb, yb, CbSize/2,CbSize, what,value);
1635 draw_PB_block(ctx,img,stride,xb+HalfCbSize,yb, CbSize/2,CbSize, what,value);
1636 break;
1637 case PART_2NxnU:
1638 draw_PB_block(ctx,img,stride,xb, yb, CbSize ,CbSize/4, what,value);
1639 draw_PB_block(ctx,img,stride,xb, yb+CbSize/4 ,CbSize ,CbSize*3/4, what,value);
1640 break;
1641 case PART_2NxnD:
1642 draw_PB_block(ctx,img,stride,xb, yb, CbSize ,CbSize*3/4, what,value);
1643 draw_PB_block(ctx,img,stride,xb, yb+CbSize*3/4,CbSize ,CbSize/4, what,value);
1644 break;
1645 case PART_nLx2N:
1646 draw_PB_block(ctx,img,stride,xb, yb, CbSize/4 ,CbSize, what,value);
1647 draw_PB_block(ctx,img,stride,xb+CbSize/4 ,yb, CbSize*3/4,CbSize, what,value);
1648 break;
1649 case PART_nRx2N:
1650 draw_PB_block(ctx,img,stride,xb, yb, CbSize*3/4,CbSize, what,value);
1651 draw_PB_block(ctx,img,stride,xb+CbSize*3/4,yb, CbSize/4 ,CbSize, what,value);
1652 break;
1653 default:
1654 assert(false);
1655 break;
1656 }
1657 }
1658 else if (what==IntraPredMode) {
1659 enum PredMode predMode = get_pred_mode(ctx->img,sps,xb,yb);
1660 if (predMode == MODE_INTRA) {
1661 enum PartMode partMode = get_PartMode(ctx->img,sps,xb,yb);
1662
1663 int HalfCbSize = (1<<(log2CbSize-1));
1664
1665 switch (partMode) {
1666 case PART_2Nx2N:
1667 draw_intra_pred_mode(ctx,img,stride,xb,yb,log2CbSize,
1668 get_IntraPredMode(ctx->img,sps,xb,yb), value);
1669 break;
1670 case PART_NxN:
1671 draw_intra_pred_mode(ctx,img,stride,xb, yb, log2CbSize-1,
1672 get_IntraPredMode(ctx->img,sps,xb,yb), value);
1673 draw_intra_pred_mode(ctx,img,stride,xb+HalfCbSize,yb, log2CbSize-1,
1674 get_IntraPredMode(ctx->img,sps,xb+HalfCbSize,yb), value);
1675 draw_intra_pred_mode(ctx,img,stride,xb ,yb+HalfCbSize,log2CbSize-1,
1676 get_IntraPredMode(ctx->img,sps,xb,yb+HalfCbSize), value);
1677 draw_intra_pred_mode(ctx,img,stride,xb+HalfCbSize,yb+HalfCbSize,log2CbSize-1,
1678 get_IntraPredMode(ctx->img,sps,xb+HalfCbSize,yb+HalfCbSize), value);
1679 break;
1680 default:
1681 assert(false);
1682 break;
1683 }
1684 }
1685 }
1686 }
1687 }
1688
1689
1690 void draw_CB_grid(const decoder_context* ctx, uint8_t* img, int stride, uint8_t value)
1691 {
1692 draw_tree_grid(ctx,img,stride,value, Partitioning_CB);
1693 }
1694
1695 void draw_TB_grid(const decoder_context* ctx, uint8_t* img, int stride, uint8_t value)
1696 {
1697 draw_tree_grid(ctx,img,stride,value, Partitioning_TB);
1698 }
1699
1700 void draw_PB_grid(const decoder_context* ctx, uint8_t* img, int stride, uint8_t value)
1701 {
1702 draw_tree_grid(ctx,img,stride,value, Partitioning_PB);
1703 }
1704
1705 void draw_intra_pred_modes(const decoder_context* ctx, uint8_t* img, int stride, uint8_t value)
1706 {
1707 draw_tree_grid(ctx,img,stride,value, IntraPredMode);
1708 }
1709
1710 void draw_PB_pred_modes(const decoder_context* ctx, uint8_t* r, uint8_t* g, uint8_t* b, int stride)
1711 {
1712 draw_tree_grid(ctx,r,stride,0, PBPredMode);
1713 draw_tree_grid(ctx,g,stride,1, PBPredMode);
1714 draw_tree_grid(ctx,b,stride,2, PBPredMode);
1715 }
1716
1717
1718 void add_warning(decoder_context* ctx, de265_error warning, bool once)
1719 {
1720 // check if warning was already shown
1721 bool add=true;
1722 if (once) {
1723 for (int i=0;i<ctx->nWarningsShown;i++) {
1724 if (ctx->warnings_shown[i] == warning) {
1725 add=false;
1726 break;
1727 }
1728 }
1729 }
1730
1731 if (!add) {
1732 return;
1733 }
1734
1735
1736 // if this is a one-time warning, remember that it was shown
1737
1738 if (once) {
1739 if (ctx->nWarningsShown < MAX_WARNINGS) {
1740 ctx->warnings_shown[ctx->nWarningsShown++] = warning;
1741 }
1742 }
1743
1744
1745 // add warning to output queue
1746
1747 if (ctx->nWarnings == MAX_WARNINGS) {
1748 ctx->warnings[MAX_WARNINGS-1] = DE265_WARNING_WARNING_BUFFER_FULL;
1749 return;
1750 }
1751
1752 ctx->warnings[ctx->nWarnings++] = warning;
1753
1754 }
1755
1756 de265_error get_warning(decoder_context* ctx)
1757 {
1758 if (ctx->nWarnings==0) {
1759 return DE265_OK;
1760 }
1761
1762 de265_error warn = ctx->warnings[0];
1763 ctx->nWarnings--;
1764 memmove(ctx->warnings, &ctx->warnings[1], ctx->nWarnings*sizeof(de265_error));
1765
1766 return warn;
1767 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "decctx.h"
21 #include "util.h"
22 #include "sao.h"
23 #include "sei.h"
24 #include "deblock.h"
25
26 #include <string.h>
27 #include <assert.h>
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <math.h>
31
32 #include "fallback.h"
33
34 #ifdef HAVE_CONFIG_H
35 #include "config.h"
36 #endif
37
38 #ifdef HAVE_SSE4_1
39 #include "x86/sse.h"
40 #endif
41
42 #define SAVE_INTERMEDIATE_IMAGES 0
43
44 #if SAVE_INTERMEDIATE_IMAGES
45 #include "visualize.h"
46 #endif
47
48 extern void thread_decode_CTB_row(void* d);
49 extern void thread_decode_slice_segment(void* d);
50
51
52 thread_context::thread_context()
53 {
54 /*
55 CtbAddrInRS = 0;
56 CtbAddrInTS = 0;
57
58 CtbX = 0;
59 CtbY = 0;
60 */
61
62 /*
63 refIdx[0] = refIdx[1] = 0;
64 mvd[0][0] = mvd[0][1] = mvd[1][0] = mvd[1][1] = 0;
65 merge_flag = 0;
66 merge_idx = 0;
67 mvp_lX_flag[0] = mvp_lX_flag[1] = 0;
68 inter_pred_idc = 0;
69 */
70
71 /*
72 enum IntraPredMode IntraPredModeC; // chroma intra-prediction mode for current CB
73 */
74
75 /*
76 cu_transquant_bypass_flag = false;
77 memset(transform_skip_flag,0, 3*sizeof(uint8_t));
78 */
79
80
81 //memset(coeffList,0,sizeof(int16_t)*3*32*32);
82 //memset(coeffPos,0,sizeof(int16_t)*3*32*32);
83 //memset(nCoeff,0,sizeof(int16_t)*3);
84
85
86
87 IsCuQpDeltaCoded = false;
88 CuQpDelta = 0;
89
90 /*
91 currentQPY = 0;
92 currentQG_x = 0;
93 currentQG_y = 0;
94 lastQPYinPreviousQG = 0;
95 */
96
97 /*
98 qPYPrime = 0;
99 qPCbPrime = 0;
100 qPCrPrime = 0;
101 */
102
103 /*
104 memset(&cabac_decoder, 0, sizeof(CABAC_decoder));
105 memset(&ctx_model, 0, sizeof(ctx_model));
106 */
107
108 decctx = NULL;
109 img = NULL;
110 shdr = NULL;
111
112
113 //memset(this,0,sizeof(thread_context));
114
115 // some compilers/linkers don't align struct members correctly,
116 // adjust if necessary
117 int offset = (uintptr_t)_coeffBuf & 0x0f;
118
119 if (offset == 0) {
120 coeffBuf = (int16_t *) &_coeffBuf; // correctly aligned already
121 }
122 else {
123 coeffBuf = (int16_t *) (((uint8_t *)_coeffBuf) + (16-offset));
124 }
125
126 memset(coeffBuf, 0, 32*32*sizeof(int16_t));
127 }
128
129
130 slice_unit::slice_unit(decoder_context* decctx)
131 : ctx(decctx),
132 nal(NULL),
133 shdr(NULL),
134 flush_reorder_buffer(false),
135 thread_contexts(NULL),
136 imgunit(NULL)
137 {
138 state = Unprocessed;
139 }
140
141 slice_unit::~slice_unit()
142 {
143 ctx->nal_parser.free_NAL_unit(nal);
144
145 if (thread_contexts) {
146 delete[] thread_contexts;
147 }
148 }
149
150
151 void slice_unit::allocate_thread_contexts(int n)
152 {
153 assert(thread_contexts==NULL);
154
155 thread_contexts = new thread_context[n];
156 }
157
158
159 image_unit::image_unit()
160 {
161 img=NULL;
162 role=Invalid;
163 state=Unprocessed;
164 }
165
166
167 image_unit::~image_unit()
168 {
169 for (int i=0;i<slice_units.size();i++) {
170 delete slice_units[i];
171 }
172
173 for (int i=0;i<tasks.size();i++) {
174 delete tasks[i];
175 }
176 }
177
178
179 decoder_context::decoder_context()
180 {
181 //memset(ctx, 0, sizeof(decoder_context));
182
183 // --- parameters ---
184
185 param_sei_check_hash = false;
186 param_conceal_stream_errors = true;
187 param_suppress_faulty_pictures = false;
188
189 param_disable_deblocking = false;
190 param_disable_sao = false;
191 //param_disable_mc_residual_idct = false;
192 //param_disable_intra_residual_idct = false;
193
194 // --- processing ---
195
196 param_sps_headers_fd = -1;
197 param_vps_headers_fd = -1;
198 param_pps_headers_fd = -1;
199 param_slice_headers_fd = -1;
200
201 set_acceleration_functions(de265_acceleration_AUTO);
202
203 param_image_allocation_functions = de265_image::default_image_allocation;
204 param_image_allocation_userdata = NULL;
205
206 /*
207 memset(&vps, 0, sizeof(video_parameter_set)*DE265_MAX_VPS_SETS);
208 memset(&sps, 0, sizeof(seq_parameter_set) *DE265_MAX_SPS_SETS);
209 memset(&pps, 0, sizeof(pic_parameter_set) *DE265_MAX_PPS_SETS);
210 memset(&slice,0,sizeof(slice_segment_header)*DE265_MAX_SLICES);
211 */
212
213 current_vps = NULL;
214 current_sps = NULL;
215 current_pps = NULL;
216
217 //memset(&thread_pool,0,sizeof(struct thread_pool));
218 num_worker_threads = 0;
219
220
221 // frame-rate
222
223 limit_HighestTid = 6; // decode all temporal layers (up to layer 6)
224 framerate_ratio = 100; // decode all 100%
225
226 goal_HighestTid = 6;
227 current_HighestTid = 6;
228 layer_framerate_ratio = 100;
229
230 compute_framedrop_table();
231
232
233 //
234
235 current_image_poc_lsb = 0;
236 first_decoded_picture = 0;
237 NoRaslOutputFlag = 0;
238 HandleCraAsBlaFlag = 0;
239 FirstAfterEndOfSequenceNAL = 0;
240 PicOrderCntMsb = 0;
241 prevPicOrderCntLsb = 0;
242 prevPicOrderCntMsb = 0;
243 img = NULL;
244
245 /*
246 int PocLsbLt[MAX_NUM_REF_PICS];
247 int UsedByCurrPicLt[MAX_NUM_REF_PICS];
248 int DeltaPocMsbCycleLt[MAX_NUM_REF_PICS];
249 int CurrDeltaPocMsbPresentFlag[MAX_NUM_REF_PICS];
250 int FollDeltaPocMsbPresentFlag[MAX_NUM_REF_PICS];
251
252 int NumPocStCurrBefore;
253 int NumPocStCurrAfter;
254 int NumPocStFoll;
255 int NumPocLtCurr;
256 int NumPocLtFoll;
257
258 // These lists contain absolute POC values.
259 int PocStCurrBefore[MAX_NUM_REF_PICS]; // used for reference in current picture, smaller POC
260 int PocStCurrAfter[MAX_NUM_REF_PICS]; // used for reference in current picture, larger POC
261 int PocStFoll[MAX_NUM_REF_PICS]; // not used for reference in current picture, but in future picture
262 int PocLtCurr[MAX_NUM_REF_PICS]; // used in current picture
263 int PocLtFoll[MAX_NUM_REF_PICS]; // used in some future picture
264
265 // These lists contain indices into the DPB.
266 int RefPicSetStCurrBefore[DE265_DPB_SIZE];
267 int RefPicSetStCurrAfter[DE265_DPB_SIZE];
268 int RefPicSetStFoll[DE265_DPB_SIZE];
269 int RefPicSetLtCurr[DE265_DPB_SIZE];
270 int RefPicSetLtFoll[DE265_DPB_SIZE];
271
272
273 uint8_t nal_unit_type;
274
275 char IdrPicFlag;
276 char RapPicFlag;
277 */
278
279
280
281 // --- internal data ---
282
283 first_decoded_picture = true;
284 //ctx->FirstAfterEndOfSequenceNAL = true;
285 //ctx->last_RAP_picture_NAL_type = NAL_UNIT_UNDEFINED;
286
287 //de265_init_image(&ctx->coeff);
288
289 // --- decoded picture buffer ---
290
291 current_image_poc_lsb = -1; // any invalid number
292 }
293
294
295 decoder_context::~decoder_context()
296 {
297 while (!image_units.empty()) {
298 delete image_units.back();
299 image_units.pop_back();
300 }
301 }
302
303
304 void decoder_context::set_image_allocation_functions(de265_image_allocation* allocfunc,
305 void* userdata)
306 {
307 if (allocfunc) {
308 param_image_allocation_functions = *allocfunc;
309 param_image_allocation_userdata = userdata;
310 }
311 else {
312 assert(false); // actually, it makes no sense to reset the allocation functions
313
314 param_image_allocation_functions = de265_image::default_image_allocation;
315 param_image_allocation_userdata = NULL;
316 }
317 }
318
319
320 de265_error decoder_context::start_thread_pool(int nThreads)
321 {
322 ::start_thread_pool(&thread_pool, nThreads);
323
324 num_worker_threads = nThreads;
325
326 return DE265_OK;
327 }
328
329
330 void decoder_context::stop_thread_pool()
331 {
332 if (get_num_worker_threads()>0) {
333 //flush_thread_pool(&ctx->thread_pool);
334 ::stop_thread_pool(&thread_pool);
335 }
336 }
337
338
339 void decoder_context::reset()
340 {
341 if (num_worker_threads>0) {
342 //flush_thread_pool(&ctx->thread_pool);
343 ::stop_thread_pool(&thread_pool);
344 }
345
346 // --------------------------------------------------
347
348 #if 0
349 ctx->end_of_stream = false;
350 ctx->pending_input_NAL = NULL;
351 ctx->current_vps = NULL;
352 ctx->current_sps = NULL;
353 ctx->current_pps = NULL;
354 ctx->num_worker_threads = 0;
355 ctx->current_image_poc_lsb = 0;
356 ctx->first_decoded_picture = 0;
357 ctx->NoRaslOutputFlag = 0;
358 ctx->HandleCraAsBlaFlag = 0;
359 ctx->FirstAfterEndOfSequenceNAL = 0;
360 ctx->PicOrderCntMsb = 0;
361 ctx->prevPicOrderCntLsb = 0;
362 ctx->prevPicOrderCntMsb = 0;
363 ctx->NumPocStCurrBefore=0;
364 ctx->NumPocStCurrAfter=0;
365 ctx->NumPocStFoll=0;
366 ctx->NumPocLtCurr=0;
367 ctx->NumPocLtFoll=0;
368 ctx->nal_unit_type=0;
369 ctx->IdrPicFlag=0;
370 ctx->RapPicFlag=0;
371 #endif
372
373 img = NULL;
374
375
376 // TODO: remove all pending image_units
377
378
379 // --- decoded picture buffer ---
380
381 current_image_poc_lsb = -1; // any invalid number
382 first_decoded_picture = true;
383
384
385 // --- remove all pictures from output queue ---
386
387 // there was a bug the peek_next_image did not return NULL on empty output queues.
388 // This was (indirectly) fixed by recreating the DPB buffer, but it should actually
389 // be sufficient to clear it like this.
390 // The error showed while scrubbing the ToS video in VLC.
391 dpb.clear();
392
393 nal_parser.remove_pending_input_data();
394
395
396 while (!image_units.empty()) {
397 delete image_units.back();
398 image_units.pop_back();
399 }
400
401 // --- start threads again ---
402
403 if (num_worker_threads>0) {
404 // TODO: need error checking
405 start_thread_pool(num_worker_threads);
406 }
407 }
408
409 void decoder_context::set_acceleration_functions(enum de265_acceleration l)
410 {
411 // fill scalar functions first (so that function table is completely filled)
412
413 init_acceleration_functions_fallback(&acceleration);
414
415
416 // override functions with optimized variants
417
418 #ifdef HAVE_SSE4_1
419 if (l>=de265_acceleration_SSE) {
420 init_acceleration_functions_sse(&acceleration);
421 }
422 #endif
423 }
424
425
426 void decoder_context::init_thread_context(thread_context* tctx)
427 {
428 // zero scrap memory for coefficient blocks
429 memset(tctx->_coeffBuf, 0, sizeof(tctx->_coeffBuf)); // TODO: check if we can safely remove this
430
431 tctx->currentQG_x = -1;
432 tctx->currentQG_y = -1;
433
434
435
436 // --- find QPY that was active at the end of the previous slice ---
437
438 // find the previous CTB in TS order
439
440 const pic_parameter_set* pps = &tctx->img->pps;
441 const seq_parameter_set* sps = &tctx->img->sps;
442
443
444 if (tctx->shdr->slice_segment_address > 0) {
445 int prevCtb = pps->CtbAddrTStoRS[ pps->CtbAddrRStoTS[tctx->shdr->slice_segment_address] -1 ];
446
447 int ctbX = prevCtb % sps->PicWidthInCtbsY;
448 int ctbY = prevCtb / sps->PicWidthInCtbsY;
449
450
451 // take the pixel at the bottom right corner (but consider that the image size might be smaller)
452
453 int x = ((ctbX+1) << sps->Log2CtbSizeY)-1;
454 int y = ((ctbY+1) << sps->Log2CtbSizeY)-1;
455
456 x = std::min(x,sps->pic_width_in_luma_samples-1);
457 y = std::min(y,sps->pic_height_in_luma_samples-1);
458
459 //printf("READ QPY: %d %d -> %d (should %d)\n",x,y,imgunit->img->get_QPY(x,y), tc.currentQPY);
460
461 //if (tctx->shdr->dependent_slice_segment_flag) { // TODO: do we need this condition ?
462 tctx->currentQPY = tctx->img->get_QPY(x,y);
463 //}
464 }
465 }
466
467
468 void decoder_context::add_task_decode_CTB_row(thread_context* tctx, bool firstSliceSubstream)
469 {
470 thread_task_ctb_row* task = new thread_task_ctb_row;
471 task->firstSliceSubstream = firstSliceSubstream;
472 task->tctx = tctx;
473 tctx->task = task;
474
475 add_task(&thread_pool, task);
476
477 tctx->imgunit->tasks.push_back(task);
478 }
479
480
481 void decoder_context::add_task_decode_slice_segment(thread_context* tctx, bool firstSliceSubstream)
482 {
483 thread_task_slice_segment* task = new thread_task_slice_segment;
484 task->firstSliceSubstream = firstSliceSubstream;
485 task->tctx = tctx;
486 tctx->task = task;
487
488 add_task(&thread_pool, task);
489
490 tctx->imgunit->tasks.push_back(task);
491 }
492
493
494 de265_error decoder_context::read_vps_NAL(bitreader& reader)
495 {
496 logdebug(LogHeaders,"---> read VPS\n");
497
498 video_parameter_set vps;
499 de265_error err = ::read_vps(this,&reader,&vps);
500 if (err != DE265_OK) {
501 return err;
502 }
503
504 if (param_vps_headers_fd>=0) {
505 dump_vps(&vps, param_vps_headers_fd);
506 }
507
508 process_vps(&vps);
509
510 return DE265_OK;
511 }
512
513 de265_error decoder_context::read_sps_NAL(bitreader& reader)
514 {
515 logdebug(LogHeaders,"----> read SPS\n");
516
517 seq_parameter_set sps;
518 de265_error err;
519
520 if ((err=sps.read(this, &reader)) != DE265_OK) {
521 return err;
522 }
523
524 if (param_sps_headers_fd>=0) {
525 sps.dump_sps(param_sps_headers_fd);
526 }
527
528 process_sps(&sps);
529
530 return DE265_OK;
531 }
532
533 de265_error decoder_context::read_pps_NAL(bitreader& reader)
534 {
535 logdebug(LogHeaders,"----> read PPS\n");
536
537 pic_parameter_set pps;
538
539 bool success = pps.read(&reader,this);
540
541 if (param_pps_headers_fd>=0) {
542 pps.dump_pps(param_pps_headers_fd);
543 }
544
545 if (success) {
546 process_pps(&pps);
547 }
548
549 return success ? DE265_OK : DE265_WARNING_PPS_HEADER_INVALID;
550 }
551
552 de265_error decoder_context::read_sei_NAL(bitreader& reader, bool suffix)
553 {
554 logdebug(LogHeaders,"----> read SEI\n");
555
556 sei_message sei;
557
558 //push_current_picture_to_output_queue();
559
560 de265_error err = DE265_OK;
561
562 if ((err=read_sei(&reader,&sei, suffix, current_sps)) == DE265_OK) {
563 dump_sei(&sei, current_sps);
564
565 if (image_units.empty()==false && suffix) {
566 image_units.back()->suffix_SEIs.push_back(sei);
567 }
568 }
569 else {
570 add_warning(err, false);
571 }
572
573 return err;
574 }
575
576 de265_error decoder_context::read_eos_NAL(bitreader& reader)
577 {
578 FirstAfterEndOfSequenceNAL = true;
579 return DE265_OK;
580 }
581
582 de265_error decoder_context::read_slice_NAL(bitreader& reader, NAL_unit* nal, nal_header& nal_hdr)
583 {
584 logdebug(LogHeaders,"---> read slice segment header\n");
585
586
587 // --- read slice header ---
588
589 slice_segment_header* shdr = new slice_segment_header;
590 bool continueDecoding;
591 de265_error err = shdr->read(&reader,this, &continueDecoding);
592 if (!continueDecoding) {
593 if (img) { img->integrity = INTEGRITY_NOT_DECODED; }
594 delete shdr;
595 return err;
596 }
597
598 if (param_slice_headers_fd>=0) {
599 shdr->dump_slice_segment_header(this, param_slice_headers_fd);
600 }
601
602
603 if (process_slice_segment_header(this, shdr, &err, nal->pts, &nal_hdr, nal->user_data) == false)
604 {
605 img->integrity = INTEGRITY_NOT_DECODED;
606 delete shdr;
607 return err;
608 }
609
610 this->img->add_slice_segment_header(shdr);
611
612 skip_bits(&reader,1); // TODO: why?
613 prepare_for_CABAC(&reader);
614
615
616 // modify entry_point_offsets
617
618 int headerLength = reader.data - nal->data();
619 for (int i=0;i<shdr->num_entry_point_offsets;i++) {
620 shdr->entry_point_offset[i] -= nal->num_skipped_bytes_before(shdr->entry_point_offset[i],
621 headerLength);
622 }
623
624
625
626 // --- start a new image if this is the first slice ---
627
628 if (shdr->first_slice_segment_in_pic_flag) {
629 image_unit* imgunit = new image_unit;
630 imgunit->img = this->img;
631 image_units.push_back(imgunit);
632 }
633
634
635 // --- add slice to current picture ---
636
637 if ( ! image_units.empty() ) {
638
639 slice_unit* sliceunit = new slice_unit(this);
640 sliceunit->nal = nal;
641 sliceunit->shdr = shdr;
642 sliceunit->reader = reader;
643
644 sliceunit->flush_reorder_buffer = flush_reorder_buffer_at_this_frame;
645
646
647 image_units.back()->slice_units.push_back(sliceunit);
648 }
649
650 decode_some();
651
652 return DE265_OK;
653 }
654
655
656 template <class T> void pop_front(std::vector<T>& vec)
657 {
658 for (int i=1;i<vec.size();i++)
659 vec[i-1] = vec[i];
660
661 vec.pop_back();
662 }
663
664
665 de265_error decoder_context::decode_some()
666 {
667 de265_error err = DE265_OK;
668
669 if (0) {
670 static int cnt=0;
671 cnt++;
672 if (cnt<5) return DE265_OK;
673 }
674
675 if (image_units.empty()) { return DE265_OK; } // nothing to do
676
677
678 // decode something if there is work to do
679
680 if ( ! image_units.empty() && ! image_units[0]->slice_units.empty() ) {
681
682 image_unit* imgunit = image_units[0];
683 slice_unit* sliceunit = imgunit->slice_units[0];
684
685 pop_front(imgunit->slice_units);
686
687 if (sliceunit->flush_reorder_buffer) {
688 dpb.flush_reorder_buffer();
689 }
690
691 //err = decode_slice_unit_sequential(imgunit, sliceunit);
692 err = decode_slice_unit_parallel(imgunit, sliceunit);
693 if (err) {
694 return err;
695 }
696
697 delete sliceunit;
698 }
699
700
701
702 // if we decoded all slices of the current image and there will not
703 // be added any more slices to the image, output the image
704
705 if ( ( image_units.size()>=2 && image_units[0]->slice_units.empty() ) ||
706 ( image_units.size()>=1 && image_units[0]->slice_units.empty() &&
707 nal_parser.number_of_NAL_units_pending()==0 && nal_parser.is_end_of_stream() )) {
708
709 image_unit* imgunit = image_units[0];
710
711
712
713 // run post-processing filters (deblocking & SAO)
714
715 if (img->decctx->num_worker_threads)
716 run_postprocessing_filters_parallel(imgunit);
717 else
718 run_postprocessing_filters_sequential(imgunit->img);
719
720 // process suffix SEIs
721
722 for (int i=0;i<imgunit->suffix_SEIs.size();i++) {
723 const sei_message& sei = imgunit->suffix_SEIs[i];
724
725 err = process_sei(&sei, imgunit->img);
726 if (err != DE265_OK)
727 break;
728 }
729
730
731 push_picture_to_output_queue(imgunit);
732
733 // remove just decoded image unit from queue
734
735 delete imgunit;
736
737 pop_front(image_units);
738 }
739
740 return err;
741 }
742
743
744 de265_error decoder_context::decode_slice_unit_sequential(image_unit* imgunit,
745 slice_unit* sliceunit)
746 {
747 de265_error err = DE265_OK;
748
749 /*
750 printf("decode slice POC=%d addr=%d, img=%p\n",
751 sliceunit->shdr->slice_pic_order_cnt_lsb,
752 sliceunit->shdr->slice_segment_address,
753 imgunit->img);
754 */
755
756 remove_images_from_dpb(sliceunit->shdr->RemoveReferencesList);
757
758
759 struct thread_context tctx;
760
761 tctx.shdr = sliceunit->shdr;
762 tctx.img = imgunit->img;
763 tctx.decctx = this;
764 tctx.imgunit = imgunit;
765 tctx.CtbAddrInTS = imgunit->img->pps.CtbAddrRStoTS[tctx.shdr->slice_segment_address];
766 tctx.task = NULL;
767
768 init_thread_context(&tctx);
769
770 init_CABAC_decoder(&tctx.cabac_decoder,
771 sliceunit->reader.data,
772 sliceunit->reader.bytes_remaining);
773
774 // alloc CABAC-model array if entropy_coding_sync is enabled
775
776 if (pps->entropy_coding_sync_enabled_flag &&
777 sliceunit->shdr->first_slice_segment_in_pic_flag) {
778 imgunit->ctx_models.resize( (img->sps.PicHeightInCtbsY-1) * CONTEXT_MODEL_TABLE_LENGTH );
779 }
780
781 if ((err=read_slice_segment_data(&tctx)) != DE265_OK)
782 { return err; }
783
784 return err;
785 }
786
787
788 de265_error decoder_context::decode_slice_unit_parallel(image_unit* imgunit,
789 slice_unit* sliceunit)
790 {
791 de265_error err = DE265_OK;
792
793 remove_images_from_dpb(sliceunit->shdr->RemoveReferencesList);
794
795
796
797 de265_image* img = imgunit->img;
798 const pic_parameter_set* pps = &img->pps;
799
800 bool use_WPP = (img->decctx->num_worker_threads > 0 &&
801 pps->entropy_coding_sync_enabled_flag);
802
803 bool use_tiles = (img->decctx->num_worker_threads > 0 &&
804 pps->tiles_enabled_flag);
805
806
807 // TODO: remove this warning later when we do frame-parallel decoding
808 if (img->decctx->num_worker_threads > 0 &&
809 pps->entropy_coding_sync_enabled_flag == false &&
810 pps->tiles_enabled_flag == false) {
811
812 img->decctx->add_warning(DE265_WARNING_NO_WPP_CANNOT_USE_MULTITHREADING, true);
813 }
814
815
816 // TODO: even though we cannot split this into several tasks, we should run it
817 // as a background thread
818 if (!use_WPP && !use_tiles) {
819 return decode_slice_unit_sequential(imgunit, sliceunit);
820 }
821
822
823 if (use_WPP && use_tiles) {
824 // TODO: this is not allowed ... output some warning or error
825 }
826
827
828 if (use_WPP) {
829 return decode_slice_unit_WPP(imgunit, sliceunit);
830 }
831 else if (use_tiles) {
832 return decode_slice_unit_tiles(imgunit, sliceunit);
833 }
834
835 assert(false);
836 return DE265_OK;
837 }
838
839
840 de265_error decoder_context::decode_slice_unit_WPP(image_unit* imgunit,
841 slice_unit* sliceunit)
842 {
843 de265_error err = DE265_OK;
844
845 de265_image* img = imgunit->img;
846 slice_segment_header* shdr = sliceunit->shdr;
847 const pic_parameter_set* pps = &img->pps;
848
849 int nRows = shdr->num_entry_point_offsets +1;
850 int ctbsWidth = img->sps.PicWidthInCtbsY;
851
852
853 assert(img->num_threads_active() == 0);
854 img->thread_start(nRows);
855
856 //printf("-------- decode --------\n");
857
858
859 // reserve space to store entropy coding context models for each CTB row
860
861 if (shdr->first_slice_segment_in_pic_flag) {
862 // reserve space for nRows-1 because we don't need to save the CABAC model in the last CTB row
863 imgunit->ctx_models.resize( (img->sps.PicHeightInCtbsY-1) * CONTEXT_MODEL_TABLE_LENGTH );
864 }
865
866
867 sliceunit->allocate_thread_contexts(nRows);
868
869
870 // first CTB in this slice
871 int ctbAddrRS = shdr->slice_segment_address;
872 int ctbRow = ctbAddrRS / ctbsWidth;
873
874 for (int entryPt=0;entryPt<nRows;entryPt++) {
875 // entry points other than the first start at CTB rows
876 if (entryPt>0) {
877 ctbRow++;
878 ctbAddrRS = ctbRow * ctbsWidth;
879 }
880
881
882 // prepare thread context
883
884 thread_context* tctx = sliceunit->get_thread_context(entryPt);
885
886 tctx->shdr = shdr;
887 tctx->decctx = img->decctx;
888 tctx->img = img;
889 tctx->imgunit = imgunit;
890 tctx->CtbAddrInTS = pps->CtbAddrRStoTS[ctbAddrRS];
891
892 init_thread_context(tctx);
893
894
895 // init CABAC
896
897 int dataStartIndex;
898 if (entryPt==0) { dataStartIndex=0; }
899 else { dataStartIndex=shdr->entry_point_offset[entryPt-1]; }
900
901 int dataEnd;
902 if (entryPt==nRows-1) dataEnd = sliceunit->reader.bytes_remaining;
903 else dataEnd = shdr->entry_point_offset[entryPt];
904
905 init_CABAC_decoder(&tctx->cabac_decoder,
906 &sliceunit->reader.data[dataStartIndex],
907 dataEnd-dataStartIndex);
908
909 // add task
910
911 add_task_decode_CTB_row(tctx, entryPt==0);
912 }
913
914 #if 0
915 for (;;) {
916 printf("q:%d r:%d b:%d f:%d\n",
917 img->nThreadsQueued,
918 img->nThreadsRunning,
919 img->nThreadsBlocked,
920 img->nThreadsFinished);
921
922 if (img->debug_is_completed()) break;
923
924 usleep(1000);
925 }
926 #endif
927
928 img->wait_for_completion();
929
930 for (int i=0;i<imgunit->tasks.size();i++)
931 delete imgunit->tasks[i];
932 imgunit->tasks.clear();
933
934 return DE265_OK;
935 }
936
937 de265_error decoder_context::decode_slice_unit_tiles(image_unit* imgunit,
938 slice_unit* sliceunit)
939 {
940 de265_error err = DE265_OK;
941
942 de265_image* img = imgunit->img;
943 slice_segment_header* shdr = sliceunit->shdr;
944 const pic_parameter_set* pps = &img->pps;
945
946 int nTiles = shdr->num_entry_point_offsets +1;
947 int ctbsWidth = img->sps.PicWidthInCtbsY;
948
949
950 assert(img->num_threads_active() == 0);
951 img->thread_start(nTiles);
952
953 sliceunit->allocate_thread_contexts(nTiles);
954
955
956 // first CTB in this slice
957 int ctbAddrRS = shdr->slice_segment_address;
958 int tileID = pps->TileIdRS[ctbAddrRS];
959
960 for (int entryPt=0;entryPt<nTiles;entryPt++) {
961 // entry points other than the first start at tile beginnings
962 if (entryPt>0) {
963 tileID++;
964 int ctbX = pps->colBd[tileID % pps->num_tile_columns];
965 int ctbY = pps->rowBd[tileID / pps->num_tile_columns];
966 ctbAddrRS = ctbY * ctbsWidth + ctbX;
967 }
968
969 // set thread context
970
971 thread_context* tctx = sliceunit->get_thread_context(entryPt);
972
973 tctx->shdr = shdr;
974 tctx->decctx = img->decctx;
975 tctx->img = img;
976 tctx->imgunit = imgunit;
977 tctx->CtbAddrInTS = pps->CtbAddrRStoTS[ctbAddrRS];
978
979 init_thread_context(tctx);
980
981
982 // init CABAC
983
984 int dataStartIndex;
985 if (entryPt==0) { dataStartIndex=0; }
986 else { dataStartIndex=shdr->entry_point_offset[entryPt-1]; }
987
988 int dataEnd;
989 if (entryPt==nTiles-1) dataEnd = sliceunit->reader.bytes_remaining;
990 else dataEnd = shdr->entry_point_offset[entryPt];
991
992 init_CABAC_decoder(&tctx->cabac_decoder,
993 &sliceunit->reader.data[dataStartIndex],
994 dataEnd-dataStartIndex);
995
996 // add task
997
998 add_task_decode_slice_segment(tctx, entryPt==0);
999 }
1000
1001 img->wait_for_completion();
1002
1003 for (int i=0;i<imgunit->tasks.size();i++)
1004 delete imgunit->tasks[i];
1005 imgunit->tasks.clear();
1006
1007 return DE265_OK;
1008 }
1009
1010
1011 de265_error decoder_context::decode_NAL(NAL_unit* nal)
1012 {
1013 //return decode_NAL_OLD(nal);
1014
1015 decoder_context* ctx = this;
1016
1017 de265_error err = DE265_OK;
1018
1019 bitreader reader;
1020 bitreader_init(&reader, nal->data(), nal->size());
1021
1022 nal_header nal_hdr;
1023 nal_read_header(&reader, &nal_hdr);
1024 ctx->process_nal_hdr(&nal_hdr);
1025
1026 loginfo(LogHighlevel,"NAL: 0x%x 0x%x - unit type:%s temporal id:%d\n",
1027 nal->data()[0], nal->data()[1],
1028 get_NAL_name(nal_hdr.nal_unit_type),
1029 nal_hdr.nuh_temporal_id);
1030
1031 /*
1032 printf("NAL: 0x%x 0x%x - unit type:%s temporal id:%d\n",
1033 nal->data()[0], nal->data()[1],
1034 get_NAL_name(nal_hdr.nal_unit_type),
1035 nal_hdr.nuh_temporal_id);
1036 */
1037
1038 // throw away NALs from higher TIDs than currently selected
1039 // TODO: better online switching of HighestTID
1040
1041 //printf("hTid: %d\n", current_HighestTid);
1042
1043 if (nal_hdr.nuh_temporal_id > current_HighestTid) {
1044 nal_parser.free_NAL_unit(nal);
1045 return DE265_OK;
1046 }
1047
1048
1049 if (nal_hdr.nal_unit_type<32) {
1050 err = read_slice_NAL(reader, nal, nal_hdr);
1051 }
1052 else switch (nal_hdr.nal_unit_type) {
1053 case NAL_UNIT_VPS_NUT:
1054 err = read_vps_NAL(reader);
1055 nal_parser.free_NAL_unit(nal);
1056 break;
1057
1058 case NAL_UNIT_SPS_NUT:
1059 err = read_sps_NAL(reader);
1060 nal_parser.free_NAL_unit(nal);
1061 break;
1062
1063 case NAL_UNIT_PPS_NUT:
1064 err = read_pps_NAL(reader);
1065 nal_parser.free_NAL_unit(nal);
1066 break;
1067
1068 case NAL_UNIT_PREFIX_SEI_NUT:
1069 case NAL_UNIT_SUFFIX_SEI_NUT:
1070 err = read_sei_NAL(reader, nal_hdr.nal_unit_type==NAL_UNIT_SUFFIX_SEI_NUT);
1071 nal_parser.free_NAL_unit(nal);
1072 break;
1073
1074 case NAL_UNIT_EOS_NUT:
1075 ctx->FirstAfterEndOfSequenceNAL = true;
1076 nal_parser.free_NAL_unit(nal);
1077 break;
1078 }
1079
1080 return err;
1081 }
1082
1083
1084 de265_error decoder_context::decode(int* more)
1085 {
1086 decoder_context* ctx = this;
1087
1088 // if the stream has ended, and no more NALs are to be decoded, flush all pictures
1089
1090 if (ctx->nal_parser.get_NAL_queue_length() == 0 &&
1091 ctx->nal_parser.is_end_of_stream() &&
1092 ctx->image_units.empty()) {
1093
1094 // flush all pending pictures into output queue
1095
1096 // ctx->push_current_picture_to_output_queue(); // TODO: not with new queue
1097 ctx->dpb.flush_reorder_buffer();
1098
1099 if (more) { *more = ctx->dpb.num_pictures_in_output_queue(); }
1100
1101 return DE265_OK;
1102 }
1103
1104
1105 // if NAL-queue is empty, we need more data
1106 // -> input stalled
1107
1108 if (ctx->nal_parser.is_end_of_stream() == false &&
1109 ctx->nal_parser.get_NAL_queue_length() == 0) {
1110 if (more) { *more=1; }
1111
1112 return DE265_ERROR_WAITING_FOR_INPUT_DATA;
1113 }
1114
1115
1116 // when there are no free image buffers in the DPB, pause decoding
1117 // -> output stalled
1118
1119 if (!ctx->dpb.has_free_dpb_picture(false)) {
1120 if (more) *more = 1;
1121 return DE265_ERROR_IMAGE_BUFFER_FULL;
1122 }
1123
1124
1125 // decode one NAL from the queue
1126
1127 de265_error err = DE265_OK;
1128
1129 if (ctx->nal_parser.number_of_NAL_units_pending()) {
1130 NAL_unit* nal = ctx->nal_parser.pop_from_NAL_queue();
1131 assert(nal);
1132 err = ctx->decode_NAL(nal);
1133 // ctx->nal_parser.free_NAL_unit(nal); TODO: do not free NAL with new loop
1134 }
1135 else {
1136 err = decode_some();
1137 }
1138
1139 if (more) {
1140 // decoding error is assumed to be unrecoverable
1141 *more = (err==DE265_OK);
1142 }
1143
1144 return err;
1145 }
1146
1147
1148 void decoder_context::process_nal_hdr(nal_header* nal)
1149 {
1150 nal_unit_type = nal->nal_unit_type;
1151
1152 IdrPicFlag = (nal->nal_unit_type == NAL_UNIT_IDR_W_RADL ||
1153 nal->nal_unit_type == NAL_UNIT_IDR_N_LP);
1154
1155 RapPicFlag = (nal->nal_unit_type >= 16 &&
1156 nal->nal_unit_type <= 23);
1157 }
1158
1159
1160 void decoder_context::process_vps(video_parameter_set* vps)
1161 {
1162 this->vps[ vps->video_parameter_set_id ] = *vps;
1163 }
1164
1165
1166 void decoder_context::process_sps(seq_parameter_set* sps)
1167 {
1168 //push_current_picture_to_output_queue();
1169
1170 this->sps[ sps->seq_parameter_set_id ] = *sps;
1171 }
1172
1173
1174 void decoder_context::process_pps(pic_parameter_set* pps)
1175 {
1176 //push_current_picture_to_output_queue();
1177
1178 this->pps[ (int)pps->pic_parameter_set_id ] = *pps;
1179 }
1180
1181
1182 /* 8.3.1
1183 */
1184 void decoder_context::process_picture_order_count(decoder_context* ctx, slice_segment_header* hdr)
1185 {
1186 loginfo(LogHeaders,"POC computation. lsb:%d prev.pic.lsb:%d msb:%d\n",
1187 hdr->slice_pic_order_cnt_lsb,
1188 ctx->prevPicOrderCntLsb,
1189 ctx->PicOrderCntMsb);
1190
1191 if (isIRAP(ctx->nal_unit_type) &&
1192 ctx->NoRaslOutputFlag)
1193 {
1194 ctx->PicOrderCntMsb=0;
1195
1196
1197 // flush all images from reorder buffer
1198
1199 flush_reorder_buffer_at_this_frame = true;
1200 //ctx->dpb.flush_reorder_buffer();
1201 }
1202 else
1203 {
1204 int MaxPicOrderCntLsb = ctx->current_sps->MaxPicOrderCntLsb;
1205
1206 if ((hdr->slice_pic_order_cnt_lsb < ctx->prevPicOrderCntLsb) &&
1207 (ctx->prevPicOrderCntLsb - hdr->slice_pic_order_cnt_lsb) >= MaxPicOrderCntLsb/2) {
1208 ctx->PicOrderCntMsb = ctx->prevPicOrderCntMsb + MaxPicOrderCntLsb;
1209 }
1210 else if ((hdr->slice_pic_order_cnt_lsb > ctx->prevPicOrderCntLsb) &&
1211 (hdr->slice_pic_order_cnt_lsb - ctx->prevPicOrderCntLsb) > MaxPicOrderCntLsb/2) {
1212 ctx->PicOrderCntMsb = ctx->prevPicOrderCntMsb - MaxPicOrderCntLsb;
1213 }
1214 else {
1215 ctx->PicOrderCntMsb = ctx->prevPicOrderCntMsb;
1216 }
1217 }
1218
1219 ctx->img->PicOrderCntVal = ctx->PicOrderCntMsb + hdr->slice_pic_order_cnt_lsb;
1220 ctx->img->picture_order_cnt_lsb = hdr->slice_pic_order_cnt_lsb;
1221
1222 loginfo(LogHeaders,"POC computation. new msb:%d POC=%d\n",
1223 ctx->PicOrderCntMsb,
1224 ctx->img->PicOrderCntVal);
1225
1226 if (ctx->img->nal_hdr.nuh_temporal_id==0 &&
1227 (isReferenceNALU(ctx->nal_unit_type) &&
1228 (!isRASL(ctx->nal_unit_type) && !isRADL(ctx->nal_unit_type))) &&
1229 1 /* sub-layer non-reference picture */) // TODO
1230 {
1231 loginfo(LogHeaders,"set prevPicOrderCntLsb/Msb\n");
1232
1233 ctx->prevPicOrderCntLsb = hdr->slice_pic_order_cnt_lsb;
1234 ctx->prevPicOrderCntMsb = ctx->PicOrderCntMsb;
1235 }
1236 }
1237
1238
1239 /* 8.3.3.2
1240 Returns DPB index of the generated picture.
1241 */
1242 int decoder_context::generate_unavailable_reference_picture(decoder_context* ctx,
1243 const seq_parameter_set* sps,
1244 int POC, bool longTerm)
1245 {
1246 assert(ctx->dpb.has_free_dpb_picture(true));
1247
1248 int idx = ctx->dpb.new_image(ctx->current_sps, this);
1249 assert(idx>=0);
1250 //printf("-> fill with unavailable POC %d\n",POC);
1251
1252 de265_image* img = ctx->dpb.get_image(idx);
1253
1254 img->fill_image(1<<(sps->BitDepth_Y-1),
1255 1<<(sps->BitDepth_C-1),
1256 1<<(sps->BitDepth_C-1));
1257
1258 img->fill_pred_mode(MODE_INTRA);
1259
1260 img->PicOrderCntVal = POC;
1261 img->picture_order_cnt_lsb = POC & (sps->MaxPicOrderCntLsb-1);
1262 img->PicOutputFlag = false;
1263 img->PicState = (longTerm ? UsedForLongTermReference : UsedForShortTermReference);
1264 img->integrity = INTEGRITY_UNAVAILABLE_REFERENCE;
1265
1266 return idx;
1267 }
1268
1269
1270 /* 8.3.2 invoked once per picture
1271
1272 This function will mark pictures in the DPB as 'unused' or 'used for long-term reference'
1273 */
1274 void decoder_context::process_reference_picture_set(decoder_context* ctx, slice_segment_header* hdr)
1275 {
1276 std::vector<int> removeReferencesList;
1277
1278 const int currentID = ctx->img->get_ID();
1279
1280
1281 if (isIRAP(ctx->nal_unit_type) && ctx->NoRaslOutputFlag) {
1282
1283 int currentPOC = ctx->img->PicOrderCntVal;
1284
1285 // reset DPB
1286
1287 /* The standard says: "When the current picture is an IRAP picture with NoRaslOutputFlag
1288 equal to 1, all reference pictures currently in the DPB (if any) are marked as
1289 "unused for reference".
1290
1291 This seems to be wrong as it also throws out the first CRA picture in a stream like
1292 RAP_A (decoding order: CRA,POC=64, RASL,POC=60). Removing only the pictures with
1293 lower POCs seems to be compliant to the reference decoder.
1294 */
1295
1296 for (int i=0;i<dpb.size();i++) {
1297 de265_image* img = ctx->dpb.get_image(i);
1298
1299 if (img->PicState != UnusedForReference &&
1300 img->PicOrderCntVal < currentPOC &&
1301 img->removed_at_picture_id > ctx->img->get_ID()) {
1302
1303 removeReferencesList.push_back(img->get_ID());
1304 img->removed_at_picture_id = ctx->img->get_ID();
1305
1306 //printf("will remove ID %d (a)\n",img->get_ID());
1307 }
1308 }
1309 }
1310
1311
1312 if (isIDR(ctx->nal_unit_type)) {
1313
1314 // clear all reference pictures
1315
1316 ctx->NumPocStCurrBefore = 0;
1317 ctx->NumPocStCurrAfter = 0;
1318 ctx->NumPocStFoll = 0;
1319 ctx->NumPocLtCurr = 0;
1320 ctx->NumPocLtFoll = 0;
1321 }
1322 else {
1323 const ref_pic_set* rps = &hdr->CurrRps;
1324
1325 // (8-98)
1326
1327 int i,j,k;
1328
1329 // scan ref-pic-set for smaller POCs and fill into PocStCurrBefore / PocStFoll
1330
1331 for (i=0, j=0, k=0;
1332 i<rps->NumNegativePics;
1333 i++)
1334 {
1335 if (rps->UsedByCurrPicS0[i]) {
1336 ctx->PocStCurrBefore[j++] = ctx->img->PicOrderCntVal + rps->DeltaPocS0[i];
1337 //printf("PocStCurrBefore = %d\n",ctx->PocStCurrBefore[j-1]);
1338 }
1339 else {
1340 ctx->PocStFoll[k++] = ctx->img->PicOrderCntVal + rps->DeltaPocS0[i];
1341 }
1342 }
1343
1344 ctx->NumPocStCurrBefore = j;
1345
1346
1347 // scan ref-pic-set for larger POCs and fill into PocStCurrAfter / PocStFoll
1348
1349 for (i=0, j=0;
1350 i<rps->NumPositivePics;
1351 i++)
1352 {
1353 if (rps->UsedByCurrPicS1[i]) {
1354 ctx->PocStCurrAfter[j++] = ctx->img->PicOrderCntVal + rps->DeltaPocS1[i];
1355 //printf("PocStCurrAfter = %d\n",ctx->PocStCurrAfter[j-1]);
1356 }
1357 else {
1358 ctx->PocStFoll[k++] = ctx->img->PicOrderCntVal + rps->DeltaPocS1[i];
1359 }
1360 }
1361
1362 ctx->NumPocStCurrAfter = j;
1363 ctx->NumPocStFoll = k;
1364
1365
1366 // find used / future long-term references
1367
1368 for (i=0, j=0, k=0;
1369 //i<ctx->current_sps->num_long_term_ref_pics_sps + hdr->num_long_term_pics;
1370 i<hdr->num_long_term_sps + hdr->num_long_term_pics;
1371 i++)
1372 {
1373 int pocLt = ctx->PocLsbLt[i];
1374
1375 if (hdr->delta_poc_msb_present_flag[i]) {
1376 int currentPictureMSB = ctx->img->PicOrderCntVal - hdr->slice_pic_order_cnt_lsb;
1377 pocLt += currentPictureMSB
1378 - ctx->DeltaPocMsbCycleLt[i] * ctx->current_sps->MaxPicOrderCntLsb;
1379 }
1380
1381 if (ctx->UsedByCurrPicLt[i]) {
1382 ctx->PocLtCurr[j] = pocLt;
1383 ctx->CurrDeltaPocMsbPresentFlag[j] = hdr->delta_poc_msb_present_flag[i];
1384 j++;
1385 }
1386 else {
1387 ctx->PocLtFoll[k] = pocLt;
1388 ctx->FollDeltaPocMsbPresentFlag[k] = hdr->delta_poc_msb_present_flag[i];
1389 k++;
1390 }
1391 }
1392
1393 ctx->NumPocLtCurr = j;
1394 ctx->NumPocLtFoll = k;
1395 }
1396
1397
1398 // (old 8-99) / (new 8-106)
1399 // 1.
1400
1401 std::vector<bool> picInAnyList(dpb.size(), false);
1402
1403
1404 dpb.log_dpb_content();
1405
1406 for (int i=0;i<ctx->NumPocLtCurr;i++) {
1407 int k;
1408 if (!ctx->CurrDeltaPocMsbPresentFlag[i]) {
1409 k = ctx->dpb.DPB_index_of_picture_with_LSB(ctx->PocLtCurr[i], currentID, true);
1410 }
1411 else {
1412 k = ctx->dpb.DPB_index_of_picture_with_POC(ctx->PocLtCurr[i], currentID, true);
1413 }
1414
1415 ctx->RefPicSetLtCurr[i] = k; // -1 == "no reference picture"
1416 if (k>=0) picInAnyList[k]=true;
1417 else {
1418 // TODO, CHECK: is it ok that we generate a picture with POC = LSB (PocLtCurr)
1419 // We do not know the correct MSB
1420 int concealedPicture = generate_unavailable_reference_picture(ctx, ctx->current_sps,
1421 ctx->PocLtCurr[i], true);
1422 ctx->RefPicSetLtCurr[i] = k = concealedPicture;
1423 picInAnyList[concealedPicture]=true;
1424 }
1425
1426 if (ctx->dpb.get_image(k)->integrity != INTEGRITY_CORRECT) {
1427 ctx->img->integrity = INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE;
1428 }
1429 }
1430
1431
1432 for (int i=0;i<ctx->NumPocLtFoll;i++) {
1433 int k;
1434 if (!ctx->FollDeltaPocMsbPresentFlag[i]) {
1435 k = ctx->dpb.DPB_index_of_picture_with_LSB(ctx->PocLtFoll[i], currentID, true);
1436 }
1437 else {
1438 k = ctx->dpb.DPB_index_of_picture_with_POC(ctx->PocLtFoll[i], currentID, true);
1439 }
1440
1441 ctx->RefPicSetLtFoll[i] = k; // -1 == "no reference picture"
1442 if (k>=0) picInAnyList[k]=true;
1443 else {
1444 int concealedPicture = k = generate_unavailable_reference_picture(ctx, ctx->current_sps,
1445 ctx->PocLtFoll[i], true);
1446 ctx->RefPicSetLtFoll[i] = concealedPicture;
1447 picInAnyList[concealedPicture]=true;
1448 }
1449 }
1450
1451
1452 // 2. Mark all pictures in RefPicSetLtCurr / RefPicSetLtFoll as UsedForLongTermReference
1453
1454 for (int i=0;i<ctx->NumPocLtCurr;i++) {
1455 ctx->dpb.get_image(ctx->RefPicSetLtCurr[i])->PicState = UsedForLongTermReference;
1456 }
1457
1458 for (int i=0;i<ctx->NumPocLtFoll;i++) {
1459 ctx->dpb.get_image(ctx->RefPicSetLtFoll[i])->PicState = UsedForLongTermReference;
1460 }
1461
1462
1463 // 3.
1464
1465 for (int i=0;i<ctx->NumPocStCurrBefore;i++) {
1466 int k = ctx->dpb.DPB_index_of_picture_with_POC(ctx->PocStCurrBefore[i], currentID);
1467
1468 //printf("st curr before, poc=%d -> idx=%d\n",ctx->PocStCurrBefore[i], k);
1469
1470 ctx->RefPicSetStCurrBefore[i] = k; // -1 == "no reference picture"
1471 if (k>=0) picInAnyList[k]=true;
1472 else {
1473 int concealedPicture = generate_unavailable_reference_picture(ctx, ctx->current_sps,
1474 ctx->PocStCurrBefore[i], false);
1475 ctx->RefPicSetStCurrBefore[i] = k = concealedPicture;
1476 picInAnyList[concealedPicture]=true;
1477
1478 //printf(" concealed: %d\n", concealedPicture);
1479 }
1480
1481 if (ctx->dpb.get_image(k)->integrity != INTEGRITY_CORRECT) {
1482 ctx->img->integrity = INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE;
1483 }
1484 }
1485
1486 for (int i=0;i<ctx->NumPocStCurrAfter;i++) {
1487 int k = ctx->dpb.DPB_index_of_picture_with_POC(ctx->PocStCurrAfter[i], currentID);
1488
1489 //printf("st curr after, poc=%d -> idx=%d\n",ctx->PocStCurrAfter[i], k);
1490
1491 ctx->RefPicSetStCurrAfter[i] = k; // -1 == "no reference picture"
1492 if (k>=0) picInAnyList[k]=true;
1493 else {
1494 int concealedPicture = generate_unavailable_reference_picture(ctx, ctx->current_sps,
1495 ctx->PocStCurrAfter[i], false);
1496 ctx->RefPicSetStCurrAfter[i] = k = concealedPicture;
1497 picInAnyList[concealedPicture]=true;
1498
1499 //printf(" concealed: %d\n", concealedPicture);
1500 }
1501
1502 if (ctx->dpb.get_image(k)->integrity != INTEGRITY_CORRECT) {
1503 ctx->img->integrity = INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE;
1504 }
1505 }
1506
1507 for (int i=0;i<ctx->NumPocStFoll;i++) {
1508 int k = ctx->dpb.DPB_index_of_picture_with_POC(ctx->PocStFoll[i], currentID);
1509 // if (k<0) { assert(false); } // IGNORE
1510
1511 ctx->RefPicSetStFoll[i] = k; // -1 == "no reference picture"
1512 if (k>=0) picInAnyList[k]=true;
1513 }
1514
1515 // 4. any picture that is not marked for reference is put into the "UnusedForReference" state
1516
1517 for (int i=0;i<dpb.size();i++)
1518 if (!picInAnyList[i]) // no reference
1519 {
1520 de265_image* dpbimg = ctx->dpb.get_image(i);
1521 if (dpbimg != ctx->img && // not the current picture
1522 dpbimg->removed_at_picture_id > ctx->img->get_ID()) // has not been removed before
1523 {
1524 if (dpbimg->PicState != UnusedForReference) {
1525 removeReferencesList.push_back(dpbimg->get_ID());
1526 //printf("will remove ID %d (b)\n",dpbimg->get_ID());
1527
1528 dpbimg->removed_at_picture_id = ctx->img->get_ID();
1529 }
1530 }
1531 }
1532
1533 hdr->RemoveReferencesList = removeReferencesList;
1534
1535 //remove_images_from_dpb(hdr->RemoveReferencesList);
1536 }
1537
1538
1539 // 8.3.4
1540 // Returns whether we can continue decoding (or whether there is a severe error).
1541 /* Called at beginning of each slice.
1542
1543 Constructs
1544 - the RefPicList[2][], containing indices into the DPB, and
1545 - the RefPicList_POC[2][], containing POCs.
1546 - LongTermRefPic[2][] is also set to true if it is a long-term reference
1547 */
1548 bool decoder_context::construct_reference_picture_lists(decoder_context* ctx, slice_segment_header* hdr)
1549 {
1550 int NumPocTotalCurr = hdr->NumPocTotalCurr;
1551 int NumRpsCurrTempList0 = libde265_max(hdr->num_ref_idx_l0_active, NumPocTotalCurr);
1552
1553 // TODO: fold code for both lists together
1554
1555 int RefPicListTemp0[3*MAX_NUM_REF_PICS]; // TODO: what would be the correct maximum ?
1556 int RefPicListTemp1[3*MAX_NUM_REF_PICS]; // TODO: what would be the correct maximum ?
1557 char isLongTerm[2][3*MAX_NUM_REF_PICS];
1558
1559 memset(isLongTerm,0,2*3*MAX_NUM_REF_PICS);
1560
1561 /* --- Fill RefPicListTmp0 with reference pictures in this order:
1562 1) short term, past POC
1563 2) short term, future POC
1564 3) long term
1565 */
1566
1567 int rIdx=0;
1568 while (rIdx < NumRpsCurrTempList0) {
1569 for (int i=0;i<ctx->NumPocStCurrBefore && rIdx<NumRpsCurrTempList0; rIdx++,i++)
1570 RefPicListTemp0[rIdx] = ctx->RefPicSetStCurrBefore[i];
1571
1572 for (int i=0;i<ctx->NumPocStCurrAfter && rIdx<NumRpsCurrTempList0; rIdx++,i++)
1573 RefPicListTemp0[rIdx] = ctx->RefPicSetStCurrAfter[i];
1574
1575 for (int i=0;i<ctx->NumPocLtCurr && rIdx<NumRpsCurrTempList0; rIdx++,i++) {
1576 RefPicListTemp0[rIdx] = ctx->RefPicSetLtCurr[i];
1577 isLongTerm[0][rIdx] = true;
1578 }
1579
1580 // This check is to prevent an endless loop when no images are added above.
1581 if (rIdx==0) {
1582 ctx->add_warning(DE265_WARNING_FAULTY_REFERENCE_PICTURE_LIST, false);
1583 return false;
1584 }
1585 }
1586
1587 if (hdr->num_ref_idx_l0_active > 15) {
1588 ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false);
1589 return false;
1590 }
1591
1592 for (rIdx=0; rIdx<hdr->num_ref_idx_l0_active; rIdx++) {
1593 int idx = hdr->ref_pic_list_modification_flag_l0 ? hdr->list_entry_l0[rIdx] : rIdx;
1594
1595 hdr->RefPicList[0][rIdx] = RefPicListTemp0[idx];
1596 hdr->LongTermRefPic[0][rIdx] = isLongTerm[0][idx];
1597
1598 // remember POC of referenced image (needed in motion.c, derive_collocated_motion_vector)
1599 hdr->RefPicList_POC[0][rIdx] = ctx->dpb.get_image(hdr->RefPicList[0][rIdx])->PicOrderCntVal;
1600 hdr->RefPicList_PicState[0][rIdx] = ctx->dpb.get_image(hdr->RefPicList[0][rIdx])->PicState;
1601 }
1602
1603
1604 /* --- Fill RefPicListTmp1 with reference pictures in this order:
1605 1) short term, future POC
1606 2) short term, past POC
1607 3) long term
1608 */
1609
1610 if (hdr->slice_type == SLICE_TYPE_B) {
1611 int NumRpsCurrTempList1 = libde265_max(hdr->num_ref_idx_l1_active, NumPocTotalCurr);
1612
1613 int rIdx=0;
1614 while (rIdx < NumRpsCurrTempList1) {
1615 for (int i=0;i<ctx->NumPocStCurrAfter && rIdx<NumRpsCurrTempList1; rIdx++,i++)
1616 RefPicListTemp1[rIdx] = ctx->RefPicSetStCurrAfter[i];
1617
1618 for (int i=0;i<ctx->NumPocStCurrBefore && rIdx<NumRpsCurrTempList1; rIdx++,i++)
1619 RefPicListTemp1[rIdx] = ctx->RefPicSetStCurrBefore[i];
1620
1621 for (int i=0;i<ctx->NumPocLtCurr && rIdx<NumRpsCurrTempList1; rIdx++,i++) {
1622 RefPicListTemp1[rIdx] = ctx->RefPicSetLtCurr[i];
1623 isLongTerm[1][rIdx] = true;
1624 }
1625 }
1626
1627 assert(hdr->num_ref_idx_l1_active <= 15);
1628 for (rIdx=0; rIdx<hdr->num_ref_idx_l1_active; rIdx++) {
1629 int idx = hdr->ref_pic_list_modification_flag_l1 ? hdr->list_entry_l1[rIdx] : rIdx;
1630
1631 hdr->RefPicList[1][rIdx] = RefPicListTemp1[idx];
1632 hdr->LongTermRefPic[1][rIdx] = isLongTerm[1][idx];
1633
1634 // remember POC of referenced imaged (needed in motion.c, derive_collocated_motion_vector)
1635 hdr->RefPicList_POC[1][rIdx] = ctx->dpb.get_image(hdr->RefPicList[1][rIdx])->PicOrderCntVal;
1636 hdr->RefPicList_PicState[1][rIdx] = ctx->dpb.get_image(hdr->RefPicList[1][rIdx])->PicState;
1637 }
1638 }
1639
1640
1641 // show reference picture lists
1642
1643 loginfo(LogHeaders,"RefPicList[0] =");
1644 for (rIdx=0; rIdx<hdr->num_ref_idx_l0_active; rIdx++) {
1645 loginfo(LogHeaders,"* [%d]=%d (LT=%d)",
1646 hdr->RefPicList[0][rIdx],
1647 hdr->RefPicList_POC[0][rIdx],
1648 hdr->LongTermRefPic[0][rIdx]
1649 );
1650 }
1651 loginfo(LogHeaders,"*\n");
1652
1653 if (hdr->slice_type == SLICE_TYPE_B) {
1654 loginfo(LogHeaders,"RefPicList[1] =");
1655 for (rIdx=0; rIdx<hdr->num_ref_idx_l1_active; rIdx++) {
1656 loginfo(LogHeaders,"* [%d]=%d (LT=%d)",
1657 hdr->RefPicList[1][rIdx],
1658 hdr->RefPicList_POC[1][rIdx],
1659 hdr->LongTermRefPic[1][rIdx]
1660 );
1661 }
1662 loginfo(LogHeaders,"*\n");
1663 }
1664
1665 return true;
1666 }
1667
1668
1669
1670 void decoder_context::run_postprocessing_filters_sequential(de265_image* img)
1671 {
1672 #if SAVE_INTERMEDIATE_IMAGES
1673 char buf[1000];
1674 sprintf(buf,"pre-lf-%05d.yuv", img->PicOrderCntVal);
1675 write_picture_to_file(img, buf);
1676 #endif
1677
1678 if (!img->decctx->param_disable_deblocking) {
1679 apply_deblocking_filter(img);
1680 }
1681
1682 #if SAVE_INTERMEDIATE_IMAGES
1683 sprintf(buf,"pre-sao-%05d.yuv", img->PicOrderCntVal);
1684 write_picture_to_file(img, buf);
1685 #endif
1686
1687 if (!img->decctx->param_disable_sao) {
1688 apply_sample_adaptive_offset_sequential(img);
1689 }
1690
1691 #if SAVE_INTERMEDIATE_IMAGES
1692 sprintf(buf,"sao-%05d.yuv", img->PicOrderCntVal);
1693 write_picture_to_file(img, buf);
1694 #endif
1695 }
1696
1697
1698 void decoder_context::run_postprocessing_filters_parallel(image_unit* imgunit)
1699 {
1700 de265_image* img = imgunit->img;
1701
1702 int saoWaitsForProgress = CTB_PROGRESS_PREFILTER;
1703 bool waitForCompletion = false;
1704
1705 if (!img->decctx->param_disable_deblocking) {
1706 add_deblocking_tasks(imgunit);
1707 saoWaitsForProgress = CTB_PROGRESS_DEBLK_H;
1708 }
1709
1710 if (!img->decctx->param_disable_sao) {
1711 waitForCompletion |= add_sao_tasks(imgunit, saoWaitsForProgress);
1712 //apply_sample_adaptive_offset(img);
1713 }
1714
1715 img->wait_for_completion();
1716 }
1717
1718 /*
1719 void decoder_context::push_current_picture_to_output_queue()
1720 {
1721 push_picture_to_output_queue(img);
1722 }
1723 */
1724
1725 de265_error decoder_context::push_picture_to_output_queue(image_unit* imgunit)
1726 {
1727 de265_image* outimg = imgunit->img;
1728
1729 if (outimg==NULL) { return DE265_OK; }
1730
1731
1732 // push image into output queue
1733
1734 if (outimg->PicOutputFlag) {
1735 loginfo(LogDPB,"new picture has output-flag=true\n");
1736
1737 if (outimg->integrity != INTEGRITY_CORRECT &&
1738 param_suppress_faulty_pictures) {
1739 }
1740 else {
1741 dpb.insert_image_into_reorder_buffer(outimg);
1742 }
1743
1744 loginfo(LogDPB,"push image %d into reordering queue\n", outimg->PicOrderCntVal);
1745 }
1746
1747 // check for full reorder buffers
1748
1749 int sublayer = outimg->vps.vps_max_sub_layers -1;
1750 int maxNumPicsInReorderBuffer = outimg->vps.layer[sublayer].vps_max_num_reorder_pics;
1751
1752 if (dpb.num_pictures_in_reorder_buffer() > maxNumPicsInReorderBuffer) {
1753 dpb.output_next_picture_in_reorder_buffer();
1754 }
1755
1756 dpb.log_dpb_queues();
1757
1758 return DE265_OK;
1759 }
1760
1761
1762 // returns whether we can continue decoding the stream or whether we should give up
1763 bool decoder_context::process_slice_segment_header(decoder_context* ctx, slice_segment_header* hdr,
1764 de265_error* err, de265_PTS pts,
1765 nal_header* nal_hdr,
1766 void* user_data)
1767 {
1768 *err = DE265_OK;
1769
1770 flush_reorder_buffer_at_this_frame = false;
1771
1772
1773 // get PPS and SPS for this slice
1774
1775 int pps_id = hdr->slice_pic_parameter_set_id;
1776 if (ctx->pps[pps_id].pps_read==false) {
1777 logerror(LogHeaders, "PPS %d has not been read\n", pps_id);
1778 assert(false); // TODO
1779 }
1780
1781 ctx->current_pps = &ctx->pps[pps_id];
1782 ctx->current_sps = &ctx->sps[ (int)ctx->current_pps->seq_parameter_set_id ];
1783 ctx->current_vps = &ctx->vps[ (int)ctx->current_sps->video_parameter_set_id ];
1784
1785 calc_tid_and_framerate_ratio();
1786
1787
1788 // --- prepare decoding of new picture ---
1789
1790 if (hdr->first_slice_segment_in_pic_flag) {
1791
1792 // previous picture has been completely decoded
1793
1794 //ctx->push_current_picture_to_output_queue();
1795
1796 ctx->current_image_poc_lsb = hdr->slice_pic_order_cnt_lsb;
1797
1798
1799 seq_parameter_set* sps = ctx->current_sps;
1800
1801
1802 // --- find and allocate image buffer for decoding ---
1803
1804 int image_buffer_idx;
1805 image_buffer_idx = ctx->dpb.new_image(sps, this);
1806 if (image_buffer_idx == -1) {
1807 *err = DE265_ERROR_IMAGE_BUFFER_FULL;
1808 return false;
1809 }
1810
1811 de265_image* img = ctx->dpb.get_image(image_buffer_idx);
1812 img->pts = pts;
1813 img->user_data = user_data;
1814 img->nal_hdr = *nal_hdr;
1815 ctx->img = img;
1816
1817 img->vps = *ctx->current_vps;
1818 img->sps = *ctx->current_sps;
1819 img->pps = *ctx->current_pps;
1820 img->decctx = ctx;
1821
1822 img->clear_metadata();
1823
1824
1825 if (isIRAP(ctx->nal_unit_type)) {
1826 if (isIDR(ctx->nal_unit_type) ||
1827 isBLA(ctx->nal_unit_type) ||
1828 ctx->first_decoded_picture ||
1829 ctx->FirstAfterEndOfSequenceNAL)
1830 {
1831 ctx->NoRaslOutputFlag = true;
1832 ctx->FirstAfterEndOfSequenceNAL = false;
1833 }
1834 else if (0) // TODO: set HandleCraAsBlaFlag by external means
1835 {
1836 }
1837 else
1838 {
1839 ctx->NoRaslOutputFlag = false;
1840 ctx->HandleCraAsBlaFlag = false;
1841 }
1842 }
1843
1844
1845 if (isRASL(ctx->nal_unit_type) &&
1846 ctx->NoRaslOutputFlag)
1847 {
1848 ctx->img->PicOutputFlag = false;
1849 }
1850 else
1851 {
1852 ctx->img->PicOutputFlag = !!hdr->pic_output_flag;
1853 }
1854
1855 process_picture_order_count(ctx,hdr);
1856
1857 if (hdr->first_slice_segment_in_pic_flag) {
1858 // mark picture so that it is not overwritten by unavailable reference frames
1859 img->PicState = UsedForShortTermReference;
1860
1861 process_reference_picture_set(ctx,hdr);
1862 }
1863
1864 img->PicState = UsedForShortTermReference;
1865
1866 log_set_current_POC(ctx->img->PicOrderCntVal);
1867
1868
1869 // next image is not the first anymore
1870
1871 first_decoded_picture = false;
1872 }
1873
1874 if (hdr->slice_type == SLICE_TYPE_B ||
1875 hdr->slice_type == SLICE_TYPE_P)
1876 {
1877 bool success = construct_reference_picture_lists(ctx,hdr);
1878 if (!success) {
1879 return false;
1880 }
1881 }
1882
1883 //printf("process slice segment header\n");
1884
1885 loginfo(LogHeaders,"end of process-slice-header\n");
1886 ctx->dpb.log_dpb_content();
1887
1888
1889 if (hdr->dependent_slice_segment_flag==0) {
1890 hdr->SliceAddrRS = hdr->slice_segment_address;
1891 } else {
1892 hdr->SliceAddrRS = ctx->previous_slice_header->SliceAddrRS;
1893 }
1894
1895 ctx->previous_slice_header = hdr;
1896
1897
1898 loginfo(LogHeaders,"SliceAddrRS = %d\n",hdr->SliceAddrRS);
1899
1900 return true;
1901 }
1902
1903
1904 void decoder_context::remove_images_from_dpb(const std::vector<int>& removeImageList)
1905 {
1906 for (int i=0;i<removeImageList.size();i++) {
1907 int idx = dpb.DPB_index_of_picture_with_ID( removeImageList[i] );
1908 if (idx>=0) {
1909 //printf("remove ID %d\n", removeImageList[i]);
1910 de265_image* dpbimg = dpb.get_image( idx );
1911 dpbimg->PicState = UnusedForReference;
1912 }
1913 }
1914 }
1915
1916
1917
1918 /*
1919 . 0 1 2 <- goal_HighestTid
1920 +-----+-----+-----+
1921 | -0->| -1->| -2->|
1922 +-----+-----+-----+
1923 0 33 66 100 <- framerate_ratio
1924 */
1925
1926 int decoder_context::get_highest_TID() const
1927 {
1928 if (current_sps) { return current_sps->sps_max_sub_layers-1; }
1929 if (current_vps) { return current_vps->vps_max_sub_layers-1; }
1930
1931 return 6;
1932 }
1933
1934 void decoder_context::set_limit_TID(int max_tid)
1935 {
1936 limit_HighestTid = max_tid;
1937 calc_tid_and_framerate_ratio();
1938 }
1939
1940 int decoder_context::change_framerate(int more)
1941 {
1942 if (current_sps == NULL) { return framerate_ratio; }
1943
1944 int highestTid = get_highest_TID();
1945
1946 assert(more>=-1 && more<=1);
1947
1948 goal_HighestTid += more;
1949 goal_HighestTid = std::max(goal_HighestTid, 0);
1950 goal_HighestTid = std::min(goal_HighestTid, highestTid);
1951
1952 framerate_ratio = framedrop_tid_index[goal_HighestTid];
1953
1954 calc_tid_and_framerate_ratio();
1955
1956 return framerate_ratio;
1957 }
1958
1959 void decoder_context::set_framerate_ratio(int percent)
1960 {
1961 framerate_ratio = percent;
1962 calc_tid_and_framerate_ratio();
1963 }
1964
1965 void decoder_context::compute_framedrop_table()
1966 {
1967 int highestTID = get_highest_TID();
1968
1969 for (int tid=highestTID ; tid>=0 ; tid--) {
1970 int lower = 100 * tid /(highestTID+1);
1971 int higher = 100 * (tid+1)/(highestTID+1);
1972
1973 for (int l=lower; l<=higher; l++) {
1974 int ratio = 100 * (l-lower) / (higher-lower);
1975
1976 // if we would exceed our TID limit, decode the highest TID at full frame-rate
1977 if (tid > limit_HighestTid) {
1978 tid = limit_HighestTid;
1979 ratio = 100;
1980 }
1981
1982 framedrop_tab[l].tid = tid;
1983 framedrop_tab[l].ratio = ratio;
1984 }
1985
1986 framedrop_tid_index[tid] = higher;
1987 }
1988
1989 #if 0
1990 for (int i=0;i<=100;i++) {
1991 printf("%d%%: %d/%d",i, framedrop_tab[i].tid, framedrop_tab[i].ratio);
1992 for (int k=0;k<=highestTID;k++) {
1993 if (framedrop_tid_index[k] == i) printf(" ** TID=%d **",k);
1994 }
1995 printf("\n");
1996 }
1997 #endif
1998 }
1999
2000 void decoder_context::calc_tid_and_framerate_ratio()
2001 {
2002 int highestTID = get_highest_TID();
2003
2004
2005 // if number of temporal layers changed, we have to recompute the framedrop table
2006
2007 if (framedrop_tab[100].tid != highestTID) {
2008 compute_framedrop_table();
2009 }
2010
2011 goal_HighestTid = framedrop_tab[framerate_ratio].tid;
2012 layer_framerate_ratio = framedrop_tab[framerate_ratio].ratio;
2013
2014 // TODO: for now, we switch immediately
2015 current_HighestTid = goal_HighestTid;
2016 }
2017
2018
2019 void error_queue::add_warning(de265_error warning, bool once)
2020 {
2021 // check if warning was already shown
2022 bool add=true;
2023 if (once) {
2024 for (int i=0;i<nWarningsShown;i++) {
2025 if (warnings_shown[i] == warning) {
2026 add=false;
2027 break;
2028 }
2029 }
2030 }
2031
2032 if (!add) {
2033 return;
2034 }
2035
2036
2037 // if this is a one-time warning, remember that it was shown
2038
2039 if (once) {
2040 if (nWarningsShown < MAX_WARNINGS) {
2041 warnings_shown[nWarningsShown++] = warning;
2042 }
2043 }
2044
2045
2046 // add warning to output queue
2047
2048 if (nWarnings == MAX_WARNINGS) {
2049 warnings[MAX_WARNINGS-1] = DE265_WARNING_WARNING_BUFFER_FULL;
2050 return;
2051 }
2052
2053 warnings[nWarnings++] = warning;
2054 }
2055
2056 error_queue::error_queue()
2057 {
2058 nWarnings = 0;
2059 nWarningsShown = 0;
2060 }
2061
2062 de265_error error_queue::get_warning()
2063 {
2064 if (nWarnings==0) {
2065 return DE265_OK;
2066 }
2067
2068 de265_error warn = warnings[0];
2069 nWarnings--;
2070 memmove(warnings, &warnings[1], nWarnings*sizeof(de265_error));
2071
2072 return warn;
2073 }
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
2828 #include "libde265/image.h"
2929 #include "libde265/motion.h"
3030 #include "libde265/de265.h"
31 #include "libde265/dpb.h"
32 #include "libde265/sei.h"
3133 #include "libde265/threads.h"
3234 #include "libde265/acceleration.h"
33
34 #define DE265_MAX_VPS_SETS 16
35 #define DE265_MAX_SPS_SETS 16
36 #define DE265_MAX_PPS_SETS 64
37 #define DE265_MAX_SLICES 512 // TODO: make this dynamic
38 #define DE265_IMAGE_OUTPUT_QUEUE_LEN 2
39
40 // TODO: check required value
41 #define DE265_DPB_OUTPUT_IMAGES 20
42 #define DE265_DPB_RESILIENCE_IMAGES 5
43 #define DE265_DPB_SIZE (DE265_DPB_OUTPUT_IMAGES + DE265_DPB_RESILIENCE_IMAGES)
44
45 #define DE265_NAL_FREE_LIST_SIZE 16
46 #define DE265_SKIPPED_BYTES_INITIAL_SIZE 16
35 #include "libde265/nal-parser.h"
36
37 #define DE265_MAX_VPS_SETS 16 // this is the maximum as defined in the standard
38 #define DE265_MAX_SPS_SETS 16 // this is the maximum as defined in the standard
39 #define DE265_MAX_PPS_SETS 64 // this is the maximum as defined in the standard
40 #define MAX_THREAD_CONTEXTS 68 // enough for 4K @ 32 pixel CTBs, but TODO: make this dynamic
4741
4842 #define MAX_WARNINGS 20
4943
5044
51 // split_cu_flag CB (MinCbSizeY)
52 // skip_flag CB
53 // pcm_flag CB
54 // prev_intra_luma_pred_flag CB
55 // rem_intra_luma_pred_mode CB
56 // mpm_idx CB
57 // intra_chroma_pred_mode CB
58
59
60 typedef struct NAL_unit {
61 nal_header header;
62
63 rbsp_buffer nal_data;
64
65 de265_PTS pts;
66 void* user_data;
67
68 int* skipped_bytes; // up to position[x], there were 'x' skipped bytes
69 int num_skipped_bytes;
70 int max_skipped_bytes;
71
72 union {
73 seq_parameter_set sps;
74 pic_parameter_set pps;
75 // slice_segment_header slice_hdr;
76 };
77 } NAL_unit;
78
79
80
8145 struct slice_segment_header;
82
83 typedef struct thread_context
46 struct image_unit;
47
48
49 struct thread_context
8450 {
85 uint8_t inUse; // thread_context is used for the current decoding process
51 thread_context();
8652
8753 int CtbAddrInRS;
8854 int CtbAddrInTS;
134100 context_model ctx_model[CONTEXT_MODEL_TABLE_LENGTH];
135101
136102 struct decoder_context* decctx;
103 struct de265_image *img;
137104 struct slice_segment_header* shdr;
138 } thread_context;
139
140
141
142
143 typedef struct decoder_context {
105
106 struct image_unit* imgunit;
107 struct thread_task* task; // executing thread_task or NULL if not multi-threaded
108
109 private:
110 thread_context(const thread_context&); // not allowed
111 const thread_context& operator=(const thread_context&); // not allowed
112 };
113
114
115
116 class error_queue
117 {
118 public:
119 error_queue();
120
121 void add_warning(de265_error warning, bool once);
122 de265_error get_warning();
123
124 private:
125 de265_error warnings[MAX_WARNINGS];
126 int nWarnings;
127 de265_error warnings_shown[MAX_WARNINGS]; // warnings that have already occurred
128 int nWarningsShown;
129 };
130
131
132
133 struct slice_unit
134 {
135 slice_unit(decoder_context* decctx);
136 ~slice_unit();
137
138 NAL_unit* nal; // we are the owner
139 slice_segment_header* shdr; // not the owner (de265_image is owner)
140 bitreader reader;
141
142 struct image_unit* imgunit;
143
144 bool flush_reorder_buffer;
145
146 enum { Unprocessed,
147 Inprogress,
148 Decoded
149 } state;
150
151 void allocate_thread_contexts(int n);
152 thread_context* get_thread_context(int n) { return &thread_contexts[n]; }
153
154 private:
155 thread_context* thread_contexts; /* NOTE: cannot use std::vector, because thread_context has
156 no copy constructor. */
157
158 decoder_context* ctx;
159
160
161 slice_unit(const slice_unit&); // not allowed
162 const slice_unit& operator=(const slice_unit&); // not allowed
163 };
164
165
166 struct image_unit
167 {
168 image_unit();
169 ~image_unit();
170
171 de265_image* img;
172 de265_image sao_output; // if SAO is used, this is allocated and used as SAO output buffer
173
174 std::vector<slice_unit*> slice_units;
175 std::vector<sei_message> suffix_SEIs;
176
177 enum { Invalid, // headers not read yet
178 Unknown, // SPS/PPS available
179 Reference, // will be used as reference
180 Leaf // not a reference picture
181 } role;
182
183 enum { Unprocessed,
184 InProgress,
185 Decoded,
186 Dropped // will not be decoded
187 } state;
188
189 std::vector<thread_task*> tasks; // we are the owner
190
191 /* Saved context models for WPP.
192 There is one saved model for the initialization of each CTB row.
193 The array is unused for non-WPP streams. */
194 std::vector<context_model> ctx_models; // TODO: move this into image ?
195 };
196
197
198
199 class decoder_context : public error_queue {
200 public:
201 decoder_context();
202 ~decoder_context();
203
204 de265_error start_thread_pool(int nThreads);
205 void stop_thread_pool();
206
207 void reset();
208
209 /* */ seq_parameter_set* get_sps(int id) { return &sps[id]; }
210 const seq_parameter_set* get_sps(int id) const { return &sps[id]; }
211 /* */ pic_parameter_set* get_pps(int id) { return &pps[id]; }
212 const pic_parameter_set* get_pps(int id) const { return &pps[id]; }
213
214 /*
215 const slice_segment_header* get_SliceHeader_atCtb(int ctb) {
216 return img->slices[img->get_SliceHeaderIndex_atIndex(ctb)];
217 }
218 */
219
220 uint8_t get_nal_unit_type() const { return nal_unit_type; }
221 bool get_RapPicFlag() const { return RapPicFlag; }
222
223 de265_error decode_NAL(NAL_unit* nal);
224
225 de265_error decode(int* more);
226 de265_error decode_some();
227
228 de265_error decode_slice_unit_sequential(image_unit* imgunit, slice_unit* sliceunit);
229 de265_error decode_slice_unit_parallel(image_unit* imgunit, slice_unit* sliceunit);
230 de265_error decode_slice_unit_WPP(image_unit* imgunit, slice_unit* sliceunit);
231 de265_error decode_slice_unit_tiles(image_unit* imgunit, slice_unit* sliceunit);
232
233 void process_nal_hdr(nal_header*);
234 void process_vps(video_parameter_set*);
235 void process_sps(seq_parameter_set*);
236 void process_pps(pic_parameter_set*);
237
238 bool process_slice_segment_header(decoder_context*, slice_segment_header*,
239 de265_error*, de265_PTS pts,
240 nal_header* nal_hdr, void* user_data);
241
242 //void push_current_picture_to_output_queue();
243 de265_error push_picture_to_output_queue(image_unit*);
244
144245
145246 // --- parameters ---
146247
147248 bool param_sei_check_hash;
148 int param_HighestTid;
149249 bool param_conceal_stream_errors;
250 bool param_suppress_faulty_pictures;
150251
151252 int param_sps_headers_fd;
152253 int param_vps_headers_fd;
153254 int param_pps_headers_fd;
154255 int param_slice_headers_fd;
155256
156
157 // --- decoder administration ---
257 bool param_disable_deblocking;
258 bool param_disable_sao;
259 //bool param_disable_mc_residual_idct; // not implemented yet
260 //bool param_disable_intra_residual_idct; // not implemented yet
261
262 void set_image_allocation_functions(de265_image_allocation* allocfunc, void* userdata);
263
264 de265_image_allocation param_image_allocation_functions;
265 void* param_image_allocation_userdata;
266
267
268 // --- accelerated DSP functions ---
269
270 void set_acceleration_functions(enum de265_acceleration);
158271
159272 struct acceleration_functions acceleration; // CPU optimized functions
160273
161 de265_error warnings[MAX_WARNINGS];
162 int nWarnings;
163 de265_error warnings_shown[MAX_WARNINGS]; // warnings that have already occurred
164 int nWarningsShown;
165
166274
167275 // --- input stream data ---
168276
169 // byte-stream level
170
171 bool end_of_stream; // data in pending_input_data is end of stream
172 int input_push_state;
173 NAL_unit* pending_input_NAL;
174
175 // NAL level
176
177 NAL_unit** NAL_queue; // enqueued NALs have suffing bytes removed
178 int NAL_queue_len;
179 int NAL_queue_size;
180
181 int nBytes_in_NAL_queue;
182
183 NAL_unit** NAL_free_list; // DE265_NAL_FREE_LIST_SIZE
184 int NAL_free_list_len;
185 int NAL_free_list_size;
186
187
277 NAL_Parser nal_parser;
278
279
280 int get_num_worker_threads() const { return num_worker_threads; }
281
282 /* */ de265_image* get_image(int dpb_index) { return dpb.get_image(dpb_index); }
283 const de265_image* get_image(int dpb_index) const { return dpb.get_image(dpb_index); }
284
285 de265_image* get_next_picture_in_output_queue() { return dpb.get_next_picture_in_output_queue(); }
286 int num_pictures_in_output_queue() const { return dpb.num_pictures_in_output_queue(); }
287 void pop_next_picture_in_output_queue() { dpb.pop_next_picture_in_output_queue(); }
288
289 private:
290 de265_error read_vps_NAL(bitreader&);
291 de265_error read_sps_NAL(bitreader&);
292 de265_error read_pps_NAL(bitreader&);
293 de265_error read_sei_NAL(bitreader& reader, bool suffix);
294 de265_error read_eos_NAL(bitreader& reader);
295 de265_error read_slice_NAL(bitreader&, NAL_unit* nal, nal_header& nal_hdr);
296
297 private:
188298 // --- internal data ---
189299
190300 video_parameter_set vps[ DE265_MAX_VPS_SETS ];
191301 seq_parameter_set sps[ DE265_MAX_SPS_SETS ];
192302 pic_parameter_set pps[ DE265_MAX_PPS_SETS ];
193 slice_segment_header slice[ DE265_MAX_SLICES ];
194303
195304 video_parameter_set* current_vps;
196305 seq_parameter_set* current_sps;
197306 pic_parameter_set* current_pps;
198307
308 public:
199309 struct thread_pool thread_pool;
310
311 private:
200312 int num_worker_threads;
201313
202314
203 // --- sequence level ---
204
205 int HighestTid;
206
207
315 public:
316 // --- frame dropping ---
317
318 void set_limit_TID(int tid);
319 int get_highest_TID() const;
320 int get_current_TID() const { return current_HighestTid; }
321 int change_framerate(int more_vs_less); // 1: more, -1: less
322 void set_framerate_ratio(int percent);
323
324 private:
325 // input parameters
326 int limit_HighestTid; // never switch to a layer above this one
327 int framerate_ratio;
328
329 // current control parameters
330 int goal_HighestTid; // this is the layer we want to decode at
331 int layer_framerate_ratio; // ratio of frames to keep in the current layer
332
333 int current_HighestTid; // the layer which we are currently decoding
334
335 struct {
336 int8_t tid;
337 int8_t ratio;
338 } framedrop_tab[100+1];
339 int framedrop_tid_index[6+1];
340
341 void compute_framedrop_table();
342 void calc_tid_and_framerate_ratio();
343
344 private:
208345 // --- decoded picture buffer ---
209346
210 de265_image dpb[DE265_DPB_SIZE]; // decoded picture buffer
211
212 de265_image* reorder_output_queue[DE265_DPB_SIZE];
213 int reorder_output_queue_length;
214
215 de265_image* image_output_queue[DE265_DPB_SIZE];
216 int image_output_queue_length;
217
218 de265_image* last_decoded_image;
347 decoded_picture_buffer dpb;
219348
220349 int current_image_poc_lsb;
221350 bool first_decoded_picture;
229358
230359 de265_image* img;
231360
361 public:
362 const slice_segment_header* previous_slice_header; /* Remember the last slice for a successive
363 dependent slice. */
364
232365
233366 // --- motion compensation ---
234367
368 public:
235369 int PocLsbLt[MAX_NUM_REF_PICS];
236370 int UsedByCurrPicLt[MAX_NUM_REF_PICS];
237371 int DeltaPocMsbCycleLt[MAX_NUM_REF_PICS];
372 private:
238373 int CurrDeltaPocMsbPresentFlag[MAX_NUM_REF_PICS];
239374 int FollDeltaPocMsbPresentFlag[MAX_NUM_REF_PICS];
240375
253388 int PocLtFoll[MAX_NUM_REF_PICS]; // used in some future picture
254389
255390 // These lists contain indices into the DPB.
256 int RefPicSetStCurrBefore[DE265_DPB_SIZE];
257 int RefPicSetStCurrAfter[DE265_DPB_SIZE];
258 int RefPicSetStFoll[DE265_DPB_SIZE];
259 int RefPicSetLtCurr[DE265_DPB_SIZE];
260 int RefPicSetLtFoll[DE265_DPB_SIZE];
261
262
263 // --- decoded image data --- TODO: all this should move into de265_image
264
265 // de265_image coeff; // transform coefficients / TODO: don't use de265_image for this
391 int RefPicSetStCurrBefore[MAX_NUM_REF_PICS];
392 int RefPicSetStCurrAfter[MAX_NUM_REF_PICS];
393 int RefPicSetStFoll[MAX_NUM_REF_PICS];
394 int RefPicSetLtCurr[MAX_NUM_REF_PICS];
395 int RefPicSetLtFoll[MAX_NUM_REF_PICS];
396
266397
267398 // --- parameters derived from parameter sets ---
268399
274405 char RapPicFlag;
275406
276407
277 // --- decoder runtime data ---
278
279 struct thread_context thread_context[MAX_THREAD_CONTEXTS];
280
281 } decoder_context;
282
283
284 void init_decoder_context(decoder_context*);
285 void set_acceleration_functions(decoder_context* ctx, enum de265_acceleration);
286 void reset_decoder_context_for_new_picture(decoder_context* ctx);
287 void free_decoder_context(decoder_context*);
288
289
290 void cleanup_image(decoder_context* ctx, de265_image* img);
291
292 NAL_unit* alloc_NAL_unit(decoder_context*, int size, int skipped_size);
293 void free_NAL_unit(decoder_context*, NAL_unit*);
294
295 NAL_unit* pop_from_NAL_queue(decoder_context*);
296 void push_to_NAL_queue(decoder_context*,NAL_unit*);
297
298
299 void flush_next_picture_from_reorder_buffer(decoder_context* ctx);
300 int initialize_new_DPB_image(decoder_context* ctx,const seq_parameter_set* sps);
301
302 seq_parameter_set* get_sps(decoder_context* ctx, int id);
303
304 void process_nal_hdr(decoder_context*, nal_header*);
305 void process_vps(decoder_context*, video_parameter_set*);
306 void process_sps(decoder_context*, seq_parameter_set*);
307 void process_pps(decoder_context*, pic_parameter_set*);
308 bool process_slice_segment_header(decoder_context*, slice_segment_header*,
309 de265_error*, de265_PTS pts, void* user_data);
310
311 int get_next_slice_index(decoder_context* ctx);
312 int get_next_thread_context_index(decoder_context* ctx);
313
314 void add_warning(decoder_context* ctx, de265_error warning, bool once);
315 de265_error get_warning(decoder_context* ctx);
316
317 // TODO void free_currently_unused_memory(decoder_context* ctx); // system is low on memory, free some (e.g. unused images in the DPB)
318
319
320 // --- decoder 2D data arrays ---
321 // All coordinates are in pixels if not stated otherwise.
322
323 void debug_dump_cb_info(const decoder_context*);
324
325 slice_segment_header* get_SliceHeader(decoder_context*, int x, int y);
326 slice_segment_header* get_SliceHeaderCtb(decoder_context* ctx, int ctbX, int ctbY);
327
328
329 const PredVectorInfo* get_mv_info(const decoder_context* ctx,int x,int y);
330 const PredVectorInfo* get_img_mv_info(const decoder_context* ctx,
331 const de265_image* img, int x,int y);
332 void set_mv_info(decoder_context* ctx,int x,int y, int nPbW,int nPbH, const PredVectorInfo* mv);
333
334 // TODO: move to some utility file
335 bool available_zscan(const de265_image* ctx,
336 int xCurr,int yCurr, int xN,int yN);
337
338 bool available_pred_blk(const decoder_context* ctx,
339 int xC,int yC, int nCbS, int xP, int yP, int nPbW, int nPbH, int partIdx,
340 int xN,int yN);
341
342 bool has_free_dpb_picture(const decoder_context* ctx, bool high_priority);
343 void push_current_picture_to_output_queue(decoder_context* ctx);
344
345 // --- debug ---
346
347 LIBDE265_API void set_output_filename(const char* filename);
348 LIBDE265_API void write_picture(const de265_image* img);
349 void write_picture_to_file(const de265_image* img, const char* filename);
350
351 void draw_CB_grid(const decoder_context* ctx, uint8_t* img, int stride, uint8_t value);
352 void draw_TB_grid(const decoder_context* ctx, uint8_t* img, int stride, uint8_t value);
353 void draw_PB_grid(const decoder_context* ctx, uint8_t* img, int stride, uint8_t value);
354 void draw_PB_pred_modes(const decoder_context* ctx, uint8_t* r, uint8_t* g, uint8_t* b, int stride);
355 void draw_intra_pred_modes(const decoder_context* ctx, uint8_t* img, int stride, uint8_t value);
408 // --- image unit queue ---
409
410 std::vector<image_unit*> image_units;
411
412 bool flush_reorder_buffer_at_this_frame;
413
414 private:
415 void init_thread_context(class thread_context* tctx);
416 void add_task_decode_CTB_row(thread_context* tctx, bool firstSliceSubstream);
417 void add_task_decode_slice_segment(thread_context* tctx, bool firstSliceSubstream);
418
419
420 void process_picture_order_count(decoder_context* ctx, slice_segment_header* hdr);
421 int generate_unavailable_reference_picture(decoder_context* ctx, const seq_parameter_set* sps,
422 int POC, bool longTerm);
423 void process_reference_picture_set(decoder_context* ctx, slice_segment_header* hdr);
424 bool construct_reference_picture_lists(decoder_context* ctx, slice_segment_header* hdr);
425
426
427 void remove_images_from_dpb(const std::vector<int>& removeImageList);
428 void run_postprocessing_filters_sequential(de265_image* img);
429 void run_postprocessing_filters_parallel(image_unit* img);
430 };
431
356432
357433 #endif
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "dpb.h"
21 #include "decctx.h"
22 #include <string.h>
23 #include <assert.h>
24
25
26 #define DPB_DEFAULT_MAX_IMAGES 30
27
28
29 decoded_picture_buffer::decoded_picture_buffer()
30 {
31 max_images_in_DPB = DPB_DEFAULT_MAX_IMAGES;
32 norm_images_in_DPB = DPB_DEFAULT_MAX_IMAGES;
33 }
34
35
36 decoded_picture_buffer::~decoded_picture_buffer()
37 {
38 for (int i=0;i<dpb.size();i++)
39 delete dpb[i];
40 }
41
42
43 void decoded_picture_buffer::log_dpb_content() const
44 {
45 for (int i=0;i<dpb.size();i++) {
46 loginfo(LogHighlevel, " DPB %d: POC=%d, ID=%d %s %s\n", i,
47 dpb[i]->PicOrderCntVal,
48 dpb[i]->get_ID(),
49 dpb[i]->PicState == UnusedForReference ? "unused" :
50 dpb[i]->PicState == UsedForShortTermReference ? "short-term" : "long-term",
51 dpb[i]->PicOutputFlag ? "output" : "---");
52 }
53 }
54
55
56 bool decoded_picture_buffer::has_free_dpb_picture(bool high_priority) const
57 {
58 // we will always adapt the buffer to insert high-priority images
59 if (high_priority) return true;
60
61 // quick test to check for free slots
62 if (dpb.size() < max_images_in_DPB) return true;
63
64 // scan for empty slots
65 for (int i=0;i<dpb.size();i++) {
66 if (dpb[i]->PicOutputFlag==false && dpb[i]->PicState == UnusedForReference) {
67 return true;
68 }
69 }
70
71 return false;
72 }
73
74
75 int decoded_picture_buffer::DPB_index_of_picture_with_POC(int poc, int currentID, bool preferLongTerm) const
76 {
77 logdebug(LogHeaders,"DPB_index_of_picture_with_POC POC=%d\n",poc);
78
79 //log_dpb_content(ctx);
80 //loginfo(LogDPB,"searching for short-term reference POC=%d\n",poc);
81
82 if (preferLongTerm) {
83 for (int k=0;k<dpb.size();k++) {
84 if (dpb[k]->PicOrderCntVal == poc &&
85 dpb[k]->removed_at_picture_id > currentID &&
86 dpb[k]->PicState == UsedForLongTermReference) {
87 return k;
88 }
89 }
90 }
91
92 for (int k=0;k<dpb.size();k++) {
93 if (dpb[k]->PicOrderCntVal == poc &&
94 dpb[k]->removed_at_picture_id > currentID &&
95 dpb[k]->PicState != UnusedForReference) {
96 return k;
97 }
98 }
99
100 return -1;
101 }
102
103
104 int decoded_picture_buffer::DPB_index_of_picture_with_LSB(int lsb, int currentID, bool preferLongTerm) const
105 {
106 logdebug(LogHeaders,"get access to picture with LSB %d from DPB\n",lsb);
107
108 if (preferLongTerm) {
109 for (int k=0;k<dpb.size();k++) {
110 if (dpb[k]->picture_order_cnt_lsb == lsb &&
111 dpb[k]->removed_at_picture_id > currentID &&
112 dpb[k]->PicState == UsedForLongTermReference) {
113 return k;
114 }
115 }
116 }
117
118 for (int k=0;k<dpb.size();k++) {
119 if (dpb[k]->picture_order_cnt_lsb == lsb &&
120 dpb[k]->removed_at_picture_id > currentID &&
121 dpb[k]->PicState != UnusedForReference) {
122 return k;
123 }
124 }
125
126 return -1;
127 }
128
129
130 int decoded_picture_buffer::DPB_index_of_picture_with_ID(int id) const
131 {
132 logdebug(LogHeaders,"get access to picture with ID %d from DPB\n",id);
133
134 for (int k=0;k<dpb.size();k++) {
135 if (dpb[k]->get_ID() == id) {
136 return k;
137 }
138 }
139
140 return -1;
141 }
142
143
144 void decoded_picture_buffer::output_next_picture_in_reorder_buffer()
145 {
146 assert(!reorder_output_queue.empty());
147
148 // search for picture in reorder buffer with minimum POC
149
150 int minPOC = reorder_output_queue[0]->PicOrderCntVal;
151 int minIdx = 0;
152 for (int i=1;i<reorder_output_queue.size();i++)
153 {
154 if (reorder_output_queue[i]->PicOrderCntVal < minPOC) {
155 minPOC = reorder_output_queue[i]->PicOrderCntVal;
156 minIdx = i;
157 }
158 }
159
160
161 // put image into output queue
162
163 image_output_queue.push_back(reorder_output_queue[minIdx]);
164
165
166 // remove image from reorder buffer
167
168 reorder_output_queue[minIdx] = reorder_output_queue.back();
169 reorder_output_queue.pop_back();
170 }
171
172
173 bool decoded_picture_buffer::flush_reorder_buffer()
174 {
175 // return 'false' when there are no pictures in reorder buffer
176 if (reorder_output_queue.empty()) return false;
177
178 while (!reorder_output_queue.empty()) {
179 output_next_picture_in_reorder_buffer();
180 }
181
182 return true;
183 }
184
185
186 void decoded_picture_buffer::clear()
187 {
188 for (int i=0;i<dpb.size();i++) {
189 if (dpb[i]->PicOutputFlag ||
190 dpb[i]->PicState != UnusedForReference)
191 {
192 dpb[i]->PicOutputFlag = false;
193 dpb[i]->PicState = UnusedForReference;
194 dpb[i]->release();
195 }
196 }
197
198 reorder_output_queue.clear();
199 image_output_queue.clear();
200 }
201
202
203 int decoded_picture_buffer::new_image(const seq_parameter_set* sps,
204 decoder_context* decctx)
205 {
206 loginfo(LogHeaders,"DPB::new_image\n");
207 log_dpb_content();
208
209 // --- search for a free slot in the DPB ---
210
211 int free_image_buffer_idx = -1;
212 for (int i=0;i<dpb.size();i++) {
213 if (dpb[i]->can_be_released()) {
214 dpb[i]->release(); /* TODO: this is surely not the best place to free the image, but
215 we have to do it here because releasing it in de265_release_image()
216 would break the API compatibility. */
217
218 free_image_buffer_idx = i;
219 break;
220 }
221 }
222
223
224 // Try to free a buffer at the end if the DPB got too large.
225 /* This should also probably move to a better place as soon as the API allows for this. */
226
227 if (dpb.size() > norm_images_in_DPB && // buffer too large
228 free_image_buffer_idx != dpb.size()-1 && // last slot not reused in this alloc
229 dpb.back()->can_be_released()) // last slot is free
230 {
231 delete dpb.back();
232 dpb.pop_back();
233 }
234
235
236 // create a new image slot if no empty slot remaining
237
238 if (free_image_buffer_idx == -1) {
239 free_image_buffer_idx = dpb.size();
240 dpb.push_back(new de265_image);
241 }
242
243
244 // --- allocate new image ---
245
246 de265_image* img = dpb[free_image_buffer_idx];
247
248 int w = sps->pic_width_in_luma_samples;
249 int h = sps->pic_height_in_luma_samples;
250
251 enum de265_chroma chroma;
252 switch (sps->chroma_format_idc) {
253 case 0: chroma = de265_chroma_mono; break;
254 case 1: chroma = de265_chroma_420; break;
255 case 2: chroma = de265_chroma_422; break;
256 case 3: chroma = de265_chroma_444; break;
257 default: chroma = de265_chroma_420; assert(0); break; // should never happen
258 }
259
260 img->alloc_image(w,h, chroma, sps, true, decctx);
261
262 img->integrity = INTEGRITY_CORRECT;
263
264 return free_image_buffer_idx;
265 }
266
267
268 void decoded_picture_buffer::pop_next_picture_in_output_queue()
269 {
270 image_output_queue.pop_front();
271
272
273 loginfo(LogDPB, "DPB output queue: ");
274 for (int i=0;i<image_output_queue.size();i++) {
275 loginfo(LogDPB, "*%d ", image_output_queue[i]->PicOrderCntVal);
276 }
277 loginfo(LogDPB,"*\n");
278 }
279
280
281 void decoded_picture_buffer::log_dpb_queues() const
282 {
283 loginfo(LogDPB, "DPB reorder queue (after push): ");
284 for (int i=0;i<num_pictures_in_reorder_buffer();i++) {
285 loginfo(LogDPB, "*%d ", reorder_output_queue[i]->PicOrderCntVal);
286 }
287 loginfo(LogDPB,"*\n");
288
289 loginfo(LogDPB, "DPB output queue (after push): ");
290 for (int i=0;i<num_pictures_in_output_queue();i++) {
291 loginfo(LogDPB, "*%d ", image_output_queue[i]->PicOrderCntVal);
292 }
293 loginfo(LogDPB,"*\n");
294 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef DE265_DPB_H
21 #define DE265_DPB_H
22
23 #include "libde265/image.h"
24 #include "libde265/sps.h"
25
26 #include <deque>
27 #include <vector>
28
29
30
31 struct decoded_picture_buffer {
32 decoded_picture_buffer();
33 ~decoded_picture_buffer();
34
35 void set_max_size_of_DPB(int n) { max_images_in_DPB=n; }
36 void set_norm_size_of_DPB(int n) { norm_images_in_DPB=n; }
37
38 /* Alloc a new image in the DPB and return its index.
39 If there is no space for a new image, return -1. */
40 int new_image(const seq_parameter_set* sps, decoder_context* decctx);
41
42 /* Check for a free slot in the DPB. There are some slots reserved for
43 unavailable reference frames. If high_priority==true, these reserved slots
44 are included in the check. */
45 bool has_free_dpb_picture(bool high_priority) const;
46
47 /* Remove all pictures from DPB and queues. Decoding should be stopped while calling this. */
48 void clear();
49
50 int size() const { return dpb.size(); }
51
52 /* Raw access to the images. */
53 /* */ de265_image* get_image(int index) { return dpb[index]; }
54 const de265_image* get_image(int index) const { return dpb[index]; }
55
56 /* Search DPB for the slot index of a specific picture. */
57 int DPB_index_of_picture_with_POC(int poc, int currentID, bool preferLongTerm=false) const;
58 int DPB_index_of_picture_with_LSB(int lsb, int currentID, bool preferLongTerm=false) const;
59 int DPB_index_of_picture_with_ID (int id) const;
60
61
62 // --- reorder buffer ---
63
64 void insert_image_into_reorder_buffer(de265_image* img) {
65 reorder_output_queue.push_back(img);
66 }
67
68 int num_pictures_in_reorder_buffer() const { return reorder_output_queue.size(); }
69
70 // move next picture in reorder buffer to output queue
71 void output_next_picture_in_reorder_buffer();
72
73 // Move all pictures in reorder buffer to output buffer. Return true if there were any pictures.
74 bool flush_reorder_buffer();
75
76
77 // --- output buffer ---
78
79 int num_pictures_in_output_queue() const { return image_output_queue.size(); }
80
81 /* Get the next picture in the output queue, but do not remove it from the queue. */
82 de265_image* get_next_picture_in_output_queue() const { return image_output_queue.front(); }
83
84 /* Remove the next picture in the output queue. */
85 void pop_next_picture_in_output_queue();
86
87
88 // --- debug ---
89
90 void log_dpb_content() const;
91 void log_dpb_queues() const;
92
93 private:
94 int max_images_in_DPB;
95 int norm_images_in_DPB;
96
97 std::vector<de265_image*> dpb; // decoded picture buffer
98
99 std::vector<de265_image*> reorder_output_queue;
100 std::deque<de265_image*> image_output_queue;
101
102 private:
103 decoded_picture_buffer(const decoded_picture_buffer&); // no copy
104 decoded_picture_buffer& operator=(const decoded_picture_buffer&); // no copy
105 };
106
107 #endif
+0
-271
libde265/fallback-dct.c less more
0
1 #include "fallback-motion.h"
2 #include "util.h"
3
4 #if defined(_MSC_VER) || defined(__MINGW32__)
5 # include <malloc.h>
6 #else
7 # include <alloca.h>
8 #endif
9
10 #include <assert.h>
11
12
13 void transform_skip_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
14 {
15 int nT = 4;
16 int bdShift2 = 20-8;
17
18 for (int y=0;y<nT;y++)
19 for (int x=0;x<nT;x++) {
20 int32_t c = coeffs[x+y*nT] << 7;
21 c = (c+(1<<(bdShift2-1)))>>bdShift2;
22
23 dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + c);
24 }
25 }
26
27
28 void transform_bypass_8_fallback(uint8_t *dst, int16_t *coeffs, int nT, ptrdiff_t stride)
29 {
30 int bdShift2 = 20-8;
31
32 for (int y=0;y<nT;y++)
33 for (int x=0;x<nT;x++) {
34 int32_t c = coeffs[x+y*nT];
35
36 dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + c);
37 }
38 }
39
40
41 static int8_t mat_8_357[4][4] = {
42 { 29, 55, 74, 84 },
43 { 74, 74, 0,-74 },
44 { 84,-29,-74, 55 },
45 { 55,-84, 74,-29 }
46 };
47
48
49
50 void transform_4x4_luma_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
51 {
52 int16_t g[4][4];
53
54 int postShift = 20-8; // 8 bit
55 int rndV = 1<<(7-1);
56 int rndH = 1<<(postShift-1);
57
58
59 // --- V ---
60
61 for (int c=0;c<4;c++) {
62
63 logtrace(LogTransform,"DST-V: ");
64 for (int r=0;r<4;r++) {
65 logtrace(LogTransform,"%d ",coeffs[c+r*4]);
66 }
67 logtrace(LogTransform,"* -> ");
68
69
70 for (int i=0;i<4;i++) {
71 int sum=0;
72
73 for (int j=0;j<4;j++) {
74 sum += mat_8_357[j][i] * coeffs[c+j*4];
75 }
76
77 g[i][c] = Clip3(-32768,32767, (sum+rndV)>>7);
78 }
79
80
81 for (int y=0;y<4;y++) {
82 logtrace(LogTransform,"*%d ",g[y][c]);
83 }
84 logtrace(LogTransform,"*\n");
85 }
86
87
88 // --- H ---
89
90 for (int y=0;y<4;y++) {
91
92 logtrace(LogTransform,"DST-H: ");
93 for (int c=0;c<4;c++) {
94 logtrace(LogTransform,"%d ",g[y][c]);
95 }
96 logtrace(LogTransform,"* -> ");
97
98
99 for (int i=0;i<4;i++) {
100 int sum=0;
101
102 for (int j=0;j<4;j++) {
103 sum += mat_8_357[j][i] * g[y][j];
104 }
105
106 int out = Clip3(-32768,32767, (sum+rndH)>>postShift);
107
108 dst[y*stride+i] = Clip1_8bit(dst[y*stride+i] + out);
109
110 logtrace(LogTransform,"*%d ",out);
111 }
112
113 logtrace(LogTransform,"*\n");
114 }
115 }
116
117
118
119 static int8_t mat_dct[32][32] = {
120 { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},
121 { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4,-13,-22,-31,-38,-46,-54,-61,-67,-73,-78,-82,-85,-88,-90,-90},
122 { 90, 87, 80, 70, 57, 43, 25, 9, -9,-25,-43,-57,-70,-80,-87,-90, -90,-87,-80,-70,-57,-43,-25, -9, 9, 25, 43, 57, 70, 80, 87, 90},
123 { 90, 82, 67, 46, 22, -4,-31,-54,-73,-85,-90,-88,-78,-61,-38,-13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4,-22,-46,-67,-82,-90},
124 { 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89, 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89},
125 { 88, 67, 31,-13,-54,-82,-90,-78,-46, -4, 38, 73, 90, 85, 61, 22, -22,-61,-85,-90,-73,-38, 4, 46, 78, 90, 82, 54, 13,-31,-67,-88},
126 { 87, 57, 9,-43,-80,-90,-70,-25, 25, 70, 90, 80, 43, -9,-57,-87, -87,-57, -9, 43, 80, 90, 70, 25,-25,-70,-90,-80,-43, 9, 57, 87},
127 { 85, 46,-13,-67,-90,-73,-22, 38, 82, 88, 54, -4,-61,-90,-78,-31, 31, 78, 90, 61, 4,-54,-88,-82,-38, 22, 73, 90, 67, 13,-46,-85},
128 { 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83},
129 { 82, 22,-54,-90,-61, 13, 78, 85, 31,-46,-90,-67, 4, 73, 88, 38, -38,-88,-73, -4, 67, 90, 46,-31,-85,-78,-13, 61, 90, 54,-22,-82},
130 { 80, 9,-70,-87,-25, 57, 90, 43,-43,-90,-57, 25, 87, 70, -9,-80, -80, -9, 70, 87, 25,-57,-90,-43, 43, 90, 57,-25,-87,-70, 9, 80},
131 { 78, -4,-82,-73, 13, 85, 67,-22,-88,-61, 31, 90, 54,-38,-90,-46, 46, 90, 38,-54,-90,-31, 61, 88, 22,-67,-85,-13, 73, 82, 4,-78},
132 { 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75, 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75},
133 { 73,-31,-90,-22, 78, 67,-38,-90,-13, 82, 61,-46,-88, -4, 85, 54, -54,-85, 4, 88, 46,-61,-82, 13, 90, 38,-67,-78, 22, 90, 31,-73},
134 { 70,-43,-87, 9, 90, 25,-80,-57, 57, 80,-25,-90, -9, 87, 43,-70, -70, 43, 87, -9,-90,-25, 80, 57,-57,-80, 25, 90, 9,-87,-43, 70},
135 { 67,-54,-78, 38, 85,-22,-90, 4, 90, 13,-88,-31, 82, 46,-73,-61, 61, 73,-46,-82, 31, 88,-13,-90, -4, 90, 22,-85,-38, 78, 54,-67},
136 { 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64},
137 { 61,-73,-46, 82, 31,-88,-13, 90, -4,-90, 22, 85,-38,-78, 54, 67, -67,-54, 78, 38,-85,-22, 90, 4,-90, 13, 88,-31,-82, 46, 73,-61},
138 { 57,-80,-25, 90, -9,-87, 43, 70,-70,-43, 87, 9,-90, 25, 80,-57, -57, 80, 25,-90, 9, 87,-43,-70, 70, 43,-87, -9, 90,-25,-80, 57},
139 { 54,-85, -4, 88,-46,-61, 82, 13,-90, 38, 67,-78,-22, 90,-31,-73, 73, 31,-90, 22, 78,-67,-38, 90,-13,-82, 61, 46,-88, 4, 85,-54},
140 { 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50, 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50},
141 { 46,-90, 38, 54,-90, 31, 61,-88, 22, 67,-85, 13, 73,-82, 4, 78, -78, -4, 82,-73,-13, 85,-67,-22, 88,-61,-31, 90,-54,-38, 90,-46},
142 { 43,-90, 57, 25,-87, 70, 9,-80, 80, -9,-70, 87,-25,-57, 90,-43, -43, 90,-57,-25, 87,-70, -9, 80,-80, 9, 70,-87, 25, 57,-90, 43},
143 { 38,-88, 73, -4,-67, 90,-46,-31, 85,-78, 13, 61,-90, 54, 22,-82, 82,-22,-54, 90,-61,-13, 78,-85, 31, 46,-90, 67, 4,-73, 88,-38},
144 { 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36},
145 { 31,-78, 90,-61, 4, 54,-88, 82,-38,-22, 73,-90, 67,-13,-46, 85, -85, 46, 13,-67, 90,-73, 22, 38,-82, 88,-54, -4, 61,-90, 78,-31},
146 { 25,-70, 90,-80, 43, 9,-57, 87,-87, 57, -9,-43, 80,-90, 70,-25, -25, 70,-90, 80,-43, -9, 57,-87, 87,-57, 9, 43,-80, 90,-70, 25},
147 { 22,-61, 85,-90, 73,-38, -4, 46,-78, 90,-82, 54,-13,-31, 67,-88, 88,-67, 31, 13,-54, 82,-90, 78,-46, 4, 38,-73, 90,-85, 61,-22},
148 { 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18, 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18},
149 { 13,-38, 61,-78, 88,-90, 85,-73, 54,-31, 4, 22,-46, 67,-82, 90, -90, 82,-67, 46,-22, -4, 31,-54, 73,-85, 90,-88, 78,-61, 38,-13},
150 { 9,-25, 43,-57, 70,-80, 87,-90, 90,-87, 80,-70, 57,-43, 25, -9, -9, 25,-43, 57,-70, 80,-87, 90,-90, 87,-80, 70,-57, 43,-25, 9},
151 { 4,-13, 22,-31, 38,-46, 54,-61, 67,-73, 78,-82, 85,-88, 90,-90, 90,-90, 88,-85, 82,-78, 73,-67, 61,-54, 46,-38, 31,-22, 13, -4}
152 };
153
154
155
156
157 static void transform_dct_add_8(uint8_t *dst, ptrdiff_t stride,
158 int nT, int16_t *coeffs)
159 {
160 int postShift = 20-8; // 8 bit
161 int rnd1 = 1<<(7-1);
162 int rnd2 = 1<<(postShift-1);
163 int fact = (1<<(5-Log2(nT)));
164
165 int16_t g[32*32]; // actually, only [nT*nT] used
166
167 // TODO: valgrind reports that dst[] contains uninitialized data.
168 // Probably from intra-prediction.
169
170 /*
171 for (int i=0;i<nT*nT;i++) {
172 printf("%d\n",coeffs[i]);
173 }
174
175 for (int y=0;y<nT;y++) {
176 for (int i=0;i<nT;i++) {
177 printf("%d ",dst[y*stride+i]);
178 }
179 }
180 printf("\n");
181 */
182
183 for (int c=0;c<nT;c++) {
184
185 logtrace(LogTransform,"DCT-V: ");
186 for (int i=0;i<nT;i++) {
187 logtrace(LogTransform,"*%d ",coeffs[c+i*nT]);
188 }
189 logtrace(LogTransform,"* -> ");
190
191
192 // find last non-zero coefficient to reduce computations carried out in DCT
193
194 int lastCol = nT-1;
195 for (;lastCol>=0;lastCol--) {
196 if (coeffs[c+lastCol*nT]) { break; }
197 }
198
199 for (int i=0;i<nT;i++) {
200 int sum=0;
201
202 for (int j=0;j<=lastCol /*nT*/;j++) {
203 sum += mat_dct[fact*j][i] * coeffs[c+j*nT];
204 }
205
206 g[c+i*nT] = Clip3(-32768,32767, (sum+rnd1)>>7);
207
208 logtrace(LogTransform,"*%d ",g[c+i*nT]);
209 }
210 logtrace(LogTransform,"*\n");
211 }
212
213
214 for (int y=0;y<nT;y++) {
215
216 logtrace(LogTransform,"DCT-H: ");
217 for (int i=0;i<nT;i++) {
218 logtrace(LogTransform,"*%d ",g[i+y*nT]);
219 }
220 logtrace(LogTransform,"* -> ");
221
222
223 // find last non-zero coefficient to reduce computations carried out in DCT
224
225 int lastCol = nT-1;
226 for (;lastCol>=0;lastCol--) {
227 if (g[y*nT+lastCol]) { break; }
228 }
229
230
231 for (int i=0;i<nT;i++) {
232 int sum=0;
233
234 for (int j=0;j<=lastCol /*nT*/;j++) {
235 sum += mat_dct[fact*j][i] * g[y*nT+j];
236 }
237
238 //int out = Clip3(-32768,32767, (sum+rnd2)>>postShift);
239 int out = (sum+rnd2)>>postShift;
240
241 //fprintf(stderr,"%d*%d+%d = %d\n",y,stride,i,y*stride+i);
242 //fprintf(stderr,"[%p]=%d\n",&dst[y*stride+i], Clip1_8bit(dst[y*stride+i]));
243 dst[y*stride+i] = Clip1_8bit(dst[y*stride+i] + out);
244
245 logtrace(LogTransform,"*%d ",out);
246 }
247 logtrace(LogTransform,"*\n");
248 }
249 }
250
251
252 void transform_4x4_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
253 {
254 transform_dct_add_8(dst,stride, 4, coeffs);
255 }
256
257 void transform_8x8_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
258 {
259 transform_dct_add_8(dst,stride, 8, coeffs);
260 }
261
262 void transform_16x16_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
263 {
264 transform_dct_add_8(dst,stride, 16, coeffs);
265 }
266
267 void transform_32x32_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
268 {
269 transform_dct_add_8(dst,stride, 32, coeffs);
270 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "fallback-motion.h"
21 #include "util.h"
22
23 #if defined(_MSC_VER) || defined(__MINGW32__)
24 # include <malloc.h>
25 #else
26 # include <alloca.h>
27 #endif
28
29 #include <assert.h>
30
31
32 void transform_skip_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
33 {
34 int nT = 4;
35 int bdShift2 = 20-8;
36
37 for (int y=0;y<nT;y++)
38 for (int x=0;x<nT;x++) {
39 int32_t c = coeffs[x+y*nT] << 7;
40 c = (c+(1<<(bdShift2-1)))>>bdShift2;
41
42 dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + c);
43 }
44 }
45
46
47 void transform_bypass_8_fallback(uint8_t *dst, int16_t *coeffs, int nT, ptrdiff_t stride)
48 {
49 int bdShift2 = 20-8;
50
51 for (int y=0;y<nT;y++)
52 for (int x=0;x<nT;x++) {
53 int32_t c = coeffs[x+y*nT];
54
55 dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + c);
56 }
57 }
58
59
60 static int8_t mat_8_357[4][4] = {
61 { 29, 55, 74, 84 },
62 { 74, 74, 0,-74 },
63 { 84,-29,-74, 55 },
64 { 55,-84, 74,-29 }
65 };
66
67
68
69 void transform_4x4_luma_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
70 {
71 int16_t g[4][4];
72
73 int postShift = 20-8; // 8 bit
74 int rndV = 1<<(7-1);
75 int rndH = 1<<(postShift-1);
76
77
78 // --- V ---
79
80 for (int c=0;c<4;c++) {
81
82 logtrace(LogTransform,"DST-V: ");
83 for (int r=0;r<4;r++) {
84 logtrace(LogTransform,"%d ",coeffs[c+r*4]);
85 }
86 logtrace(LogTransform,"* -> ");
87
88
89 for (int i=0;i<4;i++) {
90 int sum=0;
91
92 for (int j=0;j<4;j++) {
93 sum += mat_8_357[j][i] * coeffs[c+j*4];
94 }
95
96 g[i][c] = Clip3(-32768,32767, (sum+rndV)>>7);
97 }
98
99
100 for (int y=0;y<4;y++) {
101 logtrace(LogTransform,"*%d ",g[y][c]);
102 }
103 logtrace(LogTransform,"*\n");
104 }
105
106
107 // --- H ---
108
109 for (int y=0;y<4;y++) {
110
111 logtrace(LogTransform,"DST-H: ");
112 for (int c=0;c<4;c++) {
113 logtrace(LogTransform,"%d ",g[y][c]);
114 }
115 logtrace(LogTransform,"* -> ");
116
117
118 for (int i=0;i<4;i++) {
119 int sum=0;
120
121 for (int j=0;j<4;j++) {
122 sum += mat_8_357[j][i] * g[y][j];
123 }
124
125 int out = Clip3(-32768,32767, (sum+rndH)>>postShift);
126
127 dst[y*stride+i] = Clip1_8bit(dst[y*stride+i] + out);
128
129 logtrace(LogTransform,"*%d ",out);
130 }
131
132 logtrace(LogTransform,"*\n");
133 }
134 }
135
136
137
138 static int8_t mat_dct[32][32] = {
139 { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},
140 { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4,-13,-22,-31,-38,-46,-54,-61,-67,-73,-78,-82,-85,-88,-90,-90},
141 { 90, 87, 80, 70, 57, 43, 25, 9, -9,-25,-43,-57,-70,-80,-87,-90, -90,-87,-80,-70,-57,-43,-25, -9, 9, 25, 43, 57, 70, 80, 87, 90},
142 { 90, 82, 67, 46, 22, -4,-31,-54,-73,-85,-90,-88,-78,-61,-38,-13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4,-22,-46,-67,-82,-90},
143 { 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89, 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89},
144 { 88, 67, 31,-13,-54,-82,-90,-78,-46, -4, 38, 73, 90, 85, 61, 22, -22,-61,-85,-90,-73,-38, 4, 46, 78, 90, 82, 54, 13,-31,-67,-88},
145 { 87, 57, 9,-43,-80,-90,-70,-25, 25, 70, 90, 80, 43, -9,-57,-87, -87,-57, -9, 43, 80, 90, 70, 25,-25,-70,-90,-80,-43, 9, 57, 87},
146 { 85, 46,-13,-67,-90,-73,-22, 38, 82, 88, 54, -4,-61,-90,-78,-31, 31, 78, 90, 61, 4,-54,-88,-82,-38, 22, 73, 90, 67, 13,-46,-85},
147 { 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83},
148 { 82, 22,-54,-90,-61, 13, 78, 85, 31,-46,-90,-67, 4, 73, 88, 38, -38,-88,-73, -4, 67, 90, 46,-31,-85,-78,-13, 61, 90, 54,-22,-82},
149 { 80, 9,-70,-87,-25, 57, 90, 43,-43,-90,-57, 25, 87, 70, -9,-80, -80, -9, 70, 87, 25,-57,-90,-43, 43, 90, 57,-25,-87,-70, 9, 80},
150 { 78, -4,-82,-73, 13, 85, 67,-22,-88,-61, 31, 90, 54,-38,-90,-46, 46, 90, 38,-54,-90,-31, 61, 88, 22,-67,-85,-13, 73, 82, 4,-78},
151 { 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75, 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75},
152 { 73,-31,-90,-22, 78, 67,-38,-90,-13, 82, 61,-46,-88, -4, 85, 54, -54,-85, 4, 88, 46,-61,-82, 13, 90, 38,-67,-78, 22, 90, 31,-73},
153 { 70,-43,-87, 9, 90, 25,-80,-57, 57, 80,-25,-90, -9, 87, 43,-70, -70, 43, 87, -9,-90,-25, 80, 57,-57,-80, 25, 90, 9,-87,-43, 70},
154 { 67,-54,-78, 38, 85,-22,-90, 4, 90, 13,-88,-31, 82, 46,-73,-61, 61, 73,-46,-82, 31, 88,-13,-90, -4, 90, 22,-85,-38, 78, 54,-67},
155 { 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64},
156 { 61,-73,-46, 82, 31,-88,-13, 90, -4,-90, 22, 85,-38,-78, 54, 67, -67,-54, 78, 38,-85,-22, 90, 4,-90, 13, 88,-31,-82, 46, 73,-61},
157 { 57,-80,-25, 90, -9,-87, 43, 70,-70,-43, 87, 9,-90, 25, 80,-57, -57, 80, 25,-90, 9, 87,-43,-70, 70, 43,-87, -9, 90,-25,-80, 57},
158 { 54,-85, -4, 88,-46,-61, 82, 13,-90, 38, 67,-78,-22, 90,-31,-73, 73, 31,-90, 22, 78,-67,-38, 90,-13,-82, 61, 46,-88, 4, 85,-54},
159 { 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50, 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50},
160 { 46,-90, 38, 54,-90, 31, 61,-88, 22, 67,-85, 13, 73,-82, 4, 78, -78, -4, 82,-73,-13, 85,-67,-22, 88,-61,-31, 90,-54,-38, 90,-46},
161 { 43,-90, 57, 25,-87, 70, 9,-80, 80, -9,-70, 87,-25,-57, 90,-43, -43, 90,-57,-25, 87,-70, -9, 80,-80, 9, 70,-87, 25, 57,-90, 43},
162 { 38,-88, 73, -4,-67, 90,-46,-31, 85,-78, 13, 61,-90, 54, 22,-82, 82,-22,-54, 90,-61,-13, 78,-85, 31, 46,-90, 67, 4,-73, 88,-38},
163 { 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36},
164 { 31,-78, 90,-61, 4, 54,-88, 82,-38,-22, 73,-90, 67,-13,-46, 85, -85, 46, 13,-67, 90,-73, 22, 38,-82, 88,-54, -4, 61,-90, 78,-31},
165 { 25,-70, 90,-80, 43, 9,-57, 87,-87, 57, -9,-43, 80,-90, 70,-25, -25, 70,-90, 80,-43, -9, 57,-87, 87,-57, 9, 43,-80, 90,-70, 25},
166 { 22,-61, 85,-90, 73,-38, -4, 46,-78, 90,-82, 54,-13,-31, 67,-88, 88,-67, 31, 13,-54, 82,-90, 78,-46, 4, 38,-73, 90,-85, 61,-22},
167 { 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18, 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18},
168 { 13,-38, 61,-78, 88,-90, 85,-73, 54,-31, 4, 22,-46, 67,-82, 90, -90, 82,-67, 46,-22, -4, 31,-54, 73,-85, 90,-88, 78,-61, 38,-13},
169 { 9,-25, 43,-57, 70,-80, 87,-90, 90,-87, 80,-70, 57,-43, 25, -9, -9, 25,-43, 57,-70, 80,-87, 90,-90, 87,-80, 70,-57, 43,-25, 9},
170 { 4,-13, 22,-31, 38,-46, 54,-61, 67,-73, 78,-82, 85,-88, 90,-90, 90,-90, 88,-85, 82,-78, 73,-67, 61,-54, 46,-38, 31,-22, 13, -4}
171 };
172
173
174
175
176 static void transform_dct_add_8(uint8_t *dst, ptrdiff_t stride,
177 int nT, int16_t *coeffs)
178 {
179 int postShift = 20-8; // 8 bit
180 int rnd1 = 1<<(7-1);
181 int rnd2 = 1<<(postShift-1);
182 int fact = (1<<(5-Log2(nT)));
183
184 int16_t g[32*32]; // actually, only [nT*nT] used
185
186 // TODO: valgrind reports that dst[] contains uninitialized data.
187 // Probably from intra-prediction.
188
189 /*
190 for (int i=0;i<nT*nT;i++) {
191 printf("%d\n",coeffs[i]);
192 }
193
194 for (int y=0;y<nT;y++) {
195 for (int i=0;i<nT;i++) {
196 printf("%d ",dst[y*stride+i]);
197 }
198 }
199 printf("\n");
200 */
201
202 for (int c=0;c<nT;c++) {
203
204 logtrace(LogTransform,"DCT-V: ");
205 for (int i=0;i<nT;i++) {
206 logtrace(LogTransform,"*%d ",coeffs[c+i*nT]);
207 }
208 logtrace(LogTransform,"* -> ");
209
210
211 // find last non-zero coefficient to reduce computations carried out in DCT
212
213 int lastCol = nT-1;
214 for (;lastCol>=0;lastCol--) {
215 if (coeffs[c+lastCol*nT]) { break; }
216 }
217
218 for (int i=0;i<nT;i++) {
219 int sum=0;
220
221 for (int j=0;j<=lastCol /*nT*/;j++) {
222 sum += mat_dct[fact*j][i] * coeffs[c+j*nT];
223 }
224
225 g[c+i*nT] = Clip3(-32768,32767, (sum+rnd1)>>7);
226
227 logtrace(LogTransform,"*%d ",g[c+i*nT]);
228 }
229 logtrace(LogTransform,"*\n");
230 }
231
232
233 for (int y=0;y<nT;y++) {
234
235 logtrace(LogTransform,"DCT-H: ");
236 for (int i=0;i<nT;i++) {
237 logtrace(LogTransform,"*%d ",g[i+y*nT]);
238 }
239 logtrace(LogTransform,"* -> ");
240
241
242 // find last non-zero coefficient to reduce computations carried out in DCT
243
244 int lastCol = nT-1;
245 for (;lastCol>=0;lastCol--) {
246 if (g[y*nT+lastCol]) { break; }
247 }
248
249
250 for (int i=0;i<nT;i++) {
251 int sum=0;
252
253 for (int j=0;j<=lastCol /*nT*/;j++) {
254 sum += mat_dct[fact*j][i] * g[y*nT+j];
255 }
256
257 //int out = Clip3(-32768,32767, (sum+rnd2)>>postShift);
258 int out = (sum+rnd2)>>postShift;
259
260 //fprintf(stderr,"%d*%d+%d = %d\n",y,stride,i,y*stride+i);
261 //fprintf(stderr,"[%p]=%d\n",&dst[y*stride+i], Clip1_8bit(dst[y*stride+i]));
262 dst[y*stride+i] = Clip1_8bit(dst[y*stride+i] + out);
263
264 logtrace(LogTransform,"*%d ",out);
265 }
266 logtrace(LogTransform,"*\n");
267 }
268 }
269
270
271 void transform_4x4_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
272 {
273 transform_dct_add_8(dst,stride, 4, coeffs);
274 }
275
276 void transform_8x8_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
277 {
278 transform_dct_add_8(dst,stride, 8, coeffs);
279 }
280
281 void transform_16x16_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
282 {
283 transform_dct_add_8(dst,stride, 16, coeffs);
284 }
285
286 void transform_32x32_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
287 {
288 transform_dct_add_8(dst,stride, 32, coeffs);
289 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
019
120 #ifndef FALLBACK_DCT_H
221 #define FALLBACK_DCT_H
+0
-472
libde265/fallback-motion.c less more
0
1 #include "fallback-motion.h"
2 #include "util.h"
3
4 #if defined(_MSC_VER) || defined(__MINGW32__)
5 # include <malloc.h>
6 #else
7 # include <alloca.h>
8 #endif
9
10 #include <assert.h>
11
12
13 void put_unweighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
14 int16_t *src, ptrdiff_t srcstride,
15 int width, int height)
16 {
17 int offset8bit = 32;
18 int shift8bit = 6;
19
20 assert((width&1)==0);
21
22 for (int y=0;y<height;y++) {
23 int16_t* in = &src[y*srcstride];
24 uint8_t* out = &dst[y*dststride];
25
26 for (int x=0;x<width;x+=2) {
27 out[0] = Clip1_8bit((in[0] + offset8bit)>>shift8bit);
28 out[1] = Clip1_8bit((in[1] + offset8bit)>>shift8bit);
29 out+=2; in+=2;
30 }
31 }
32 }
33
34
35 void put_weighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
36 int16_t *src, ptrdiff_t srcstride,
37 int width, int height,
38 int w,int o,int log2WD)
39 {
40 assert(log2WD>=1); // TODO
41
42 const int rnd = (1<<(log2WD-1));
43
44 for (int y=0;y<height;y++) {
45 int16_t* in = &src[y*srcstride];
46 uint8_t* out = &dst[y*dststride];
47
48 for (int x=0;x<width;x++) {
49 out[0] = Clip1_8bit(((in[0]*w + rnd)>>log2WD) + o);
50 out++; in++;
51 }
52 }
53 }
54
55 void put_weighted_bipred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
56 int16_t *src1, int16_t *src2, ptrdiff_t srcstride,
57 int width, int height,
58 int w1,int o1, int w2,int o2, int log2WD)
59 {
60 assert(log2WD>=1); // TODO
61
62 const int rnd = ((o1+o2+1) << log2WD);
63
64 for (int y=0;y<height;y++) {
65 int16_t* in1 = &src1[y*srcstride];
66 int16_t* in2 = &src2[y*srcstride];
67 uint8_t* out = &dst[y*dststride];
68
69 for (int x=0;x<width;x++) {
70 out[0] = Clip1_8bit((in1[0]*w1 + in2[0]*w2 + rnd)>>(log2WD+1));
71 out++; in1++; in2++;
72 }
73 }
74 }
75
76
77 void put_weighted_pred_avg_8_fallback(uint8_t *dst, ptrdiff_t dststride,
78 int16_t *src1, int16_t *src2,
79 ptrdiff_t srcstride, int width,
80 int height)
81 {
82 int offset8bit = 64;
83 int shift8bit = 7;
84
85 assert((width&1)==0);
86
87 // I had a special case for 8-pixel parallel, unrolled code,
88 // but I did not see any speedup.
89
90 #if 0
91 for (int y=0;y<height;y++) {
92 int16_t* in1 = &src1[y*srcstride];
93 int16_t* in2 = &src2[y*srcstride];
94 uint8_t* out = &dst[y*dststride];
95
96 for (int x=0;x<width;x++) {
97 out[0] = Clip1_8bit((in1[0] + in2[0] + offset8bit)>>shift8bit);
98 out++; in1++; in2++;
99 }
100 }
101 #endif
102
103 #if 0
104 if ((width&7)==0) {
105 for (int y=0;y<height;y++) {
106 int16_t* in1 = &src1[y*srcstride];
107 int16_t* in2 = &src2[y*srcstride];
108 uint8_t* out = &dst[y*dststride];
109
110 for (int x=0;x<width;x+=8) {
111 out[0] = Clip1_8bit((in1[0] + in2[0] + offset8bit)>>shift8bit);
112 out[1] = Clip1_8bit((in1[1] + in2[1] + offset8bit)>>shift8bit);
113 out[2] = Clip1_8bit((in1[2] + in2[2] + offset8bit)>>shift8bit);
114 out[3] = Clip1_8bit((in1[3] + in2[3] + offset8bit)>>shift8bit);
115 out[4] = Clip1_8bit((in1[4] + in2[4] + offset8bit)>>shift8bit);
116 out[5] = Clip1_8bit((in1[5] + in2[5] + offset8bit)>>shift8bit);
117 out[6] = Clip1_8bit((in1[6] + in2[6] + offset8bit)>>shift8bit);
118 out[7] = Clip1_8bit((in1[7] + in2[7] + offset8bit)>>shift8bit);
119 out+=8; in1+=8; in2+=8;
120 }
121 }
122 }
123 else
124 #endif
125 {
126 for (int y=0;y<height;y++) {
127 int16_t* in1 = &src1[y*srcstride];
128 int16_t* in2 = &src2[y*srcstride];
129 uint8_t* out = &dst[y*dststride];
130
131 for (int x=0;x<width;x+=2) {
132 out[0] = Clip1_8bit((in1[0] + in2[0] + offset8bit)>>shift8bit);
133 out[1] = Clip1_8bit((in1[1] + in2[1] + offset8bit)>>shift8bit);
134 out+=2; in1+=2; in2+=2;
135 }
136 }
137 }
138 }
139
140
141
142 void put_epel_8_fallback(int16_t *out, ptrdiff_t out_stride,
143 uint8_t *src, ptrdiff_t src_stride,
144 int width, int height,
145 int mx, int my, int16_t* mcbuffer)
146 {
147 int shift3 = 6;
148
149 for (int y=0;y<height;y++) {
150 int16_t* o = &out[y*out_stride];
151 uint8_t* i = &src[y*src_stride];
152
153 for (int x=0;x<width;x++) {
154 *o = *i << shift3;
155 o++;
156 i++;
157 }
158 }
159 }
160
161
162 void put_epel_hv_8_fallback(int16_t *dst, ptrdiff_t dst_stride,
163 uint8_t *src, ptrdiff_t src_stride,
164 int nPbWC, int nPbHC,
165 int xFracC, int yFracC, int16_t* mcbuffer)
166 {
167 const int shift1 = 0;
168 const int shift2 = 6;
169 //const int shift3 = 6;
170
171 int extra_left = 1;
172 int extra_top = 1;
173 // int extra_right = 2;
174 int extra_bottom= 2;
175
176
177 int nPbH_extra = extra_top + nPbHC + extra_bottom;
178
179 int16_t* tmp2buf = (int16_t*)alloca( nPbWC * nPbH_extra * sizeof(int16_t) );
180
181 /*
182 int nPbW_extra = extra_left + nPbWC + extra_right;
183
184
185 printf("x,y FracC: %d/%d\n",xFracC,yFracC);
186
187 printf("---IN---\n");
188
189 for (int y=-extra_top;y<nPbHC+extra_bottom;y++) {
190 uint8_t* p = &src[y*src_stride -extra_left];
191
192 for (int x=-extra_left;x<nPbWC+extra_right;x++) {
193 printf("%05d ",*p << 6);
194 p++;
195 }
196 printf("\n");
197 }
198 */
199
200
201 // H-filters
202
203 logtrace(LogMotion,"---H---\n");
204 //printf("---H---(%d)\n",xFracC);
205
206 for (int y=-extra_top;y<nPbHC+extra_bottom;y++) {
207 uint8_t* p = &src[y*src_stride - extra_left];
208
209 for (int x=0;x<nPbWC;x++) {
210 int16_t v;
211 switch (xFracC) {
212 case 0: v = p[1]; break;
213 case 1: v = (-2*p[0]+58*p[1]+10*p[2]-2*p[3])>>shift1; break;
214 case 2: v = (-4*p[0]+54*p[1]+16*p[2]-2*p[3])>>shift1; break;
215 case 3: v = (-6*p[0]+46*p[1]+28*p[2]-4*p[3])>>shift1; break;
216 case 4: v = (-4*p[0]+36*p[1]+36*p[2]-4*p[3])>>shift1; break;
217 case 5: v = (-4*p[0]+28*p[1]+46*p[2]-6*p[3])>>shift1; break;
218 case 6: v = (-2*p[0]+16*p[1]+54*p[2]-4*p[3])>>shift1; break;
219 default:
220 case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>shift1; break;
221 }
222
223 //printf("%d %d %d %d -> %d\n",p[0],p[1],p[2],p[3],v);
224
225 tmp2buf[y+extra_top + x*nPbH_extra] = v;
226 p++;
227
228 //printf("%05d ",tmp2buf[y+extra_top + x*nPbH_extra]);
229 }
230 //printf("\n");
231 }
232
233 // V-filters
234
235 int vshift = (xFracC==0 ? shift1 : shift2);
236
237 for (int x=0;x<nPbWC;x++) {
238 int16_t* p = &tmp2buf[x*nPbH_extra];
239
240 for (int y=0;y<nPbHC;y++) {
241 int16_t v;
242 //logtrace(LogMotion,"%x %x %x %x %x %x %x\n",p[0],p[1],p[2],p[3],p[4],p[5],p[6]);
243
244 switch (yFracC) {
245 case 0: v = p[1]; break;
246 case 1: v = (-2*p[0]+58*p[1]+10*p[2]-2*p[3])>>vshift; break;
247 case 2: v = (-4*p[0]+54*p[1]+16*p[2]-2*p[3])>>vshift; break;
248 case 3: v = (-6*p[0]+46*p[1]+28*p[2]-4*p[3])>>vshift; break;
249 case 4: v = (-4*p[0]+36*p[1]+36*p[2]-4*p[3])>>vshift; break;
250 case 5: v = (-4*p[0]+28*p[1]+46*p[2]-6*p[3])>>vshift; break;
251 case 6: v = (-2*p[0]+16*p[1]+54*p[2]-4*p[3])>>vshift; break;
252 default:
253 case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>vshift; break;
254 }
255
256 dst[x + y*dst_stride] = v;
257 p++;
258 }
259
260 }
261
262 /*
263 printf("---V---\n");
264 for (int y=0;y<nPbHC;y++) {
265 for (int x=0;x<nPbWC;x++) {
266 printf("%05d ",dst[x+y*dst_stride]);
267 }
268 printf("\n");
269 }
270 */
271 }
272
273
274
275
276 void put_qpel_0_0_fallback(int16_t *out, ptrdiff_t out_stride,
277 uint8_t *src, ptrdiff_t srcstride,
278 int nPbW, int nPbH, int16_t* mcbuffer)
279 {
280 //const int shift1 = 0; // sps->BitDepth_Y-8;
281 const int shift2 = 6;
282
283 // straight copy
284
285 for (int y=0;y<nPbH;y++) {
286 uint8_t* p = src + srcstride*y;
287 int16_t* o = out + out_stride*y;
288
289 for (int x=0;x<nPbW;x+=4) {
290 #if 0
291 *o = *p << shift2;
292 o++; p++;
293 #else
294 // does not seem to be faster...
295 int16_t o0,o1,o2,o3;
296 o0 = p[0] << shift2;
297 o1 = p[1] << shift2;
298 o2 = p[2] << shift2;
299 o3 = p[3] << shift2;
300 o[0]=o0;
301 o[1]=o1;
302 o[2]=o2;
303 o[3]=o3;
304
305 o+=4;
306 p+=4;
307 #endif
308 }
309 }
310 }
311
312
313
314 static int extra_before[4] = { 0,3,3,2 };
315 static int extra_after [4] = { 0,3,4,4 };
316
317 void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride,
318 uint8_t *src, ptrdiff_t srcstride,
319 int nPbW, int nPbH, int16_t* mcbuffer,
320 int xFracL, int yFracL)
321 {
322 int extra_left = extra_before[xFracL];
323 //int extra_right = extra_after [xFracL];
324 int extra_top = extra_before[yFracL];
325 int extra_bottom = extra_after [yFracL];
326
327 //int nPbW_extra = extra_left + nPbW + extra_right;
328 int nPbH_extra = extra_top + nPbH + extra_bottom;
329
330 const int shift1 = 0; // sps->BitDepth_Y-8;
331 const int shift2 = 6;
332
333
334 // H-filters
335
336 switch (xFracL) {
337 case 0:
338 for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
339 uint8_t* p = src + srcstride*y - extra_left;
340 int16_t* o = &mcbuffer[y+extra_top];
341
342 for (int x=0;x<nPbW;x++) {
343 *o = *p;
344 o += nPbH_extra;
345 p++;
346 }
347 }
348 break;
349 case 1:
350 for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
351 uint8_t* p = src + srcstride*y - extra_left;
352 int16_t* o = &mcbuffer[y+extra_top];
353
354 for (int x=0;x<nPbW;x++) {
355 *o = (-p[0]+4*p[1]-10*p[2]+58*p[3]+17*p[4] -5*p[5] +p[6])>>shift1;
356 o += nPbH_extra;
357 p++;
358 }
359 }
360 break;
361 case 2:
362 for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
363 uint8_t* p = src + srcstride*y - extra_left;
364 int16_t* o = &mcbuffer[y+extra_top];
365
366 for (int x=0;x<nPbW;x++) {
367 *o = (-p[0]+4*p[1]-11*p[2]+40*p[3]+40*p[4]-11*p[5]+4*p[6]-p[7])>>shift1;
368 o += nPbH_extra;
369 p++;
370 }
371 }
372 break;
373 case 3:
374 for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
375 uint8_t* p = src + srcstride*y - extra_left;
376 int16_t* o = &mcbuffer[y+extra_top];
377
378 for (int x=0;x<nPbW;x++) {
379 *o = ( p[0]-5*p[1]+17*p[2]+58*p[3]-10*p[4] +4*p[5] -p[6])>>shift1;
380 o += nPbH_extra;
381 p++;
382 }
383 }
384 break;
385 }
386
387
388 logtrace(LogMotion,"---H---\n");
389
390 for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
391 for (int x=0;x<nPbW;x++) {
392 logtrace(LogMotion,"%04x ",mcbuffer[y+extra_top + x*nPbH_extra]);
393 }
394 logtrace(LogMotion,"\n");
395 }
396
397 // V-filters
398
399 int vshift = (xFracL==0 ? shift1 : shift2);
400
401 switch (yFracL) {
402 case 0:
403 for (int x=0;x<nPbW;x++) {
404 int16_t* p = &mcbuffer[x*nPbH_extra];
405 int16_t* o = &out[x];
406
407 for (int y=0;y<nPbH;y++) {
408 *o = *p;
409 o+=out_stride;
410 p++;
411 }
412 }
413 break;
414 case 1:
415 for (int x=0;x<nPbW;x++) {
416 int16_t* p = &mcbuffer[x*nPbH_extra];
417 int16_t* o = &out[x];
418
419 for (int y=0;y<nPbH;y++) {
420 *o = (-p[0]+4*p[1]-10*p[2]+58*p[3]+17*p[4] -5*p[5] +p[6])>>vshift;
421 o+=out_stride;
422 p++;
423 }
424 }
425 break;
426 case 2:
427 for (int x=0;x<nPbW;x++) {
428 int16_t* p = &mcbuffer[x*nPbH_extra];
429 int16_t* o = &out[x];
430
431 for (int y=0;y<nPbH;y++) {
432 *o = (-p[0]+4*p[1]-11*p[2]+40*p[3]+40*p[4]-11*p[5]+4*p[6]-p[7])>>vshift;
433 o+=out_stride;
434 p++;
435 }
436 }
437 break;
438 case 3:
439 for (int x=0;x<nPbW;x++) {
440 int16_t* p = &mcbuffer[x*nPbH_extra];
441 int16_t* o = &out[x];
442
443 for (int y=0;y<nPbH;y++) {
444 *o = ( p[0]-5*p[1]+17*p[2]+58*p[3]-10*p[4] +4*p[5] -p[6])>>vshift;
445 o+=out_stride;
446 p++;
447 }
448 }
449 break;
450 }
451
452
453 logtrace(LogMotion,"---V---\n");
454 for (int y=0;y<nPbH;y++) {
455 for (int x=0;x<nPbW;x++) {
456 logtrace(LogMotion,"%04x ",out[x+y*out_stride]);
457 }
458 logtrace(LogMotion,"\n");
459 }
460 }
461
462
463 #define QPEL(x,y) void put_qpel_ ## x ## _ ## y ## _fallback(int16_t *out, ptrdiff_t out_stride, \
464 uint8_t *src, ptrdiff_t srcstride, \
465 int nPbW, int nPbH, int16_t* mcbuffer) \
466 { put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y ); }
467
468 /* */ QPEL(0,1) QPEL(0,2) QPEL(0,3)
469 QPEL(1,0) QPEL(1,1) QPEL(1,2) QPEL(1,3)
470 QPEL(2,0) QPEL(2,1) QPEL(2,2) QPEL(2,3)
471 QPEL(3,0) QPEL(3,1) QPEL(3,2) QPEL(3,3)
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "fallback-motion.h"
21 #include "util.h"
22
23 #if defined(_MSC_VER) || defined(__MINGW32__)
24 # include <malloc.h>
25 #else
26 # include <alloca.h>
27 #endif
28
29 #include <assert.h>
30
31
32 void put_unweighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
33 int16_t *src, ptrdiff_t srcstride,
34 int width, int height)
35 {
36 int offset8bit = 32;
37 int shift8bit = 6;
38
39 assert((width&1)==0);
40
41 for (int y=0;y<height;y++) {
42 int16_t* in = &src[y*srcstride];
43 uint8_t* out = &dst[y*dststride];
44
45 for (int x=0;x<width;x+=2) {
46 out[0] = Clip1_8bit((in[0] + offset8bit)>>shift8bit);
47 out[1] = Clip1_8bit((in[1] + offset8bit)>>shift8bit);
48 out+=2; in+=2;
49 }
50 }
51 }
52
53
54 void put_weighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
55 int16_t *src, ptrdiff_t srcstride,
56 int width, int height,
57 int w,int o,int log2WD)
58 {
59 assert(log2WD>=1); // TODO
60
61 const int rnd = (1<<(log2WD-1));
62
63 for (int y=0;y<height;y++) {
64 int16_t* in = &src[y*srcstride];
65 uint8_t* out = &dst[y*dststride];
66
67 for (int x=0;x<width;x++) {
68 out[0] = Clip1_8bit(((in[0]*w + rnd)>>log2WD) + o);
69 out++; in++;
70 }
71 }
72 }
73
74 void put_weighted_bipred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
75 int16_t *src1, int16_t *src2, ptrdiff_t srcstride,
76 int width, int height,
77 int w1,int o1, int w2,int o2, int log2WD)
78 {
79 assert(log2WD>=1); // TODO
80
81 const int rnd = ((o1+o2+1) << log2WD);
82
83 for (int y=0;y<height;y++) {
84 int16_t* in1 = &src1[y*srcstride];
85 int16_t* in2 = &src2[y*srcstride];
86 uint8_t* out = &dst[y*dststride];
87
88 for (int x=0;x<width;x++) {
89 out[0] = Clip1_8bit((in1[0]*w1 + in2[0]*w2 + rnd)>>(log2WD+1));
90 out++; in1++; in2++;
91 }
92 }
93 }
94
95
96 void put_weighted_pred_avg_8_fallback(uint8_t *dst, ptrdiff_t dststride,
97 int16_t *src1, int16_t *src2,
98 ptrdiff_t srcstride, int width,
99 int height)
100 {
101 int offset8bit = 64;
102 int shift8bit = 7;
103
104 assert((width&1)==0);
105
106 // I had a special case for 8-pixel parallel, unrolled code,
107 // but I did not see any speedup.
108
109 #if 0
110 for (int y=0;y<height;y++) {
111 int16_t* in1 = &src1[y*srcstride];
112 int16_t* in2 = &src2[y*srcstride];
113 uint8_t* out = &dst[y*dststride];
114
115 for (int x=0;x<width;x++) {
116 out[0] = Clip1_8bit((in1[0] + in2[0] + offset8bit)>>shift8bit);
117 out++; in1++; in2++;
118 }
119 }
120 #endif
121
122 #if 0
123 if ((width&7)==0) {
124 for (int y=0;y<height;y++) {
125 int16_t* in1 = &src1[y*srcstride];
126 int16_t* in2 = &src2[y*srcstride];
127 uint8_t* out = &dst[y*dststride];
128
129 for (int x=0;x<width;x+=8) {
130 out[0] = Clip1_8bit((in1[0] + in2[0] + offset8bit)>>shift8bit);
131 out[1] = Clip1_8bit((in1[1] + in2[1] + offset8bit)>>shift8bit);
132 out[2] = Clip1_8bit((in1[2] + in2[2] + offset8bit)>>shift8bit);
133 out[3] = Clip1_8bit((in1[3] + in2[3] + offset8bit)>>shift8bit);
134 out[4] = Clip1_8bit((in1[4] + in2[4] + offset8bit)>>shift8bit);
135 out[5] = Clip1_8bit((in1[5] + in2[5] + offset8bit)>>shift8bit);
136 out[6] = Clip1_8bit((in1[6] + in2[6] + offset8bit)>>shift8bit);
137 out[7] = Clip1_8bit((in1[7] + in2[7] + offset8bit)>>shift8bit);
138 out+=8; in1+=8; in2+=8;
139 }
140 }
141 }
142 else
143 #endif
144 {
145 for (int y=0;y<height;y++) {
146 int16_t* in1 = &src1[y*srcstride];
147 int16_t* in2 = &src2[y*srcstride];
148 uint8_t* out = &dst[y*dststride];
149
150 for (int x=0;x<width;x+=2) {
151 out[0] = Clip1_8bit((in1[0] + in2[0] + offset8bit)>>shift8bit);
152 out[1] = Clip1_8bit((in1[1] + in2[1] + offset8bit)>>shift8bit);
153 out+=2; in1+=2; in2+=2;
154 }
155 }
156 }
157 }
158
159
160
161 void put_epel_8_fallback(int16_t *out, ptrdiff_t out_stride,
162 uint8_t *src, ptrdiff_t src_stride,
163 int width, int height,
164 int mx, int my, int16_t* mcbuffer)
165 {
166 int shift3 = 6;
167
168 for (int y=0;y<height;y++) {
169 int16_t* o = &out[y*out_stride];
170 uint8_t* i = &src[y*src_stride];
171
172 for (int x=0;x<width;x++) {
173 *o = *i << shift3;
174 o++;
175 i++;
176 }
177 }
178 }
179
180
181 void put_epel_hv_8_fallback(int16_t *dst, ptrdiff_t dst_stride,
182 uint8_t *src, ptrdiff_t src_stride,
183 int nPbWC, int nPbHC,
184 int xFracC, int yFracC, int16_t* mcbuffer)
185 {
186 const int shift1 = 0;
187 const int shift2 = 6;
188 //const int shift3 = 6;
189
190 int extra_left = 1;
191 int extra_top = 1;
192 // int extra_right = 2;
193 int extra_bottom= 2;
194
195
196 int nPbH_extra = extra_top + nPbHC + extra_bottom;
197
198 int16_t* tmp2buf = (int16_t*)alloca( nPbWC * nPbH_extra * sizeof(int16_t) );
199
200 /*
201 int nPbW_extra = extra_left + nPbWC + extra_right;
202
203
204 printf("x,y FracC: %d/%d\n",xFracC,yFracC);
205
206 printf("---IN---\n");
207
208 for (int y=-extra_top;y<nPbHC+extra_bottom;y++) {
209 uint8_t* p = &src[y*src_stride -extra_left];
210
211 for (int x=-extra_left;x<nPbWC+extra_right;x++) {
212 printf("%05d ",*p << 6);
213 p++;
214 }
215 printf("\n");
216 }
217 */
218
219
220 // H-filters
221
222 logtrace(LogMotion,"---H---\n");
223 //printf("---H---(%d)\n",xFracC);
224
225 for (int y=-extra_top;y<nPbHC+extra_bottom;y++) {
226 uint8_t* p = &src[y*src_stride - extra_left];
227
228 for (int x=0;x<nPbWC;x++) {
229 int16_t v;
230 switch (xFracC) {
231 case 0: v = p[1]; break;
232 case 1: v = (-2*p[0]+58*p[1]+10*p[2]-2*p[3])>>shift1; break;
233 case 2: v = (-4*p[0]+54*p[1]+16*p[2]-2*p[3])>>shift1; break;
234 case 3: v = (-6*p[0]+46*p[1]+28*p[2]-4*p[3])>>shift1; break;
235 case 4: v = (-4*p[0]+36*p[1]+36*p[2]-4*p[3])>>shift1; break;
236 case 5: v = (-4*p[0]+28*p[1]+46*p[2]-6*p[3])>>shift1; break;
237 case 6: v = (-2*p[0]+16*p[1]+54*p[2]-4*p[3])>>shift1; break;
238 default:
239 case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>shift1; break;
240 }
241
242 //printf("%d %d %d %d -> %d\n",p[0],p[1],p[2],p[3],v);
243
244 tmp2buf[y+extra_top + x*nPbH_extra] = v;
245 p++;
246
247 //printf("%05d ",tmp2buf[y+extra_top + x*nPbH_extra]);
248 }
249 //printf("\n");
250 }
251
252 // V-filters
253
254 int vshift = (xFracC==0 ? shift1 : shift2);
255
256 for (int x=0;x<nPbWC;x++) {
257 int16_t* p = &tmp2buf[x*nPbH_extra];
258
259 for (int y=0;y<nPbHC;y++) {
260 int16_t v;
261 //logtrace(LogMotion,"%x %x %x %x %x %x %x\n",p[0],p[1],p[2],p[3],p[4],p[5],p[6]);
262
263 switch (yFracC) {
264 case 0: v = p[1]; break;
265 case 1: v = (-2*p[0]+58*p[1]+10*p[2]-2*p[3])>>vshift; break;
266 case 2: v = (-4*p[0]+54*p[1]+16*p[2]-2*p[3])>>vshift; break;
267 case 3: v = (-6*p[0]+46*p[1]+28*p[2]-4*p[3])>>vshift; break;
268 case 4: v = (-4*p[0]+36*p[1]+36*p[2]-4*p[3])>>vshift; break;
269 case 5: v = (-4*p[0]+28*p[1]+46*p[2]-6*p[3])>>vshift; break;
270 case 6: v = (-2*p[0]+16*p[1]+54*p[2]-4*p[3])>>vshift; break;
271 default:
272 case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>vshift; break;
273 }
274
275 dst[x + y*dst_stride] = v;
276 p++;
277 }
278
279 }
280
281 /*
282 printf("---V---\n");
283 for (int y=0;y<nPbHC;y++) {
284 for (int x=0;x<nPbWC;x++) {
285 printf("%05d ",dst[x+y*dst_stride]);
286 }
287 printf("\n");
288 }
289 */
290 }
291
292
293
294
295 void put_qpel_0_0_fallback(int16_t *out, ptrdiff_t out_stride,
296 uint8_t *src, ptrdiff_t srcstride,
297 int nPbW, int nPbH, int16_t* mcbuffer)
298 {
299 //const int shift1 = 0; // sps->BitDepth_Y-8;
300 const int shift2 = 6;
301
302 // straight copy
303
304 for (int y=0;y<nPbH;y++) {
305 uint8_t* p = src + srcstride*y;
306 int16_t* o = out + out_stride*y;
307
308 for (int x=0;x<nPbW;x+=4) {
309 #if 0
310 *o = *p << shift2;
311 o++; p++;
312 #else
313 // does not seem to be faster...
314 int16_t o0,o1,o2,o3;
315 o0 = p[0] << shift2;
316 o1 = p[1] << shift2;
317 o2 = p[2] << shift2;
318 o3 = p[3] << shift2;
319 o[0]=o0;
320 o[1]=o1;
321 o[2]=o2;
322 o[3]=o3;
323
324 o+=4;
325 p+=4;
326 #endif
327 }
328 }
329 }
330
331
332
333 static int extra_before[4] = { 0,3,3,2 };
334 static int extra_after [4] = { 0,3,4,4 };
335
336 void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride,
337 uint8_t *src, ptrdiff_t srcstride,
338 int nPbW, int nPbH, int16_t* mcbuffer,
339 int xFracL, int yFracL)
340 {
341 int extra_left = extra_before[xFracL];
342 //int extra_right = extra_after [xFracL];
343 int extra_top = extra_before[yFracL];
344 int extra_bottom = extra_after [yFracL];
345
346 //int nPbW_extra = extra_left + nPbW + extra_right;
347 int nPbH_extra = extra_top + nPbH + extra_bottom;
348
349 const int shift1 = 0; // sps->BitDepth_Y-8;
350 const int shift2 = 6;
351
352
353 // H-filters
354
355 switch (xFracL) {
356 case 0:
357 for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
358 uint8_t* p = src + srcstride*y - extra_left;
359 int16_t* o = &mcbuffer[y+extra_top];
360
361 for (int x=0;x<nPbW;x++) {
362 *o = *p;
363 o += nPbH_extra;
364 p++;
365 }
366 }
367 break;
368 case 1:
369 for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
370 uint8_t* p = src + srcstride*y - extra_left;
371 int16_t* o = &mcbuffer[y+extra_top];
372
373 for (int x=0;x<nPbW;x++) {
374 *o = (-p[0]+4*p[1]-10*p[2]+58*p[3]+17*p[4] -5*p[5] +p[6])>>shift1;
375 o += nPbH_extra;
376 p++;
377 }
378 }
379 break;
380 case 2:
381 for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
382 uint8_t* p = src + srcstride*y - extra_left;
383 int16_t* o = &mcbuffer[y+extra_top];
384
385 for (int x=0;x<nPbW;x++) {
386 *o = (-p[0]+4*p[1]-11*p[2]+40*p[3]+40*p[4]-11*p[5]+4*p[6]-p[7])>>shift1;
387 o += nPbH_extra;
388 p++;
389 }
390 }
391 break;
392 case 3:
393 for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
394 uint8_t* p = src + srcstride*y - extra_left;
395 int16_t* o = &mcbuffer[y+extra_top];
396
397 for (int x=0;x<nPbW;x++) {
398 *o = ( p[0]-5*p[1]+17*p[2]+58*p[3]-10*p[4] +4*p[5] -p[6])>>shift1;
399 o += nPbH_extra;
400 p++;
401 }
402 }
403 break;
404 }
405
406
407 logtrace(LogMotion,"---H---\n");
408
409 for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
410 for (int x=0;x<nPbW;x++) {
411 logtrace(LogMotion,"%04x ",mcbuffer[y+extra_top + x*nPbH_extra]);
412 }
413 logtrace(LogMotion,"\n");
414 }
415
416 // V-filters
417
418 int vshift = (xFracL==0 ? shift1 : shift2);
419
420 switch (yFracL) {
421 case 0:
422 for (int x=0;x<nPbW;x++) {
423 int16_t* p = &mcbuffer[x*nPbH_extra];
424 int16_t* o = &out[x];
425
426 for (int y=0;y<nPbH;y++) {
427 *o = *p;
428 o+=out_stride;
429 p++;
430 }
431 }
432 break;
433 case 1:
434 for (int x=0;x<nPbW;x++) {
435 int16_t* p = &mcbuffer[x*nPbH_extra];
436 int16_t* o = &out[x];
437
438 for (int y=0;y<nPbH;y++) {
439 *o = (-p[0]+4*p[1]-10*p[2]+58*p[3]+17*p[4] -5*p[5] +p[6])>>vshift;
440 o+=out_stride;
441 p++;
442 }
443 }
444 break;
445 case 2:
446 for (int x=0;x<nPbW;x++) {
447 int16_t* p = &mcbuffer[x*nPbH_extra];
448 int16_t* o = &out[x];
449
450 for (int y=0;y<nPbH;y++) {
451 *o = (-p[0]+4*p[1]-11*p[2]+40*p[3]+40*p[4]-11*p[5]+4*p[6]-p[7])>>vshift;
452 o+=out_stride;
453 p++;
454 }
455 }
456 break;
457 case 3:
458 for (int x=0;x<nPbW;x++) {
459 int16_t* p = &mcbuffer[x*nPbH_extra];
460 int16_t* o = &out[x];
461
462 for (int y=0;y<nPbH;y++) {
463 *o = ( p[0]-5*p[1]+17*p[2]+58*p[3]-10*p[4] +4*p[5] -p[6])>>vshift;
464 o+=out_stride;
465 p++;
466 }
467 }
468 break;
469 }
470
471
472 logtrace(LogMotion,"---V---\n");
473 for (int y=0;y<nPbH;y++) {
474 for (int x=0;x<nPbW;x++) {
475 logtrace(LogMotion,"%04x ",out[x+y*out_stride]);
476 }
477 logtrace(LogMotion,"\n");
478 }
479 }
480
481
482 #define QPEL(x,y) void put_qpel_ ## x ## _ ## y ## _fallback(int16_t *out, ptrdiff_t out_stride, \
483 uint8_t *src, ptrdiff_t srcstride, \
484 int nPbW, int nPbH, int16_t* mcbuffer) \
485 { put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y ); }
486
487 /* */ QPEL(0,1) QPEL(0,2) QPEL(0,3)
488 QPEL(1,0) QPEL(1,1) QPEL(1,2) QPEL(1,3)
489 QPEL(2,0) QPEL(2,1) QPEL(2,2) QPEL(2,3)
490 QPEL(3,0) QPEL(3,1) QPEL(3,2) QPEL(3,3)
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
019
120 #ifndef FALLBACK_MOTION_H
221 #define FALLBACK_MOTION_H
+0
-44
libde265/fallback.c less more
0
1 #include "fallback.h"
2 #include "fallback-motion.h"
3 #include "fallback-dct.h"
4
5
6 void init_acceleration_functions_fallback(struct acceleration_functions* accel)
7 {
8 accel->put_weighted_pred_avg_8 = put_weighted_pred_avg_8_fallback;
9 accel->put_unweighted_pred_8 = put_unweighted_pred_8_fallback;
10
11 accel->put_weighted_pred_8 = put_weighted_pred_8_fallback;
12 accel->put_weighted_bipred_8 = put_weighted_bipred_8_fallback;
13
14 accel->put_hevc_epel_8 = put_epel_8_fallback;
15 accel->put_hevc_epel_h_8 = put_epel_hv_8_fallback;
16 accel->put_hevc_epel_v_8 = put_epel_hv_8_fallback;
17 accel->put_hevc_epel_hv_8 = put_epel_hv_8_fallback;
18
19 accel->put_hevc_qpel_8[0][0] = put_qpel_0_0_fallback;
20 accel->put_hevc_qpel_8[0][1] = put_qpel_0_1_fallback;
21 accel->put_hevc_qpel_8[0][2] = put_qpel_0_2_fallback;
22 accel->put_hevc_qpel_8[0][3] = put_qpel_0_3_fallback;
23 accel->put_hevc_qpel_8[1][0] = put_qpel_1_0_fallback;
24 accel->put_hevc_qpel_8[1][1] = put_qpel_1_1_fallback;
25 accel->put_hevc_qpel_8[1][2] = put_qpel_1_2_fallback;
26 accel->put_hevc_qpel_8[1][3] = put_qpel_1_3_fallback;
27 accel->put_hevc_qpel_8[2][0] = put_qpel_2_0_fallback;
28 accel->put_hevc_qpel_8[2][1] = put_qpel_2_1_fallback;
29 accel->put_hevc_qpel_8[2][2] = put_qpel_2_2_fallback;
30 accel->put_hevc_qpel_8[2][3] = put_qpel_2_3_fallback;
31 accel->put_hevc_qpel_8[3][0] = put_qpel_3_0_fallback;
32 accel->put_hevc_qpel_8[3][1] = put_qpel_3_1_fallback;
33 accel->put_hevc_qpel_8[3][2] = put_qpel_3_2_fallback;
34 accel->put_hevc_qpel_8[3][3] = put_qpel_3_3_fallback;
35
36 accel->transform_skip_8 = transform_skip_8_fallback;
37 accel->transform_bypass_8 = transform_bypass_8_fallback;
38 accel->transform_4x4_luma_add_8 = transform_4x4_luma_add_8_fallback;
39 accel->transform_4x4_add_8 = transform_4x4_add_8_fallback;
40 accel->transform_8x8_add_8 = transform_8x8_add_8_fallback;
41 accel->transform_16x16_add_8 = transform_16x16_add_8_fallback;
42 accel->transform_32x32_add_8 = transform_32x32_add_8_fallback;
43 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "fallback.h"
21 #include "fallback-motion.h"
22 #include "fallback-dct.h"
23
24
25 void init_acceleration_functions_fallback(struct acceleration_functions* accel)
26 {
27 accel->put_weighted_pred_avg_8 = put_weighted_pred_avg_8_fallback;
28 accel->put_unweighted_pred_8 = put_unweighted_pred_8_fallback;
29
30 accel->put_weighted_pred_8 = put_weighted_pred_8_fallback;
31 accel->put_weighted_bipred_8 = put_weighted_bipred_8_fallback;
32
33 accel->put_hevc_epel_8 = put_epel_8_fallback;
34 accel->put_hevc_epel_h_8 = put_epel_hv_8_fallback;
35 accel->put_hevc_epel_v_8 = put_epel_hv_8_fallback;
36 accel->put_hevc_epel_hv_8 = put_epel_hv_8_fallback;
37
38 accel->put_hevc_qpel_8[0][0] = put_qpel_0_0_fallback;
39 accel->put_hevc_qpel_8[0][1] = put_qpel_0_1_fallback;
40 accel->put_hevc_qpel_8[0][2] = put_qpel_0_2_fallback;
41 accel->put_hevc_qpel_8[0][3] = put_qpel_0_3_fallback;
42 accel->put_hevc_qpel_8[1][0] = put_qpel_1_0_fallback;
43 accel->put_hevc_qpel_8[1][1] = put_qpel_1_1_fallback;
44 accel->put_hevc_qpel_8[1][2] = put_qpel_1_2_fallback;
45 accel->put_hevc_qpel_8[1][3] = put_qpel_1_3_fallback;
46 accel->put_hevc_qpel_8[2][0] = put_qpel_2_0_fallback;
47 accel->put_hevc_qpel_8[2][1] = put_qpel_2_1_fallback;
48 accel->put_hevc_qpel_8[2][2] = put_qpel_2_2_fallback;
49 accel->put_hevc_qpel_8[2][3] = put_qpel_2_3_fallback;
50 accel->put_hevc_qpel_8[3][0] = put_qpel_3_0_fallback;
51 accel->put_hevc_qpel_8[3][1] = put_qpel_3_1_fallback;
52 accel->put_hevc_qpel_8[3][2] = put_qpel_3_2_fallback;
53 accel->put_hevc_qpel_8[3][3] = put_qpel_3_3_fallback;
54
55 accel->transform_skip_8 = transform_skip_8_fallback;
56 accel->transform_bypass_8 = transform_bypass_8_fallback;
57 accel->transform_4x4_luma_add_8 = transform_4x4_luma_add_8_fallback;
58 accel->transform_4x4_add_8 = transform_4x4_add_8_fallback;
59 accel->transform_8x8_add_8 = transform_8x8_add_8_fallback;
60 accel->transform_16x16_add_8 = transform_16x16_add_8_fallback;
61 accel->transform_32x32_add_8 = transform_32x32_add_8_fallback;
62 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
019
120 #ifndef DE265_FALLBACK_H
221 #define DE265_FALLBACK_H
+0
-647
libde265/image.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "image.h"
21 #include <stdlib.h>
22 #include <string.h>
23 #include <assert.h>
24 #ifdef HAVE_MALLOC_H
25 #include <malloc.h>
26 #endif
27
28 #ifdef HAVE___MINGW_ALIGNED_MALLOC
29 #define ALLOC_ALIGNED(alignment, size) __mingw_aligned_malloc((size), (alignment))
30 #define FREE_ALIGNED(mem) __mingw_aligned_free((mem))
31 #elif _WIN32
32 #define ALLOC_ALIGNED(alignment, size) _aligned_malloc((size), (alignment))
33 #define FREE_ALIGNED(mem) _aligned_free((mem))
34 #elif __APPLE__
35 static inline void *ALLOC_ALIGNED(size_t alignment, size_t size) {
36 void *mem = NULL;
37 if (posix_memalign(&mem, alignment, size) != 0) {
38 return NULL;
39 }
40 return mem;
41 };
42 #define FREE_ALIGNED(mem) free((mem))
43 #else
44 #define ALLOC_ALIGNED(alignment, size) memalign((alignment), (size))
45 #define FREE_ALIGNED(mem) free((mem))
46 #endif
47
48 #define ALLOC_ALIGNED_16(size) ALLOC_ALIGNED(16, size)
49
50 static const int alignment = 16;
51
52
53 void de265_init_image(de265_image* img) // (optional) init variables, do not alloc image
54 {
55 memset(img, 0, sizeof(de265_image));
56
57 img->picture_order_cnt_lsb = -1; // undefined
58 img->PicOrderCntVal = -1; // undefined
59 img->PicState = UnusedForReference;
60
61 de265_mutex_init(&img->mutex);
62 de265_cond_init(&img->finished_cond);
63 }
64
65
66
67 de265_error de265_alloc_image(de265_image* img, int w,int h, enum de265_chroma c,
68 const seq_parameter_set* sps)
69 {
70 const int border=0; // TODO: remove the border altogether
71
72 // --- allocate image buffer (or reuse old one) ---
73
74 if (img->width != w || img->height != h || img->chroma_format != c || img->border != border) {
75
76 int chroma_width = w;
77 int chroma_height= h;
78
79 if (c==de265_chroma_420) {
80 chroma_width = (chroma_width +1)/2;
81 chroma_height = (chroma_height+1)/2;
82 }
83
84 if (c==de265_chroma_422) {
85 chroma_height = (chroma_height+1)/2;
86 }
87
88 img->stride = (w +2*border+alignment-1) / alignment * alignment;
89 img->chroma_stride = (chroma_width+2*border+alignment-1) / alignment * alignment;
90
91 img->width = w;
92 img->height= h;
93 img->border=border;
94 img->chroma_width = chroma_width;
95 img->chroma_height= chroma_height;
96
97 img->chroma_format= c;
98
99 FREE_ALIGNED(img->y_mem);
100 img->y_mem = (uint8_t *)ALLOC_ALIGNED_16(img->stride * (h+2*border));
101 img->y = img->y_mem + border + 2*border*img->stride;
102
103 if (c != de265_chroma_mono) {
104 FREE_ALIGNED(img->cb_mem);
105 FREE_ALIGNED(img->cr_mem);
106 img->cb_mem = (uint8_t *)ALLOC_ALIGNED_16(img->chroma_stride * (chroma_height+2*border));
107 img->cr_mem = (uint8_t *)ALLOC_ALIGNED_16(img->chroma_stride * (chroma_height+2*border));
108
109 img->cb = img->cb_mem + border + 2*border*img->chroma_stride;
110 img->cr = img->cr_mem + border + 2*border*img->chroma_stride;
111 } else {
112 img->cb_mem = NULL;
113 img->cr_mem = NULL;
114 img->cb = NULL;
115 img->cr = NULL;
116 }
117 }
118
119
120 // check for memory shortage
121
122 if (img->y_mem == NULL ||
123 img->cb_mem == NULL ||
124 img->cr_mem == NULL)
125 {
126 de265_free_image(img);
127 return DE265_ERROR_OUT_OF_MEMORY;
128 }
129
130
131 // --- allocate decoding info arrays ---
132
133 if (sps) {
134 // intra pred mode
135
136 int intraPredModeSize = sps->PicWidthInMinPUs * sps->PicHeightInMinPUs;
137 if (intraPredModeSize != img->intraPredModeSize) {
138 img->intraPredModeSize = intraPredModeSize;
139 free(img->intraPredMode);
140 img->intraPredMode = (uint8_t *) malloc(intraPredModeSize * sizeof(*img->intraPredMode));
141 }
142
143
144 // cb info
145
146 if (img->cb_info_size != sps->PicSizeInMinCbsY ||
147 img->cb_info == NULL) {
148 img->cb_info_size = sps->PicSizeInMinCbsY;
149 free(img->cb_info);
150 img->cb_info = (CB_ref_info*)malloc(sizeof(CB_ref_info) * img->cb_info_size);
151 }
152
153
154 // pb info
155
156 int puWidth = sps->PicWidthInMinCbsY << (sps->Log2MinCbSizeY -2);
157 int puHeight = sps->PicHeightInMinCbsY << (sps->Log2MinCbSizeY -2);
158
159 if (img->pb_info_size != puWidth*puHeight ||
160 img->pb_info == NULL) {
161 img->pb_info_size = puWidth*puHeight;
162 img->pb_info_stride = puWidth;
163 free(img->pb_info);
164 img->pb_info = (PB_ref_info*)malloc(sizeof(PB_ref_info) * img->pb_info_size);
165 }
166
167
168 // tu info
169
170 if (img->tu_info_size != sps->PicSizeInTbsY ||
171 img->tu_info == NULL) {
172 img->tu_info_size = sps->PicSizeInTbsY;
173 free(img->tu_info);
174 img->tu_info = (uint8_t*)malloc(sizeof(uint8_t) * img->tu_info_size);
175 }
176
177
178 // deblk info
179
180 int deblk_w = (sps->pic_width_in_luma_samples +3)/4;
181 int deblk_h = (sps->pic_height_in_luma_samples+3)/4;
182
183 if (img->deblk_width != deblk_w ||
184 img->deblk_height != deblk_h ||
185 img->deblk_info == NULL) {
186 img->deblk_width = deblk_w;
187 img->deblk_height = deblk_h;
188 img->deblk_info_size = deblk_w*deblk_h;
189 free(img->deblk_info);
190 img->deblk_info = (uint8_t*)malloc(sizeof(uint8_t) * img->deblk_info_size);
191 }
192
193
194 // CTB info
195
196 if (img->ctb_info_size != sps->PicSizeInCtbsY)
197 {
198 for (int i=0;i<img->ctb_info_size;i++)
199 { de265_progress_lock_destroy(&img->ctb_progress[i]); }
200
201 free(img->ctb_info);
202 free(img->ctb_progress);
203 img->ctb_info_size = sps->PicSizeInCtbsY;
204 img->ctb_info = (CTB_info *)malloc( sizeof(CTB_info) * img->ctb_info_size);
205 img->ctb_progress = (de265_progress_lock*)malloc( sizeof(de265_progress_lock)
206 * img->ctb_info_size);
207
208 for (int i=0;i<img->ctb_info_size;i++)
209 { de265_progress_lock_init(&img->ctb_progress[i]); }
210 }
211
212
213 // check for memory shortage
214
215 if (img->ctb_info == NULL ||
216 img->intraPredMode == NULL ||
217 img->cb_info == NULL ||
218 img->pb_info == NULL ||
219 img->tu_info == NULL ||
220 img->deblk_info == NULL)
221 {
222 de265_free_image(img);
223 return DE265_ERROR_OUT_OF_MEMORY;
224 }
225 }
226
227 return DE265_OK;
228 }
229
230
231 void de265_free_image(de265_image* img)
232 {
233 if (img->y) FREE_ALIGNED(img->y_mem);
234 if (img->cb) FREE_ALIGNED(img->cb_mem);
235 if (img->cr) FREE_ALIGNED(img->cr_mem);
236
237 for (int i=0;i<img->ctb_info_size;i++)
238 { de265_progress_lock_destroy(&img->ctb_progress[i]); }
239
240 free(img->ctb_progress);
241 free(img->cb_info);
242 free(img->pb_info);
243 free(img->tu_info);
244 free(img->deblk_info);
245 free(img->ctb_info);
246 free(img->intraPredMode);
247
248 de265_cond_destroy(&img->finished_cond);
249 de265_mutex_destroy(&img->mutex);
250
251 memset(img, 0, sizeof(de265_image));
252 }
253
254
255 void de265_fill_image(de265_image* img, int y,int cb,int cr)
256 {
257 if (y>=0) {
258 memset(img->y_mem, y, img->stride * (img->height+2*img->border));
259 }
260
261 if (cb>=0) {
262 memset(img->cb_mem, cb, img->chroma_stride * (img->chroma_height+2*img->border));
263 }
264
265 if (cr>=0) {
266 memset(img->cr_mem, cr, img->chroma_stride * (img->chroma_height+2*img->border));
267 }
268 }
269
270
271 void de265_copy_image(de265_image* dest, const de265_image* src)
272 {
273 for (int y=0;y<src->height;y++) {
274 memcpy(dest->y+y*dest->stride, src->y+y*src->stride, src->width);
275 }
276
277 if (src->chroma_format != de265_chroma_mono) {
278 for (int y=0;y<src->chroma_height;y++) {
279 memcpy(dest->cb+y*dest->chroma_stride, src->cb+y*src->chroma_stride, src->chroma_width);
280 memcpy(dest->cr+y*dest->chroma_stride, src->cr+y*src->chroma_stride, src->chroma_width);
281 }
282 }
283 }
284
285
286 void get_image_plane(const de265_image* img, int cIdx, uint8_t** image, int* stride)
287 {
288 switch (cIdx) {
289 case 0: *image = img->y; if (stride) *stride = img->stride; break;
290 case 1: *image = img->cb; if (stride) *stride = img->chroma_stride; break;
291 case 2: *image = img->cr; if (stride) *stride = img->chroma_stride; break;
292 default: *image = NULL; if (stride) *stride = 0; break;
293 }
294 }
295
296 void set_conformance_window(de265_image* img, int left,int right,int top,int bottom)
297 {
298 int WinUnitX, WinUnitY;
299
300 switch (img->chroma_format) {
301 case de265_chroma_mono: WinUnitX=1; WinUnitY=1; break;
302 case de265_chroma_420: WinUnitX=2; WinUnitY=2; break;
303 case de265_chroma_422: WinUnitX=2; WinUnitY=1; break;
304 case de265_chroma_444: WinUnitX=1; WinUnitY=1; break;
305 default:
306 assert(0);
307 }
308
309 img->y_confwin = img->y + left*WinUnitX + top*WinUnitY*img->stride;
310 img->cb_confwin= img->cb+ left + top*img->chroma_stride;
311 img->cr_confwin= img->cr+ left + top*img->chroma_stride;
312
313 img->width_confwin = img->width - (left+right)*WinUnitX;
314 img->height_confwin= img->height- (top+bottom)*WinUnitY;
315 img->chroma_width_confwin = img->chroma_width -left-right;
316 img->chroma_height_confwin= img->chroma_height-top-bottom;
317 }
318
319 void increase_pending_tasks(de265_image* img, int n)
320 {
321 de265_sync_add_and_fetch(&img->tasks_pending, n);
322 }
323
324 void decrease_pending_tasks(de265_image* img, int n)
325 {
326 de265_mutex_lock(&img->mutex);
327
328 int pending = de265_sync_sub_and_fetch(&img->tasks_pending, n);
329
330 //printf("pending: %d\n",pending);
331
332 assert(pending >= 0);
333
334 if (pending==0) {
335 de265_cond_broadcast(&img->finished_cond, &img->mutex);
336 }
337
338 de265_mutex_unlock(&img->mutex);
339 }
340
341 void wait_for_completion(de265_image* img)
342 {
343 de265_mutex_lock(&img->mutex);
344 while (img->tasks_pending>0) {
345 de265_cond_wait(&img->finished_cond, &img->mutex);
346 }
347 de265_mutex_unlock(&img->mutex);
348 }
349
350
351
352 void prepare_image_for_decoding(de265_image* img)
353 {
354 // TODO: maybe we could avoid the memset by ensuring that all data is written to
355 // during decoding (especially log2CbSize), but it is unlikely to be faster than the memset.
356
357 memset(img->cb_info, 0,img->cb_info_size * sizeof(CB_ref_info));
358
359 memset(img->tu_info, 0,img->tu_info_size * sizeof(uint8_t));
360 memset(img->deblk_info,0,img->deblk_info_size * sizeof(uint8_t));
361 memset(img->ctb_info, 0,img->ctb_info_size * sizeof(CTB_info));
362
363 for (int i=0;i<img->ctb_info_size;i++) {
364 img->ctb_progress[i].progress = CTB_PROGRESS_NONE;
365 }
366 }
367
368
369 #define PIXEL2CB(x) (x >> sps->Log2MinCbSizeY)
370 #define CB_IDX(x0,y0) (PIXEL2CB(x0) + PIXEL2CB(y0)*sps->PicWidthInMinCbsY)
371 #define SET_CB_BLK(x,y,log2BlkWidth, Field,value) \
372 int cbX = PIXEL2CB(x); \
373 int cbY = PIXEL2CB(y); \
374 int width = 1 << (log2BlkWidth - sps->Log2MinCbSizeY); \
375 for (int cby=cbY;cby<cbY+width;cby++) \
376 for (int cbx=cbX;cbx<cbX+width;cbx++) \
377 { \
378 img->cb_info[ cbx + cby*sps->PicWidthInMinCbsY ].Field = value; \
379 }
380
381 #define SET_CB_BLK_SAVE(x,y,log2BlkWidth, Field,value) \
382 int cbX = PIXEL2CB(x); \
383 int cbY = PIXEL2CB(y); \
384 int width = 1 << (log2BlkWidth - sps->Log2MinCbSizeY); \
385 for (int cby=cbY;cby<cbY+width;cby++) \
386 for (int cbx=cbX;cbx<cbX+width;cbx++) \
387 if (cbx < sps->PicWidthInMinCbsY && \
388 cby < sps->PicHeightInMinCbsY) \
389 { \
390 img->cb_info[ cbx + cby*sps->PicWidthInMinCbsY ].Field = value; \
391 }
392
393
394 void set_cu_skip_flag(const seq_parameter_set* sps, de265_image* img,
395 int x,int y, int log2BlkWidth, uint8_t flag)
396 {
397 SET_CB_BLK(x,y,log2BlkWidth, cu_skip_flag, flag);
398 }
399
400 uint8_t get_cu_skip_flag(const seq_parameter_set* sps, const de265_image* img, int x,int y)
401 {
402 int cbX = PIXEL2CB(x);
403 int cbY = PIXEL2CB(y);
404
405 return img->cb_info[ cbX + cbY*sps->PicWidthInMinCbsY ].cu_skip_flag;
406 }
407
408
409 void set_pred_mode(de265_image* img, const seq_parameter_set* sps,
410 int x,int y, int log2BlkWidth, enum PredMode mode)
411 {
412 SET_CB_BLK(x,y,log2BlkWidth, PredMode, mode);
413 }
414
415 enum PredMode get_pred_mode(const de265_image* img, const seq_parameter_set* sps, int x,int y)
416 {
417 int cbX = PIXEL2CB(x);
418 int cbY = PIXEL2CB(y);
419
420 return (enum PredMode)img->cb_info[ cbX + cbY*sps->PicWidthInMinCbsY ].PredMode;
421 }
422
423
424 void set_cu_transquant_bypass(const de265_image* img, const seq_parameter_set* sps,
425 int x,int y, int log2BlkWidth)
426 {
427 SET_CB_BLK(x,y,log2BlkWidth, cu_transquant_bypass, 1);
428 }
429
430 int get_cu_transquant_bypass(const de265_image* img, const seq_parameter_set* sps, int x,int y)
431 {
432 int cbX = PIXEL2CB(x);
433 int cbY = PIXEL2CB(y);
434
435 return img->cb_info[ cbX + cbY*sps->PicWidthInMinCbsY ].cu_transquant_bypass;
436 }
437
438
439 void set_pcm_flag(de265_image* img, const seq_parameter_set* sps,
440 int x,int y, int log2BlkWidth)
441 {
442 SET_CB_BLK(x,y,log2BlkWidth, pcm_flag, 1);
443 }
444
445 int get_pcm_flag(const de265_image* img, const seq_parameter_set* sps, int x,int y)
446 {
447 int cbX = PIXEL2CB(x);
448 int cbY = PIXEL2CB(y);
449
450 return img->cb_info[ cbX + cbY*sps->PicWidthInMinCbsY ].pcm_flag;
451 }
452
453
454 int get_log2CbSize(const de265_image* img, const seq_parameter_set* sps, int x0, int y0)
455 {
456 int cbX = PIXEL2CB(x0);
457 int cbY = PIXEL2CB(y0);
458
459 return (enum PredMode)img->cb_info[ cbX + cbY*sps->PicWidthInMinCbsY ].log2CbSize;
460 }
461
462 void set_log2CbSize(de265_image* img, const seq_parameter_set* sps, int x0, int y0, int log2CbSize)
463 {
464 int cbX = PIXEL2CB(x0);
465 int cbY = PIXEL2CB(y0);
466
467 img->cb_info[ cbX + cbY*sps->PicWidthInMinCbsY ].log2CbSize = log2CbSize;
468
469 // assume that remaining cb_info blocks are initialized to zero
470 }
471
472 // coordinates in CB units
473 int get_log2CbSize_cbUnits(de265_image* img, const seq_parameter_set* sps, int x0, int y0)
474 {
475 return (enum PredMode)img->cb_info[ x0 + y0*sps->PicWidthInMinCbsY ].log2CbSize;
476 }
477
478
479 void set_PartMode(de265_image* img, const seq_parameter_set* sps,
480 int x,int y, enum PartMode mode)
481 {
482 img->cb_info[ CB_IDX(x,y) ].PartMode = mode;
483 }
484
485 enum PartMode get_PartMode(const de265_image* img, const seq_parameter_set* sps, int x,int y)
486 {
487 return (enum PartMode)img->cb_info[ CB_IDX(x,y) ].PartMode;
488 }
489
490
491 void set_ctDepth(de265_image* img, const seq_parameter_set* sps,
492 int x,int y, int log2BlkWidth, int depth)
493 {
494 SET_CB_BLK(x,y,log2BlkWidth, ctDepth, depth);
495 }
496
497 int get_ctDepth(const de265_image* img, const seq_parameter_set* sps, int x,int y)
498 {
499 return img->cb_info[ CB_IDX(x,y) ].ctDepth;
500 }
501
502
503 void set_QPY(de265_image* img, const seq_parameter_set* sps,
504 const pic_parameter_set* pps, int x,int y, int log2BlkWidth, int QP_Y)
505 {
506 assert(x>=0 && x<sps->pic_width_in_luma_samples);
507 assert(y>=0 && y<sps->pic_height_in_luma_samples);
508
509 SET_CB_BLK (x, y, log2BlkWidth, QP_Y, QP_Y);
510 }
511
512 int get_QPY(const de265_image* img, const seq_parameter_set* sps,int x,int y)
513 {
514 return img->cb_info[CB_IDX(x,y)].QP_Y;
515 }
516
517
518 #define PIXEL2TU(x) (x >> sps->Log2MinTrafoSize)
519 #define TU_IDX(x0,y0) (PIXEL2TU(x0) + PIXEL2TU(y0)*sps->PicWidthInTbsY)
520
521 #define OR_TU_BLK(x,y,log2BlkWidth, value) \
522 int tuX = PIXEL2TU(x); \
523 int tuY = PIXEL2TU(y); \
524 int width = 1 << (log2BlkWidth - sps->Log2MinTrafoSize); \
525 for (int tuy=tuY;tuy<tuY+width;tuy++) \
526 for (int tux=tuX;tux<tuX+width;tux++) \
527 { \
528 img->tu_info[ tux + tuy*sps->PicWidthInTbsY ] |= value; \
529 }
530
531 void set_split_transform_flag(de265_image* img,const seq_parameter_set* sps,
532 int x0,int y0,int trafoDepth)
533 {
534 img->tu_info[TU_IDX(x0,y0)] |= (1<<trafoDepth);
535 }
536
537 int get_split_transform_flag(const de265_image* img, const seq_parameter_set* sps,
538 int x0,int y0,int trafoDepth)
539 {
540 int idx = TU_IDX(x0,y0);
541 return (img->tu_info[idx] & (1<<trafoDepth));
542 }
543
544
545 void set_nonzero_coefficient(de265_image* img,const seq_parameter_set* sps,
546 int x,int y, int log2TrafoSize)
547 {
548 OR_TU_BLK(x,y,log2TrafoSize, TU_FLAG_NONZERO_COEFF);
549 }
550
551
552 int get_nonzero_coefficient(const de265_image* img,const seq_parameter_set* sps,
553 int x,int y)
554 {
555 return img->tu_info[TU_IDX(x,y)] & TU_FLAG_NONZERO_COEFF;
556 }
557
558
559 enum IntraPredMode get_IntraPredMode(const de265_image* img, const seq_parameter_set* sps, int x,int y)
560 {
561 int PUidx = (x>>sps->Log2MinPUSize) + (y>>sps->Log2MinPUSize) * sps->PicWidthInMinPUs;
562
563 return (enum IntraPredMode) img->intraPredMode[PUidx];
564 }
565
566
567 void set_deblk_flags(de265_image* img, int x0,int y0, uint8_t flags)
568 {
569 const int xd = x0/4;
570 const int yd = y0/4;
571
572 if (xd<img->deblk_width && yd<img->deblk_height) {
573 img->deblk_info[xd + yd*img->deblk_width] |= flags;
574 }
575 }
576
577 uint8_t get_deblk_flags(const de265_image* img, int x0,int y0)
578 {
579 const int xd = x0/4;
580 const int yd = y0/4;
581 assert (xd<img->deblk_width && yd<img->deblk_height);
582
583 return img->deblk_info[xd + yd*img->deblk_width];
584 }
585
586 void set_deblk_bS(de265_image* img, int x0,int y0, uint8_t bS)
587 {
588 uint8_t* data = &img->deblk_info[x0/4 + y0/4*img->deblk_width];
589 *data &= ~DEBLOCK_BS_MASK;
590 *data |= bS;
591 }
592
593 uint8_t get_deblk_bS(const de265_image* img, int x0,int y0)
594 {
595 return img->deblk_info[x0/4 + y0/4*img->deblk_width] & DEBLOCK_BS_MASK;
596 }
597
598
599 void set_SliceAddrRS(de265_image* img, const seq_parameter_set* sps,
600 int ctbX, int ctbY, int SliceAddrRS)
601 {
602 assert(ctbX + ctbY*sps->PicWidthInCtbsY < img->ctb_info_size);
603 img->ctb_info[ctbX + ctbY*sps->PicWidthInCtbsY].SliceAddrRS = SliceAddrRS;
604 }
605
606 int get_SliceAddrRS(const de265_image* img, const seq_parameter_set* sps, int ctbX, int ctbY)
607 {
608 return img->ctb_info[ctbX + ctbY*sps->PicWidthInCtbsY].SliceAddrRS;
609 }
610
611 int get_SliceAddrRS_atCtbRS(const de265_image* img, const seq_parameter_set* sps, int ctbRS)
612 {
613 return img->ctb_info[ctbRS].SliceAddrRS;
614 }
615
616 void set_SliceHeaderIndex(de265_image* img, const seq_parameter_set* sps,
617 int x, int y, int SliceHeaderIndex)
618 {
619 int ctbX = x >> sps->Log2CtbSizeY;
620 int ctbY = y >> sps->Log2CtbSizeY;
621 img->ctb_info[ctbX + ctbY*sps->PicWidthInCtbsY].SliceHeaderIndex = SliceHeaderIndex;
622 }
623
624 int get_SliceHeaderIndex(const de265_image* img, const seq_parameter_set* sps, int x, int y)
625 {
626 int ctbX = x >> sps->Log2CtbSizeY;
627 int ctbY = y >> sps->Log2CtbSizeY;
628 return img->ctb_info[ctbX + ctbY*sps->PicWidthInCtbsY].SliceHeaderIndex;
629 }
630
631
632
633 void set_sao_info(de265_image* img,const seq_parameter_set* sps,
634 int ctbX,int ctbY,const sao_info* saoinfo)
635 {
636 assert(ctbX + ctbY*sps->PicWidthInCtbsY < img->ctb_info_size);
637 memcpy(&img->ctb_info[ctbX + ctbY*sps->PicWidthInCtbsY].saoInfo,
638 saoinfo,
639 sizeof(sao_info));
640 }
641
642 const sao_info* get_sao_info(const de265_image* img,const seq_parameter_set* sps, int ctbX,int ctbY)
643 {
644 assert(ctbX + ctbY*sps->PicWidthInCtbsY < img->ctb_info_size);
645 return &img->ctb_info[ctbX + ctbY*sps->PicWidthInCtbsY].saoInfo;
646 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "image.h"
21 #include "decctx.h"
22
23 #include <stdlib.h>
24 #include <string.h>
25 #include <assert.h>
26
27 #include <limits>
28
29
30 #ifdef HAVE_MALLOC_H
31 #include <malloc.h>
32 #endif
33
34 #ifdef HAVE_SSE4_1
35 #define MEMORY_PADDING 8
36 #else
37 #define MEMORY_PADDING 0
38 #endif
39
40 #ifdef HAVE___MINGW_ALIGNED_MALLOC
41 #define ALLOC_ALIGNED(alignment, size) __mingw_aligned_malloc((size), (alignment))
42 #define FREE_ALIGNED(mem) __mingw_aligned_free((mem))
43 #elif _WIN32
44 #define ALLOC_ALIGNED(alignment, size) _aligned_malloc((size), (alignment))
45 #define FREE_ALIGNED(mem) _aligned_free((mem))
46 #elif __APPLE__
47 static inline void *ALLOC_ALIGNED(size_t alignment, size_t size) {
48 void *mem = NULL;
49 if (posix_memalign(&mem, alignment, size) != 0) {
50 return NULL;
51 }
52 return mem;
53 };
54 #define FREE_ALIGNED(mem) free((mem))
55 #else
56 #define ALLOC_ALIGNED(alignment, size) memalign((alignment), (size))
57 #define FREE_ALIGNED(mem) free((mem))
58 #endif
59
60 #define ALLOC_ALIGNED_16(size) ALLOC_ALIGNED(16, size)
61
62 static const int alignment = 16;
63
64
65 static int de265_image_get_buffer(de265_decoder_context* ctx,
66 de265_image_spec* spec, de265_image* img, void* userdata)
67 {
68 int luma_stride = (spec->width + spec->alignment-1) / spec->alignment * spec->alignment;
69 int chroma_stride = (spec->width/2 + spec->alignment-1) / spec->alignment * spec->alignment;
70
71 int luma_height = spec->height;
72 int chroma_height = (spec->height+1)/2;
73
74 uint8_t* p[3] = { 0,0,0 };
75 p[0] = (uint8_t *)ALLOC_ALIGNED_16(luma_stride * luma_height + MEMORY_PADDING);
76 p[1] = (uint8_t *)ALLOC_ALIGNED_16(chroma_stride * chroma_height + MEMORY_PADDING);
77 p[2] = (uint8_t *)ALLOC_ALIGNED_16(chroma_stride * chroma_height + MEMORY_PADDING);
78
79 if (p[0]==NULL || p[1]==NULL || p[2]==NULL) {
80 for (int i=0;i<3;i++)
81 if (p[i]) {
82 FREE_ALIGNED(p[i]);
83 }
84
85 return 0;
86 }
87
88 img->set_image_plane(0, p[0], luma_stride, NULL);
89 img->set_image_plane(1, p[1], chroma_stride, NULL);
90 img->set_image_plane(2, p[2], chroma_stride, NULL);
91
92 return 1;
93 }
94
95 static void de265_image_release_buffer(de265_decoder_context* ctx,
96 de265_image* img, void* userdata)
97 {
98 for (int i=0;i<3;i++) {
99 uint8_t* p = (uint8_t*)img->get_image_plane(i);
100 assert(p);
101 FREE_ALIGNED(p);
102 }
103 }
104
105
106 de265_image_allocation de265_image::default_image_allocation = {
107 de265_image_get_buffer,
108 de265_image_release_buffer
109 };
110
111
112 void de265_image::set_image_plane(int cIdx, uint8_t* mem, int stride, void *userdata)
113 {
114 pixels[cIdx] = mem;
115 plane_user_data[cIdx] = userdata;
116
117 if (cIdx==0) { this->stride = stride; }
118 else { this->chroma_stride = stride; }
119 }
120
121
122 uint32_t de265_image::s_next_image_ID = 0;
123
124 de265_image::de265_image()
125 {
126 ID = -1;
127 removed_at_picture_id = 0; // picture not used, so we can assume it has been removed
128
129 decctx = NULL;
130
131 //alloc_functions.get_buffer = NULL;
132 //alloc_functions.release_buffer = NULL;
133
134 for (int c=0;c<3;c++) {
135 pixels[c] = NULL;
136 pixels_confwin[c] = NULL;
137 plane_user_data[c] = NULL;
138 }
139
140 width=height=0;
141
142 pts = 0;
143 user_data = NULL;
144
145 ctb_progress = NULL;
146
147 integrity = INTEGRITY_NOT_DECODED;
148
149 picture_order_cnt_lsb = -1; // undefined
150 PicOrderCntVal = -1; // undefined
151 PicState = UnusedForReference;
152 PicOutputFlag = false;
153
154 de265_mutex_init(&mutex);
155 de265_cond_init(&finished_cond);
156 }
157
158
159 de265_error de265_image::alloc_image(int w,int h, enum de265_chroma c,
160 const seq_parameter_set* sps, bool allocMetadata,
161 decoder_context* ctx)
162 {
163 if (allocMetadata) { assert(sps); }
164
165 ID = s_next_image_ID++;
166 removed_at_picture_id = std::numeric_limits<int32_t>::max();
167
168 decctx = ctx;
169
170 nThreadsQueued = 0;
171 nThreadsRunning = 0;
172 nThreadsBlocked = 0;
173 nThreadsFinished = 0;
174 nThreadsTotal = 0;
175
176 // --- allocate image buffer ---
177
178 chroma_format= c;
179
180 width = w;
181 height = h;
182 chroma_width = w;
183 chroma_height= h;
184
185 de265_image_spec spec;
186
187 int WinUnitX, WinUnitY;
188
189 switch (chroma_format) {
190 case de265_chroma_mono: WinUnitX=1; WinUnitY=1; break;
191 case de265_chroma_420: WinUnitX=2; WinUnitY=2; break;
192 case de265_chroma_422: WinUnitX=2; WinUnitY=1; break;
193 case de265_chroma_444: WinUnitX=1; WinUnitY=1; break;
194 default:
195 assert(0);
196 }
197
198 switch (chroma_format) {
199 case de265_chroma_420:
200 spec.format = de265_image_format_YUV420P8;
201 chroma_width = (chroma_width +1)/2;
202 chroma_height = (chroma_height+1)/2;
203 break;
204
205 case de265_chroma_422:
206 spec.format = de265_image_format_YUV422P8;
207 chroma_height = (chroma_height+1)/2;
208 break;
209
210 default:
211 assert(false); // TODO: not implemented yet
212 break;
213 }
214
215 spec.width = w;
216 spec.height = h;
217 spec.alignment = 16;
218
219
220 // conformance window cropping
221
222 int left = sps ? sps->conf_win_left_offset : 0;
223 int right = sps ? sps->conf_win_right_offset : 0;
224 int top = sps ? sps->conf_win_top_offset : 0;
225 int bottom = sps ? sps->conf_win_bottom_offset : 0;
226
227 width_confwin = width - (left+right)*WinUnitX;
228 height_confwin= height- (top+bottom)*WinUnitY;
229 chroma_width_confwin = chroma_width -left-right;
230 chroma_height_confwin= chroma_height-top-bottom;
231
232 spec.crop_left = left *WinUnitX;
233 spec.crop_right = right*WinUnitX;
234 spec.crop_top = top *WinUnitY;
235 spec.crop_bottom= bottom*WinUnitY;
236
237 spec.visible_width = width_confwin;
238 spec.visible_height= height_confwin;
239
240
241 // allocate memory and set conformance window pointers
242
243 void* alloc_userdata = decctx->param_image_allocation_userdata;
244 bool mem_alloc_success = decctx->param_image_allocation_functions.get_buffer(decctx, &spec, this,
245 alloc_userdata);
246
247 pixels_confwin[0] = pixels[0] + left*WinUnitX + top*WinUnitY*stride;
248 pixels_confwin[1] = pixels[1] + left + top*chroma_stride;
249 pixels_confwin[2] = pixels[2] + left + top*chroma_stride;
250
251
252 // check for memory shortage
253
254 if (!mem_alloc_success)
255 {
256 return DE265_ERROR_OUT_OF_MEMORY;
257 }
258
259 //alloc_functions = *allocfunc;
260 //alloc_userdata = userdata;
261
262 // --- allocate decoding info arrays ---
263
264 if (allocMetadata) {
265 // intra pred mode
266
267 mem_alloc_success &= intraPredMode.alloc(sps->PicWidthInMinPUs, sps->PicHeightInMinPUs,
268 sps->Log2MinPUSize);
269
270 // cb info
271
272 mem_alloc_success &= cb_info.alloc(sps->PicWidthInMinCbsY, sps->PicHeightInMinCbsY,
273 sps->Log2MinCbSizeY);
274
275 // pb info
276
277 int puWidth = sps->PicWidthInMinCbsY << (sps->Log2MinCbSizeY -2);
278 int puHeight = sps->PicHeightInMinCbsY << (sps->Log2MinCbSizeY -2);
279
280 mem_alloc_success &= pb_info.alloc(puWidth,puHeight, 2);
281
282
283 // tu info
284
285 mem_alloc_success &= tu_info.alloc(sps->PicWidthInTbsY, sps->PicHeightInTbsY,
286 sps->Log2MinTrafoSize);
287
288 // deblk info
289
290 int deblk_w = (sps->pic_width_in_luma_samples +3)/4;
291 int deblk_h = (sps->pic_height_in_luma_samples+3)/4;
292
293 mem_alloc_success &= deblk_info.alloc(deblk_w, deblk_h, 2);
294
295 // CTB info
296
297 if (ctb_info.data_size != sps->PicSizeInCtbsY)
298 {
299 delete[] ctb_progress;
300
301 mem_alloc_success &= ctb_info.alloc(sps->PicWidthInCtbsY, sps->PicHeightInCtbsY,
302 sps->Log2CtbSizeY);
303
304 ctb_progress = new de265_progress_lock[ ctb_info.data_size ];
305 }
306
307
308 // check for memory shortage
309
310 if (!mem_alloc_success)
311 {
312 return DE265_ERROR_OUT_OF_MEMORY;
313 }
314 }
315
316 return DE265_OK;
317 }
318
319
320 de265_image::~de265_image()
321 {
322 release();
323
324 // free progress locks
325
326 if (ctb_progress) {
327 delete[] ctb_progress;
328 }
329
330 de265_cond_destroy(&finished_cond);
331 de265_mutex_destroy(&mutex);
332 }
333
334
335 void de265_image::release()
336 {
337 // free image memory
338
339 if (decctx) {
340 de265_image_allocation* allocfunc = &decctx->param_image_allocation_functions;
341 if (allocfunc->release_buffer &&
342 pixels[0])
343 {
344 allocfunc->release_buffer(decctx, this, decctx->param_image_allocation_userdata);
345
346 for (int i=0;i<3;i++)
347 {
348 pixels[i] = NULL;
349 pixels_confwin[i] = NULL;
350 }
351 }
352 }
353
354 // free slices
355
356 for (int i=0;i<slices.size();i++) {
357 delete slices[i];
358 }
359 slices.clear();
360 }
361
362
363 void de265_image::fill_image(int y,int cb,int cr)
364 {
365 if (y>=0) {
366 memset(pixels[0], y, stride * height);
367 }
368
369 if (cb>=0) {
370 memset(pixels[1], cb, chroma_stride * chroma_height);
371 }
372
373 if (cr>=0) {
374 memset(pixels[2], cr, chroma_stride * chroma_height);
375 }
376 }
377
378
379 de265_error de265_image::copy_image(const de265_image* src)
380 {
381 /* TODO: actually, since we allocate the image only for internal purpose, we
382 do not have to call the external allocation routines for this. However, then
383 we have to track for each image how to release it again.
384 Another option would be to safe the copied data not in an de265_image at all.
385 */
386
387 de265_error err = alloc_image(src->width, src->height, src->chroma_format, &src->sps, false, src->decctx);
388 if (err != DE265_OK) {
389 return err;
390 }
391
392 copy_lines_from(src, 0, src->height);
393
394 return err;
395 }
396
397
398 // end = last line + 1
399 void de265_image::copy_lines_from(const de265_image* src, int first, int end)
400 {
401 assert(src->stride == stride &&
402 src->chroma_stride == chroma_stride);
403
404 if (end > src->height) end=src->height;
405
406 assert(first % 2 == 0);
407 assert(end % 2 == 0);
408
409 if (src->stride == stride) {
410 memcpy(pixels[0] + first*stride,
411 src->pixels[0] + first*src->stride,
412 (end-first)*stride);
413 }
414 else {
415 for (int yp=first;yp<end;yp++) {
416 memcpy(pixels[0]+yp*stride, src->pixels[0]+yp*src->stride, src->width);
417 }
418 }
419
420 int first_chroma = first>>1;
421 int end_chroma = end>>1;
422
423 if (src->chroma_format != de265_chroma_mono) {
424 if (src->chroma_stride == chroma_stride) {
425 memcpy(pixels[1] + first_chroma*chroma_stride,
426 src->pixels[1] + first_chroma*chroma_stride,
427 (end_chroma-first_chroma) * chroma_stride);
428 memcpy(pixels[2] + first_chroma*chroma_stride,
429 src->pixels[2] + first_chroma*chroma_stride,
430 (end_chroma-first_chroma) * chroma_stride);
431 }
432 else {
433 for (int y=first_chroma;y<end_chroma;y++) {
434 memcpy(pixels[1]+y*chroma_stride, src->pixels[1]+y*src->chroma_stride, src->chroma_width);
435 memcpy(pixels[2]+y*chroma_stride, src->pixels[2]+y*src->chroma_stride, src->chroma_width);
436 }
437 }
438 }
439 }
440
441
442 void de265_image::exchange_pixel_data_with(de265_image& b)
443 {
444 for (int i=0;i<3;i++) {
445 std::swap(pixels[i], b.pixels[i]);
446 std::swap(pixels_confwin[i], b.pixels_confwin[i]);
447 std::swap(plane_user_data[i], b.plane_user_data[i]);
448 }
449
450 std::swap(stride, b.stride);
451 std::swap(chroma_stride, b.chroma_stride);
452 }
453
454
455 void de265_image::thread_start(int nThreads)
456 {
457 de265_mutex_lock(&mutex);
458
459 nThreadsQueued += nThreads;
460 nThreadsTotal += nThreads;
461
462 de265_mutex_unlock(&mutex);
463 }
464
465 void de265_image::thread_run()
466 {
467 de265_mutex_lock(&mutex);
468 nThreadsQueued--;
469 nThreadsRunning++;
470 de265_mutex_unlock(&mutex);
471 }
472
473 void de265_image::thread_blocks()
474 {
475 de265_mutex_lock(&mutex);
476 nThreadsRunning--;
477 nThreadsBlocked++;
478 de265_mutex_unlock(&mutex);
479 }
480
481 void de265_image::thread_unblocks()
482 {
483 de265_mutex_lock(&mutex);
484 nThreadsBlocked--;
485 nThreadsRunning++;
486 de265_mutex_unlock(&mutex);
487 }
488
489 void de265_image::thread_finishes()
490 {
491 de265_mutex_lock(&mutex);
492
493 nThreadsRunning--;
494 nThreadsFinished++;
495 assert(nThreadsRunning >= 0);
496
497 if (nThreadsFinished==nThreadsTotal) {
498 de265_cond_broadcast(&finished_cond, &mutex);
499 }
500
501 de265_mutex_unlock(&mutex);
502 }
503
504 void de265_image::wait_for_progress(thread_task* task, int ctbx,int ctby, int progress)
505 {
506 const int ctbW = sps.PicWidthInCtbsY;
507
508 wait_for_progress(task, ctbx + ctbW*ctby, progress);
509 }
510
511 void de265_image::wait_for_progress(thread_task* task, int ctbAddrRS, int progress)
512 {
513 de265_progress_lock* progresslock = &ctb_progress[ctbAddrRS];
514 if (progresslock->get_progress() < progress) {
515 thread_blocks();
516
517 assert(task!=NULL);
518 task->state = thread_task::Blocked;
519
520 /* TODO: check whether we are the first blocked task in the list.
521 If we are, we have to conceal input errors.
522 Simplest concealment: do not block.
523 */
524
525 progresslock->wait_for_progress(progress);
526 task->state = thread_task::Running;
527 thread_unblocks();
528 }
529 }
530
531
532 void de265_image::wait_for_completion()
533 {
534 de265_mutex_lock(&mutex);
535 while (nThreadsFinished!=nThreadsTotal) {
536 de265_cond_wait(&finished_cond, &mutex);
537 }
538 de265_mutex_unlock(&mutex);
539 }
540
541 bool de265_image::debug_is_completed() const
542 {
543 return nThreadsFinished==nThreadsTotal;
544 }
545
546
547
548 void de265_image::clear_metadata()
549 {
550 // TODO: maybe we could avoid the memset by ensuring that all data is written to
551 // during decoding (especially log2CbSize), but it is unlikely to be faster than the memset.
552
553 cb_info.clear();
554 tu_info.clear();
555 ctb_info.clear();
556 deblk_info.clear();
557
558 // --- reset CTB progresses ---
559
560 for (int i=0;i<ctb_info.data_size;i++) {
561 ctb_progress[i].reset(CTB_PROGRESS_NONE);
562 }
563 }
564
565
566 void de265_image::set_mv_info(int x,int y, int nPbW,int nPbH, const PredVectorInfo* mv)
567 {
568 int log2PuSize = 2;
569
570 int xPu = x >> log2PuSize;
571 int yPu = y >> log2PuSize;
572 int wPu = nPbW >> log2PuSize;
573 int hPu = nPbH >> log2PuSize;
574
575 int stride = pb_info.width_in_units;
576
577 for (int pby=0;pby<hPu;pby++)
578 for (int pbx=0;pbx<wPu;pbx++)
579 {
580 pb_info[ xPu+pbx + (yPu+pby)*stride ].mvi = *mv;
581 }
582 }
583
584
585 bool de265_image::available_zscan(int xCurr,int yCurr, int xN,int yN) const
586 {
587 if (xN<0 || yN<0) return false;
588 if (xN>=sps.pic_width_in_luma_samples ||
589 yN>=sps.pic_height_in_luma_samples) return false;
590
591 int minBlockAddrN = pps.MinTbAddrZS[ (xN>>sps.Log2MinTrafoSize) +
592 (yN>>sps.Log2MinTrafoSize) * sps.PicWidthInTbsY ];
593 int minBlockAddrCurr = pps.MinTbAddrZS[ (xCurr>>sps.Log2MinTrafoSize) +
594 (yCurr>>sps.Log2MinTrafoSize) * sps.PicWidthInTbsY ];
595
596 if (minBlockAddrN > minBlockAddrCurr) return false;
597
598 int xCurrCtb = xCurr >> sps.Log2CtbSizeY;
599 int yCurrCtb = yCurr >> sps.Log2CtbSizeY;
600 int xNCtb = xN >> sps.Log2CtbSizeY;
601 int yNCtb = yN >> sps.Log2CtbSizeY;
602
603 if (get_SliceAddrRS(xCurrCtb,yCurrCtb) !=
604 get_SliceAddrRS(xNCtb, yNCtb)) {
605 return false;
606 }
607
608 if (pps.TileIdRS[xCurrCtb + yCurrCtb*sps.PicWidthInCtbsY] !=
609 pps.TileIdRS[xNCtb + yNCtb *sps.PicWidthInCtbsY]) {
610 return false;
611 }
612
613 return true;
614 }
615
616
617 bool de265_image::available_pred_blk(int xC,int yC, int nCbS, int xP, int yP,
618 int nPbW, int nPbH, int partIdx, int xN,int yN) const
619 {
620 logtrace(LogMotion,"C:%d;%d P:%d;%d N:%d;%d size=%d;%d\n",xC,yC,xP,yP,xN,yN,nPbW,nPbH);
621
622 int sameCb = (xC <= xN && xN < xC+nCbS &&
623 yC <= yN && yN < yC+nCbS);
624
625 bool availableN;
626
627 if (!sameCb) {
628 availableN = available_zscan(xP,yP,xN,yN);
629 }
630 else {
631 availableN = !(nPbW<<1 == nCbS && nPbH<<1 == nCbS &&
632 partIdx==1 &&
633 yN >= yC+nPbH && xN < xC+nPbW);
634 }
635
636 if (availableN && get_pred_mode(xN,yN) == MODE_INTRA) {
637 availableN = false;
638 }
639
640 return availableN;
641 }
642
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
2424 #include <config.h>
2525 #endif
2626
27 #include <assert.h>
2728 #include <stdint.h>
29 #include <stdlib.h>
30 #include <string.h>
2831 #ifdef HAVE_STDBOOL_H
2932 #include <stdbool.h>
3033 #endif
3437 #include "libde265/motion.h"
3538 #include "libde265/threads.h"
3639 #include "libde265/slice.h"
40 #include "libde265/nal.h"
3741
3842
3943 enum PictureState {
6973
7074 #define CTB_PROGRESS_NONE 0
7175 #define CTB_PROGRESS_PREFILTER 1
72 #define CTB_PROGRESS_FILTERED 2
76 #define CTB_PROGRESS_DEBLK_V 2
77 #define CTB_PROGRESS_DEBLK_H 3
78 #define CTB_PROGRESS_SAO 4
79
80 template <class DataUnit> class MetaDataArray
81 {
82 public:
83 MetaDataArray() { data=NULL; data_size=0; log2unitSize=0; width_in_units=0; height_in_units=0; }
84 ~MetaDataArray() { free(data); }
85
86 bool alloc(int w,int h, int _log2unitSize) {
87 int size = w*h;
88
89 if (size != data_size) {
90 free(data);
91 data = (DataUnit*)malloc(size * sizeof(DataUnit));
92 data_size = size;
93 width_in_units = w;
94 height_in_units = h;
95 }
96
97 log2unitSize = _log2unitSize;
98
99 return data != NULL;
100 }
101
102 void clear() {
103 if (data) memset(data, 0, sizeof(DataUnit) * data_size);
104 }
105
106 const DataUnit& get(int x,int y) const {
107 int unitX = x>>log2unitSize;
108 int unitY = y>>log2unitSize;
109
110 return data[ unitX + unitY*width_in_units ];
111 }
112
113 DataUnit& get(int x,int y) {
114 int unitX = x>>log2unitSize;
115 int unitY = y>>log2unitSize;
116
117 return data[ unitX + unitY*width_in_units ];
118 }
119
120 void set(int x,int y, const DataUnit& d) {
121 int unitX = x>>log2unitSize;
122 int unitY = y>>log2unitSize;
123
124 data[ unitX + unitY*width_in_units ] = d;
125 }
126
127 DataUnit& operator[](int idx) { return data[idx]; }
128 const DataUnit& operator[](int idx) const { return data[idx]; }
129
130 // private:
131 DataUnit* data;
132 int data_size;
133 int log2unitSize;
134 int width_in_units;
135 int height_in_units;
136 };
137
138 #define SET_CB_BLK(x,y,log2BlkWidth, Field,value) \
139 int cbX = x >> cb_info.log2unitSize; \
140 int cbY = y >> cb_info.log2unitSize; \
141 int width = 1 << (log2BlkWidth - cb_info.log2unitSize); \
142 for (int cby=cbY;cby<cbY+width;cby++) \
143 for (int cbx=cbX;cbx<cbX+width;cbx++) \
144 { \
145 cb_info[ cbx + cby*cb_info.width_in_units ].Field = value; \
146 }
147
73148
74149 typedef struct {
75150 uint16_t SliceAddrRS;
76151 uint16_t SliceHeaderIndex; // index into array to slice header for this CTB
77152
78153 sao_info saoInfo;
79
80 uint16_t thread_context_id; // which thread-context is used to decode this CTB
154 bool deblock; // this CTB has to be deblocked
155 bool has_pcm; // pcm is used in this CTB
156 bool has_cu_transquant_bypass; // transquant_bypass is used in this CTB
81157 } CTB_info;
82158
83159
84160 typedef struct {
85161 uint8_t log2CbSize : 3; // [0;6] (1<<log2CbSize) = 64
86 uint8_t cu_skip_flag : 1; // only for decoding of current image
87 uint8_t ctDepth : 2; // [0:3]? (0:64, 1:32, 2:16, 3:8)
88 uint8_t PredMode : 2; // (enum PredMode) [0;2] must be safed for past images
89162 uint8_t PartMode : 3; // (enum PartMode) [0;7] set only in top-left of CB
90163 // TODO: could be removed if prediction-block-boundaries would be
91164 // set during decoding
165 uint8_t ctDepth : 2; // [0:3]? (0:64, 1:32, 2:16, 3:8)
166 uint8_t PredMode : 2; // (enum PredMode) [0;2] must be saved for past images
92167 uint8_t pcm_flag : 1; //
93168 uint8_t cu_transquant_bypass : 1;
94169
103178 } PB_ref_info;
104179
105180
106 /*
107 typedef struct {
108 //uint16_t cbf_cb; // bitfield (1<<depth)
109 //uint16_t cbf_cr; // bitfield (1<<depth)
110 //uint16_t cbf_luma; // bitfield (1<<depth)
111
112 //uint8_t IntraPredMode; // NOTE: can be thread-local // (enum IntraPredMode)
113 //uint8_t IntraPredModeC; // NOTE: can be thread-local // (enum IntraPredMode)
114
115 //uint8_t split_transform_flag; // NOTE: can be local if deblocking flags set during decoding
116 //uint8_t transform_skip_flag; // NOTE: can be in local context // read bit (1<<cIdx)
117 //uint8_t flags; // NOTE: can be removed if deblocking flags set during decoding (nonzero coefficients)
118 } TU_log_info;
119 */
120
121
122 typedef struct de265_image {
123 uint8_t* y; // pointer to pixel at (0,0), which is inside the optional image borders
124 uint8_t* cb;
125 uint8_t* cr;
126
127 uint8_t* y_mem; // usually, you don't use these, but the pointers above
128 uint8_t* cb_mem;
129 uint8_t* cr_mem;
181
182 struct de265_image {
183 de265_image();
184 ~de265_image();
185
186
187 de265_error alloc_image(int w,int h, enum de265_chroma c, const seq_parameter_set* sps,
188 bool allocMetadata, decoder_context* ctx);
189
190 bool is_allocated() const { return pixels[0] != NULL; }
191
192 void release();
193
194 void fill_image(int y,int u,int v);
195 de265_error copy_image(const de265_image* src);
196 void copy_lines_from(const de265_image* src, int first, int end);
197 void exchange_pixel_data_with(de265_image&);
198
199 uint32_t get_ID() const { return ID; }
200
201
202 /* */ uint8_t* get_image_plane(int cIdx) { return pixels[cIdx]; }
203 const uint8_t* get_image_plane(int cIdx) const { return pixels[cIdx]; }
204
205 void set_image_plane(int cIdx, uint8_t* mem, int stride, void *userdata);
206
207 uint8_t* get_image_plane_at_pos(int cIdx, int xpos,int ypos)
208 {
209 int stride = get_image_stride(cIdx);
210 return pixels[cIdx] + xpos + ypos*stride;
211 }
212
213 const uint8_t* get_image_plane_at_pos(int cIdx, int xpos,int ypos) const
214 {
215 int stride = get_image_stride(cIdx);
216 return pixels[cIdx] + xpos + ypos*stride;
217 }
218
219 int get_image_stride(int cIdx) const
220 {
221 if (cIdx==0) return stride;
222 else return chroma_stride;
223 }
224
225 int get_luma_stride() const { return stride; }
226 int get_chroma_stride() const { return chroma_stride; }
227
228 int get_width (int cIdx=0) const { return cIdx==0 ? width : chroma_width; }
229 int get_height(int cIdx=0) const { return cIdx==0 ? height : chroma_height; }
230
231 enum de265_chroma get_chroma_format() const { return chroma_format; }
232
233
234 bool can_be_released() const { return PicOutputFlag==false && PicState==UnusedForReference; }
235
236
237 void add_slice_segment_header(slice_segment_header* shdr) {
238 shdr->slice_index = slices.size();
239 slices.push_back(shdr);
240 }
241
242
243 bool available_zscan(int xCurr,int yCurr, int xN,int yN) const;
244
245 bool available_pred_blk(int xC,int yC, int nCbS,
246 int xP, int yP, int nPbW, int nPbH, int partIdx,
247 int xN,int yN) const;
248
249
250 static de265_image_allocation default_image_allocation;
251
252 private:
253 uint32_t ID;
254 static uint32_t s_next_image_ID;
255
256 uint8_t* pixels[3];
130257
131258 enum de265_chroma chroma_format;
132259
133260 int width, height; // size in luma pixels
261
134262 int chroma_width, chroma_height;
135263 int stride, chroma_stride;
136264
137 int border;
138
265 public:
266 std::vector<slice_segment_header*> slices;
267
268 public:
139269
140270 // --- conformance cropping window ---
141271
142 uint8_t* y_confwin;
143 uint8_t* cb_confwin;
144 uint8_t* cr_confwin;
272 uint8_t* pixels_confwin[3]; // pointer to pixels in the conformance window
145273
146274 int width_confwin, height_confwin;
147275 int chroma_width_confwin, chroma_height_confwin;
148276
149
150277 // --- decoding info ---
151278
152279 // If PicOutputFlag==false && PicState==UnusedForReference, image buffer is free.
153280
154281 int picture_order_cnt_lsb;
155282 int PicOrderCntVal;
283 enum PictureState PicState;
156284 bool PicOutputFlag;
157 enum PictureState PicState;
158
159
160 seq_parameter_set* sps; // the SPS used for decoding this image
161 pic_parameter_set* pps; // the PPS used for decoding this image
162
163
164 CTB_info* ctb_info; // in raster scan
165 int ctb_info_size;
166
167 CB_ref_info* cb_info;
168 int cb_info_size;
169
170 PB_ref_info* pb_info;
171 int pb_info_size;
172 int pb_info_stride;
173
174 uint8_t* intraPredMode; // sps->PicWidthInMinPUs * sps->PicHeightInMinPUs
175 int intraPredModeSize;
176
177 uint8_t* tu_info;
178 int tu_info_size;
179
180 uint8_t* deblk_info;
181 int deblk_info_size;
182 int deblk_width;
183 int deblk_height;
184
185 // TODO CHECK: should this move to slice header? Can this be different for each slice in the image?
186
285
286 int32_t removed_at_picture_id;
287
288 video_parameter_set vps;
289 seq_parameter_set sps; // the SPS used for decoding this image
290 pic_parameter_set pps; // the PPS used for decoding this image
291 decoder_context* decctx;
292
293 private:
294 MetaDataArray<CTB_info> ctb_info;
295 MetaDataArray<CB_ref_info> cb_info;
296 MetaDataArray<PB_ref_info> pb_info;
297 MetaDataArray<uint8_t> intraPredMode;
298 MetaDataArray<uint8_t> tu_info;
299 MetaDataArray<uint8_t> deblk_info;
300
301 public:
187302 // --- meta information ---
188303
189304 de265_PTS pts;
190305 void* user_data;
306 void* plane_user_data[3]; // this is logically attached to the pixel data pointers
191307
192308 uint8_t integrity; /* Whether an error occured while the image was decoded.
193309 When generated, this is initialized to INTEGRITY_CORRECT,
194310 and changed on decoding errors.
195311 */
196 uint8_t sei_hash_check_result;
312 bool sei_hash_check_result;
313
314 nal_header nal_hdr;
197315
198316 // --- multi core ---
199317
200318 de265_progress_lock* ctb_progress; // ctb_info_size
201319
202 ALIGNED_8(de265_sync_int tasks_pending); // number of tasks pending to complete decoding
320
321 void thread_start(int nThreads);
322 void thread_run();
323 void thread_blocks();
324 void thread_unblocks();
325 void thread_finishes(); /* NOTE: you should not access any data in the thread_task after
326 calling this, as this function may unlock other threads that
327 will push this image to the output queue and free all decoder data. */
328
329 void wait_for_progress(thread_task* task, int ctbx,int ctby, int progress);
330 void wait_for_progress(thread_task* task, int ctbAddrRS, int progress);
331
332 void wait_for_completion(); // block until image is decoded by background threads
333 bool debug_is_completed() const;
334 int num_threads_active() const { return nThreadsRunning + nThreadsBlocked; } // for debug only
335
336 //private:
337 int nThreadsQueued;
338 int nThreadsRunning;
339 int nThreadsBlocked;
340 int nThreadsFinished;
341 int nThreadsTotal;
342
343 // ALIGNED_8(de265_sync_int tasks_pending); // number of tasks pending to complete decoding
203344 de265_mutex mutex;
204345 de265_cond finished_cond;
205346
206 } de265_image;
207
208
209 void de265_init_image (de265_image* img); // (optional) init variables, do not alloc image
210 de265_error de265_alloc_image(de265_image* img, int w,int h, enum de265_chroma c,
211 const seq_parameter_set* sps);
212 void de265_free_image (de265_image* img);
213
214 void de265_fill_image(de265_image* img, int y,int u,int v);
215 void de265_copy_image(de265_image* dest, const de265_image* src);
216
217 void get_image_plane(const de265_image*, int cIdx, uint8_t** image, int* stride);
218 void set_conformance_window(de265_image* img, int left,int right,int top,int bottom);
219
220
221 void increase_pending_tasks(de265_image* img, int n);
222 void decrease_pending_tasks(de265_image* img, int n);
223 void wait_for_completion(de265_image* img); // block until image is decoded by background threads
224
225
226 void prepare_image_for_decoding(de265_image*);
227
228 void set_cu_skip_flag(const seq_parameter_set* sps, de265_image* img,
229 int x,int y, int log2BlkWidth, uint8_t flag);
230 uint8_t get_cu_skip_flag(const seq_parameter_set* sps, const de265_image* img, int x,int y);
231
232 void set_pred_mode(de265_image* img, const seq_parameter_set* sps,
233 int x,int y, int log2BlkWidth, enum PredMode mode);
234 enum PredMode get_pred_mode(const de265_image* img, const seq_parameter_set* sps, int x,int y);
235
236 void set_pcm_flag(de265_image* img, const seq_parameter_set* sps,
237 int x,int y, int log2BlkWidth);
238 int get_pcm_flag(const de265_image* img, const seq_parameter_set* sps, int x,int y);
239
240
241 void set_cu_transquant_bypass(const de265_image* img, const seq_parameter_set* sps,
242 int x,int y, int log2BlkWidth);
243 int get_cu_transquant_bypass(const de265_image* img, const seq_parameter_set* sps, int x,int y);
244
245
246 void set_log2CbSize(de265_image* img, const seq_parameter_set* sps, int x0, int y0, int log2CbSize);
247 int get_log2CbSize(const de265_image* img, const seq_parameter_set* sps, int x0, int y0);
248 int get_log2CbSize_cbUnits(de265_image* img, const seq_parameter_set* sps, int xCb, int yCb);
249
250
251 void set_PartMode( de265_image*, const seq_parameter_set*, int x,int y, enum PartMode);
252 enum PartMode get_PartMode(const de265_image*, const seq_parameter_set*, int x,int y);
253
254
255 void set_ctDepth(de265_image*, const seq_parameter_set*, int x,int y, int log2BlkWidth, int depth);
256 int get_ctDepth(const de265_image*, const seq_parameter_set*, int x,int y);
257
258 void set_QPY(de265_image*, const seq_parameter_set*,
259 const pic_parameter_set* pps, int x,int y, int log2BlkWidth, int QP_Y);
260 int get_QPY(const de265_image*, const seq_parameter_set*,int x0,int y0);
261
262 void set_split_transform_flag(de265_image* img,const seq_parameter_set* sps,
263 int x0,int y0,int trafoDepth);
264 int get_split_transform_flag(const de265_image* img, const seq_parameter_set* sps,
265 int x0,int y0,int trafoDepth);
266
267 void set_nonzero_coefficient(de265_image* img,const seq_parameter_set* sps,
268 int x,int y, int log2TrafoSize);
269
270 int get_nonzero_coefficient(const de265_image* img,const seq_parameter_set* sps,
271 int x,int y);
272
273 enum IntraPredMode get_IntraPredMode(const de265_image* img, const seq_parameter_set* sps, int x,int y);
274
275
276 void set_deblk_flags(de265_image* img, int x0,int y0, uint8_t flags);
277 uint8_t get_deblk_flags(const de265_image* img, int x0,int y0);
278
279 void set_deblk_bS(de265_image* img, int x0,int y0, uint8_t bS);
280 uint8_t get_deblk_bS(const de265_image* img, int x0,int y0);
281
282
283 // address of first CTB in slice
284 void set_SliceAddrRS(de265_image* img, const seq_parameter_set* sps,
285 int ctbX, int ctbY, int SliceAddrRS);
286 int get_SliceAddrRS(const de265_image* img, const seq_parameter_set* sps, int ctbX, int ctbY);
287 int get_SliceAddrRS_atCtbRS(const de265_image* img, const seq_parameter_set* sps, int ctbRS);
288
289
290 void set_SliceHeaderIndex(de265_image* img, const seq_parameter_set* sps,
291 int x, int y, int SliceHeaderIndex);
292 int get_SliceHeaderIndex(const de265_image* img, const seq_parameter_set* sps, int x, int y);
293
294 void set_sao_info(de265_image* img,const seq_parameter_set* sps,
295 int ctbX,int ctbY,const sao_info* saoinfo);
296 const sao_info* get_sao_info(const de265_image* img,const seq_parameter_set* sps, int ctbX,int ctbY);
297
347 public:
348
349 /* Clear all CTB/CB/PB decoding data of this image.
350 All CTB's processing states are set to 'unprocessed'.
351 */
352 void clear_metadata();
353
354
355 // --- CB metadata access ---
356
357 void set_pred_mode(int x,int y, int log2BlkWidth, enum PredMode mode)
358 {
359 SET_CB_BLK(x,y,log2BlkWidth, PredMode, mode);
360 }
361
362 void fill_pred_mode(enum PredMode mode)
363 {
364 for (int i=0;i<cb_info.data_size;i++)
365 { cb_info[i].PredMode = MODE_INTRA; }
366 }
367
368 enum PredMode get_pred_mode(int x,int y) const
369 {
370 return (enum PredMode)cb_info.get(x,y).PredMode;
371 }
372
373 uint8_t get_cu_skip_flag(int x,int y) const
374 {
375 return get_pred_mode(x,y)==MODE_SKIP;
376 }
377
378 void set_pcm_flag(int x,int y, int log2BlkWidth)
379 {
380 SET_CB_BLK(x,y,log2BlkWidth, pcm_flag, 1);
381 ctb_info.get(x,y).has_pcm = true;
382 }
383
384 int get_pcm_flag(int x,int y) const
385 {
386 return cb_info.get(x,y).pcm_flag;
387 }
388
389 void set_cu_transquant_bypass(int x,int y, int log2BlkWidth)
390 {
391 SET_CB_BLK(x,y,log2BlkWidth, cu_transquant_bypass, 1);
392 ctb_info.get(x,y).has_cu_transquant_bypass = true;
393 }
394
395 int get_cu_transquant_bypass(int x,int y) const
396 {
397 return cb_info.get(x,y).cu_transquant_bypass;
398 }
399
400 void set_log2CbSize(int x0, int y0, int log2CbSize)
401 {
402 cb_info.get(x0,y0).log2CbSize = log2CbSize;
403
404 // assume that remaining cb_info blocks are initialized to zero
405 }
406
407 int get_log2CbSize(int x0, int y0) const
408 {
409 return (enum PredMode)cb_info.get(x0,y0).log2CbSize;
410 }
411
412 // coordinates in CB units
413 int get_log2CbSize_cbUnits(int xCb, int yCb) const
414 {
415 return (enum PredMode)cb_info[ xCb + yCb*cb_info.width_in_units ].log2CbSize;
416 }
417
418 void set_PartMode(int x,int y, enum PartMode mode)
419 {
420 cb_info.get(x,y).PartMode = mode;
421 }
422
423 enum PartMode get_PartMode(int x,int y) const
424 {
425 return (enum PartMode)cb_info.get(x,y).PartMode;
426 }
427
428 void set_ctDepth(int x,int y, int log2BlkWidth, int depth)
429 {
430 SET_CB_BLK(x,y,log2BlkWidth, ctDepth, depth);
431 }
432
433 int get_ctDepth(int x,int y) const
434 {
435 return cb_info.get(x,y).ctDepth;
436 }
437
438 void set_QPY(int x,int y, int log2BlkWidth, int QP_Y)
439 {
440 SET_CB_BLK (x, y, log2BlkWidth, QP_Y, QP_Y);
441 }
442
443 int get_QPY(int x0,int y0) const
444 {
445 return cb_info.get(x0,y0).QP_Y;
446 }
447
448 // --- TU metadata access ---
449
450 void set_split_transform_flag(int x0,int y0,int trafoDepth)
451 {
452 tu_info.get(x0,y0) |= (1<<trafoDepth);
453 }
454
455 int get_split_transform_flag(int x0,int y0,int trafoDepth) const
456 {
457 return (tu_info.get(x0,y0) & (1<<trafoDepth));
458 }
459
460 void set_nonzero_coefficient(int x,int y, int log2TrafoSize)
461 {
462 const int tuX = x >> tu_info.log2unitSize;
463 const int tuY = y >> tu_info.log2unitSize;
464 const int width = 1 << (log2TrafoSize - tu_info.log2unitSize);
465
466 for (int tuy=tuY;tuy<tuY+width;tuy++)
467 for (int tux=tuX;tux<tuX+width;tux++)
468 {
469 tu_info[ tux + tuy*tu_info.width_in_units ] |= TU_FLAG_NONZERO_COEFF;
470 }
471 }
472
473 int get_nonzero_coefficient(int x,int y) const
474 {
475 return tu_info.get(x,y) & TU_FLAG_NONZERO_COEFF;
476 }
477
478
479 // --- intraPredMode metadata access ---
480
481 enum IntraPredMode get_IntraPredMode(int x,int y) const
482 {
483 return (enum IntraPredMode)intraPredMode.get(x,y);
484 }
485
486 enum IntraPredMode get_IntraPredMode_atIndex(int idx) const
487 {
488 return (enum IntraPredMode)intraPredMode[idx];
489 }
490
491 void set_IntraPredMode(int PUidx,int log2blkSize, enum IntraPredMode mode)
492 {
493 int pbSize = 1<<(log2blkSize - intraPredMode.log2unitSize);
494
495 for (int y=0;y<pbSize;y++)
496 for (int x=0;x<pbSize;x++)
497 intraPredMode[PUidx + x + y*intraPredMode.width_in_units] = mode;
498 }
499
500
501 // --- CTB metadata access ---
502
503 // address of first CTB in slice
504 void set_SliceAddrRS(int ctbX, int ctbY, int SliceAddrRS)
505 {
506 int idx = ctbX + ctbY*ctb_info.width_in_units;
507 ctb_info[idx].SliceAddrRS = SliceAddrRS;
508 }
509
510 int get_SliceAddrRS(int ctbX, int ctbY) const
511 {
512 return ctb_info[ctbX + ctbY*ctb_info.width_in_units].SliceAddrRS;
513 }
514
515 int get_SliceAddrRS_atCtbRS(int ctbRS) const
516 {
517 return ctb_info[ctbRS].SliceAddrRS;
518 }
519
520
521 void set_SliceHeaderIndex(int x, int y, int SliceHeaderIndex)
522 {
523 ctb_info.get(x,y).SliceHeaderIndex = SliceHeaderIndex;
524 }
525
526 int get_SliceHeaderIndex(int x, int y) const
527 {
528 return ctb_info.get(x,y).SliceHeaderIndex;
529 }
530
531 int get_SliceHeaderIndexCtb(int ctbX, int ctbY) const
532 {
533 return ctb_info[ctbX + ctbY*ctb_info.width_in_units].SliceHeaderIndex;
534 }
535
536 int get_SliceHeaderIndex_atIndex(int ctb) const
537 {
538 return ctb_info[ctb].SliceHeaderIndex;
539 }
540
541 slice_segment_header* get_SliceHeader(int x, int y)
542 {
543 return slices[ get_SliceHeaderIndex(x,y) ];
544 }
545
546 slice_segment_header* get_SliceHeaderCtb(int ctbX, int ctbY)
547 {
548 return slices[ get_SliceHeaderIndexCtb(ctbX,ctbY) ];
549 }
550
551 const slice_segment_header* get_SliceHeaderCtb(int ctbX, int ctbY) const
552 {
553 return slices[ get_SliceHeaderIndexCtb(ctbX,ctbY) ];
554 }
555
556 void set_sao_info(int ctbX,int ctbY,const sao_info* saoinfo)
557 {
558 sao_info* sao = &ctb_info[ctbX + ctbY*ctb_info.width_in_units].saoInfo;
559
560 memcpy(sao,
561 saoinfo,
562 sizeof(sao_info));
563 }
564
565 const sao_info* get_sao_info(int ctbX,int ctbY) const
566 {
567 return &ctb_info[ctbX + ctbY*ctb_info.width_in_units].saoInfo;
568 }
569
570
571 void set_CtbDeblockFlag(int ctbX, int ctbY, bool flag)
572 {
573 int idx = ctbX + ctbY*ctb_info.width_in_units;
574 ctb_info[idx].deblock = flag;
575 }
576
577 bool get_CtbDeblockFlag(int ctbX, int ctbY) const
578 {
579 return ctb_info[ctbX + ctbY*ctb_info.width_in_units].deblock;
580 }
581
582
583 bool get_CTB_has_pcm(int ctbX,int ctbY) const
584 {
585 int idx = ctbX + ctbY*ctb_info.width_in_units;
586 return ctb_info[idx].has_pcm;
587 }
588
589 bool get_CTB_has_cu_transquant_bypass(int ctbX,int ctbY) const
590 {
591 int idx = ctbX + ctbY*ctb_info.width_in_units;
592 return ctb_info[idx].has_cu_transquant_bypass;
593 }
594
595
596
597 // --- DEBLK metadata access ---
598
599 int get_deblk_width() const { return deblk_info.width_in_units; }
600 int get_deblk_height() const { return deblk_info.height_in_units; }
601
602 void set_deblk_flags(int x0,int y0, uint8_t flags)
603 {
604 const int xd = x0/4;
605 const int yd = y0/4;
606
607 if (xd<deblk_info.width_in_units &&
608 yd<deblk_info.height_in_units) {
609 deblk_info[xd + yd*deblk_info.width_in_units] |= flags;
610 }
611 }
612
613 uint8_t get_deblk_flags(int x0,int y0) const
614 {
615 const int xd = x0/4;
616 const int yd = y0/4;
617
618 return deblk_info[xd + yd*deblk_info.width_in_units];
619 }
620
621 void set_deblk_bS(int x0,int y0, uint8_t bS)
622 {
623 uint8_t* data = &deblk_info[x0/4 + y0/4*deblk_info.width_in_units];
624 *data &= ~DEBLOCK_BS_MASK;
625 *data |= bS;
626 }
627
628 uint8_t get_deblk_bS(int x0,int y0) const
629 {
630 return deblk_info[x0/4 + y0/4*deblk_info.width_in_units] & DEBLOCK_BS_MASK;
631 }
632
633
634 // --- PB metadata access ---
635
636 const PredVectorInfo* get_mv_info(int x,int y) const
637 {
638 return &pb_info.get(x,y).mvi;
639 }
640
641 void set_mv_info(int x,int y, int nPbW,int nPbH, const PredVectorInfo* mv);
298642
299643 // --- value logging ---
300644
645 };
646
647
301648 #endif
+0
-597
libde265/intrapred.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "intrapred.h"
21 #include "transform.h"
22 #include "util.h"
23 #include <assert.h>
24
25
26 #include <sys/types.h>
27 #include <string.h>
28
29
30 int nIntraPredictions;
31 int nAvail0;
32 int nAvailPart;
33 int nAvailAll;
34 int nAvailSz[32*2+32*2+1+1];
35
36 LIBDE265_API void showIntraPredictionProfile()
37 {
38 printf("nIntraPredictions: %d\n", nIntraPredictions);
39 printf(" with no available border samples: %d\n", nAvail0);
40 printf(" with partially available samples: %d\n", nAvailPart);
41 printf(" with complete border samples: %d\n", nAvailAll);
42
43 if (0) {
44 printf(" ");
45 for (int i=0;i<32*2+32*2+1+1;i++)
46 printf("%d ",nAvailSz[i]);
47 printf("\n");
48 }
49 }
50
51
52 void print_border(uint8_t* data, uint8_t* available, int nT)
53 {
54 for (int i=-2*nT ; i<=2*nT ; i++) {
55 if (i==0 || i==1 || i==-nT || i==nT+1) {
56 logtrace(LogIntraPred,"|");
57 } else {
58 logtrace(LogIntraPred," ");
59 }
60
61 if (available==NULL || available[i]) {
62 logtrace(LogIntraPred,"%02x",data[i]);
63 }
64 else {
65 logtrace(LogIntraPred,"--");
66 }
67 }
68 }
69
70
71 // (8.4.4.2.2)
72 void fill_border_samples(decoder_context* ctx, int xB,int yB,
73 int nT, int cIdx,
74 uint8_t* out_border)
75 {
76 seq_parameter_set* sps = ctx->current_sps;
77
78 uint8_t available_data[2*64 + 1];
79 uint8_t* available = &available_data[64];
80
81 uint8_t* image;
82 int stride;
83 get_image_plane(ctx->img, cIdx, &image, &stride);
84
85 const int chromaShift = (cIdx==0) ? 0 : 1;
86 const int TUShift = (cIdx==0) ? sps->Log2MinTrafoSize : sps->Log2MinTrafoSize-1;
87
88
89 // --- check for CTB boundaries ---
90
91 int xBLuma = (cIdx==0) ? xB : 2*xB;
92 int yBLuma = (cIdx==0) ? yB : 2*yB;
93 int nTLuma = (cIdx==0) ? nT : 2*nT;
94
95 int log2CtbSize = sps->Log2CtbSizeY;
96 int picWidthInCtbs = ctx->current_sps->PicWidthInCtbsY;
97 const pic_parameter_set* pps = ctx->current_pps;
98
99 bool availableLeft=true; // is CTB at left side available?
100 bool availableTop=true; // is CTB at top side available?
101 bool availableTopRight=true; // is CTB at top-right side available?
102 bool availableTopLeft=true; // if CTB at top-left pixel available?
103
104
105 // are we at left image border
106
107 if (xBLuma == 0) {
108 availableLeft = false;
109 availableTopLeft = false;
110 xBLuma = 0; // fake value, available flags are already set to false
111 }
112
113
114 // are we at top image border
115
116 if (yBLuma == 0) {
117 availableTop = false;
118 availableTopLeft = false;
119 availableTopRight = false;
120 yBLuma = 0; // fake value, available flags are already set to false
121 }
122
123 if (xBLuma+nTLuma >= sps->pic_width_in_luma_samples) {
124 availableTopRight=false;
125 }
126
127 // check for tile and slice boundaries
128
129 int xCurrCtb = xBLuma >> log2CtbSize;
130 int yCurrCtb = yBLuma >> log2CtbSize;
131 int xLeftCtb = (xBLuma-1) >> log2CtbSize;
132 int xRightCtb = (xBLuma+nTLuma) >> log2CtbSize;
133 int yTopCtb = (yBLuma-1) >> log2CtbSize;
134
135 int currCTBSlice = get_SliceAddrRS(ctx->img,sps, xCurrCtb,yCurrCtb);
136 int leftCTBSlice = availableLeft ? get_SliceAddrRS(ctx->img,sps, xLeftCtb, yCurrCtb) : -1;
137 int topCTBSlice = availableTop ? get_SliceAddrRS(ctx->img,sps, xCurrCtb, yTopCtb) : -1;
138 int toprightCTBSlice = availableTopRight ? get_SliceAddrRS(ctx->img,sps, xRightCtb, yTopCtb) : -1;
139 int topleftCTBSlice = availableTopLeft ? get_SliceAddrRS(ctx->img,sps, xLeftCtb, yTopCtb) : -1;
140
141 int currCTBTileID = pps->TileIdRS[xCurrCtb+yCurrCtb*picWidthInCtbs];
142 int leftCTBTileID = availableLeft ? pps->TileIdRS[xLeftCtb+yCurrCtb*picWidthInCtbs] : -1;
143 int topCTBTileID = availableTop ? pps->TileIdRS[xCurrCtb+yTopCtb*picWidthInCtbs] : -1;
144 int topleftCTBTileID = availableTopLeft ? pps->TileIdRS[xLeftCtb+yTopCtb*picWidthInCtbs] : -1;
145 int toprightCTBTileID= availableTopRight? pps->TileIdRS[xRightCtb+yTopCtb*picWidthInCtbs] : -1;
146
147 if (leftCTBSlice != currCTBSlice || leftCTBTileID != currCTBTileID ) availableLeft = false;
148 if (topCTBSlice != currCTBSlice || topCTBTileID != currCTBTileID ) availableTop = false;
149 if (topleftCTBSlice !=currCTBSlice||topleftCTBTileID!=currCTBTileID ) availableTopLeft = false;
150 if (toprightCTBSlice!=currCTBSlice||toprightCTBTileID!=currCTBTileID) availableTopRight= false;
151
152 int currBlockAddr = pps->MinTbAddrZS[ (xBLuma>>sps->Log2MinTrafoSize) +
153 (yBLuma>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ];
154
155
156 // number of pixels that are in the valid image area to the right and to the bottom
157
158 int nBottom = sps->pic_height_in_luma_samples - (cIdx==0 ? yB : 2*yB);
159 if (cIdx) nBottom=(nBottom+1)/2;
160 if (nBottom>2*nT) nBottom=2*nT;
161 int nRight = sps->pic_width_in_luma_samples - (cIdx==0 ? xB : 2*xB);
162 if (cIdx) nRight =(nRight +1)/2;
163 if (nRight >2*nT) nRight=2*nT;
164
165 int nAvail=0;
166
167 uint8_t firstValue;
168
169 memset(available-2*nT, 0, 4*nT+1);
170
171 {
172 // copy pixels at left column
173
174 for (int y=nBottom-1 ; y>=0 ; y-=4)
175 if (availableLeft)
176 {
177 int NBlockAddr = pps->MinTbAddrZS[ ((xB-1)>>TUShift) +
178 ((yB+y)>>TUShift) * sps->PicWidthInTbsY ];
179
180 bool availableN = NBlockAddr < currBlockAddr;
181
182 if (ctx->current_pps->constrained_intra_pred_flag) {
183 if (get_pred_mode(ctx->img,sps,(xB-1)<<chromaShift,(yB+y)<<chromaShift)!=MODE_INTRA)
184 availableN = false;
185 }
186
187 if (availableN) {
188 if (!nAvail) firstValue = image[xB-1 + (yB+y)*stride];
189
190 for (int i=0;i<4;i++) {
191 available[-y+i-1] = availableN;
192 out_border[-y+i-1] = image[xB-1 + (yB+y-i)*stride];
193 }
194
195 nAvail+=4;
196 }
197 }
198
199 // copy pixel at top-left position
200
201 if (availableTopLeft)
202 {
203 int NBlockAddr = pps->MinTbAddrZS[ ((xB-1)>>TUShift) +
204 ((yB-1)>>TUShift) * sps->PicWidthInTbsY ];
205
206 bool availableN = NBlockAddr < currBlockAddr;
207
208 if (ctx->current_pps->constrained_intra_pred_flag) {
209 if (get_pred_mode(ctx->img,sps,(xB-1)<<chromaShift,(yB-1)<<chromaShift)!=MODE_INTRA) {
210 availableN = false;
211 }
212 }
213
214 if (availableN) {
215 if (!nAvail) firstValue = image[xB-1 + (yB-1)*stride];
216
217 out_border[0] = image[xB-1 + (yB-1)*stride];
218 available[0] = availableN;
219 nAvail++;
220 }
221 }
222
223 // copy pixels at top row
224
225 for (int x=0 ; x<nRight ; x+=4) {
226 bool borderAvailable;
227 if (x<nT) borderAvailable=availableTop;
228 else borderAvailable=availableTopRight;
229
230 if (borderAvailable)
231 {
232 int NBlockAddr = pps->MinTbAddrZS[ ((xB+x)>>TUShift) +
233 ((yB-1)>>TUShift) * sps->PicWidthInTbsY ];
234
235 bool availableN = NBlockAddr < currBlockAddr;
236
237 if (ctx->current_pps->constrained_intra_pred_flag) {
238 if (get_pred_mode(ctx->img,sps,(xB+x)<<chromaShift,(yB-1)<<chromaShift)!=MODE_INTRA) {
239 availableN = false;
240 }
241 }
242
243
244 if (availableN) {
245 if (!nAvail) firstValue = image[xB+x + (yB-1)*stride];
246
247 for (int i=0;i<4;i++) {
248 out_border[x+i+1] = image[xB+x+i + (yB-1)*stride];
249 available[x+i+1] = availableN;
250 }
251
252 nAvail+=4;
253 }
254 }
255 }
256
257
258 // reference sample substitution
259
260 if (nAvail!=4*nT+1) {
261 if (nAvail==0) {
262 memset(out_border-2*nT, 1<<(sps->bit_depth_luma-1), 4*nT+1);
263 }
264 else {
265 if (!available[-2*nT]) {
266 out_border[-2*nT] = firstValue;
267 }
268
269 for (int i=-2*nT+1; i<=2*nT; i++)
270 if (!available[i]) {
271 out_border[i]=out_border[i-1];
272 }
273 }
274 }
275
276 logtrace(LogIntraPred,"availableN: ");
277 print_border(available,NULL,nT);
278 logtrace(LogIntraPred,"\n");
279
280 logtrace(LogIntraPred,"output: ");
281 print_border(out_border,NULL,nT);
282 logtrace(LogIntraPred,"\n");
283 }
284 }
285
286
287 // (8.4.4.2.3)
288 void intra_prediction_sample_filtering(decoder_context* ctx,
289 uint8_t* p,
290 int nT,
291 enum IntraPredMode intraPredMode)
292 {
293 int filterFlag;
294
295 if (intraPredMode==INTRA_DC || nT==4) {
296 filterFlag = 0;
297 } else {
298 // int-cast below prevents a typing problem that leads to wrong results when abs_value is a macro
299 int minDistVerHor = libde265_min( abs_value((int)intraPredMode-26),
300 abs_value((int)intraPredMode-10) );
301 switch (nT) {
302 case 8: filterFlag = (minDistVerHor>7) ? 1 : 0; break;
303 case 16: filterFlag = (minDistVerHor>1) ? 1 : 0; break;
304 case 32: filterFlag = (minDistVerHor>0) ? 1 : 0; break;
305 default: filterFlag = -1; assert(false); break; // should never happen
306 }
307 }
308
309
310 if (filterFlag) {
311 int biIntFlag = (ctx->current_sps->strong_intra_smoothing_enable_flag &&
312 nT==32 &&
313 abs_value(p[0]+p[ 64]-2*p[ 32]) < (1<<(ctx->current_sps->bit_depth_luma-5)) &&
314 abs_value(p[0]+p[-64]-2*p[-32]) < (1<<(ctx->current_sps->bit_depth_luma-5)))
315 ? 1 : 0;
316
317 uint8_t pF_mem[2*64+1];
318 uint8_t* pF = &pF_mem[64];
319
320 if (biIntFlag) {
321 pF[-2*nT] = p[-2*nT];
322 pF[ 2*nT] = p[ 2*nT];
323 pF[ 0] = p[ 0];
324
325 for (int i=1;i<=63;i++) {
326 pF[-i] = p[0] + ((i*(p[-64]-p[0])+32)>>6);
327 pF[ i] = p[0] + ((i*(p[ 64]-p[0])+32)>>6);
328 }
329 } else {
330 pF[-2*nT] = p[-2*nT];
331 pF[ 2*nT] = p[ 2*nT];
332
333 for (int i=-(2*nT-1) ; i<=2*nT-1 ; i++)
334 {
335 pF[i] = (p[i+1] + 2*p[i] + p[i-1] + 2) >> 2;
336 }
337 }
338
339
340 // copy back to original array
341
342 memcpy(p-2*nT, pF-2*nT, 4*nT+1);
343 }
344 else {
345 // do nothing ?
346 }
347
348
349 logtrace(LogIntraPred,"post filtering: ");
350 print_border(p,NULL,nT);
351 logtrace(LogIntraPred,"\n");
352 }
353
354
355 const int intraPredAngle_table[1+34] =
356 { 0, 0,32,26,21,17,13, 9, 5, 2, 0,-2,-5,-9,-13,-17,-21,-26,
357 -32,-26,-21,-17,-13,-9,-5,-2,0,2,5,9,13,17,21,26,32 };
358
359 static const int invAngle_table[25-10] =
360 { -4096,-1638,-910,-630,-482,-390,-315,-256,
361 -315,-390,-482,-630,-910,-1638,-4096 };
362
363
364 // TODO: clip to read BitDepthY
365 int Clip1Y(int x) { if (x<0) return 0; else if (x>255) return 255; else return x; }
366
367
368 // (8.4.4.2.6)
369 void intra_prediction_angular(decoder_context* ctx,
370 int xB0,int yB0,
371 enum IntraPredMode intraPredMode,
372 int nT,int cIdx,
373 uint8_t* border)
374 {
375 uint8_t ref_mem[2*64+1];
376 uint8_t* ref=&ref_mem[64];
377
378 uint8_t* pred;
379 int stride;
380 get_image_plane(ctx->img,cIdx,&pred,&stride);
381 pred += xB0 + yB0*stride;
382
383 int intraPredAngle = intraPredAngle_table[intraPredMode];
384
385 if (intraPredMode >= 18) {
386
387 for (int x=0;x<=nT;x++)
388 { ref[x] = border[x]; }
389
390 if (intraPredAngle<0) {
391 int invAngle = invAngle_table[intraPredMode-11];
392
393 if ((nT*intraPredAngle)>>5 < -1) {
394 for (int x=(nT*intraPredAngle)>>5; x<=-1; x++) {
395 ref[x] = border[0-((x*invAngle+128)>>8)];
396 }
397 }
398 } else {
399 for (int x=nT+1; x<=2*nT;x++) {
400 ref[x] = border[x];
401 }
402 }
403
404 for (int y=0;y<nT;y++)
405 for (int x=0;x<nT;x++)
406 {
407 int iIdx = ((y+1)*intraPredAngle)>>5;
408 int iFact= ((y+1)*intraPredAngle)&31;
409
410 if (iFact != 0) {
411 pred[x+y*stride] = ((32-iFact)*ref[x+iIdx+1] + iFact*ref[x+iIdx+2] + 16)>>5;
412 } else {
413 pred[x+y*stride] = ref[x+iIdx+1];
414 }
415 }
416
417 if (intraPredMode==26 && cIdx==0 && nT<32) {
418 for (int y=0;y<nT;y++) {
419 pred[0+y*stride] = Clip1Y(border[1] + ((border[-1-y] - border[0])>>1));
420 }
421 }
422 }
423 else { // intraPredAngle < 18
424
425 for (int x=0;x<=nT;x++)
426 { ref[x] = border[-x]; } // DIFF (neg)
427
428 if (intraPredAngle<0) {
429 int invAngle = invAngle_table[intraPredMode-11];
430
431 if ((nT*intraPredAngle)>>5 < -1) {
432 for (int x=(nT*intraPredAngle)>>5; x<=-1; x++) {
433 ref[x] = border[((x*invAngle+128)>>8)]; // DIFF (neg)
434 }
435 }
436 } else {
437 for (int x=nT+1; x<=2*nT;x++) {
438 ref[x] = border[-x]; // DIFF (neg)
439 }
440 }
441
442 for (int y=0;y<nT;y++)
443 for (int x=0;x<nT;x++)
444 {
445 int iIdx = ((x+1)*intraPredAngle)>>5; // DIFF (x<->y)
446 int iFact= ((x+1)*intraPredAngle)&31; // DIFF (x<->y)
447
448 if (iFact != 0) {
449 pred[x+y*stride] = ((32-iFact)*ref[y+iIdx+1] + iFact*ref[y+iIdx+2] + 16)>>5; // DIFF (x<->y)
450 } else {
451 pred[x+y*stride] = ref[y+iIdx+1]; // DIFF (x<->y)
452 }
453 }
454
455 if (intraPredMode==10 && cIdx==0 && nT<32) { // DIFF 26->10
456 for (int x=0;x<nT;x++) { // DIFF (x<->y)
457 pred[x] = Clip1Y(border[-1] + ((border[1+x] - border[0])>>1)); // DIFF (x<->y && neg)
458 }
459 }
460 }
461
462
463 logtrace(LogIntraPred,"result of angular intra prediction (mode=%d):\n",intraPredMode);
464
465 for (int y=0;y<nT;y++)
466 {
467 for (int x=0;x<nT;x++)
468 logtrace(LogIntraPred,"%02x ", pred[x+y*stride]);
469
470 logtrace(LogIntraPred,"\n");
471 }
472 }
473
474
475 void intra_prediction_planar(decoder_context* ctx,int xB0,int yB0,int nT,int cIdx,
476 uint8_t* border)
477 {
478 uint8_t* pred;
479 int stride;
480 get_image_plane(ctx->img,cIdx,&pred,&stride);
481 pred += xB0 + yB0*stride;
482
483 int Log2_nT = Log2(nT);
484
485 for (int y=0;y<nT;y++)
486 for (int x=0;x<nT;x++)
487 {
488 pred[x+y*stride] = ((nT-1-x)*border[-1-y] + (x+1)*border[ 1+nT] +
489 (nT-1-y)*border[ 1+x] + (y+1)*border[-1-nT] + nT) >> (Log2_nT+1);
490 }
491
492
493 logtrace(LogIntraPred,"result of planar prediction\n");
494
495 for (int y=0;y<nT;y++)
496 {
497 for (int x=0;x<nT;x++)
498 logtrace(LogIntraPred,"%02x ", pred[x+y*stride]);
499
500 logtrace(LogIntraPred,"\n");
501 }
502 }
503
504
505 void intra_prediction_DC(decoder_context* ctx,int xB0,int yB0,int nT,int cIdx,
506 uint8_t* border)
507 {
508 uint8_t* pred;
509 int stride;
510 get_image_plane(ctx->img,cIdx,&pred,&stride);
511 pred += xB0 + yB0*stride;
512
513 int Log2_nT = Log2(nT);
514
515 int dcVal = 0;
516 for (int i=0;i<nT;i++)
517 {
518 dcVal += border[ i+1];
519 dcVal += border[-i-1];
520 }
521
522 dcVal += nT;
523 dcVal >>= Log2_nT+1;
524
525 if (cIdx==0 && nT<32) {
526 pred[0] = (border[-1] + 2*dcVal + border[1] +2) >> 2;
527
528 for (int x=1;x<nT;x++) { pred[x] = (border[ x+1] + 3*dcVal+2)>>2; }
529 for (int y=1;y<nT;y++) { pred[y*stride] = (border[-y-1] + 3*dcVal+2)>>2; }
530 for (int y=1;y<nT;y++)
531 for (int x=1;x<nT;x++)
532 {
533 pred[x+y*stride] = dcVal;
534 }
535 } else {
536 for (int y=0;y<nT;y++)
537 for (int x=0;x<nT;x++)
538 {
539 pred[x+y*stride] = dcVal;
540 }
541 }
542
543
544 /*
545 printf("INTRAPRED DC\n");
546 for (int y=0;y<nT;y++) {
547 for (int x=0;x<nT;x++)
548 {
549 printf("%d ",pred[x+y*stride]);
550 }
551 printf("\n");
552 }
553 */
554 }
555
556
557
558 // (8.4.4.2.1)
559 void decode_intra_prediction(decoder_context* ctx,
560 int xB0,int yB0,
561 enum IntraPredMode intraPredMode,
562 int nT, int cIdx)
563 {
564 logtrace(LogIntraPred,"decode_intra_prediction xy0:%d/%d mode=%d nT=%d, cIdx=%d\n",
565 xB0,yB0, intraPredMode, nT,cIdx);
566 /*
567 printf("decode_intra_prediction xy0:%d/%d mode=%d nT=%d, cIdx=%d\n",
568 xB0,yB0, intraPredMode, nT,cIdx);
569 */
570
571 nIntraPredictions++;
572
573 uint8_t border_pixels_mem[2*64+1];
574 uint8_t* border_pixels = &border_pixels_mem[64];
575
576 fill_border_samples(ctx, xB0,yB0, nT, cIdx, border_pixels);
577
578 if (cIdx==0) {
579 intra_prediction_sample_filtering(ctx, border_pixels, nT, intraPredMode);
580 }
581
582
583 switch (intraPredMode) {
584 case INTRA_PLANAR:
585 intra_prediction_planar(ctx,xB0,yB0,nT,cIdx, border_pixels);
586 break;
587 case INTRA_DC:
588 intra_prediction_DC(ctx,xB0,yB0,nT,cIdx, border_pixels);
589 break;
590 default:
591 intra_prediction_angular(ctx,xB0,yB0,intraPredMode,nT,cIdx, border_pixels);
592 break;
593 }
594 }
595
596
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "intrapred.h"
21 #include "transform.h"
22 #include "util.h"
23 #include <assert.h>
24
25
26 #include <sys/types.h>
27 #include <string.h>
28
29
30 #ifdef DE265_LOG_TRACE
31 void print_border(uint8_t* data, uint8_t* available, int nT)
32 {
33 for (int i=-2*nT ; i<=2*nT ; i++) {
34 if (i==0 || i==1 || i==-nT || i==nT+1) {
35 logtrace(LogIntraPred,"|");
36 } else {
37 logtrace(LogIntraPred," ");
38 }
39
40 if (available==NULL || available[i]) {
41 logtrace(LogIntraPred,"%02x",data[i]);
42 }
43 else {
44 logtrace(LogIntraPred,"--");
45 }
46 }
47 }
48 #else
49 #define print_border(data, available, nT)
50 #endif
51
52
53 // (8.4.4.2.2)
54 void fill_border_samples(de265_image* img, int xB,int yB,
55 int nT, int cIdx,
56 uint8_t* out_border)
57 {
58 const seq_parameter_set* sps = &img->sps;
59 const pic_parameter_set* pps = &img->pps;
60
61 uint8_t available_data[2*64 + 1];
62 uint8_t* available = &available_data[64];
63
64 uint8_t* image;
65 int stride;
66 image = img->get_image_plane(cIdx);
67 stride = img->get_image_stride(cIdx);
68
69 const int chromaShift = (cIdx==0) ? 0 : 1;
70 const int TUShift = (cIdx==0) ? sps->Log2MinTrafoSize : sps->Log2MinTrafoSize-1;
71
72
73 // --- check for CTB boundaries ---
74
75 int xBLuma = (cIdx==0) ? xB : 2*xB;
76 int yBLuma = (cIdx==0) ? yB : 2*yB;
77 int nTLuma = (cIdx==0) ? nT : 2*nT;
78
79 int log2CtbSize = sps->Log2CtbSizeY;
80 int picWidthInCtbs = sps->PicWidthInCtbsY;
81
82 bool availableLeft=true; // is CTB at left side available?
83 bool availableTop=true; // is CTB at top side available?
84 bool availableTopRight=true; // is CTB at top-right side available?
85 bool availableTopLeft=true; // if CTB at top-left pixel available?
86
87
88 // are we at left image border
89
90 if (xBLuma == 0) {
91 availableLeft = false;
92 availableTopLeft = false;
93 xBLuma = 0; // fake value, available flags are already set to false
94 }
95
96
97 // are we at top image border
98
99 if (yBLuma == 0) {
100 availableTop = false;
101 availableTopLeft = false;
102 availableTopRight = false;
103 yBLuma = 0; // fake value, available flags are already set to false
104 }
105
106 if (xBLuma+nTLuma >= sps->pic_width_in_luma_samples) {
107 availableTopRight=false;
108 }
109
110 // check for tile and slice boundaries
111
112 int xCurrCtb = xBLuma >> log2CtbSize;
113 int yCurrCtb = yBLuma >> log2CtbSize;
114 int xLeftCtb = (xBLuma-1) >> log2CtbSize;
115 int xRightCtb = (xBLuma+nTLuma) >> log2CtbSize;
116 int yTopCtb = (yBLuma-1) >> log2CtbSize;
117
118 int currCTBSlice = img->get_SliceAddrRS(xCurrCtb,yCurrCtb);
119 int leftCTBSlice = availableLeft ? img->get_SliceAddrRS(xLeftCtb, yCurrCtb) : -1;
120 int topCTBSlice = availableTop ? img->get_SliceAddrRS(xCurrCtb, yTopCtb) : -1;
121 int toprightCTBSlice = availableTopRight ? img->get_SliceAddrRS(xRightCtb, yTopCtb) : -1;
122 int topleftCTBSlice = availableTopLeft ? img->get_SliceAddrRS(xLeftCtb, yTopCtb) : -1;
123
124 int currCTBTileID = pps->TileIdRS[xCurrCtb+yCurrCtb*picWidthInCtbs];
125 int leftCTBTileID = availableLeft ? pps->TileIdRS[xLeftCtb+yCurrCtb*picWidthInCtbs] : -1;
126 int topCTBTileID = availableTop ? pps->TileIdRS[xCurrCtb+yTopCtb*picWidthInCtbs] : -1;
127 int topleftCTBTileID = availableTopLeft ? pps->TileIdRS[xLeftCtb+yTopCtb*picWidthInCtbs] : -1;
128 int toprightCTBTileID= availableTopRight? pps->TileIdRS[xRightCtb+yTopCtb*picWidthInCtbs] : -1;
129
130 if (leftCTBSlice != currCTBSlice || leftCTBTileID != currCTBTileID ) availableLeft = false;
131 if (topCTBSlice != currCTBSlice || topCTBTileID != currCTBTileID ) availableTop = false;
132 if (topleftCTBSlice !=currCTBSlice||topleftCTBTileID!=currCTBTileID ) availableTopLeft = false;
133 if (toprightCTBSlice!=currCTBSlice||toprightCTBTileID!=currCTBTileID) availableTopRight= false;
134
135 int currBlockAddr = pps->MinTbAddrZS[ (xBLuma>>sps->Log2MinTrafoSize) +
136 (yBLuma>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ];
137
138
139 // number of pixels that are in the valid image area to the right and to the bottom
140
141 int nBottom = sps->pic_height_in_luma_samples - (cIdx==0 ? yB : 2*yB);
142 if (cIdx) nBottom=(nBottom+1)/2;
143 if (nBottom>2*nT) nBottom=2*nT;
144 int nRight = sps->pic_width_in_luma_samples - (cIdx==0 ? xB : 2*xB);
145 if (cIdx) nRight =(nRight +1)/2;
146 if (nRight >2*nT) nRight=2*nT;
147
148 int nAvail=0;
149
150 uint8_t firstValue;
151
152 memset(available-2*nT, 0, 4*nT+1);
153
154 {
155 // copy pixels at left column
156
157 for (int y=nBottom-1 ; y>=0 ; y-=4)
158 if (availableLeft)
159 {
160 int NBlockAddr = pps->MinTbAddrZS[ ((xB-1)>>TUShift) +
161 ((yB+y)>>TUShift) * sps->PicWidthInTbsY ];
162
163 bool availableN = NBlockAddr < currBlockAddr;
164
165 if (pps->constrained_intra_pred_flag) {
166 if (img->get_pred_mode((xB-1)<<chromaShift,(yB+y)<<chromaShift)!=MODE_INTRA)
167 availableN = false;
168 }
169
170 if (availableN) {
171 if (!nAvail) firstValue = image[xB-1 + (yB+y)*stride];
172
173 for (int i=0;i<4;i++) {
174 available[-y+i-1] = availableN;
175 out_border[-y+i-1] = image[xB-1 + (yB+y-i)*stride];
176 }
177
178 nAvail+=4;
179 }
180 }
181
182 // copy pixel at top-left position
183
184 if (availableTopLeft)
185 {
186 int NBlockAddr = pps->MinTbAddrZS[ ((xB-1)>>TUShift) +
187 ((yB-1)>>TUShift) * sps->PicWidthInTbsY ];
188
189 bool availableN = NBlockAddr < currBlockAddr;
190
191 if (pps->constrained_intra_pred_flag) {
192 if (img->get_pred_mode((xB-1)<<chromaShift,(yB-1)<<chromaShift)!=MODE_INTRA) {
193 availableN = false;
194 }
195 }
196
197 if (availableN) {
198 if (!nAvail) firstValue = image[xB-1 + (yB-1)*stride];
199
200 out_border[0] = image[xB-1 + (yB-1)*stride];
201 available[0] = availableN;
202 nAvail++;
203 }
204 }
205
206 // copy pixels at top row
207
208 for (int x=0 ; x<nRight ; x+=4) {
209 bool borderAvailable;
210 if (x<nT) borderAvailable=availableTop;
211 else borderAvailable=availableTopRight;
212
213 if (borderAvailable)
214 {
215 int NBlockAddr = pps->MinTbAddrZS[ ((xB+x)>>TUShift) +
216 ((yB-1)>>TUShift) * sps->PicWidthInTbsY ];
217
218 bool availableN = NBlockAddr < currBlockAddr;
219
220 if (pps->constrained_intra_pred_flag) {
221 if (img->get_pred_mode((xB+x)<<chromaShift,(yB-1)<<chromaShift)!=MODE_INTRA) {
222 availableN = false;
223 }
224 }
225
226
227 if (availableN) {
228 if (!nAvail) firstValue = image[xB+x + (yB-1)*stride];
229
230 for (int i=0;i<4;i++) {
231 out_border[x+i+1] = image[xB+x+i + (yB-1)*stride];
232 available[x+i+1] = availableN;
233 }
234
235 nAvail+=4;
236 }
237 }
238 }
239
240
241 // reference sample substitution
242
243 if (nAvail!=4*nT+1) {
244 if (nAvail==0) {
245 memset(out_border-2*nT, 1<<(sps->bit_depth_luma-1), 4*nT+1);
246 }
247 else {
248 if (!available[-2*nT]) {
249 out_border[-2*nT] = firstValue;
250 }
251
252 for (int i=-2*nT+1; i<=2*nT; i++)
253 if (!available[i]) {
254 out_border[i]=out_border[i-1];
255 }
256 }
257 }
258
259 logtrace(LogIntraPred,"availableN: ");
260 print_border(available,NULL,nT);
261 logtrace(LogIntraPred,"\n");
262
263 logtrace(LogIntraPred,"output: ");
264 print_border(out_border,NULL,nT);
265 logtrace(LogIntraPred,"\n");
266 }
267 }
268
269
270 // (8.4.4.2.3)
271 void intra_prediction_sample_filtering(de265_image* img,
272 uint8_t* p,
273 int nT,
274 enum IntraPredMode intraPredMode)
275 {
276 int filterFlag;
277
278 if (intraPredMode==INTRA_DC || nT==4) {
279 filterFlag = 0;
280 } else {
281 // int-cast below prevents a typing problem that leads to wrong results when abs_value is a macro
282 int minDistVerHor = libde265_min( abs_value((int)intraPredMode-26),
283 abs_value((int)intraPredMode-10) );
284 switch (nT) {
285 case 8: filterFlag = (minDistVerHor>7) ? 1 : 0; break;
286 case 16: filterFlag = (minDistVerHor>1) ? 1 : 0; break;
287 case 32: filterFlag = (minDistVerHor>0) ? 1 : 0; break;
288 default: filterFlag = -1; assert(false); break; // should never happen
289 }
290 }
291
292
293 if (filterFlag) {
294 int biIntFlag = (img->sps.strong_intra_smoothing_enable_flag &&
295 nT==32 &&
296 abs_value(p[0]+p[ 64]-2*p[ 32]) < (1<<(img->sps.bit_depth_luma-5)) &&
297 abs_value(p[0]+p[-64]-2*p[-32]) < (1<<(img->sps.bit_depth_luma-5)))
298 ? 1 : 0;
299
300 uint8_t pF_mem[2*64+1];
301 uint8_t* pF = &pF_mem[64];
302
303 if (biIntFlag) {
304 pF[-2*nT] = p[-2*nT];
305 pF[ 2*nT] = p[ 2*nT];
306 pF[ 0] = p[ 0];
307
308 for (int i=1;i<=63;i++) {
309 pF[-i] = p[0] + ((i*(p[-64]-p[0])+32)>>6);
310 pF[ i] = p[0] + ((i*(p[ 64]-p[0])+32)>>6);
311 }
312 } else {
313 pF[-2*nT] = p[-2*nT];
314 pF[ 2*nT] = p[ 2*nT];
315
316 for (int i=-(2*nT-1) ; i<=2*nT-1 ; i++)
317 {
318 pF[i] = (p[i+1] + 2*p[i] + p[i-1] + 2) >> 2;
319 }
320 }
321
322
323 // copy back to original array
324
325 memcpy(p-2*nT, pF-2*nT, 4*nT+1);
326 }
327 else {
328 // do nothing ?
329 }
330
331
332 logtrace(LogIntraPred,"post filtering: ");
333 print_border(p,NULL,nT);
334 logtrace(LogIntraPred,"\n");
335 }
336
337
338 const int intraPredAngle_table[1+34] =
339 { 0, 0,32,26,21,17,13, 9, 5, 2, 0,-2,-5,-9,-13,-17,-21,-26,
340 -32,-26,-21,-17,-13,-9,-5,-2,0,2,5,9,13,17,21,26,32 };
341
342 static const int invAngle_table[25-10] =
343 { -4096,-1638,-910,-630,-482,-390,-315,-256,
344 -315,-390,-482,-630,-910,-1638,-4096 };
345
346
347 // TODO: clip to read BitDepthY
348 LIBDE265_INLINE static int Clip1Y(int x) { if (x<0) return 0; else if (x>255) return 255; else return x; }
349
350
351 // (8.4.4.2.6)
352 void intra_prediction_angular(de265_image* img,
353 int xB0,int yB0,
354 enum IntraPredMode intraPredMode,
355 int nT,int cIdx,
356 uint8_t* border)
357 {
358 uint8_t ref_mem[2*64+1];
359 uint8_t* ref=&ref_mem[64];
360
361 uint8_t* pred;
362 int stride;
363 pred = img->get_image_plane_at_pos(cIdx,xB0,yB0);
364 stride = img->get_image_stride(cIdx);
365
366 int intraPredAngle = intraPredAngle_table[intraPredMode];
367
368 if (intraPredMode >= 18) {
369
370 for (int x=0;x<=nT;x++)
371 { ref[x] = border[x]; }
372
373 if (intraPredAngle<0) {
374 int invAngle = invAngle_table[intraPredMode-11];
375
376 if ((nT*intraPredAngle)>>5 < -1) {
377 for (int x=(nT*intraPredAngle)>>5; x<=-1; x++) {
378 ref[x] = border[0-((x*invAngle+128)>>8)];
379 }
380 }
381 } else {
382 for (int x=nT+1; x<=2*nT;x++) {
383 ref[x] = border[x];
384 }
385 }
386
387 for (int y=0;y<nT;y++)
388 for (int x=0;x<nT;x++)
389 {
390 int iIdx = ((y+1)*intraPredAngle)>>5;
391 int iFact= ((y+1)*intraPredAngle)&31;
392
393 if (iFact != 0) {
394 pred[x+y*stride] = ((32-iFact)*ref[x+iIdx+1] + iFact*ref[x+iIdx+2] + 16)>>5;
395 } else {
396 pred[x+y*stride] = ref[x+iIdx+1];
397 }
398 }
399
400 if (intraPredMode==26 && cIdx==0 && nT<32) {
401 for (int y=0;y<nT;y++) {
402 pred[0+y*stride] = Clip1Y(border[1] + ((border[-1-y] - border[0])>>1));
403 }
404 }
405 }
406 else { // intraPredAngle < 18
407
408 for (int x=0;x<=nT;x++)
409 { ref[x] = border[-x]; } // DIFF (neg)
410
411 if (intraPredAngle<0) {
412 int invAngle = invAngle_table[intraPredMode-11];
413
414 if ((nT*intraPredAngle)>>5 < -1) {
415 for (int x=(nT*intraPredAngle)>>5; x<=-1; x++) {
416 ref[x] = border[((x*invAngle+128)>>8)]; // DIFF (neg)
417 }
418 }
419 } else {
420 for (int x=nT+1; x<=2*nT;x++) {
421 ref[x] = border[-x]; // DIFF (neg)
422 }
423 }
424
425 for (int y=0;y<nT;y++)
426 for (int x=0;x<nT;x++)
427 {
428 int iIdx = ((x+1)*intraPredAngle)>>5; // DIFF (x<->y)
429 int iFact= ((x+1)*intraPredAngle)&31; // DIFF (x<->y)
430
431 if (iFact != 0) {
432 pred[x+y*stride] = ((32-iFact)*ref[y+iIdx+1] + iFact*ref[y+iIdx+2] + 16)>>5; // DIFF (x<->y)
433 } else {
434 pred[x+y*stride] = ref[y+iIdx+1]; // DIFF (x<->y)
435 }
436 }
437
438 if (intraPredMode==10 && cIdx==0 && nT<32) { // DIFF 26->10
439 for (int x=0;x<nT;x++) { // DIFF (x<->y)
440 pred[x] = Clip1Y(border[-1] + ((border[1+x] - border[0])>>1)); // DIFF (x<->y && neg)
441 }
442 }
443 }
444
445
446 logtrace(LogIntraPred,"result of angular intra prediction (mode=%d):\n",intraPredMode);
447
448 for (int y=0;y<nT;y++)
449 {
450 for (int x=0;x<nT;x++)
451 logtrace(LogIntraPred,"%02x ", pred[x+y*stride]);
452
453 logtrace(LogIntraPred,"\n");
454 }
455 }
456
457
458 void intra_prediction_planar(de265_image* img,int xB0,int yB0,int nT,int cIdx,
459 uint8_t* border)
460 {
461 uint8_t* pred;
462 int stride;
463 pred = img->get_image_plane_at_pos(cIdx,xB0,yB0);
464 stride = img->get_image_stride(cIdx);
465
466 int Log2_nT = Log2(nT);
467
468 for (int y=0;y<nT;y++)
469 for (int x=0;x<nT;x++)
470 {
471 pred[x+y*stride] = ((nT-1-x)*border[-1-y] + (x+1)*border[ 1+nT] +
472 (nT-1-y)*border[ 1+x] + (y+1)*border[-1-nT] + nT) >> (Log2_nT+1);
473 }
474
475
476 logtrace(LogIntraPred,"result of planar prediction\n");
477
478 for (int y=0;y<nT;y++)
479 {
480 for (int x=0;x<nT;x++)
481 logtrace(LogIntraPred,"%02x ", pred[x+y*stride]);
482
483 logtrace(LogIntraPred,"\n");
484 }
485 }
486
487
488 void intra_prediction_DC(de265_image* img,int xB0,int yB0,int nT,int cIdx,
489 uint8_t* border)
490 {
491 uint8_t* pred;
492 int stride;
493 pred = img->get_image_plane_at_pos(cIdx,xB0,yB0);
494 stride = img->get_image_stride(cIdx);
495
496 int Log2_nT = Log2(nT);
497
498 int dcVal = 0;
499 for (int i=0;i<nT;i++)
500 {
501 dcVal += border[ i+1];
502 dcVal += border[-i-1];
503 }
504
505 dcVal += nT;
506 dcVal >>= Log2_nT+1;
507
508 if (cIdx==0 && nT<32) {
509 pred[0] = (border[-1] + 2*dcVal + border[1] +2) >> 2;
510
511 for (int x=1;x<nT;x++) { pred[x] = (border[ x+1] + 3*dcVal+2)>>2; }
512 for (int y=1;y<nT;y++) { pred[y*stride] = (border[-y-1] + 3*dcVal+2)>>2; }
513 for (int y=1;y<nT;y++)
514 for (int x=1;x<nT;x++)
515 {
516 pred[x+y*stride] = dcVal;
517 }
518 } else {
519 for (int y=0;y<nT;y++)
520 for (int x=0;x<nT;x++)
521 {
522 pred[x+y*stride] = dcVal;
523 }
524 }
525
526
527 /*
528 printf("INTRAPRED DC\n");
529 for (int y=0;y<nT;y++) {
530 for (int x=0;x<nT;x++)
531 {
532 printf("%d ",pred[x+y*stride]);
533 }
534 printf("\n");
535 }
536 */
537 }
538
539
540
541 // (8.4.4.2.1)
542 void decode_intra_prediction(de265_image* img,
543 int xB0,int yB0,
544 enum IntraPredMode intraPredMode,
545 int nT, int cIdx)
546 {
547 logtrace(LogIntraPred,"decode_intra_prediction xy0:%d/%d mode=%d nT=%d, cIdx=%d\n",
548 xB0,yB0, intraPredMode, nT,cIdx);
549 /*
550 printf("decode_intra_prediction xy0:%d/%d mode=%d nT=%d, cIdx=%d\n",
551 xB0,yB0, intraPredMode, nT,cIdx);
552 */
553
554 uint8_t border_pixels_mem[2*64+1];
555 uint8_t* border_pixels = &border_pixels_mem[64];
556
557 fill_border_samples(img, xB0,yB0, nT, cIdx, border_pixels);
558
559 if (cIdx==0) {
560 intra_prediction_sample_filtering(img, border_pixels, nT, intraPredMode);
561 }
562
563
564 switch (intraPredMode) {
565 case INTRA_PLANAR:
566 intra_prediction_planar(img,xB0,yB0,nT,cIdx, border_pixels);
567 break;
568 case INTRA_DC:
569 intra_prediction_DC(img,xB0,yB0,nT,cIdx, border_pixels);
570 break;
571 default:
572 intra_prediction_angular(img,xB0,yB0,intraPredMode,nT,cIdx, border_pixels);
573 break;
574 }
575 }
576
577
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
3636 void fill_border_samples(decoder_context* ctx, int xB,int yB,
3737 int nT, int cIdx, uint8_t* out_border);
3838
39 void decode_intra_prediction(decoder_context* ctx,
39 void decode_intra_prediction(de265_image* img,
4040 int xB0,int yB0,
4141 enum IntraPredMode intraPredMode,
4242 int nT, int cIdx);
+0
-295
libde265/md5.c less more
0 /*
1 * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
2 * MD5 Message-Digest Algorithm (RFC 1321).
3 *
4 * Homepage:
5 * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
6 *
7 * Author:
8 * Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
9 *
10 * This software was written by Alexander Peslyak in 2001. No copyright is
11 * claimed, and the software is hereby placed in the public domain.
12 * In case this attempt to disclaim copyright and place the software in the
13 * public domain is deemed null and void, then the software is
14 * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
15 * general public under the following terms:
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted.
19 *
20 * There's ABSOLUTELY NO WARRANTY, express or implied.
21 *
22 * (This is a heavily cut-down "BSD license".)
23 *
24 * This differs from Colin Plumb's older public domain implementation in that
25 * no exactly 32-bit integer data type is required (any 32-bit or wider
26 * unsigned integer data type will do), there's no compile-time endianness
27 * configuration, and the function prototypes match OpenSSL's. No code from
28 * Colin Plumb's implementation has been reused; this comment merely compares
29 * the properties of the two independent implementations.
30 *
31 * The primary goals of this implementation are portability and ease of use.
32 * It is meant to be fast, but not as fast as possible. Some known
33 * optimizations are not included to reduce source code size and avoid
34 * compile-time configuration.
35 */
36
37 #ifndef HAVE_OPENSSL
38
39 #include <string.h>
40
41 #include "md5.h"
42
43 /*
44 * The basic MD5 functions.
45 *
46 * F and G are optimized compared to their RFC 1321 definitions for
47 * architectures that lack an AND-NOT instruction, just like in Colin Plumb's
48 * implementation.
49 */
50 #define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
51 #define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y))))
52 #define H(x, y, z) ((x) ^ (y) ^ (z))
53 #define I(x, y, z) ((y) ^ ((x) | ~(z)))
54
55 /*
56 * The MD5 transformation for all four rounds.
57 */
58 #define STEP(f, a, b, c, d, x, t, s) \
59 (a) += f((b), (c), (d)) + (x) + (t); \
60 (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \
61 (a) += (b);
62
63 /*
64 * SET reads 4 input bytes in little-endian byte order and stores them
65 * in a properly aligned word in host byte order.
66 *
67 * The check for little-endian architectures that tolerate unaligned
68 * memory accesses is just an optimization. Nothing will break if it
69 * doesn't work.
70 */
71 #if defined(__i386__) || defined(__x86_64__) || defined(__vax__)
72 #define SET(n) \
73 (*(MD5_u32plus *)&ptr[(n) * 4])
74 #define GET(n) \
75 SET(n)
76 #else
77 #define SET(n) \
78 (ctx->block[(n)] = \
79 (MD5_u32plus)ptr[(n) * 4] | \
80 ((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \
81 ((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \
82 ((MD5_u32plus)ptr[(n) * 4 + 3] << 24))
83 #define GET(n) \
84 (ctx->block[(n)])
85 #endif
86
87 /*
88 * This processes one or more 64-byte data blocks, but does NOT update
89 * the bit counters. There are no alignment requirements.
90 */
91 static void *body(MD5_CTX *ctx, void *data, unsigned long size)
92 {
93 unsigned char *ptr;
94 MD5_u32plus a, b, c, d;
95 MD5_u32plus saved_a, saved_b, saved_c, saved_d;
96
97 ptr = (unsigned char *)data;
98
99 a = ctx->a;
100 b = ctx->b;
101 c = ctx->c;
102 d = ctx->d;
103
104 do {
105 saved_a = a;
106 saved_b = b;
107 saved_c = c;
108 saved_d = d;
109
110 /* Round 1 */
111 STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7)
112 STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12)
113 STEP(F, c, d, a, b, SET(2), 0x242070db, 17)
114 STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22)
115 STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7)
116 STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12)
117 STEP(F, c, d, a, b, SET(6), 0xa8304613, 17)
118 STEP(F, b, c, d, a, SET(7), 0xfd469501, 22)
119 STEP(F, a, b, c, d, SET(8), 0x698098d8, 7)
120 STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12)
121 STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17)
122 STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22)
123 STEP(F, a, b, c, d, SET(12), 0x6b901122, 7)
124 STEP(F, d, a, b, c, SET(13), 0xfd987193, 12)
125 STEP(F, c, d, a, b, SET(14), 0xa679438e, 17)
126 STEP(F, b, c, d, a, SET(15), 0x49b40821, 22)
127
128 /* Round 2 */
129 STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5)
130 STEP(G, d, a, b, c, GET(6), 0xc040b340, 9)
131 STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14)
132 STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20)
133 STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5)
134 STEP(G, d, a, b, c, GET(10), 0x02441453, 9)
135 STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14)
136 STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20)
137 STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5)
138 STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9)
139 STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14)
140 STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20)
141 STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5)
142 STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9)
143 STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14)
144 STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20)
145
146 /* Round 3 */
147 STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4)
148 STEP(H, d, a, b, c, GET(8), 0x8771f681, 11)
149 STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16)
150 STEP(H, b, c, d, a, GET(14), 0xfde5380c, 23)
151 STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4)
152 STEP(H, d, a, b, c, GET(4), 0x4bdecfa9, 11)
153 STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16)
154 STEP(H, b, c, d, a, GET(10), 0xbebfbc70, 23)
155 STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4)
156 STEP(H, d, a, b, c, GET(0), 0xeaa127fa, 11)
157 STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16)
158 STEP(H, b, c, d, a, GET(6), 0x04881d05, 23)
159 STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4)
160 STEP(H, d, a, b, c, GET(12), 0xe6db99e5, 11)
161 STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16)
162 STEP(H, b, c, d, a, GET(2), 0xc4ac5665, 23)
163
164 /* Round 4 */
165 STEP(I, a, b, c, d, GET(0), 0xf4292244, 6)
166 STEP(I, d, a, b, c, GET(7), 0x432aff97, 10)
167 STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15)
168 STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21)
169 STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6)
170 STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10)
171 STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15)
172 STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21)
173 STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6)
174 STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10)
175 STEP(I, c, d, a, b, GET(6), 0xa3014314, 15)
176 STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21)
177 STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6)
178 STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10)
179 STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15)
180 STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21)
181
182 a += saved_a;
183 b += saved_b;
184 c += saved_c;
185 d += saved_d;
186
187 ptr += 64;
188 } while (size -= 64);
189
190 ctx->a = a;
191 ctx->b = b;
192 ctx->c = c;
193 ctx->d = d;
194
195 return ptr;
196 }
197
198 void MD5_Init(MD5_CTX *ctx)
199 {
200 ctx->a = 0x67452301;
201 ctx->b = 0xefcdab89;
202 ctx->c = 0x98badcfe;
203 ctx->d = 0x10325476;
204
205 ctx->lo = 0;
206 ctx->hi = 0;
207 }
208
209 void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size)
210 {
211 MD5_u32plus saved_lo;
212 unsigned long used, free;
213
214 saved_lo = ctx->lo;
215 if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo)
216 ctx->hi++;
217 ctx->hi += size >> 29;
218
219 used = saved_lo & 0x3f;
220
221 if (used) {
222 free = 64 - used;
223
224 if (size < free) {
225 memcpy(&ctx->buffer[used], data, size);
226 return;
227 }
228
229 memcpy(&ctx->buffer[used], data, free);
230 data = (unsigned char *)data + free;
231 size -= free;
232 body(ctx, ctx->buffer, 64);
233 }
234
235 if (size >= 64) {
236 data = body(ctx, data, size & ~(unsigned long)0x3f);
237 size &= 0x3f;
238 }
239
240 memcpy(ctx->buffer, data, size);
241 }
242
243 void MD5_Final(unsigned char *result, MD5_CTX *ctx)
244 {
245 unsigned long used, free;
246
247 used = ctx->lo & 0x3f;
248
249 ctx->buffer[used++] = 0x80;
250
251 free = 64 - used;
252
253 if (free < 8) {
254 memset(&ctx->buffer[used], 0, free);
255 body(ctx, ctx->buffer, 64);
256 used = 0;
257 free = 64;
258 }
259
260 memset(&ctx->buffer[used], 0, free - 8);
261
262 ctx->lo <<= 3;
263 ctx->buffer[56] = ctx->lo;
264 ctx->buffer[57] = ctx->lo >> 8;
265 ctx->buffer[58] = ctx->lo >> 16;
266 ctx->buffer[59] = ctx->lo >> 24;
267 ctx->buffer[60] = ctx->hi;
268 ctx->buffer[61] = ctx->hi >> 8;
269 ctx->buffer[62] = ctx->hi >> 16;
270 ctx->buffer[63] = ctx->hi >> 24;
271
272 body(ctx, ctx->buffer, 64);
273
274 result[0] = ctx->a;
275 result[1] = ctx->a >> 8;
276 result[2] = ctx->a >> 16;
277 result[3] = ctx->a >> 24;
278 result[4] = ctx->b;
279 result[5] = ctx->b >> 8;
280 result[6] = ctx->b >> 16;
281 result[7] = ctx->b >> 24;
282 result[8] = ctx->c;
283 result[9] = ctx->c >> 8;
284 result[10] = ctx->c >> 16;
285 result[11] = ctx->c >> 24;
286 result[12] = ctx->d;
287 result[13] = ctx->d >> 8;
288 result[14] = ctx->d >> 16;
289 result[15] = ctx->d >> 24;
290
291 memset(ctx, 0, sizeof(*ctx));
292 }
293
294 #endif
0 /*
1 * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
2 * MD5 Message-Digest Algorithm (RFC 1321).
3 *
4 * Homepage:
5 * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
6 *
7 * Author:
8 * Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
9 *
10 * This software was written by Alexander Peslyak in 2001. No copyright is
11 * claimed, and the software is hereby placed in the public domain.
12 * In case this attempt to disclaim copyright and place the software in the
13 * public domain is deemed null and void, then the software is
14 * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
15 * general public under the following terms:
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted.
19 *
20 * There's ABSOLUTELY NO WARRANTY, express or implied.
21 *
22 * (This is a heavily cut-down "BSD license".)
23 *
24 * This differs from Colin Plumb's older public domain implementation in that
25 * no exactly 32-bit integer data type is required (any 32-bit or wider
26 * unsigned integer data type will do), there's no compile-time endianness
27 * configuration, and the function prototypes match OpenSSL's. No code from
28 * Colin Plumb's implementation has been reused; this comment merely compares
29 * the properties of the two independent implementations.
30 *
31 * The primary goals of this implementation are portability and ease of use.
32 * It is meant to be fast, but not as fast as possible. Some known
33 * optimizations are not included to reduce source code size and avoid
34 * compile-time configuration.
35 */
36
37 #ifndef HAVE_OPENSSL
38
39 #include <string.h>
40
41 #include "md5.h"
42
43 /*
44 * The basic MD5 functions.
45 *
46 * F and G are optimized compared to their RFC 1321 definitions for
47 * architectures that lack an AND-NOT instruction, just like in Colin Plumb's
48 * implementation.
49 */
50 #define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
51 #define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y))))
52 #define H(x, y, z) ((x) ^ (y) ^ (z))
53 #define I(x, y, z) ((y) ^ ((x) | ~(z)))
54
55 /*
56 * The MD5 transformation for all four rounds.
57 */
58 #define STEP(f, a, b, c, d, x, t, s) \
59 (a) += f((b), (c), (d)) + (x) + (t); \
60 (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \
61 (a) += (b);
62
63 /*
64 * SET reads 4 input bytes in little-endian byte order and stores them
65 * in a properly aligned word in host byte order.
66 *
67 * The check for little-endian architectures that tolerate unaligned
68 * memory accesses is just an optimization. Nothing will break if it
69 * doesn't work.
70 */
71 #if defined(__i386__) || defined(__x86_64__) || defined(__vax__)
72 #define SET(n) \
73 (*(MD5_u32plus *)&ptr[(n) * 4])
74 #define GET(n) \
75 SET(n)
76 #else
77 #define SET(n) \
78 (ctx->block[(n)] = \
79 (MD5_u32plus)ptr[(n) * 4] | \
80 ((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \
81 ((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \
82 ((MD5_u32plus)ptr[(n) * 4 + 3] << 24))
83 #define GET(n) \
84 (ctx->block[(n)])
85 #endif
86
87 /*
88 * This processes one or more 64-byte data blocks, but does NOT update
89 * the bit counters. There are no alignment requirements.
90 */
91 static void *body(MD5_CTX *ctx, void *data, unsigned long size)
92 {
93 unsigned char *ptr;
94 MD5_u32plus a, b, c, d;
95 MD5_u32plus saved_a, saved_b, saved_c, saved_d;
96
97 ptr = (unsigned char *)data;
98
99 a = ctx->a;
100 b = ctx->b;
101 c = ctx->c;
102 d = ctx->d;
103
104 do {
105 saved_a = a;
106 saved_b = b;
107 saved_c = c;
108 saved_d = d;
109
110 /* Round 1 */
111 STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7)
112 STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12)
113 STEP(F, c, d, a, b, SET(2), 0x242070db, 17)
114 STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22)
115 STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7)
116 STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12)
117 STEP(F, c, d, a, b, SET(6), 0xa8304613, 17)
118 STEP(F, b, c, d, a, SET(7), 0xfd469501, 22)
119 STEP(F, a, b, c, d, SET(8), 0x698098d8, 7)
120 STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12)
121 STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17)
122 STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22)
123 STEP(F, a, b, c, d, SET(12), 0x6b901122, 7)
124 STEP(F, d, a, b, c, SET(13), 0xfd987193, 12)
125 STEP(F, c, d, a, b, SET(14), 0xa679438e, 17)
126 STEP(F, b, c, d, a, SET(15), 0x49b40821, 22)
127
128 /* Round 2 */
129 STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5)
130 STEP(G, d, a, b, c, GET(6), 0xc040b340, 9)
131 STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14)
132 STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20)
133 STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5)
134 STEP(G, d, a, b, c, GET(10), 0x02441453, 9)
135 STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14)
136 STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20)
137 STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5)
138 STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9)
139 STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14)
140 STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20)
141 STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5)
142 STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9)
143 STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14)
144 STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20)
145
146 /* Round 3 */
147 STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4)
148 STEP(H, d, a, b, c, GET(8), 0x8771f681, 11)
149 STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16)
150 STEP(H, b, c, d, a, GET(14), 0xfde5380c, 23)
151 STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4)
152 STEP(H, d, a, b, c, GET(4), 0x4bdecfa9, 11)
153 STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16)
154 STEP(H, b, c, d, a, GET(10), 0xbebfbc70, 23)
155 STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4)
156 STEP(H, d, a, b, c, GET(0), 0xeaa127fa, 11)
157 STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16)
158 STEP(H, b, c, d, a, GET(6), 0x04881d05, 23)
159 STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4)
160 STEP(H, d, a, b, c, GET(12), 0xe6db99e5, 11)
161 STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16)
162 STEP(H, b, c, d, a, GET(2), 0xc4ac5665, 23)
163
164 /* Round 4 */
165 STEP(I, a, b, c, d, GET(0), 0xf4292244, 6)
166 STEP(I, d, a, b, c, GET(7), 0x432aff97, 10)
167 STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15)
168 STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21)
169 STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6)
170 STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10)
171 STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15)
172 STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21)
173 STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6)
174 STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10)
175 STEP(I, c, d, a, b, GET(6), 0xa3014314, 15)
176 STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21)
177 STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6)
178 STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10)
179 STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15)
180 STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21)
181
182 a += saved_a;
183 b += saved_b;
184 c += saved_c;
185 d += saved_d;
186
187 ptr += 64;
188 } while (size -= 64);
189
190 ctx->a = a;
191 ctx->b = b;
192 ctx->c = c;
193 ctx->d = d;
194
195 return ptr;
196 }
197
198 void MD5_Init(MD5_CTX *ctx)
199 {
200 ctx->a = 0x67452301;
201 ctx->b = 0xefcdab89;
202 ctx->c = 0x98badcfe;
203 ctx->d = 0x10325476;
204
205 ctx->lo = 0;
206 ctx->hi = 0;
207 }
208
209 void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size)
210 {
211 MD5_u32plus saved_lo;
212 unsigned long used, free;
213
214 saved_lo = ctx->lo;
215 if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo)
216 ctx->hi++;
217 ctx->hi += size >> 29;
218
219 used = saved_lo & 0x3f;
220
221 if (used) {
222 free = 64 - used;
223
224 if (size < free) {
225 memcpy(&ctx->buffer[used], data, size);
226 return;
227 }
228
229 memcpy(&ctx->buffer[used], data, free);
230 data = (unsigned char *)data + free;
231 size -= free;
232 body(ctx, ctx->buffer, 64);
233 }
234
235 if (size >= 64) {
236 data = body(ctx, data, size & ~(unsigned long)0x3f);
237 size &= 0x3f;
238 }
239
240 memcpy(ctx->buffer, data, size);
241 }
242
243 void MD5_Final(unsigned char *result, MD5_CTX *ctx)
244 {
245 unsigned long used, free;
246
247 used = ctx->lo & 0x3f;
248
249 ctx->buffer[used++] = 0x80;
250
251 free = 64 - used;
252
253 if (free < 8) {
254 memset(&ctx->buffer[used], 0, free);
255 body(ctx, ctx->buffer, 64);
256 used = 0;
257 free = 64;
258 }
259
260 memset(&ctx->buffer[used], 0, free - 8);
261
262 ctx->lo <<= 3;
263 ctx->buffer[56] = ctx->lo;
264 ctx->buffer[57] = ctx->lo >> 8;
265 ctx->buffer[58] = ctx->lo >> 16;
266 ctx->buffer[59] = ctx->lo >> 24;
267 ctx->buffer[60] = ctx->hi;
268 ctx->buffer[61] = ctx->hi >> 8;
269 ctx->buffer[62] = ctx->hi >> 16;
270 ctx->buffer[63] = ctx->hi >> 24;
271
272 body(ctx, ctx->buffer, 64);
273
274 result[0] = ctx->a;
275 result[1] = ctx->a >> 8;
276 result[2] = ctx->a >> 16;
277 result[3] = ctx->a >> 24;
278 result[4] = ctx->b;
279 result[5] = ctx->b >> 8;
280 result[6] = ctx->b >> 16;
281 result[7] = ctx->b >> 24;
282 result[8] = ctx->c;
283 result[9] = ctx->c >> 8;
284 result[10] = ctx->c >> 16;
285 result[11] = ctx->c >> 24;
286 result[12] = ctx->d;
287 result[13] = ctx->d >> 8;
288 result[14] = ctx->d >> 16;
289 result[15] = ctx->d >> 24;
290
291 memset(ctx, 0, sizeof(*ctx));
292 }
293
294 #endif
+0
-1956
libde265/motion.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "motion.h"
21 #include "motion_func.h"
22 #include "decctx.h"
23 #include "util.h"
24 #include <assert.h>
25
26
27 #include <sys/types.h>
28 #include <signal.h>
29 #include <string.h>
30
31 #if defined(_MSC_VER) || defined(__MINGW32__)
32 # include <malloc.h>
33 #else
34 # include <alloca.h>
35 #endif
36
37
38 #define MAX_CU_SIZE 64
39
40
41 enum {
42 // important! order like shown in 8.5.3.1.1
43 PRED_A1 = 0,
44 PRED_B1 = 1,
45 PRED_B0 = 2,
46 PRED_A0 = 3,
47 PRED_B2 = 4,
48 PRED_COL = 5,
49 PRED_ZERO= 6
50 };
51
52
53 typedef struct
54 {
55 uint8_t available[7];
56 PredVectorInfo pred_vector[7];
57 } MergingCandidates;
58
59
60 void reset_pred_vector(PredVectorInfo* pvec)
61 {
62 for (int X=0;X<2;X++) {
63 pvec->mv[X].x = 0;
64 pvec->mv[X].y = 0;
65 pvec->refIdx[X] = -1;
66 pvec->predFlag[X] = 0;
67 }
68 }
69
70
71 static int extra_before[4] = { 0,3,3,2 };
72 static int extra_after [4] = { 0,3,4,4 };
73
74 int FracCnt[4][4];
75 int SizeCnt[64][64];
76 int TotalCnt;
77 int InsideCnt,OutsideCnt;
78 int FullPelInsideCnt,FullPelOutsideCnt;
79 int NullCnt;
80
81 int BipredCnt;
82 int FullpelBipredCnt;
83 int FullpelPredCnt;
84 int TotalPredCnt; // number of prediction blocks
85
86 LIBDE265_API void showMotionProfile()
87 {
88 fprintf(stderr,"fractional pel positions:\n");
89 for (int y=0;y<4;y++)
90 for (int x=0;x<4;x++)
91 fprintf(stderr,"(%d,%d) %8d %4.1f%%\n",x,y,FracCnt[x][y],(float)(FracCnt[x][y] * 100) / TotalCnt);
92
93 fprintf(stderr,"block sizes:\n");
94 for (int x=0;x<64;x++)
95 for (int y=0;y<64;y++)
96 if (SizeCnt[x][y]) {
97 char tmp[128];
98 sprintf(tmp, "%dx%d", x+1, y+1);
99 fprintf(stderr,"%2dx%2d %8d %4.1f%%\n",x+1, y+1, SizeCnt[x][y],(float)(SizeCnt[x][y] * 100) / TotalCnt);
100 }
101
102
103 fprintf(stderr,"total cnt: %d\n", TotalCnt);
104 fprintf(stderr,"inside: %d, outside: %d\n", InsideCnt,OutsideCnt);
105 fprintf(stderr,"fullpel-inside: %d, fullpel-outside: %d\n", FullPelInsideCnt,FullPelOutsideCnt);
106 fprintf(stderr,"null-vectors: %d\n", NullCnt);
107 fprintf(stderr,"bi-pred: %d (%4.1f %%)\n", BipredCnt, BipredCnt*100.0/TotalPredCnt);
108 fprintf(stderr,"full-pel bi-pred: %d (%4.1f %%)\n", FullpelBipredCnt, FullpelBipredCnt*100.0/TotalPredCnt);
109 fprintf(stderr,"full-pel pred: %d (%4.1f %%)\n", FullpelPredCnt, FullpelPredCnt*100.0/TotalPredCnt);
110 }
111
112
113 void mc_luma(const decoder_context* ctx, int mv_x, int mv_y,
114 int xP,int yP,
115 int16_t* out, int out_stride,
116 uint8_t* img, int img_stride,
117 int nPbW, int nPbH)
118 {
119 const seq_parameter_set* sps = ctx->current_sps;
120
121 int xFracL = mv_x & 3;
122 int yFracL = mv_y & 3;
123
124 int xIntOffsL = xP + (mv_x>>2);
125 int yIntOffsL = yP + (mv_y>>2);
126
127 FracCnt[xFracL][yFracL]++;
128 SizeCnt[nPbW-1][nPbH-1]++;
129 TotalCnt++;
130 if (mv_x==0 && mv_y==0) { NullCnt++; }
131
132 // luma sample interpolation process (8.5.3.2.2.1)
133
134 //const int shift1 = sps->BitDepth_Y-8;
135 //const int shift2 = 6;
136 const int shift3 = 14 - sps->BitDepth_Y;
137
138 int w = sps->pic_width_in_luma_samples;
139 int h = sps->pic_height_in_luma_samples;
140
141 ALIGNED_16(int16_t) mcbuffer[MAX_CU_SIZE * (MAX_CU_SIZE+7)];
142
143 if (xFracL==0 && yFracL==0) {
144 if (xIntOffsL >= 0 && yIntOffsL >= 0 &&
145 nPbW+xIntOffsL <= w && nPbH+yIntOffsL <= h) {
146 FullPelInsideCnt++;
147 InsideCnt++;
148 }
149 else {
150 FullPelOutsideCnt++;
151 OutsideCnt++;
152 }
153
154 if (xIntOffsL >= 0 && yIntOffsL >= 0 &&
155 nPbW+xIntOffsL <= w && nPbH+yIntOffsL <= h) {
156
157 ctx->acceleration.put_hevc_qpel_8[0][0](out, out_stride,
158 &img[yIntOffsL*img_stride + xIntOffsL],
159 img_stride,
160 nPbW,nPbH, mcbuffer);
161 }
162 else {
163 for (int y=0;y<nPbH;y++)
164 for (int x=0;x<nPbW;x++) {
165
166 int xA = Clip3(0,w-1,x + xIntOffsL);
167 int yA = Clip3(0,h-1,y + yIntOffsL);
168
169 out[y*out_stride+x] = img[ xA + yA*img_stride ] << shift3;
170 }
171 }
172
173 #ifdef DE265_LOG_TRACE
174 logtrace(LogMotion,"---MC luma %d %d = direct---\n",xFracL,yFracL);
175
176 for (int y=0;y<nPbH;y++) {
177 for (int x=0;x<nPbW;x++) {
178
179 int xA = Clip3(0,w-1,x + xIntOffsL);
180 int yA = Clip3(0,h-1,y + yIntOffsL);
181
182 logtrace(LogMotion,"%02x ", img[ xA + yA*img_stride ]);
183 }
184 logtrace(LogMotion,"\n");
185 }
186
187 logtrace(LogMotion," -> \n");
188
189 for (int y=0;y<nPbH;y++) {
190 for (int x=0;x<nPbW;x++) {
191
192 logtrace(LogMotion,"%02x ",out[y*out_stride+x] >> 6); // 6 will be used when summing predictions
193 }
194 logtrace(LogMotion,"\n");
195 }
196 #endif
197 }
198 else {
199 int extra_left = extra_before[xFracL];
200 int extra_right = extra_after [xFracL];
201 int extra_top = extra_before[yFracL];
202 int extra_bottom = extra_after [yFracL];
203
204 //int nPbW_extra = extra_left + nPbW + extra_right;
205 //int nPbH_extra = extra_top + nPbH + extra_bottom;
206
207
208 uint8_t padbuf[(MAX_CU_SIZE+16)*(MAX_CU_SIZE+7)];
209
210 uint8_t* src_ptr;
211 int src_stride;
212
213 if (-extra_left + xIntOffsL >= 0 &&
214 -extra_top + yIntOffsL >= 0 &&
215 nPbW+extra_right + xIntOffsL < w &&
216 nPbH+extra_bottom + yIntOffsL < h) {
217 src_ptr = &img[xIntOffsL + yIntOffsL*img_stride];
218 src_stride = img_stride;
219 }
220 else {
221 for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
222 for (int x=-extra_left;x<nPbW+extra_right;x++) {
223
224 int xA = Clip3(0,w-1,x + xIntOffsL);
225 int yA = Clip3(0,h-1,y + yIntOffsL);
226
227 padbuf[x+extra_left + (y+extra_top)*(MAX_CU_SIZE+16)] = img[ xA + yA*img_stride ];
228 }
229 }
230
231 src_ptr = &padbuf[extra_top*(MAX_CU_SIZE+16) + extra_left];
232 src_stride = MAX_CU_SIZE+16;
233 }
234
235 ctx->acceleration.put_hevc_qpel_8[xFracL][yFracL](out, out_stride,
236 src_ptr, src_stride,
237 nPbW,nPbH, mcbuffer);
238
239
240 logtrace(LogMotion,"---V---\n");
241 for (int y=0;y<nPbH;y++) {
242 for (int x=0;x<nPbW;x++) {
243 logtrace(LogMotion,"%04x ",out[x+y*out_stride]);
244 }
245 logtrace(LogMotion,"\n");
246 }
247 }
248 }
249
250
251
252 void mc_chroma(const decoder_context* ctx, int mv_x, int mv_y,
253 int xP,int yP,
254 int16_t* out, int out_stride,
255 uint8_t* img, int img_stride,
256 int nPbWC, int nPbHC)
257 {
258 const seq_parameter_set* sps = ctx->current_sps;
259
260 // chroma sample interpolation process (8.5.3.2.2.2)
261
262 //const int shift1 = sps->BitDepth_C-8;
263 //const int shift2 = 6;
264 const int shift3 = 14 - sps->BitDepth_C;
265
266 int wC = sps->pic_width_in_luma_samples /sps->SubWidthC;
267 int hC = sps->pic_height_in_luma_samples/sps->SubHeightC;
268
269 int xFracC = mv_x & 7;
270 int yFracC = mv_y & 7;
271
272 int xIntOffsC = xP/2 + (mv_x>>3);
273 int yIntOffsC = yP/2 + (mv_y>>3);
274
275 ALIGNED_32(int16_t mcbuffer[MAX_CU_SIZE*(MAX_CU_SIZE+7)]);
276
277 if (xFracC == 0 && yFracC == 0) {
278 if (xIntOffsC>=0 && nPbWC+xIntOffsC<=wC &&
279 yIntOffsC>=0 && nPbHC+yIntOffsC<=hC) {
280 ctx->acceleration.put_hevc_epel_8(out, out_stride,
281 &img[xIntOffsC + yIntOffsC*img_stride], img_stride,
282 nPbWC,nPbHC, 0,0, NULL);
283 }
284 else
285 {
286 for (int y=0;y<nPbHC;y++)
287 for (int x=0;x<nPbWC;x++) {
288
289 int xB = Clip3(0,wC-1,x + xIntOffsC);
290 int yB = Clip3(0,hC-1,y + yIntOffsC);
291
292 out[y*out_stride+x] = img[ xB + yB*img_stride ] << shift3;
293 }
294 }
295 }
296 else {
297 uint8_t padbuf[(MAX_CU_SIZE+16)*(MAX_CU_SIZE+3)];
298
299 uint8_t* src_ptr;
300 int src_stride;
301
302 int extra_top = 1;
303 int extra_left = 1;
304 int extra_right = 2;
305 int extra_bottom = 2;
306
307 if (xIntOffsC>=1 && nPbWC+xIntOffsC<=wC-2 &&
308 yIntOffsC>=1 && nPbHC+yIntOffsC<=hC-2) {
309 src_ptr = &img[xIntOffsC + yIntOffsC*img_stride];
310 src_stride = img_stride;
311 }
312 else {
313 for (int y=-extra_top;y<nPbHC+extra_bottom;y++) {
314 for (int x=-extra_left;x<nPbWC+extra_right;x++) {
315
316 int xA = Clip3(0,wC-1,x + xIntOffsC);
317 int yA = Clip3(0,hC-1,y + yIntOffsC);
318
319 padbuf[x+extra_left + (y+extra_top)*(MAX_CU_SIZE+16)] = img[ xA + yA*img_stride ];
320 }
321 }
322
323 src_ptr = &padbuf[extra_left + extra_top*(MAX_CU_SIZE+16)];
324 src_stride = MAX_CU_SIZE+16;
325 }
326
327
328 if (xFracC && yFracC) {
329 ctx->acceleration.put_hevc_epel_hv_8(out, out_stride,
330 src_ptr, src_stride,
331 nPbWC,nPbHC, xFracC,yFracC, mcbuffer);
332 }
333 else if (xFracC) {
334 ctx->acceleration.put_hevc_epel_h_8(out, out_stride,
335 src_ptr, src_stride,
336 nPbWC,nPbHC, xFracC,yFracC, mcbuffer);
337 }
338 else if (yFracC) {
339 ctx->acceleration.put_hevc_epel_v_8(out, out_stride,
340 src_ptr, src_stride,
341 nPbWC,nPbHC, xFracC,yFracC, mcbuffer);
342 }
343 else {
344 assert(false); // full-pel shifts are handled above
345 }
346 }
347 }
348
349
350
351 // 8.5.3.2
352 // NOTE: for full-pel shifts, we can introduce a fast path, simply copying without shifts
353 void generate_inter_prediction_samples(decoder_context* ctx,
354 slice_segment_header* shdr,
355 int xC,int yC,
356 int xB,int yB,
357 int nCS, int nPbW,int nPbH,
358 const VectorInfo* vi)
359 {
360 //const seq_parameter_set* sps = ctx->current_sps;
361
362 /*
363 if (vi->lum.predFlag[0]) {
364 assert(vi->lum.refIdx[0] >= 0);
365 assert(vi->lum.refIdx[0] <= 1);
366 }
367 */
368
369 TotalPredCnt++;
370
371 ALIGNED_16(int16_t) predSamplesL [2 /* LX */][MAX_CU_SIZE* MAX_CU_SIZE];
372 ALIGNED_16(int16_t) predSamplesC[2 /* chroma */ ][2 /* LX */][MAX_CU_SIZE* MAX_CU_SIZE];
373
374 int xP = xC+xB;
375 int yP = yC+yB;
376
377 int predFlag[2];
378 predFlag[0] = vi->lum.predFlag[0];
379 predFlag[1] = vi->lum.predFlag[1];
380
381
382 // Some encoders use bi-prediction with two similar MVs.
383 // Identify this case and use only one MV.
384
385 // do this only without weighted prediction, because the weights/offsets may be different
386 if (ctx->current_pps->weighted_pred_flag==0) {
387 if (predFlag[0] && predFlag[1]) {
388 if (vi->lum.mv[0].x == vi->lum.mv[1].x &&
389 vi->lum.mv[0].y == vi->lum.mv[1].y &&
390 shdr->RefPicList[0][vi->lum.refIdx[0]] ==
391 shdr->RefPicList[1][vi->lum.refIdx[1]]) {
392 predFlag[1] = 0;
393 }
394 }
395 }
396
397
398 for (int l=0;l<2;l++) {
399 if (predFlag[l]) {
400 // 8.5.3.2.1
401
402 if (vi->lum.refIdx[l] >= MAX_NUM_REF_PICS) {
403 ctx->img->integrity = INTEGRITY_DECODING_ERRORS;
404 add_warning(ctx,DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false);
405 return;
406 }
407
408 de265_image* refPic;
409 refPic = &ctx->dpb[ shdr->RefPicList[l][vi->lum.refIdx[l]] ];
410
411 logtrace(LogMotion, "refIdx: %d -> dpb[%d]\n", vi->lum.refIdx[l], shdr->RefPicList[l][vi->lum.refIdx[l]]);
412
413 if (refPic->PicState == UnusedForReference) {
414 //printf("state %d = %d\n",refPic->PicOrderCntVal, refPic->PicState);
415 }
416
417 //assert(refPic->PicState != UnusedForReference);
418 if (refPic->PicState == UnusedForReference) {
419 ctx->img->integrity = INTEGRITY_DECODING_ERRORS;
420 add_warning(ctx,DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false);
421 }
422 else {
423 // 8.5.3.2.2
424
425 logtrace(LogMotion,"do MC: L%d,MV=%d;%d RefPOC=%d\n",
426 l,vi->lum.mv[l].x,vi->lum.mv[l].y,refPic->PicOrderCntVal);
427
428
429 // TODO: must predSamples stride really be nCS or can it be somthing smaller like nPbW?
430 mc_luma(ctx, vi->lum.mv[l].x, vi->lum.mv[l].y, xP,yP,
431 predSamplesL[l],nCS, refPic->y,refPic->stride, nPbW,nPbH);
432
433
434 mc_chroma(ctx, vi->lum.mv[l].x, vi->lum.mv[l].y, xP,yP,
435 predSamplesC[0][l],nCS, refPic->cb,refPic->chroma_stride, nPbW/2,nPbH/2);
436 mc_chroma(ctx, vi->lum.mv[l].x, vi->lum.mv[l].y, xP,yP,
437 predSamplesC[1][l],nCS, refPic->cr,refPic->chroma_stride, nPbW/2,nPbH/2);
438 }
439 }
440 }
441
442
443 // weighted sample prediction (8.5.3.2.3)
444
445 //const int shift1 = 6; // TODO
446 //const int offset1= 1<<(shift1-1);
447
448 logtrace(LogMotion,"predFlags (modified): %d %d\n", predFlag[0], predFlag[1]);
449
450 if (shdr->slice_type == SLICE_TYPE_P) {
451 if (ctx->current_pps->weighted_pred_flag==0) {
452 if (predFlag[0]==1 && predFlag[1]==0) {
453 if ((vi->lum.mv[0].x & 3) == 0 &&
454 (vi->lum.mv[0].y & 3) == 0)
455 {
456 FullpelPredCnt++;
457 }
458
459 ctx->acceleration.put_unweighted_pred_8(&ctx->img->y[xP +yP*ctx->img->stride],
460 ctx->img->stride,
461 predSamplesL[0],nCS, nPbW,nPbH);
462 ctx->acceleration.put_unweighted_pred_8(&ctx->img->cb[xP/2 +yP/2*ctx->img->chroma_stride],
463 ctx->img->chroma_stride,
464 predSamplesC[0][0],nCS, nPbW/2,nPbH/2);
465 ctx->acceleration.put_unweighted_pred_8(&ctx->img->cr[xP/2 +yP/2*ctx->img->chroma_stride],
466 ctx->img->chroma_stride,
467 predSamplesC[1][0],nCS, nPbW/2,nPbH/2);
468 }
469 else {
470 add_warning(ctx, DE265_WARNING_BOTH_PREDFLAGS_ZERO, false);
471 ctx->img->integrity = INTEGRITY_DECODING_ERRORS;
472 }
473
474 //printf("unweighted\n");
475 }
476 else {
477 // weighted prediction
478
479 if (predFlag[0]==1 && predFlag[1]==0) {
480
481 int refIdx0 = vi->lum.refIdx[0];
482
483 int luma_log2WD = shdr->luma_log2_weight_denom + (14-8); // TODO: bitDepth
484 int chroma_log2WD = shdr->ChromaLog2WeightDenom + (14-8); // TODO: bitDepth
485
486 int luma_w0 = shdr->LumaWeight[0][refIdx0];
487 int luma_o0 = shdr->luma_offset[0][refIdx0] * (1<<(8-8)); // TODO: bitDepth
488
489 int chroma0_w0 = shdr->ChromaWeight[0][refIdx0][0];
490 int chroma0_o0 = shdr->ChromaOffset[0][refIdx0][0] * (1<<(8-8)); // TODO: bitDepth
491 int chroma1_w0 = shdr->ChromaWeight[0][refIdx0][1];
492 int chroma1_o0 = shdr->ChromaOffset[0][refIdx0][1] * (1<<(8-8)); // TODO: bitDepth
493
494 logtrace(LogMotion,"weighted-0 [%d] %d %d %d %dx%d\n", refIdx0, luma_log2WD-6,luma_w0,luma_o0,nPbW,nPbH);
495
496 ctx->acceleration.put_weighted_pred_8(&ctx->img->y[xP +yP*ctx->img->stride],
497 ctx->img->stride,
498 predSamplesL[0],nCS, nPbW,nPbH,
499 luma_w0, luma_o0, luma_log2WD);
500 ctx->acceleration.put_weighted_pred_8(&ctx->img->cb[xP/2 +yP/2*ctx->img->chroma_stride],
501 ctx->img->chroma_stride,
502 predSamplesC[0][0],nCS, nPbW/2,nPbH/2,
503 chroma0_w0, chroma0_o0, chroma_log2WD);
504 ctx->acceleration.put_weighted_pred_8(&ctx->img->cr[xP/2 +yP/2*ctx->img->chroma_stride],
505 ctx->img->chroma_stride,
506 predSamplesC[1][0],nCS, nPbW/2,nPbH/2,
507 chroma1_w0, chroma1_o0, chroma_log2WD);
508 }
509 else {
510 add_warning(ctx, DE265_WARNING_BOTH_PREDFLAGS_ZERO, false);
511 ctx->img->integrity = INTEGRITY_DECODING_ERRORS;
512 }
513 }
514 }
515 else {
516 assert(shdr->slice_type == SLICE_TYPE_B);
517
518 //printf("unweighted\n");
519
520 if (predFlag[0]==1 && predFlag[1]==1) {
521 if (ctx->current_pps->weighted_bipred_flag==0) {
522 //const int shift2 = 15-8; // TODO: real bit depth
523 //const int offset2 = 1<<(shift2-1);
524
525 BipredCnt++;
526
527 if ((vi->lum.mv[0].x & 3) == 0 &&
528 (vi->lum.mv[0].y & 3) == 0 &&
529 (vi->lum.mv[1].x & 3) == 0 &&
530 (vi->lum.mv[1].y & 3) == 0)
531 {
532 FullpelBipredCnt++;
533 }
534
535 int16_t* in0 = predSamplesL[0];
536 int16_t* in1 = predSamplesL[1];
537 uint8_t* out = &ctx->img->y[xP + (yP+0)*ctx->img->stride];
538
539 ctx->acceleration.put_weighted_pred_avg_8(out, ctx->img->stride,
540 in0,in1, nCS, nPbW, nPbH);
541
542 int16_t* in00 = predSamplesC[0][0];
543 int16_t* in01 = predSamplesC[0][1];
544 int16_t* in10 = predSamplesC[1][0];
545 int16_t* in11 = predSamplesC[1][1];
546 uint8_t* out0 = &ctx->img->cb[xP/2 + (yP/2+0)*ctx->img->chroma_stride];
547 uint8_t* out1 = &ctx->img->cr[xP/2 + (yP/2+0)*ctx->img->chroma_stride];
548
549 ctx->acceleration.put_weighted_pred_avg_8(out0, ctx->img->chroma_stride,
550 in00,in01, nCS, nPbW/2, nPbH/2);
551 ctx->acceleration.put_weighted_pred_avg_8(out1, ctx->img->chroma_stride,
552 in10,in11, nCS, nPbW/2, nPbH/2);
553 }
554 else {
555 // weighted prediction
556
557 int refIdx0 = vi->lum.refIdx[0];
558 int refIdx1 = vi->lum.refIdx[1];
559
560 int luma_log2WD = shdr->luma_log2_weight_denom + (14-8); // TODO: bitDepth
561 int chroma_log2WD = shdr->ChromaLog2WeightDenom + (14-8); // TODO: bitDepth
562
563 int luma_w0 = shdr->LumaWeight[0][refIdx0];
564 int luma_o0 = shdr->luma_offset[0][refIdx0] * (1<<(8-8)); // TODO: bitDepth
565 int luma_w1 = shdr->LumaWeight[1][refIdx1];
566 int luma_o1 = shdr->luma_offset[1][refIdx1] * (1<<(8-8)); // TODO: bitDepth
567
568 int chroma0_w0 = shdr->ChromaWeight[0][refIdx0][0];
569 int chroma0_o0 = shdr->ChromaOffset[0][refIdx0][0] * (1<<(8-8)); // TODO: bitDepth
570 int chroma1_w0 = shdr->ChromaWeight[0][refIdx0][1];
571 int chroma1_o0 = shdr->ChromaOffset[0][refIdx0][1] * (1<<(8-8)); // TODO: bitDepth
572 int chroma0_w1 = shdr->ChromaWeight[1][refIdx1][0];
573 int chroma0_o1 = shdr->ChromaOffset[1][refIdx1][0] * (1<<(8-8)); // TODO: bitDepth
574 int chroma1_w1 = shdr->ChromaWeight[1][refIdx1][1];
575 int chroma1_o1 = shdr->ChromaOffset[1][refIdx1][1] * (1<<(8-8)); // TODO: bitDepth
576
577 logtrace(LogMotion,"weighted-BI-0 [%d] %d %d %d %dx%d\n", refIdx0, luma_log2WD-6,luma_w0,luma_o0,nPbW,nPbH);
578 logtrace(LogMotion,"weighted-BI-1 [%d] %d %d %d %dx%d\n", refIdx1, luma_log2WD-6,luma_w1,luma_o1,nPbW,nPbH);
579
580 int16_t* in0 = predSamplesL[0];
581 int16_t* in1 = predSamplesL[1];
582 uint8_t* out = &ctx->img->y[xP + (yP+0)*ctx->img->stride];
583
584 ctx->acceleration.put_weighted_bipred_8(out, ctx->img->stride,
585 in0,in1, nCS, nPbW, nPbH,
586 luma_w0,luma_o0,
587 luma_w1,luma_o1,
588 luma_log2WD);
589
590 int16_t* in00 = predSamplesC[0][0];
591 int16_t* in01 = predSamplesC[0][1];
592 int16_t* in10 = predSamplesC[1][0];
593 int16_t* in11 = predSamplesC[1][1];
594 uint8_t* out0 = &ctx->img->cb[xP/2 + (yP/2+0)*ctx->img->chroma_stride];
595 uint8_t* out1 = &ctx->img->cr[xP/2 + (yP/2+0)*ctx->img->chroma_stride];
596
597 ctx->acceleration.put_weighted_bipred_8(out0, ctx->img->chroma_stride,
598 in00,in01, nCS, nPbW/2, nPbH/2,
599 chroma0_w0,chroma0_o0,
600 chroma0_w1,chroma0_o1,
601 chroma_log2WD);
602 ctx->acceleration.put_weighted_bipred_8(out1, ctx->img->chroma_stride,
603 in10,in11, nCS, nPbW/2, nPbH/2,
604 chroma1_w0,chroma1_o0,
605 chroma1_w1,chroma1_o1,
606 chroma_log2WD);
607 }
608 }
609 else if (predFlag[0]==1 || predFlag[1]==1) {
610 int l = predFlag[0] ? 0 : 1;
611
612 if (ctx->current_pps->weighted_bipred_flag==0) {
613 if ((vi->lum.mv[l].x & 3) == 0 &&
614 (vi->lum.mv[l].y & 3) == 0)
615 {
616 FullpelPredCnt++;
617 }
618
619
620 ctx->acceleration.put_unweighted_pred_8(&ctx->img->y[xP +yP*ctx->img->stride],
621 ctx->img->stride,
622 predSamplesL[l],nCS, nPbW,nPbH);
623 ctx->acceleration.put_unweighted_pred_8(&ctx->img->cb[xP/2 +yP/2*ctx->img->chroma_stride],
624 ctx->img->chroma_stride,
625 predSamplesC[0][l],nCS, nPbW/2,nPbH/2);
626 ctx->acceleration.put_unweighted_pred_8(&ctx->img->cr[xP/2 +yP/2*ctx->img->chroma_stride],
627 ctx->img->chroma_stride,
628 predSamplesC[1][l],nCS, nPbW/2,nPbH/2);
629 }
630 else {
631 int refIdx = vi->lum.refIdx[l];
632
633 int luma_log2WD = shdr->luma_log2_weight_denom + (14-8); // TODO: bitDepth
634 int chroma_log2WD = shdr->ChromaLog2WeightDenom + (14-8); // TODO: bitDepth
635
636 int luma_w = shdr->LumaWeight[l][refIdx];
637 int luma_o = shdr->luma_offset[l][refIdx] * (1<<(8-8)); // TODO: bitDepth
638
639 int chroma0_w = shdr->ChromaWeight[l][refIdx][0];
640 int chroma0_o = shdr->ChromaOffset[l][refIdx][0] * (1<<(8-8)); // TODO: bitDepth
641 int chroma1_w = shdr->ChromaWeight[l][refIdx][1];
642 int chroma1_o = shdr->ChromaOffset[l][refIdx][1] * (1<<(8-8)); // TODO: bitDepth
643
644 logtrace(LogMotion,"weighted-B-L%d [%d] %d %d %d %dx%d\n", l, refIdx, luma_log2WD-6,luma_w,luma_o,nPbW,nPbH);
645
646 ctx->acceleration.put_weighted_pred_8(&ctx->img->y[xP +yP*ctx->img->stride],
647 ctx->img->stride,
648 predSamplesL[l],nCS, nPbW,nPbH,
649 luma_w, luma_o, luma_log2WD);
650 ctx->acceleration.put_weighted_pred_8(&ctx->img->cb[xP/2 +yP/2*ctx->img->chroma_stride],
651 ctx->img->chroma_stride,
652 predSamplesC[0][l],nCS, nPbW/2,nPbH/2,
653 chroma0_w, chroma0_o, chroma_log2WD);
654 ctx->acceleration.put_weighted_pred_8(&ctx->img->cr[xP/2 +yP/2*ctx->img->chroma_stride],
655 ctx->img->chroma_stride,
656 predSamplesC[1][l],nCS, nPbW/2,nPbH/2,
657 chroma1_w, chroma1_o, chroma_log2WD);
658 }
659 }
660 else {
661 // TODO: check why it can actually happen that both predFlags[] are false.
662 // For now, we ignore this and continue decoding.
663
664 add_warning(ctx, DE265_WARNING_BOTH_PREDFLAGS_ZERO, false);
665 ctx->img->integrity = INTEGRITY_DECODING_ERRORS;
666 }
667 }
668
669
670 logtrace(LogTransform,"MC pixels (luma), position %d %d:\n", xP,yP);
671
672 for (int y=0;y<nPbH;y++) {
673 logtrace(LogTransform,"MC-y-%d-%d ",xP,yP+y);
674
675 for (int x=0;x<nPbW;x++) {
676 logtrace(LogTransform,"*%02x ", ctx->img->y[xP+x+(yP+y)*ctx->img->stride]);
677 }
678
679 logtrace(LogTransform,"*\n");
680 }
681
682
683 logtrace(LogTransform,"MC pixels (chroma cb), position %d %d:\n", xP/2,yP/2);
684
685 for (int y=0;y<nPbH/2;y++) {
686 logtrace(LogTransform,"MC-cb-%d-%d ",xP/2,yP/2+y);
687
688 for (int x=0;x<nPbW/2;x++) {
689 logtrace(LogTransform,"*%02x ", ctx->img->cb[xP/2+x+(yP/2+y)*ctx->img->chroma_stride]);
690 }
691
692 logtrace(LogTransform,"*\n");
693 }
694
695
696 logtrace(LogTransform,"MC pixels (chroma cr), position %d %d:\n", xP/2,yP/2);
697
698 for (int y=0;y<nPbH/2;y++) {
699 logtrace(LogTransform,"MC-cr-%d-%d ",xP/2,yP/2+y);
700
701 for (int x=0;x<nPbW/2;x++) {
702 logtrace(LogTransform,"*%02x ", ctx->img->cr[xP/2+x+(yP/2+y)*ctx->img->chroma_stride]);
703 }
704
705 logtrace(LogTransform,"*\n");
706 }
707 }
708
709
710 void logmvcand(PredVectorInfo p)
711 {
712 for (int v=0;v<2;v++) {
713 if (p.predFlag[v]) {
714 logtrace(LogMotion," %d: %s %d;%d ref=%d\n", v, p.predFlag[v] ? "yes":"no ",
715 p.mv[v].x,p.mv[v].y, p.refIdx[v]);
716 } else {
717 logtrace(LogMotion," %d: %s --;-- ref=--\n", v, p.predFlag[v] ? "yes":"no ");
718 }
719 }
720 }
721
722
723 bool equal_cand_MV(const PredVectorInfo* a, const PredVectorInfo* b)
724 {
725 // TODO: is this really correct? no check for predFlag? Standard says so... (p.127)
726
727 for (int i=0;i<2;i++) {
728 if (a->predFlag[i] != b->predFlag[i]) return false;
729
730 if (a->predFlag[i]) {
731 if (a->mv[i].x != b->mv[i].x) return false;
732 if (a->mv[i].y != b->mv[i].y) return false;
733 if (a->refIdx[i] != b->refIdx[i]) return false;
734 }
735 }
736
737 return true;
738 }
739
740
741 /*
742 +--+ +--+--+
743 |B2| |B1|B0|
744 +--+----------------+--+--+
745 | |
746 | |
747 | |
748 | |
749 | |
750 | |
751 | |
752 +--+ |
753 |A1| |
754 +--+-------------------+
755 |A0|
756 +--+
757 */
758
759
760 // 8.5.3.1.2
761 // TODO: check: can we fill the candidate list directly in this function and omit to copy later
762 void derive_spatial_merging_candidates(const decoder_context* ctx,
763 int xC, int yC, int nCS, int xP, int yP,
764 uint8_t singleMCLFlag,
765 int nPbW, int nPbH,
766 int partIdx,
767 MergingCandidates* out_cand)
768 {
769 const pic_parameter_set* pps = ctx->current_pps;
770 int log2_parallel_merge_level = pps->log2_parallel_merge_level;
771
772 enum PartMode PartMode = get_PartMode(ctx->img,ctx->current_sps,xC,yC);
773
774 // --- A1 ---
775
776 // a pixel within A1
777 int xA1 = xP-1;
778 int yA1 = yP+nPbH-1;
779
780 bool availableA1;
781
782 if ((xP>>log2_parallel_merge_level) == (xA1>>log2_parallel_merge_level) &&
783 (yP>>log2_parallel_merge_level) == (yA1>>log2_parallel_merge_level)) {
784 availableA1 = false;
785 logtrace(LogMotion,"spatial merging candidate A1: below parallel merge level\n");
786 }
787 else if (!singleMCLFlag &&
788 partIdx==1 &&
789 (PartMode==PART_Nx2N ||
790 PartMode==PART_nLx2N ||
791 PartMode==PART_nRx2N)) {
792 availableA1 = false;
793 logtrace(LogMotion,"spatial merging candidate A1: second part ignore\n");
794 }
795 else {
796 availableA1 = available_pred_blk(ctx, xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA1,yA1);
797 if (!availableA1) logtrace(LogMotion,"spatial merging candidate A1: unavailable\n");
798 }
799
800 if (!availableA1) {
801 out_cand->available[PRED_A1] = 0;
802 reset_pred_vector(&out_cand->pred_vector[PRED_A1]);
803 }
804 else {
805 out_cand->available[PRED_A1] = 1;
806 out_cand->pred_vector[PRED_A1] = *get_mv_info(ctx,xA1,yA1);
807
808 logtrace(LogMotion,"spatial merging candidate A1:\n");
809 logmvcand(out_cand->pred_vector[PRED_A1]);
810 }
811
812
813 // --- B1 ---
814
815 int xB1 = xP+nPbW-1;
816 int yB1 = yP-1;
817
818 bool availableB1;
819
820 if ((xP>>log2_parallel_merge_level) == (xB1>>log2_parallel_merge_level) &&
821 (yP>>log2_parallel_merge_level) == (yB1>>log2_parallel_merge_level)) {
822 availableB1 = false;
823 logtrace(LogMotion,"spatial merging candidate B1: below parallel merge level\n");
824 }
825 else if (!singleMCLFlag &&
826 partIdx==1 &&
827 (PartMode==PART_2NxN ||
828 PartMode==PART_2NxnU ||
829 PartMode==PART_2NxnD)) {
830 availableB1 = false;
831 logtrace(LogMotion,"spatial merging candidate B1: second part ignore\n");
832 }
833 else {
834 availableB1 = available_pred_blk(ctx, xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB1,yB1);
835 if (!availableB1) logtrace(LogMotion,"spatial merging candidate B1: unavailable\n");
836 }
837
838 if (!availableB1) {
839 out_cand->available[PRED_B1] = 0;
840 reset_pred_vector(&out_cand->pred_vector[PRED_B1]);
841 }
842 else {
843 out_cand->available[PRED_B1] = 1;
844 out_cand->pred_vector[PRED_B1] = *get_mv_info(ctx,xB1,yB1);
845
846 if (availableA1 &&
847 equal_cand_MV(&out_cand->pred_vector[PRED_A1],
848 &out_cand->pred_vector[PRED_B1])) {
849 out_cand->available[PRED_B1] = 0;
850 logtrace(LogMotion,"spatial merging candidate B1: redundant to A1\n");
851 }
852 else {
853 logtrace(LogMotion,"spatial merging candidate B1:\n");
854 logmvcand(out_cand->pred_vector[PRED_B1]);
855 }
856 }
857
858
859 // --- B0 ---
860
861 int xB0 = xP+nPbW;
862 int yB0 = yP-1;
863
864 bool availableB0;
865
866 if ((xP>>log2_parallel_merge_level) == (xB0>>log2_parallel_merge_level) &&
867 (yP>>log2_parallel_merge_level) == (yB0>>log2_parallel_merge_level)) {
868 availableB0 = false;
869 logtrace(LogMotion,"spatial merging candidate B0: below parallel merge level\n");
870 }
871 else {
872 availableB0 = available_pred_blk(ctx, xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB0,yB0);
873 if (!availableB0) logtrace(LogMotion,"spatial merging candidate B0: unavailable\n");
874 }
875
876 if (!availableB0) {
877 out_cand->available[PRED_B0] = 0;
878 reset_pred_vector(&out_cand->pred_vector[PRED_B0]);
879 }
880 else {
881 out_cand->available[PRED_B0] = 1;
882 out_cand->pred_vector[PRED_B0] = *get_mv_info(ctx,xB0,yB0);
883
884 if (availableB1 &&
885 equal_cand_MV(&out_cand->pred_vector[PRED_B1],
886 &out_cand->pred_vector[PRED_B0])) {
887 out_cand->available[PRED_B0] = 0;
888 logtrace(LogMotion,"spatial merging candidate B0: redundant to B1\n");
889 }
890 else {
891 logtrace(LogMotion,"spatial merging candidate B0:\n");
892 logmvcand(out_cand->pred_vector[PRED_B0]);
893 }
894 }
895
896
897 // --- A0 ---
898
899 int xA0 = xP-1;
900 int yA0 = yP+nPbH;
901
902 bool availableA0;
903
904 if ((xP>>log2_parallel_merge_level) == (xA0>>log2_parallel_merge_level) &&
905 (yP>>log2_parallel_merge_level) == (yA0>>log2_parallel_merge_level)) {
906 availableA0 = false;
907 logtrace(LogMotion,"spatial merging candidate A0: below parallel merge level\n");
908 }
909 else {
910 availableA0 = available_pred_blk(ctx, xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA0,yA0);
911 if (!availableA0) logtrace(LogMotion,"spatial merging candidate A0: unavailable\n");
912 }
913
914 if (!availableA0) {
915 out_cand->available[PRED_A0] = 0;
916 reset_pred_vector(&out_cand->pred_vector[PRED_A0]);
917 }
918 else {
919 out_cand->available[PRED_A0] = 1;
920 out_cand->pred_vector[PRED_A0] = *get_mv_info(ctx,xA0,yA0);
921
922 if (availableA1 &&
923 equal_cand_MV(&out_cand->pred_vector[PRED_A1],
924 &out_cand->pred_vector[PRED_A0])) {
925 out_cand->available[PRED_A0] = 0;
926 logtrace(LogMotion,"spatial merging candidate A0: redundant to A1\n");
927 }
928 else {
929 logtrace(LogMotion,"spatial merging candidate A0:\n");
930 logmvcand(out_cand->pred_vector[PRED_A0]);
931 }
932 }
933
934
935 // --- B2 ---
936
937 int xB2 = xP-1;
938 int yB2 = yP-1;
939
940 bool availableB2;
941
942 if (out_cand->available[PRED_A0] && out_cand->available[PRED_A1] &&
943 out_cand->available[PRED_B0] && out_cand->available[PRED_B1]) {
944 availableB2 = false;
945 logtrace(LogMotion,"spatial merging candidate B2: ignore\n");
946 }
947 else if ((xP>>log2_parallel_merge_level) == (xB2>>log2_parallel_merge_level) &&
948 (yP>>log2_parallel_merge_level) == (yB2>>log2_parallel_merge_level)) {
949 availableB2 = false;
950 logtrace(LogMotion,"spatial merging candidate B2: below parallel merge level\n");
951 }
952 else {
953 availableB2 = available_pred_blk(ctx, xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB2,yB2);
954 if (!availableB2) logtrace(LogMotion,"spatial merging candidate B2: unavailable\n");
955 }
956
957 if (!availableB2) {
958 out_cand->available[PRED_B2] = 0;
959 reset_pred_vector(&out_cand->pred_vector[PRED_B2]);
960 }
961 else {
962 out_cand->available[PRED_B2] = 1;
963 out_cand->pred_vector[PRED_B2] = *get_mv_info(ctx,xB2,yB2);
964
965 if (availableB1 &&
966 equal_cand_MV(&out_cand->pred_vector[PRED_B1],
967 &out_cand->pred_vector[PRED_B2])) {
968 out_cand->available[PRED_B2] = 0;
969 logtrace(LogMotion,"spatial merging candidate B2: redundant to B1\n");
970 }
971 else if (availableA1 &&
972 equal_cand_MV(&out_cand->pred_vector[PRED_A1],
973 &out_cand->pred_vector[PRED_B2])) {
974 out_cand->available[PRED_B2] = 0;
975 logtrace(LogMotion,"spatial merging candidate B2: redundant to A1\n");
976 }
977 else {
978 logtrace(LogMotion,"spatial merging candidate B0:\n");
979 logmvcand(out_cand->pred_vector[PRED_B0]);
980 }
981 }
982 }
983
984
985 // 8.5.3.1.4
986 void derive_zero_motion_vector_candidates(decoder_context* ctx,
987 slice_segment_header* shdr,
988 PredVectorInfo* inout_mergeCandList,
989 int* inout_numCurrMergeCand)
990 {
991 logtrace(LogMotion,"derive_zero_motion_vector_candidates\n");
992
993 int numRefIdx;
994
995 if (shdr->slice_type==SLICE_TYPE_P) {
996 numRefIdx = shdr->num_ref_idx_l0_active;
997 }
998 else {
999 numRefIdx = libde265_min(shdr->num_ref_idx_l0_active,
1000 shdr->num_ref_idx_l1_active);
1001 }
1002
1003
1004 //int numInputMergeCand = *inout_numMergeCand;
1005 int zeroIdx = 0;
1006
1007 while (*inout_numCurrMergeCand < shdr->MaxNumMergeCand) {
1008 // 1.
1009
1010 logtrace(LogMotion,"zeroIdx:%d numRefIdx:%d\n", zeroIdx, numRefIdx);
1011
1012 PredVectorInfo* newCand = &inout_mergeCandList[*inout_numCurrMergeCand];
1013
1014 if (shdr->slice_type==SLICE_TYPE_P) {
1015 newCand->refIdx[0] = (zeroIdx < numRefIdx) ? zeroIdx : 0;
1016 newCand->refIdx[1] = -1;
1017 newCand->predFlag[0] = 1;
1018 newCand->predFlag[1] = 0;
1019 }
1020 else {
1021 newCand->refIdx[0] = (zeroIdx < numRefIdx) ? zeroIdx : 0;
1022 newCand->refIdx[1] = (zeroIdx < numRefIdx) ? zeroIdx : 0;
1023 newCand->predFlag[0] = 1;
1024 newCand->predFlag[1] = 1;
1025 }
1026
1027 newCand->mv[0].x = 0;
1028 newCand->mv[0].y = 0;
1029 newCand->mv[1].x = 0;
1030 newCand->mv[1].y = 0;
1031
1032 (*inout_numCurrMergeCand)++;
1033
1034 // 2.
1035
1036 zeroIdx++;
1037 }
1038 }
1039
1040
1041 bool scale_mv(MotionVector* out_mv, MotionVector mv, int colDist, int currDist)
1042 {
1043 int td = Clip3(-128,127, colDist);
1044 int tb = Clip3(-128,127, currDist);
1045
1046 if (td==0) {
1047 *out_mv = mv;
1048 return false;
1049 }
1050 else {
1051 int tx = (16384 + (abs_value(td)>>1)) / td;
1052 int distScaleFactor = Clip3(-4096,4095, (tb*tx+32)>>6);
1053 out_mv->x = Clip3(-32768,32767,
1054 Sign(distScaleFactor*mv.x)*((abs_value(distScaleFactor*mv.x)+127)>>8));
1055 out_mv->y = Clip3(-32768,32767,
1056 Sign(distScaleFactor*mv.y)*((abs_value(distScaleFactor*mv.y)+127)>>8));
1057 return true;
1058 }
1059 }
1060
1061
1062 // (L1003) 8.5.3.2.8
1063
1064 void derive_collocated_motion_vectors(decoder_context* ctx,
1065 const slice_segment_header* shdr,
1066 int xP,int yP,
1067 int colPic,
1068 int xColPb,int yColPb,
1069 int refIdxLX, int X,
1070 MotionVector* out_mvLXCol,
1071 uint8_t* out_availableFlagLXCol)
1072 {
1073 logtrace(LogMotion,"derive_collocated_motion_vectors %d;%d\n",xP,yP);
1074
1075 // TODO: has to get pred_mode from reference picture
1076 enum PredMode predMode = get_pred_mode(&ctx->dpb[colPic],ctx->current_sps, xColPb,yColPb);
1077
1078 if (predMode == MODE_INTRA) {
1079 out_mvLXCol->x = 0;
1080 out_mvLXCol->y = 0;
1081 *out_availableFlagLXCol = 0;
1082 return;
1083 }
1084 else {
1085 logtrace(LogMotion,"colPic:%d (POC=%d) X:%d refIdxLX:%d refpiclist:%d\n",
1086 colPic,
1087 ctx->dpb[colPic].PicOrderCntVal,
1088 X,refIdxLX,shdr->RefPicList[X][refIdxLX]);
1089
1090 const de265_image* colImg = &ctx->dpb[colPic];
1091 if (colImg->integrity == INTEGRITY_UNAVAILABLE_REFERENCE) {
1092 out_mvLXCol->x = 0;
1093 out_mvLXCol->y = 0;
1094 *out_availableFlagLXCol = 0;
1095 return;
1096 }
1097
1098 const PredVectorInfo* mvi = get_img_mv_info(ctx,colImg,xColPb,yColPb);
1099 int listCol;
1100 int refIdxCol;
1101 MotionVector mvCol;
1102
1103 logtrace(LogMotion,"read MVI %d;%d:\n",xColPb,yColPb);
1104 logmvcand(*mvi);
1105
1106 if (mvi->predFlag[0]==0) {
1107 mvCol = mvi->mv[1];
1108 refIdxCol = mvi->refIdx[1];
1109 listCol = 1;
1110 }
1111 else {
1112 if (mvi->predFlag[1]==0) {
1113 mvCol = mvi->mv[0];
1114 refIdxCol = mvi->refIdx[0];
1115 listCol = 0;
1116 }
1117 else {
1118 int AllDiffPicOrderCntLEZero = true;
1119
1120 const int PicOrderCntVal = ctx->img->PicOrderCntVal;
1121
1122 for (int rIdx=0; rIdx<shdr->num_ref_idx_l0_active && AllDiffPicOrderCntLEZero; rIdx++)
1123 {
1124 int aPOC = ctx->dpb[shdr->RefPicList[0][rIdx]].PicOrderCntVal;
1125
1126 if (aPOC > PicOrderCntVal) {
1127 AllDiffPicOrderCntLEZero = false;
1128 }
1129 }
1130
1131 for (int rIdx=0; rIdx<shdr->num_ref_idx_l1_active && AllDiffPicOrderCntLEZero; rIdx++)
1132 {
1133 int aPOC = ctx->dpb[shdr->RefPicList[1][rIdx]].PicOrderCntVal;
1134
1135 if (aPOC > PicOrderCntVal) {
1136 AllDiffPicOrderCntLEZero = false;
1137 }
1138 }
1139
1140 if (AllDiffPicOrderCntLEZero) {
1141 mvCol = mvi->mv[X];
1142 refIdxCol = mvi->refIdx[X];
1143 listCol = X;
1144 }
1145 else {
1146 int N = shdr->collocated_from_l0_flag;
1147 mvCol = mvi->mv[N];
1148 refIdxCol = mvi->refIdx[N];
1149 listCol = N;
1150 }
1151 }
1152 }
1153
1154
1155
1156 slice_segment_header* colShdr = &ctx->slice[ get_SliceHeaderIndex(colImg,ctx->current_sps,xColPb,yColPb) ];
1157
1158 if (shdr->LongTermRefPic[X][refIdxLX] !=
1159 colShdr->LongTermRefPic[listCol][refIdxCol]) {
1160 *out_availableFlagLXCol = 0;
1161 out_mvLXCol->x = 0;
1162 out_mvLXCol->y = 0;
1163 }
1164 else {
1165 *out_availableFlagLXCol = 1;
1166
1167 const bool isLongTerm = shdr->LongTermRefPic[X][refIdxLX];
1168
1169 int colDist = colImg->PicOrderCntVal - colShdr->RefPicList_POC[listCol][refIdxCol];
1170 int currDist = ctx->img->PicOrderCntVal - shdr->RefPicList_POC[X][refIdxLX];
1171
1172 logtrace(LogMotion,"COLPOCDIFF %d %d [%d %d / %d %d]\n",colDist, currDist,
1173 colImg->PicOrderCntVal, colShdr->RefPicList_POC[listCol][refIdxCol],
1174 ctx->img->PicOrderCntVal, shdr->RefPicList_POC[X][refIdxLX]
1175 );
1176
1177 if (isLongTerm || colDist == currDist) {
1178 *out_mvLXCol = mvCol;
1179 }
1180 else {
1181 if (!scale_mv(out_mvLXCol, mvCol, colDist, currDist)) {
1182 //printf("A\n");
1183 add_warning(ctx, DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false);
1184 ctx->img->integrity = INTEGRITY_DECODING_ERRORS;
1185 }
1186
1187 logtrace(LogMotion,"scale: %d;%d to %d;%d\n",
1188 mvCol.x,mvCol.y, out_mvLXCol->x,out_mvLXCol->y);
1189 }
1190 }
1191 }
1192 }
1193
1194
1195 // 8.5.3.1.7
1196 void derive_temporal_luma_vector_prediction(decoder_context* ctx,
1197 const slice_segment_header* shdr,
1198 int xP,int yP,
1199 int nPbW,int nPbH,
1200 int refIdxL, int X,
1201 MotionVector* out_mvLXCol,
1202 uint8_t* out_availableFlagLXCol)
1203 {
1204
1205 if (shdr->slice_temporal_mvp_enabled_flag == 0) {
1206 out_mvLXCol->x = 0;
1207 out_mvLXCol->y = 0;
1208 *out_availableFlagLXCol = 0;
1209 return;
1210 }
1211
1212 int Log2CtbSizeY = ctx->current_sps->Log2CtbSizeY;
1213
1214 int colPic; // TODO: this is the same for the whole slice. We can precompute it.
1215
1216 if (shdr->slice_type == SLICE_TYPE_B &&
1217 shdr->collocated_from_l0_flag == 0)
1218 {
1219 logtrace(LogMotion,"collocated L1 ref_idx=%d\n",shdr->collocated_ref_idx);
1220
1221 // TODO: make sure that shdr->collocated_ref_idx is a valid index
1222 colPic = shdr->RefPicList[1][ shdr->collocated_ref_idx ];
1223 }
1224 else
1225 {
1226 logtrace(LogMotion,"collocated L0 ref_idx=%d\n",shdr->collocated_ref_idx);
1227
1228 // TODO: make sure that shdr->collocated_ref_idx is a valid index
1229 colPic = shdr->RefPicList[0][ shdr->collocated_ref_idx ];
1230 }
1231
1232 logtrace(LogMotion,"collocated reference POC=%d\n",ctx->dpb[colPic].PicOrderCntVal);
1233
1234
1235 int xColPb,yColPb;
1236 int yColBr = yP + nPbH; // bottom right collocated motion vector position
1237 int xColBr = xP + nPbW;
1238
1239 if ((yP>>Log2CtbSizeY) == (yColBr>>Log2CtbSizeY) &&
1240 xColBr < ctx->current_sps->pic_width_in_luma_samples &&
1241 yColBr < ctx->current_sps->pic_height_in_luma_samples)
1242 {
1243 xColPb = xColBr & ~0x0F; // reduce resolution of collocated motion-vectors to 16 pixels grid
1244 yColPb = yColBr & ~0x0F;
1245
1246 derive_collocated_motion_vectors(ctx,shdr, xP,yP, colPic, xColPb,yColPb, refIdxL, X,
1247 out_mvLXCol, out_availableFlagLXCol);
1248 }
1249 else
1250 {
1251 out_mvLXCol->x = 0;
1252 out_mvLXCol->y = 0;
1253 *out_availableFlagLXCol = 0;
1254 }
1255
1256
1257 if (*out_availableFlagLXCol==0) {
1258
1259 int xColCtr = xP+(nPbW>>1);
1260 int yColCtr = yP+(nPbH>>1);
1261
1262 xColPb = xColCtr & ~0x0F; // reduce resolution of collocated motion-vectors to 16 pixels grid
1263 yColPb = yColCtr & ~0x0F;
1264
1265 derive_collocated_motion_vectors(ctx,shdr, xP,yP, colPic, xColPb,yColPb, refIdxL, X,
1266 out_mvLXCol, out_availableFlagLXCol);
1267 }
1268 }
1269
1270
1271 static int table_8_19[2][12] = {
1272 { 0,1,0,2,1,2,0,3,1,3,2,3 },
1273 { 1,0,2,0,2,1,3,0,3,1,3,2 }
1274 };
1275
1276 // 8.5.3.1.3
1277 void derive_combined_bipredictive_merging_candidates(const decoder_context* ctx,
1278 slice_segment_header* shdr,
1279 PredVectorInfo* inout_mergeCandList,
1280 int* inout_numMergeCand,
1281 int numOrigMergeCand)
1282 {
1283 if (*inout_numMergeCand>1 && *inout_numMergeCand < shdr->MaxNumMergeCand) {
1284 int numInputMergeCand = *inout_numMergeCand;
1285 int combIdx = 0;
1286 uint8_t combStop = false;
1287
1288 while (!combStop) {
1289 int l0CandIdx = table_8_19[0][combIdx];
1290 int l1CandIdx = table_8_19[1][combIdx];
1291
1292 if (l0CandIdx >= numInputMergeCand ||
1293 l1CandIdx >= numInputMergeCand) {
1294 assert(false); // bitstream error -> TODO: conceal error
1295 }
1296
1297 PredVectorInfo* l0Cand = &inout_mergeCandList[l0CandIdx];
1298 PredVectorInfo* l1Cand = &inout_mergeCandList[l1CandIdx];
1299
1300 logtrace(LogMotion,"add bipredictive merging candidate (combIdx:%d)\n",combIdx);
1301 logtrace(LogMotion,"l0Cand:\n"); logmvcand(*l0Cand);
1302 logtrace(LogMotion,"l1Cand:\n"); logmvcand(*l1Cand);
1303
1304 if (l0Cand->predFlag[0] && l1Cand->predFlag[1] &&
1305 (ctx->dpb[shdr->RefPicList[0][l0Cand->refIdx[0]]].PicOrderCntVal !=
1306 ctx->dpb[shdr->RefPicList[1][l1Cand->refIdx[1]]].PicOrderCntVal ||
1307 l0Cand->mv[0].x != l1Cand->mv[1].x ||
1308 l0Cand->mv[0].y != l1Cand->mv[1].y)) {
1309 PredVectorInfo* p = &inout_mergeCandList[ *inout_numMergeCand ];
1310 p->refIdx[0] = l0Cand->refIdx[0];
1311 p->refIdx[1] = l1Cand->refIdx[1];
1312 p->predFlag[0] = l0Cand->predFlag[0];
1313 p->predFlag[1] = l1Cand->predFlag[1];
1314 p->mv[0] = l0Cand->mv[0];
1315 p->mv[1] = l1Cand->mv[1];
1316 (*inout_numMergeCand)++;
1317
1318 logtrace(LogMotion,"result:\n");
1319 logmvcand(*p);
1320 }
1321
1322 combIdx++;
1323 if (combIdx == numOrigMergeCand*(numOrigMergeCand-1) ||
1324 *inout_numMergeCand == shdr->MaxNumMergeCand) {
1325 combStop = true;
1326 }
1327 }
1328 }
1329 }
1330
1331
1332 // 8.5.3.1.1
1333 void derive_luma_motion_merge_mode(decoder_context* ctx,
1334 thread_context* tctx,
1335 int xC,int yC, int xP,int yP,
1336 int nCS, int nPbW,int nPbH, int partIdx,
1337 VectorInfo* out_vi)
1338 {
1339 slice_segment_header* shdr = tctx->shdr;
1340
1341 //int xOrigP = xP;
1342 //int yOrigP = yP;
1343 int nOrigPbW = nPbW;
1344 int nOrigPbH = nPbH;
1345
1346 int singleMCLFlag;
1347 singleMCLFlag = (ctx->current_pps->log2_parallel_merge_level > 2 && nCS==8);
1348
1349 if (singleMCLFlag) {
1350 xP=xC;
1351 yP=yC;
1352 nPbW=nCS;
1353 nPbH=nCS;
1354 partIdx=0;
1355 }
1356
1357 MergingCandidates mergeCand;
1358 derive_spatial_merging_candidates(ctx, xC,yC, nCS, xP,yP, singleMCLFlag,
1359 nPbW,nPbH,partIdx, &mergeCand);
1360
1361 int refIdxCol[2] = { 0,0 };
1362
1363 MotionVector mvCol[2];
1364 uint8_t predFlagLCol[2];
1365 derive_temporal_luma_vector_prediction(ctx,shdr, xP,yP,nPbW,nPbH, refIdxCol[0],0, &mvCol[0],
1366 &predFlagLCol[0]);
1367
1368 uint8_t availableFlagCol = predFlagLCol[0];
1369 predFlagLCol[1] = 0;
1370
1371 if (shdr->slice_type == SLICE_TYPE_B) {
1372 derive_temporal_luma_vector_prediction(ctx,shdr, xP,yP,nPbW,nPbH, refIdxCol[1],1, &mvCol[1],
1373 &predFlagLCol[1]);
1374 availableFlagCol |= predFlagLCol[1];
1375 }
1376
1377
1378 // 4.
1379
1380 PredVectorInfo mergeCandList[5];
1381 int numMergeCand=0;
1382
1383 for (int i=0;i<5;i++) {
1384 if (mergeCand.available[i]) {
1385 mergeCandList[numMergeCand++] = mergeCand.pred_vector[i];
1386 }
1387 }
1388
1389 if (availableFlagCol) {
1390 // TODO: save in mergeCand directly...
1391 mergeCand.available[PRED_COL] = availableFlagCol;
1392 mergeCand.pred_vector[PRED_COL].mv[0] = mvCol[0];
1393 mergeCand.pred_vector[PRED_COL].mv[1] = mvCol[1];
1394 mergeCand.pred_vector[PRED_COL].predFlag[0] = predFlagLCol[0];
1395 mergeCand.pred_vector[PRED_COL].predFlag[1] = predFlagLCol[1];
1396 mergeCand.pred_vector[PRED_COL].refIdx[0] = refIdxCol[0];
1397 mergeCand.pred_vector[PRED_COL].refIdx[1] = refIdxCol[1];
1398
1399 mergeCandList[numMergeCand++] = mergeCand.pred_vector[PRED_COL];
1400 }
1401
1402 // 5.
1403
1404 //int numOrigMergeCand = numMergeCand;
1405
1406 // 6.
1407
1408 //int numCombMergeCand = 0;
1409
1410 if (shdr->slice_type == SLICE_TYPE_B) {
1411 derive_combined_bipredictive_merging_candidates(ctx, shdr,
1412 mergeCandList, &numMergeCand, numMergeCand);
1413
1414 //numCombMergeCand = numMergeCand - numOrigMergeCand;
1415 }
1416
1417
1418 // 7.
1419
1420 derive_zero_motion_vector_candidates(ctx, shdr,
1421 mergeCandList, &numMergeCand);
1422
1423 // 8.
1424
1425 int merge_idx = tctx->merge_idx; // get_merge_idx(ctx,xP,yP);
1426 out_vi->lum = mergeCandList[merge_idx];
1427
1428
1429 logtrace(LogMotion,"mergeCandList:\n");
1430 for (int i=0;i<shdr->MaxNumMergeCand;i++)
1431 {
1432 logtrace(LogMotion, " %d:%s\n", i, i==merge_idx ? " SELECTED":"");
1433 logmvcand(mergeCandList[i]);
1434 }
1435
1436 // 9.
1437
1438 if (out_vi->lum.predFlag[0] && out_vi->lum.predFlag[1] && nOrigPbW+nOrigPbH==12) {
1439 out_vi->lum.refIdx[1] = -1;
1440 out_vi->lum.predFlag[1] = 0;
1441 }
1442 }
1443
1444
1445 // 8.5.3.1.6
1446 void derive_spatial_luma_vector_prediction(decoder_context* ctx,
1447 const slice_segment_header* shdr,
1448 int xC,int yC,int nCS,int xP,int yP,
1449 int nPbW,int nPbH, int X,
1450 int refIdxLX, int partIdx,
1451 uint8_t out_availableFlagLXN[2],
1452 MotionVector out_mvLXN[2])
1453 {
1454 int isScaledFlagLX = 0;
1455
1456 const int A=0;
1457 const int B=1;
1458
1459 // --- A ---
1460
1461 // 1.
1462
1463 int xA[2], yA[2];
1464 xA[0] = xP-1;
1465 yA[0] = yP + nPbH;
1466 xA[1] = xA[0];
1467 yA[1] = yA[0]-1;
1468
1469 // 2.
1470
1471 out_availableFlagLXN[A] = 0;
1472 out_mvLXN[A].x = 0;
1473 out_mvLXN[A].y = 0;
1474
1475 // 3. / 4.
1476
1477 bool availableA[2];
1478 availableA[0] = available_pred_blk(ctx, xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA[0],yA[0]);
1479 availableA[1] = available_pred_blk(ctx, xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA[1],yA[1]);
1480
1481 // 5.
1482
1483 if (availableA[0] || availableA[1]) {
1484 isScaledFlagLX = 1;
1485 }
1486
1487 // 6. test A0 and A1 (Ak)
1488
1489 int refIdxA=-1;
1490
1491 // the POC we want to reference in this PB
1492 const int referenced_POC = ctx->dpb[ shdr->RefPicList[X][ refIdxLX ] ].PicOrderCntVal;
1493
1494 for (int k=0;k<=1;k++) {
1495 if (availableA[k] &&
1496 out_availableFlagLXN[A]==0 && // no A?-predictor so far
1497 get_pred_mode(ctx->img,ctx->current_sps,xA[k],yA[k]) != MODE_INTRA) {
1498
1499 int Y=1-X;
1500
1501 const PredVectorInfo* vi = get_mv_info(ctx, xA[k],yA[k]);
1502 logtrace(LogMotion,"MVP A%d=\n",k);
1503 logmvcand(*vi);
1504
1505 // check whether the predictor X is available and references the same POC
1506 if (vi->predFlag[X] &&
1507 ctx->dpb[ shdr->RefPicList[X][ vi->refIdx[X] ] ].PicOrderCntVal == referenced_POC) {
1508
1509 logtrace(LogMotion,"take A%d/L%d as A candidate with same POC\n",k,X);
1510
1511 out_availableFlagLXN[A]=1;
1512 out_mvLXN[A] = vi->mv[X];
1513 refIdxA = vi->refIdx[X];
1514 }
1515 // check whether the other predictor (Y) is available and references the same POC
1516 else if (vi->predFlag[Y] &&
1517 ctx->dpb[ shdr->RefPicList[Y][ vi->refIdx[Y] ] ].PicOrderCntVal == referenced_POC) {
1518
1519 logtrace(LogMotion,"take A%d/L%d as A candidate with same POC\n",k,Y);
1520
1521 out_availableFlagLXN[A]=1;
1522 out_mvLXN[A] = vi->mv[Y];
1523 refIdxA = vi->refIdx[Y];
1524 }
1525 }
1526 }
1527
1528 // 7. If there is no predictor referencing the same POC, we take any other reference as
1529 // long as it is the same type of reference (long-term / short-term)
1530
1531 for (int k=0 ; k<=1 && out_availableFlagLXN[A]==0 ; k++) {
1532 int refPicList=-1;
1533
1534 if (availableA[k] &&
1535 // TODO: we could remove this call by storing the result of the similar computation above
1536 get_pred_mode(ctx->img,ctx->current_sps,xA[k],yA[k]) != MODE_INTRA) {
1537
1538 int Y=1-X;
1539
1540 const PredVectorInfo* vi = get_mv_info(ctx, xA[k],yA[k]);
1541 if (vi->predFlag[X]==1 &&
1542 shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[X][ vi->refIdx[X] ]) {
1543
1544 logtrace(LogMotion,"take A%D/L%d as A candidate with different POCs\n",k,X);
1545
1546 out_availableFlagLXN[A]=1;
1547 out_mvLXN[A] = vi->mv[X];
1548 refIdxA = vi->refIdx[X];
1549 refPicList = X;
1550 }
1551 else if (vi->predFlag[Y]==1 &&
1552 shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[Y][ vi->refIdx[Y] ]) {
1553
1554 logtrace(LogMotion,"take A%d/L%d as A candidate with different POCs\n",k,Y);
1555
1556 out_availableFlagLXN[A]=1;
1557 out_mvLXN[A] = vi->mv[Y];
1558 refIdxA = vi->refIdx[Y];
1559 refPicList = Y;
1560 }
1561 }
1562
1563 if (out_availableFlagLXN[A]==1) {
1564 assert(refIdxA>=0);
1565 assert(refPicList>=0);
1566
1567 const de265_image* refPicA = &ctx->dpb[ shdr->RefPicList[refPicList][refIdxA ] ];
1568 const de265_image* refPicX = &ctx->dpb[ shdr->RefPicList[X ][refIdxLX] ];
1569
1570 logtrace(LogMotion,"scale MVP A: A-POC:%d X-POC:%d\n",
1571 refPicA->PicOrderCntVal,refPicX->PicOrderCntVal);
1572
1573 if (refPicA->PicState == UsedForShortTermReference &&
1574 refPicX->PicState == UsedForShortTermReference) {
1575
1576 int distA = ctx->img->PicOrderCntVal - refPicA->PicOrderCntVal;
1577 int distX = ctx->img->PicOrderCntVal - referenced_POC;
1578
1579 if (!scale_mv(&out_mvLXN[A], out_mvLXN[A], distA, distX)) {
1580 add_warning(ctx, DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false);
1581 ctx->img->integrity = INTEGRITY_DECODING_ERRORS;
1582 }
1583 }
1584 }
1585 }
1586
1587
1588 // --- B ---
1589
1590 // 1.
1591
1592 int xB[3], yB[3];
1593 xB[0] = xP+nPbW;
1594 yB[0] = yP-1;
1595 xB[1] = xB[0]-1;
1596 yB[1] = yP-1;
1597 xB[2] = xP-1;
1598 yB[2] = yP-1;
1599
1600 // 2.
1601
1602 out_availableFlagLXN[B] = 0;
1603 out_mvLXN[B].x = 0;
1604 out_mvLXN[B].y = 0;
1605
1606 // 3. test B0,B1,B2 (Bk)
1607
1608 int refIdxB=-1;
1609
1610 bool availableB[3];
1611 for (int k=0;k<3;k++) {
1612 availableB[k] = available_pred_blk(ctx, xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB[k],yB[k]);
1613
1614 if (availableB[k] && out_availableFlagLXN[B]==0) {
1615
1616 int Y=1-X;
1617
1618 const PredVectorInfo* vi = get_mv_info(ctx, xB[k],yB[k]);
1619 logtrace(LogMotion,"MVP B%d=\n",k);
1620 logmvcand(*vi);
1621
1622 if (vi->predFlag[X] &&
1623 ctx->dpb[ shdr->RefPicList[X][ vi->refIdx[X] ] ].PicOrderCntVal == referenced_POC) {
1624
1625 logtrace(LogMotion,"take B%d/L%d as B candidate with same POC\n",k,X);
1626
1627 out_availableFlagLXN[B]=1;
1628 out_mvLXN[B] = vi->mv[X];
1629 refIdxB = vi->refIdx[X];
1630 }
1631 else if (vi->predFlag[Y] &&
1632 ctx->dpb[ shdr->RefPicList[Y][ vi->refIdx[Y] ] ].PicOrderCntVal == referenced_POC) {
1633
1634 logtrace(LogMotion,"take B%d/L%d as B candidate with same POC\n",k,Y);
1635
1636 out_availableFlagLXN[B]=1;
1637 out_mvLXN[B] = vi->mv[Y];
1638 refIdxB = vi->refIdx[Y];
1639 }
1640 }
1641 }
1642
1643 // 4.
1644
1645 if (isScaledFlagLX==0 && // no A predictor,
1646 out_availableFlagLXN[B]) // but an unscaled B predictor
1647 {
1648 // use unscaled B predictor as A predictor
1649
1650 logtrace(LogMotion,"copy the same-POC B candidate as additional A candidate\n");
1651
1652 out_availableFlagLXN[A]=1;
1653 out_mvLXN[A] = out_mvLXN[B];
1654 refIdxA = refIdxB;
1655 }
1656
1657 // 5.
1658
1659 // If no A predictor, we output the unscaled B as the A predictor (above)
1660 // and also add a scaled B predictor here.
1661 // If there is (probably) an A predictor, no differing-POC B predictor is generated.
1662 if (isScaledFlagLX==0) {
1663 out_availableFlagLXN[B]=0;
1664
1665 for (int k=0 ; k<=2 && out_availableFlagLXN[B]==0 ; k++) {
1666 int refPicList=-1;
1667
1668 if (availableB[k]) {
1669 int Y=1-X;
1670
1671 const PredVectorInfo* vi = get_mv_info(ctx, xB[k],yB[k]);
1672 if (vi->predFlag[X]==1 &&
1673 shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[X][ vi->refIdx[X] ]) {
1674 out_availableFlagLXN[B]=1;
1675 out_mvLXN[B] = vi->mv[X];
1676 refIdxB = vi->refIdx[X];
1677 refPicList = X;
1678 }
1679 else if (vi->predFlag[Y]==1 &&
1680 shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[Y][ vi->refIdx[Y] ]) {
1681 out_availableFlagLXN[B]=1;
1682 out_mvLXN[B] = vi->mv[Y];
1683 refIdxB = vi->refIdx[Y];
1684 refPicList = Y;
1685 }
1686 }
1687
1688 if (out_availableFlagLXN[B]==1) {
1689 assert(refPicList>=0);
1690 assert(refIdxB>=0);
1691
1692 const de265_image* refPicB = &ctx->dpb[ shdr->RefPicList[refPicList][refIdxB ] ];
1693 const de265_image* refPicX = &ctx->dpb[ shdr->RefPicList[X ][refIdxLX] ];
1694 if (refPicB->PicOrderCntVal != refPicX->PicOrderCntVal &&
1695 refPicB->PicState == UsedForShortTermReference &&
1696 refPicX->PicState == UsedForShortTermReference) {
1697
1698 int distB = ctx->img->PicOrderCntVal - refPicB->PicOrderCntVal;
1699 int distX = ctx->img->PicOrderCntVal - referenced_POC;
1700
1701 logtrace(LogMotion,"scale MVP B: B-POC:%d X-POC:%d\n",refPicB->PicOrderCntVal,refPicX->PicOrderCntVal);
1702
1703 if (!scale_mv(&out_mvLXN[B], out_mvLXN[B], distB, distX)) {
1704 add_warning(ctx, DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false);
1705 ctx->img->integrity = INTEGRITY_DECODING_ERRORS;
1706 }
1707 }
1708 }
1709 }
1710 }
1711 }
1712
1713 // 8.5.3.1.5
1714 MotionVector luma_motion_vector_prediction(decoder_context* ctx,
1715 thread_context* tctx,
1716 int xC,int yC,int nCS,int xP,int yP,
1717 int nPbW,int nPbH, int l,
1718 int refIdx, int partIdx)
1719 {
1720 const slice_segment_header* shdr = tctx->shdr;
1721
1722
1723 // 8.5.3.1.6: derive two spatial vector predictors A (0) and B (1)
1724
1725 uint8_t availableFlagLXN[2];
1726 MotionVector mvLXN[2];
1727
1728 derive_spatial_luma_vector_prediction(ctx, shdr, xC,yC, nCS, xP,yP, nPbW,nPbH, l, refIdx, partIdx,
1729 availableFlagLXN, mvLXN);
1730
1731 // 8.5.3.1.7: if we only have one spatial vector or both spatial vectors are the same,
1732 // derive a temporal predictor
1733
1734 uint8_t availableFlagLXCol;
1735 MotionVector mvLXCol;
1736
1737
1738 if (availableFlagLXN[0] &&
1739 availableFlagLXN[1] &&
1740 (mvLXN[0].x != mvLXN[1].x || mvLXN[0].y != mvLXN[1].y)) {
1741 availableFlagLXCol = 0;
1742 }
1743 else {
1744 derive_temporal_luma_vector_prediction(ctx, shdr, xP,yP, nPbW,nPbH, refIdx,l,
1745 &mvLXCol, &availableFlagLXCol);
1746 }
1747
1748
1749 // --- build candidate vector list with exactly two entries ---
1750
1751 int numMVPCandLX=0;
1752
1753 // spatial predictor A
1754
1755 MotionVector mvpList[3];
1756 if (availableFlagLXN[0])
1757 {
1758 mvpList[numMVPCandLX++] = mvLXN[0];
1759 }
1760
1761 // spatial predictor B (if not same as A)
1762
1763 if (availableFlagLXN[1] &&
1764 (!availableFlagLXN[0] || // in case A in not available, but mvLXA initialized to same as mvLXB
1765 (mvLXN[0].x != mvLXN[1].x || mvLXN[0].y != mvLXN[1].y)))
1766 {
1767 mvpList[numMVPCandLX++] = mvLXN[1];
1768 }
1769
1770 // temporal predictor
1771
1772 if (availableFlagLXCol)
1773 {
1774 mvpList[numMVPCandLX++] = mvLXCol;
1775 }
1776
1777 // fill with zero predictors
1778
1779 while (numMVPCandLX<2) {
1780 mvpList[numMVPCandLX].x = 0;
1781 mvpList[numMVPCandLX].y = 0;
1782 numMVPCandLX++;
1783 }
1784
1785
1786 // select predictor according to mvp_lX_flag
1787
1788 return mvpList[ tctx->mvp_lX_flag[l] ];
1789 }
1790
1791 void logMV(int x0,int y0,int nPbW,int nPbH, const char* mode,const VectorInfo* mv)
1792 {
1793 #if DE265_LOG_TRACE
1794 int pred0 = mv->lum.predFlag[0];
1795 int pred1 = mv->lum.predFlag[1];
1796
1797 logtrace(LogMotion,
1798 "*MV %d;%d [%d;%d] %s: (%d) %d;%d @%d (%d) %d;%d @%d\n", x0,y0,nPbW,nPbH,mode,
1799 pred0,
1800 pred0 ? mv->lum.mv[0].x : 0,pred0 ? mv->lum.mv[0].y : 0, pred0 ? mv->lum.refIdx[0] : 0,
1801 pred1,
1802 pred1 ? mv->lum.mv[1].x : 0,pred1 ? mv->lum.mv[1].y : 0, pred1 ? mv->lum.refIdx[1] : 0);
1803 #endif
1804 }
1805
1806
1807
1808 // 8.5.3.1
1809 void motion_vectors_and_ref_indices(decoder_context* ctx,
1810 thread_context* tctx,
1811 int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx,
1812 VectorInfo* out_vi)
1813 {
1814 //slice_segment_header* shdr = tctx->shdr;
1815
1816 int xP = xC+xB;
1817 int yP = yC+yB;
1818
1819 enum PredMode predMode = get_pred_mode(ctx->img,ctx->current_sps, xC,yC);
1820
1821 if (predMode == MODE_SKIP ||
1822 (predMode == MODE_INTER && tctx->merge_flag))
1823 {
1824 derive_luma_motion_merge_mode(ctx,tctx, xC,yC, xP,yP, nCS,nPbW,nPbH, partIdx, out_vi);
1825
1826 logMV(xP,yP,nPbW,nPbH, "merge_mode", out_vi);
1827 }
1828 else {
1829 int mvdL[2][2];
1830 MotionVector mvpL[2];
1831
1832 for (int l=0;l<2;l++) {
1833 // 1.
1834
1835 enum InterPredIdc inter_pred_idc = (enum InterPredIdc)tctx->inter_pred_idc;
1836
1837 if (inter_pred_idc == PRED_BI ||
1838 (inter_pred_idc == PRED_L0 && l==0) ||
1839 (inter_pred_idc == PRED_L1 && l==1)) {
1840 out_vi->lum.refIdx[l] = tctx->refIdx[l];
1841 out_vi->lum.predFlag[l] = 1;
1842 }
1843 else {
1844 out_vi->lum.refIdx[l] = -1;
1845 out_vi->lum.predFlag[l] = 0;
1846 }
1847
1848 // 2.
1849
1850 mvdL[l][0] = tctx->mvd[l][0];
1851 mvdL[l][1] = tctx->mvd[l][1];
1852
1853
1854 if (out_vi->lum.predFlag[l]) {
1855 // 3.
1856
1857 mvpL[l] = luma_motion_vector_prediction(ctx,tctx,xC,yC,nCS,xP,yP, nPbW,nPbH, l,
1858 out_vi->lum.refIdx[l], partIdx);
1859
1860 // 4.
1861
1862 int32_t x = (mvpL[l].x + mvdL[l][0] + 0x10000) & 0xFFFF;
1863 int32_t y = (mvpL[l].y + mvdL[l][1] + 0x10000) & 0xFFFF;
1864
1865 out_vi->lum.mv[l].x = (x>=0x8000) ? x-0x10000 : x;
1866 out_vi->lum.mv[l].y = (y>=0x8000) ? y-0x10000 : y;
1867 }
1868 }
1869
1870 logMV(xP,yP,nPbW,nPbH, "mvp", out_vi);
1871 }
1872 }
1873
1874
1875 // 8.5.3
1876 void decode_prediction_unit(decoder_context* ctx,
1877 thread_context* tctx,
1878 int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx)
1879 {
1880 logtrace(LogMotion,"decode_prediction_unit POC=%d %d;%d %dx%d\n",
1881 ctx->img->PicOrderCntVal, xC+xB,yC+yB, nPbW,nPbH);
1882
1883 slice_segment_header* shdr = tctx->shdr;
1884
1885 // 1.
1886
1887 VectorInfo vi;
1888 motion_vectors_and_ref_indices(ctx,tctx, xC,yC, xB,yB, nCS, nPbW,nPbH, partIdx, &vi);
1889
1890 // 2.
1891
1892 generate_inter_prediction_samples(ctx,shdr, xC,yC, xB,yB, nCS, nPbW,nPbH, &vi);
1893
1894
1895 set_mv_info(ctx,xC+xB,yC+yB,nPbW,nPbH, &vi.lum);
1896 }
1897
1898
1899 // 8.5.2
1900 #if 0
1901 void inter_prediction(decoder_context* ctx,slice_segment_header* shdr,
1902 int xC,int yC, int log2CbSize)
1903 {
1904 int nCS_L = 1<<log2CbSize;
1905 //int nCS_C = nCS_L>>1;
1906 int nCS1L = nCS_L>>1;
1907
1908 enum PartMode partMode = get_PartMode(ctx->img,ctx->current_sps,xC,yC);
1909 switch (partMode) {
1910 case PART_2Nx2N:
1911 decode_prediction_unit(ctx,shdr,xC,yC, 0,0, nCS_L, nCS_L,nCS_L, 0);
1912 break;
1913
1914 case PART_2NxN:
1915 decode_prediction_unit(ctx,shdr,xC,yC, 0,0, nCS_L, nCS_L,nCS1L, 0);
1916 decode_prediction_unit(ctx,shdr,xC,yC, 0,nCS1L, nCS_L, nCS_L,nCS1L, 1);
1917 break;
1918
1919 case PART_Nx2N:
1920 decode_prediction_unit(ctx,shdr,xC,yC, 0, 0, nCS_L, nCS1L,nCS_L, 0);
1921 decode_prediction_unit(ctx,shdr,xC,yC, nCS1L,0, nCS_L, nCS1L,nCS_L, 1);
1922 break;
1923
1924 case PART_2NxnU:
1925 decode_prediction_unit(ctx,shdr,xC,yC, 0,0, nCS_L, nCS_L,nCS1L>>1, 0);
1926 decode_prediction_unit(ctx,shdr,xC,yC, 0,nCS1L>>1, nCS_L, nCS_L,nCS1L + (nCS1L>>1), 1);
1927 break;
1928
1929 case PART_2NxnD:
1930 decode_prediction_unit(ctx,shdr,xC,yC, 0,0, nCS_L, nCS_L,nCS1L + (nCS1L>>1), 0);
1931 decode_prediction_unit(ctx,shdr,xC,yC, 0,nCS1L + (nCS1L>>1), nCS_L, nCS_L,nCS1L>>1, 1);
1932 break;
1933
1934 case PART_nLx2N:
1935 decode_prediction_unit(ctx,shdr,xC,yC, 0, 0, nCS_L, nCS1L>>1, nCS_L, 0);
1936 decode_prediction_unit(ctx,shdr,xC,yC, nCS1L>>1,0, nCS_L, nCS1L + (nCS1L>>1),nCS_L, 1);
1937 break;
1938
1939 case PART_nRx2N:
1940 decode_prediction_unit(ctx,shdr,xC,yC, 0, 0, nCS_L, nCS1L + (nCS1L>>1),nCS_L, 0);
1941 decode_prediction_unit(ctx,shdr,xC,yC, nCS1L + (nCS1L>>1),0, nCS_L, nCS1L>>1,nCS_L, 1);
1942 break;
1943
1944 case PART_NxN:
1945 decode_prediction_unit(ctx,shdr,xC,yC, 0, 0, nCS_L, nCS1L,nCS1L, 0);
1946 decode_prediction_unit(ctx,shdr,xC,yC, nCS1L,0, nCS_L, nCS1L,nCS1L, 1);
1947 decode_prediction_unit(ctx,shdr,xC,yC, 0, nCS1L, nCS_L, nCS1L,nCS1L, 2);
1948 decode_prediction_unit(ctx,shdr,xC,yC, nCS1L,nCS1L, nCS_L, nCS1L,nCS1L, 3);
1949 break;
1950
1951 default:
1952 assert(false); // undefined partitioning mode
1953 }
1954 }
1955 #endif
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "motion.h"
21 #include "decctx.h"
22 #include "util.h"
23 #include "dpb.h"
24 #include <assert.h>
25
26
27 #include <sys/types.h>
28 #include <signal.h>
29 #include <string.h>
30
31 #if defined(_MSC_VER) || defined(__MINGW32__)
32 # include <malloc.h>
33 #else
34 # include <alloca.h>
35 #endif
36
37
38 #define MAX_CU_SIZE 64
39
40
41 enum {
42 // important! order like shown in 8.5.3.1.1
43 PRED_A1 = 0,
44 PRED_B1 = 1,
45 PRED_B0 = 2,
46 PRED_A0 = 3,
47 PRED_B2 = 4,
48 PRED_COL = 5,
49 PRED_ZERO= 6
50 };
51
52
53 typedef struct
54 {
55 uint8_t available[7];
56 PredVectorInfo pred_vector[7];
57 } MergingCandidates;
58
59
60 void reset_pred_vector(PredVectorInfo* pvec)
61 {
62 for (int X=0;X<2;X++) {
63 pvec->mv[X].x = 0;
64 pvec->mv[X].y = 0;
65 pvec->refIdx[X] = -1;
66 pvec->predFlag[X] = 0;
67 }
68 }
69
70
71 static int extra_before[4] = { 0,3,3,2 };
72 static int extra_after [4] = { 0,3,4,4 };
73
74
75
76 void mc_luma(const decoder_context* ctx,
77 const de265_image* img, int mv_x, int mv_y,
78 int xP,int yP,
79 int16_t* out, int out_stride,
80 uint8_t* ref, int ref_stride,
81 int nPbW, int nPbH)
82 {
83 const seq_parameter_set* sps = &img->sps;
84
85 int xFracL = mv_x & 3;
86 int yFracL = mv_y & 3;
87
88 int xIntOffsL = xP + (mv_x>>2);
89 int yIntOffsL = yP + (mv_y>>2);
90
91 // luma sample interpolation process (8.5.3.2.2.1)
92
93 //const int shift1 = sps->BitDepth_Y-8;
94 //const int shift2 = 6;
95 const int shift3 = 14 - sps->BitDepth_Y;
96
97 int w = sps->pic_width_in_luma_samples;
98 int h = sps->pic_height_in_luma_samples;
99
100 ALIGNED_16(int16_t) mcbuffer[MAX_CU_SIZE * (MAX_CU_SIZE+7)];
101
102 if (xFracL==0 && yFracL==0) {
103 if (xIntOffsL >= 0 && yIntOffsL >= 0 &&
104 nPbW+xIntOffsL <= w && nPbH+yIntOffsL <= h) {
105 }
106
107 if (xIntOffsL >= 0 && yIntOffsL >= 0 &&
108 nPbW+xIntOffsL <= w && nPbH+yIntOffsL <= h) {
109
110 ctx->acceleration.put_hevc_qpel_8[0][0](out, out_stride,
111 &ref[yIntOffsL*ref_stride + xIntOffsL],
112 ref_stride,
113 nPbW,nPbH, mcbuffer);
114 }
115 else {
116 for (int y=0;y<nPbH;y++)
117 for (int x=0;x<nPbW;x++) {
118
119 int xA = Clip3(0,w-1,x + xIntOffsL);
120 int yA = Clip3(0,h-1,y + yIntOffsL);
121
122 out[y*out_stride+x] = ref[ xA + yA*ref_stride ] << shift3;
123 }
124 }
125
126 #ifdef DE265_LOG_TRACE
127 logtrace(LogMotion,"---MC luma %d %d = direct---\n",xFracL,yFracL);
128
129 for (int y=0;y<nPbH;y++) {
130 for (int x=0;x<nPbW;x++) {
131
132 int xA = Clip3(0,w-1,x + xIntOffsL);
133 int yA = Clip3(0,h-1,y + yIntOffsL);
134
135 logtrace(LogMotion,"%02x ", ref[ xA + yA*ref_stride ]);
136 }
137 logtrace(LogMotion,"\n");
138 }
139
140 logtrace(LogMotion," -> \n");
141
142 for (int y=0;y<nPbH;y++) {
143 for (int x=0;x<nPbW;x++) {
144
145 logtrace(LogMotion,"%02x ",out[y*out_stride+x] >> 6); // 6 will be used when summing predictions
146 }
147 logtrace(LogMotion,"\n");
148 }
149 #endif
150 }
151 else {
152 int extra_left = extra_before[xFracL];
153 int extra_right = extra_after [xFracL];
154 int extra_top = extra_before[yFracL];
155 int extra_bottom = extra_after [yFracL];
156
157 //int nPbW_extra = extra_left + nPbW + extra_right;
158 //int nPbH_extra = extra_top + nPbH + extra_bottom;
159
160
161 uint8_t padbuf[(MAX_CU_SIZE+16)*(MAX_CU_SIZE+7)];
162
163 uint8_t* src_ptr;
164 int src_stride;
165
166 if (-extra_left + xIntOffsL >= 0 &&
167 -extra_top + yIntOffsL >= 0 &&
168 nPbW+extra_right + xIntOffsL < w &&
169 nPbH+extra_bottom + yIntOffsL < h) {
170 src_ptr = &ref[xIntOffsL + yIntOffsL*ref_stride];
171 src_stride = ref_stride;
172 }
173 else {
174 for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
175 for (int x=-extra_left;x<nPbW+extra_right;x++) {
176
177 int xA = Clip3(0,w-1,x + xIntOffsL);
178 int yA = Clip3(0,h-1,y + yIntOffsL);
179
180 padbuf[x+extra_left + (y+extra_top)*(MAX_CU_SIZE+16)] = ref[ xA + yA*ref_stride ];
181 }
182 }
183
184 src_ptr = &padbuf[extra_top*(MAX_CU_SIZE+16) + extra_left];
185 src_stride = MAX_CU_SIZE+16;
186 }
187
188 ctx->acceleration.put_hevc_qpel_8[xFracL][yFracL](out, out_stride,
189 src_ptr, src_stride,
190 nPbW,nPbH, mcbuffer);
191
192
193 logtrace(LogMotion,"---V---\n");
194 for (int y=0;y<nPbH;y++) {
195 for (int x=0;x<nPbW;x++) {
196 logtrace(LogMotion,"%04x ",out[x+y*out_stride]);
197 }
198 logtrace(LogMotion,"\n");
199 }
200 }
201 }
202
203
204
205 void mc_chroma(const decoder_context* ctx,
206 const de265_image* img,
207 int mv_x, int mv_y,
208 int xP,int yP,
209 int16_t* out, int out_stride,
210 uint8_t* ref, int ref_stride,
211 int nPbWC, int nPbHC)
212 {
213 const seq_parameter_set* sps = &img->sps;
214
215 // chroma sample interpolation process (8.5.3.2.2.2)
216
217 //const int shift1 = sps->BitDepth_C-8;
218 //const int shift2 = 6;
219 const int shift3 = 14 - sps->BitDepth_C;
220
221 int wC = sps->pic_width_in_luma_samples /sps->SubWidthC;
222 int hC = sps->pic_height_in_luma_samples/sps->SubHeightC;
223
224 int xFracC = mv_x & 7;
225 int yFracC = mv_y & 7;
226
227 int xIntOffsC = xP/2 + (mv_x>>3);
228 int yIntOffsC = yP/2 + (mv_y>>3);
229
230 ALIGNED_32(int16_t mcbuffer[MAX_CU_SIZE*(MAX_CU_SIZE+7)]);
231
232 if (xFracC == 0 && yFracC == 0) {
233 if (xIntOffsC>=0 && nPbWC+xIntOffsC<=wC &&
234 yIntOffsC>=0 && nPbHC+yIntOffsC<=hC) {
235 ctx->acceleration.put_hevc_epel_8(out, out_stride,
236 &ref[xIntOffsC + yIntOffsC*ref_stride], ref_stride,
237 nPbWC,nPbHC, 0,0, NULL);
238 }
239 else
240 {
241 for (int y=0;y<nPbHC;y++)
242 for (int x=0;x<nPbWC;x++) {
243
244 int xB = Clip3(0,wC-1,x + xIntOffsC);
245 int yB = Clip3(0,hC-1,y + yIntOffsC);
246
247 out[y*out_stride+x] = ref[ xB + yB*ref_stride ] << shift3;
248 }
249 }
250 }
251 else {
252 uint8_t padbuf[(MAX_CU_SIZE+16)*(MAX_CU_SIZE+3)];
253
254 uint8_t* src_ptr;
255 int src_stride;
256
257 int extra_top = 1;
258 int extra_left = 1;
259 int extra_right = 2;
260 int extra_bottom = 2;
261
262 if (xIntOffsC>=1 && nPbWC+xIntOffsC<=wC-2 &&
263 yIntOffsC>=1 && nPbHC+yIntOffsC<=hC-2) {
264 src_ptr = &ref[xIntOffsC + yIntOffsC*ref_stride];
265 src_stride = ref_stride;
266 }
267 else {
268 for (int y=-extra_top;y<nPbHC+extra_bottom;y++) {
269 for (int x=-extra_left;x<nPbWC+extra_right;x++) {
270
271 int xA = Clip3(0,wC-1,x + xIntOffsC);
272 int yA = Clip3(0,hC-1,y + yIntOffsC);
273
274 padbuf[x+extra_left + (y+extra_top)*(MAX_CU_SIZE+16)] = ref[ xA + yA*ref_stride ];
275 }
276 }
277
278 src_ptr = &padbuf[extra_left + extra_top*(MAX_CU_SIZE+16)];
279 src_stride = MAX_CU_SIZE+16;
280 }
281
282
283 if (xFracC && yFracC) {
284 ctx->acceleration.put_hevc_epel_hv_8(out, out_stride,
285 src_ptr, src_stride,
286 nPbWC,nPbHC, xFracC,yFracC, mcbuffer);
287 }
288 else if (xFracC) {
289 ctx->acceleration.put_hevc_epel_h_8(out, out_stride,
290 src_ptr, src_stride,
291 nPbWC,nPbHC, xFracC,yFracC, mcbuffer);
292 }
293 else if (yFracC) {
294 ctx->acceleration.put_hevc_epel_v_8(out, out_stride,
295 src_ptr, src_stride,
296 nPbWC,nPbHC, xFracC,yFracC, mcbuffer);
297 }
298 else {
299 assert(false); // full-pel shifts are handled above
300 }
301 }
302 }
303
304
305
306 // 8.5.3.2
307 // NOTE: for full-pel shifts, we can introduce a fast path, simply copying without shifts
308 void generate_inter_prediction_samples(decoder_context* ctx,
309 de265_image* img,
310 slice_segment_header* shdr,
311 int xC,int yC,
312 int xB,int yB,
313 int nCS, int nPbW,int nPbH,
314 const VectorInfo* vi)
315 {
316 ALIGNED_16(int16_t) predSamplesL [2 /* LX */][MAX_CU_SIZE* MAX_CU_SIZE];
317 ALIGNED_16(int16_t) predSamplesC[2 /* chroma */ ][2 /* LX */][MAX_CU_SIZE* MAX_CU_SIZE];
318
319 int xP = xC+xB;
320 int yP = yC+yB;
321
322 int predFlag[2];
323 predFlag[0] = vi->lum.predFlag[0];
324 predFlag[1] = vi->lum.predFlag[1];
325
326
327 // Some encoders use bi-prediction with two similar MVs.
328 // Identify this case and use only one MV.
329
330 // do this only without weighted prediction, because the weights/offsets may be different
331 if (img->pps.weighted_pred_flag==0) {
332 if (predFlag[0] && predFlag[1]) {
333 if (vi->lum.mv[0].x == vi->lum.mv[1].x &&
334 vi->lum.mv[0].y == vi->lum.mv[1].y &&
335 shdr->RefPicList[0][vi->lum.refIdx[0]] ==
336 shdr->RefPicList[1][vi->lum.refIdx[1]]) {
337 predFlag[1] = 0;
338 }
339 }
340 }
341
342
343 for (int l=0;l<2;l++) {
344 if (predFlag[l]) {
345 // 8.5.3.2.1
346
347 if (vi->lum.refIdx[l] >= MAX_NUM_REF_PICS) {
348 img->integrity = INTEGRITY_DECODING_ERRORS;
349 ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false);
350 return;
351 }
352
353 de265_image* refPic;
354 refPic = ctx->get_image(shdr->RefPicList[l][vi->lum.refIdx[l]]);
355
356 logtrace(LogMotion, "refIdx: %d -> dpb[%d]\n", vi->lum.refIdx[l], shdr->RefPicList[l][vi->lum.refIdx[l]]);
357
358 if (refPic->PicState == UnusedForReference) {
359 img->integrity = INTEGRITY_DECODING_ERRORS;
360 ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false);
361 }
362 else {
363 // 8.5.3.2.2
364
365 logtrace(LogMotion,"do MC: L%d,MV=%d;%d RefPOC=%d\n",
366 l,vi->lum.mv[l].x,vi->lum.mv[l].y,refPic->PicOrderCntVal);
367
368
369 // TODO: must predSamples stride really be nCS or can it be somthing smaller like nPbW?
370 mc_luma(ctx, img, vi->lum.mv[l].x, vi->lum.mv[l].y, xP,yP,
371 predSamplesL[l],nCS,
372 refPic->get_image_plane(0),refPic->get_luma_stride(), nPbW,nPbH);
373
374
375 mc_chroma(ctx, img, vi->lum.mv[l].x, vi->lum.mv[l].y, xP,yP,
376 predSamplesC[0][l],nCS, refPic->get_image_plane(1),
377 refPic->get_chroma_stride(), nPbW/2,nPbH/2);
378 mc_chroma(ctx, img, vi->lum.mv[l].x, vi->lum.mv[l].y, xP,yP,
379 predSamplesC[1][l],nCS, refPic->get_image_plane(2),
380 refPic->get_chroma_stride(), nPbW/2,nPbH/2);
381 }
382 }
383 }
384
385
386 // weighted sample prediction (8.5.3.2.3)
387
388 //const int shift1 = 6; // TODO
389 //const int offset1= 1<<(shift1-1);
390
391 logtrace(LogMotion,"predFlags (modified): %d %d\n", predFlag[0], predFlag[1]);
392
393 if (shdr->slice_type == SLICE_TYPE_P) {
394 if (img->pps.weighted_pred_flag==0) {
395 if (predFlag[0]==1 && predFlag[1]==0) {
396 ctx->acceleration.put_unweighted_pred_8(img->get_image_plane_at_pos(0,xP,yP),
397 img->get_image_stride(0),
398 predSamplesL[0],nCS, nPbW,nPbH);
399 ctx->acceleration.put_unweighted_pred_8(img->get_image_plane_at_pos(1,xP/2,yP/2),
400 img->get_image_stride(1),
401 predSamplesC[0][0],nCS, nPbW/2,nPbH/2);
402 ctx->acceleration.put_unweighted_pred_8(img->get_image_plane_at_pos(2,xP/2,yP/2),
403 img->get_image_stride(2),
404 predSamplesC[1][0],nCS, nPbW/2,nPbH/2);
405 }
406 else {
407 ctx->add_warning(DE265_WARNING_BOTH_PREDFLAGS_ZERO, false);
408 img->integrity = INTEGRITY_DECODING_ERRORS;
409 }
410 }
411 else {
412 // weighted prediction
413
414 if (predFlag[0]==1 && predFlag[1]==0) {
415
416 int refIdx0 = vi->lum.refIdx[0];
417
418 int luma_log2WD = shdr->luma_log2_weight_denom + (14-8); // TODO: bitDepth
419 int chroma_log2WD = shdr->ChromaLog2WeightDenom + (14-8); // TODO: bitDepth
420
421 int luma_w0 = shdr->LumaWeight[0][refIdx0];
422 int luma_o0 = shdr->luma_offset[0][refIdx0] * (1<<(8-8)); // TODO: bitDepth
423
424 int chroma0_w0 = shdr->ChromaWeight[0][refIdx0][0];
425 int chroma0_o0 = shdr->ChromaOffset[0][refIdx0][0] * (1<<(8-8)); // TODO: bitDepth
426 int chroma1_w0 = shdr->ChromaWeight[0][refIdx0][1];
427 int chroma1_o0 = shdr->ChromaOffset[0][refIdx0][1] * (1<<(8-8)); // TODO: bitDepth
428
429 logtrace(LogMotion,"weighted-0 [%d] %d %d %d %dx%d\n", refIdx0, luma_log2WD-6,luma_w0,luma_o0,nPbW,nPbH);
430
431 ctx->acceleration.put_weighted_pred_8(img->get_image_plane_at_pos(0,xP,yP),
432 img->get_image_stride(0),
433 predSamplesL[0],nCS, nPbW,nPbH,
434 luma_w0, luma_o0, luma_log2WD);
435 ctx->acceleration.put_weighted_pred_8(img->get_image_plane_at_pos(1,xP/2,yP/2),
436 img->get_image_stride(1),
437 predSamplesC[0][0],nCS, nPbW/2,nPbH/2,
438 chroma0_w0, chroma0_o0, chroma_log2WD);
439 ctx->acceleration.put_weighted_pred_8(img->get_image_plane_at_pos(2,xP/2,yP/2),
440 img->get_image_stride(2),
441 predSamplesC[1][0],nCS, nPbW/2,nPbH/2,
442 chroma1_w0, chroma1_o0, chroma_log2WD);
443 }
444 else {
445 ctx->add_warning(DE265_WARNING_BOTH_PREDFLAGS_ZERO, false);
446 img->integrity = INTEGRITY_DECODING_ERRORS;
447 }
448 }
449 }
450 else {
451 assert(shdr->slice_type == SLICE_TYPE_B);
452
453 if (predFlag[0]==1 && predFlag[1]==1) {
454 if (img->pps.weighted_bipred_flag==0) {
455 //const int shift2 = 15-8; // TODO: real bit depth
456 //const int offset2 = 1<<(shift2-1);
457
458 int16_t* in0 = predSamplesL[0];
459 int16_t* in1 = predSamplesL[1];
460 uint8_t* out = img->get_image_plane_at_pos(0, xP,yP);
461
462 ctx->acceleration.put_weighted_pred_avg_8(out, img->get_luma_stride(),
463 in0,in1, nCS, nPbW, nPbH);
464
465 int16_t* in00 = predSamplesC[0][0];
466 int16_t* in01 = predSamplesC[0][1];
467 int16_t* in10 = predSamplesC[1][0];
468 int16_t* in11 = predSamplesC[1][1];
469 uint8_t* out0 = img->get_image_plane_at_pos(1,xP/2,yP/2);
470 uint8_t* out1 = img->get_image_plane_at_pos(2,xP/2,yP/2);
471
472 ctx->acceleration.put_weighted_pred_avg_8(out0, img->get_chroma_stride(),
473 in00,in01, nCS, nPbW/2, nPbH/2);
474 ctx->acceleration.put_weighted_pred_avg_8(out1, img->get_chroma_stride(),
475 in10,in11, nCS, nPbW/2, nPbH/2);
476 }
477 else {
478 // weighted prediction
479
480 int refIdx0 = vi->lum.refIdx[0];
481 int refIdx1 = vi->lum.refIdx[1];
482
483 int luma_log2WD = shdr->luma_log2_weight_denom + (14-8); // TODO: bitDepth
484 int chroma_log2WD = shdr->ChromaLog2WeightDenom + (14-8); // TODO: bitDepth
485
486 int luma_w0 = shdr->LumaWeight[0][refIdx0];
487 int luma_o0 = shdr->luma_offset[0][refIdx0] * (1<<(8-8)); // TODO: bitDepth
488 int luma_w1 = shdr->LumaWeight[1][refIdx1];
489 int luma_o1 = shdr->luma_offset[1][refIdx1] * (1<<(8-8)); // TODO: bitDepth
490
491 int chroma0_w0 = shdr->ChromaWeight[0][refIdx0][0];
492 int chroma0_o0 = shdr->ChromaOffset[0][refIdx0][0] * (1<<(8-8)); // TODO: bitDepth
493 int chroma1_w0 = shdr->ChromaWeight[0][refIdx0][1];
494 int chroma1_o0 = shdr->ChromaOffset[0][refIdx0][1] * (1<<(8-8)); // TODO: bitDepth
495 int chroma0_w1 = shdr->ChromaWeight[1][refIdx1][0];
496 int chroma0_o1 = shdr->ChromaOffset[1][refIdx1][0] * (1<<(8-8)); // TODO: bitDepth
497 int chroma1_w1 = shdr->ChromaWeight[1][refIdx1][1];
498 int chroma1_o1 = shdr->ChromaOffset[1][refIdx1][1] * (1<<(8-8)); // TODO: bitDepth
499
500 logtrace(LogMotion,"weighted-BI-0 [%d] %d %d %d %dx%d\n", refIdx0, luma_log2WD-6,luma_w0,luma_o0,nPbW,nPbH);
501 logtrace(LogMotion,"weighted-BI-1 [%d] %d %d %d %dx%d\n", refIdx1, luma_log2WD-6,luma_w1,luma_o1,nPbW,nPbH);
502
503 int16_t* in0 = predSamplesL[0];
504 int16_t* in1 = predSamplesL[1];
505 uint8_t* out = img->get_image_plane_at_pos(0, xP,yP);
506
507 ctx->acceleration.put_weighted_bipred_8(out, img->get_luma_stride(),
508 in0,in1, nCS, nPbW, nPbH,
509 luma_w0,luma_o0,
510 luma_w1,luma_o1,
511 luma_log2WD);
512
513 int16_t* in00 = predSamplesC[0][0];
514 int16_t* in01 = predSamplesC[0][1];
515 int16_t* in10 = predSamplesC[1][0];
516 int16_t* in11 = predSamplesC[1][1];
517 uint8_t* out0 = img->get_image_plane_at_pos(1,xP/2,yP/2);
518 uint8_t* out1 = img->get_image_plane_at_pos(2,xP/2,yP/2);
519
520 ctx->acceleration.put_weighted_bipred_8(out0, img->get_chroma_stride(),
521 in00,in01, nCS, nPbW/2, nPbH/2,
522 chroma0_w0,chroma0_o0,
523 chroma0_w1,chroma0_o1,
524 chroma_log2WD);
525 ctx->acceleration.put_weighted_bipred_8(out1, img->get_chroma_stride(),
526 in10,in11, nCS, nPbW/2, nPbH/2,
527 chroma1_w0,chroma1_o0,
528 chroma1_w1,chroma1_o1,
529 chroma_log2WD);
530 }
531 }
532 else if (predFlag[0]==1 || predFlag[1]==1) {
533 int l = predFlag[0] ? 0 : 1;
534
535 if (img->pps.weighted_bipred_flag==0) {
536 ctx->acceleration.put_unweighted_pred_8(img->get_image_plane_at_pos(0,xP,yP),
537 img->get_image_stride(0),
538 predSamplesL[l],nCS, nPbW,nPbH);
539 ctx->acceleration.put_unweighted_pred_8(img->get_image_plane_at_pos(1,xP/2,yP/2),
540 img->get_image_stride(1),
541 predSamplesC[0][l],nCS, nPbW/2,nPbH/2);
542 ctx->acceleration.put_unweighted_pred_8(img->get_image_plane_at_pos(2,xP/2,yP/2),
543 img->get_image_stride(2),
544 predSamplesC[1][l],nCS, nPbW/2,nPbH/2);
545 }
546 else {
547 int refIdx = vi->lum.refIdx[l];
548
549 int luma_log2WD = shdr->luma_log2_weight_denom + (14-8); // TODO: bitDepth
550 int chroma_log2WD = shdr->ChromaLog2WeightDenom + (14-8); // TODO: bitDepth
551
552 int luma_w = shdr->LumaWeight[l][refIdx];
553 int luma_o = shdr->luma_offset[l][refIdx] * (1<<(8-8)); // TODO: bitDepth
554
555 int chroma0_w = shdr->ChromaWeight[l][refIdx][0];
556 int chroma0_o = shdr->ChromaOffset[l][refIdx][0] * (1<<(8-8)); // TODO: bitDepth
557 int chroma1_w = shdr->ChromaWeight[l][refIdx][1];
558 int chroma1_o = shdr->ChromaOffset[l][refIdx][1] * (1<<(8-8)); // TODO: bitDepth
559
560 logtrace(LogMotion,"weighted-B-L%d [%d] %d %d %d %dx%d\n", l, refIdx, luma_log2WD-6,luma_w,luma_o,nPbW,nPbH);
561
562 ctx->acceleration.put_weighted_pred_8(img->get_image_plane_at_pos(0,xP,yP),
563 img->get_image_stride(0),
564 predSamplesL[l],nCS, nPbW,nPbH,
565 luma_w, luma_o, luma_log2WD);
566 ctx->acceleration.put_weighted_pred_8(img->get_image_plane_at_pos(1,xP/2,yP/2),
567 img->get_image_stride(1),
568 predSamplesC[0][l],nCS, nPbW/2,nPbH/2,
569 chroma0_w, chroma0_o, chroma_log2WD);
570 ctx->acceleration.put_weighted_pred_8(img->get_image_plane_at_pos(2,xP/2,yP/2),
571 img->get_image_stride(2),
572 predSamplesC[1][l],nCS, nPbW/2,nPbH/2,
573 chroma1_w, chroma1_o, chroma_log2WD);
574 }
575 }
576 else {
577 // TODO: check why it can actually happen that both predFlags[] are false.
578 // For now, we ignore this and continue decoding.
579
580 ctx->add_warning(DE265_WARNING_BOTH_PREDFLAGS_ZERO, false);
581 img->integrity = INTEGRITY_DECODING_ERRORS;
582 }
583 }
584
585 #if defined(DE265_LOG_TRACE) && 0
586 logtrace(LogTransform,"MC pixels (luma), position %d %d:\n", xP,yP);
587
588 for (int y=0;y<nPbH;y++) {
589 logtrace(LogTransform,"MC-y-%d-%d ",xP,yP+y);
590
591 for (int x=0;x<nPbW;x++) {
592 logtrace(LogTransform,"*%02x ", img->y[xP+x+(yP+y)*img->stride]);
593 }
594
595 logtrace(LogTransform,"*\n");
596 }
597
598
599 logtrace(LogTransform,"MC pixels (chroma cb), position %d %d:\n", xP/2,yP/2);
600
601 for (int y=0;y<nPbH/2;y++) {
602 logtrace(LogTransform,"MC-cb-%d-%d ",xP/2,yP/2+y);
603
604 for (int x=0;x<nPbW/2;x++) {
605 logtrace(LogTransform,"*%02x ", img->cb[xP/2+x+(yP/2+y)*img->chroma_stride]);
606 }
607
608 logtrace(LogTransform,"*\n");
609 }
610
611
612 logtrace(LogTransform,"MC pixels (chroma cr), position %d %d:\n", xP/2,yP/2);
613
614 for (int y=0;y<nPbH/2;y++) {
615 logtrace(LogTransform,"MC-cr-%d-%d ",xP/2,yP/2+y);
616
617 for (int x=0;x<nPbW/2;x++) {
618 logtrace(LogTransform,"*%02x ", img->cr[xP/2+x+(yP/2+y)*img->chroma_stride]);
619 }
620
621 logtrace(LogTransform,"*\n");
622 }
623 #endif
624 }
625
626
627 #ifdef DE265_LOG_TRACE
628 void logmvcand(PredVectorInfo p)
629 {
630 for (int v=0;v<2;v++) {
631 if (p.predFlag[v]) {
632 logtrace(LogMotion," %d: %s %d;%d ref=%d\n", v, p.predFlag[v] ? "yes":"no ",
633 p.mv[v].x,p.mv[v].y, p.refIdx[v]);
634 } else {
635 logtrace(LogMotion," %d: %s --;-- ref=--\n", v, p.predFlag[v] ? "yes":"no ");
636 }
637 }
638 }
639 #else
640 #define logmvcand(p)
641 #endif
642
643
644 LIBDE265_INLINE static bool equal_cand_MV(const PredVectorInfo* a, const PredVectorInfo* b)
645 {
646 // TODO: is this really correct? no check for predFlag? Standard says so... (p.127)
647
648 for (int i=0;i<2;i++) {
649 if (a->predFlag[i] != b->predFlag[i]) return false;
650
651 if (a->predFlag[i]) {
652 if (a->mv[i].x != b->mv[i].x) return false;
653 if (a->mv[i].y != b->mv[i].y) return false;
654 if (a->refIdx[i] != b->refIdx[i]) return false;
655 }
656 }
657
658 return true;
659 }
660
661
662 /*
663 +--+ +--+--+
664 |B2| |B1|B0|
665 +--+----------------+--+--+
666 | |
667 | |
668 | |
669 | |
670 | |
671 | |
672 | |
673 +--+ |
674 |A1| |
675 +--+-------------------+
676 |A0|
677 +--+
678 */
679
680
681 // 8.5.3.1.2
682 // TODO: check: can we fill the candidate list directly in this function and omit to copy later
683 void derive_spatial_merging_candidates(const de265_image* img,
684 int xC, int yC, int nCS, int xP, int yP,
685 uint8_t singleMCLFlag,
686 int nPbW, int nPbH,
687 int partIdx,
688 MergingCandidates* out_cand)
689 {
690 const pic_parameter_set* pps = &img->pps;
691 int log2_parallel_merge_level = pps->log2_parallel_merge_level;
692
693 enum PartMode PartMode = img->get_PartMode(xC,yC);
694
695 // --- A1 ---
696
697 // a pixel within A1
698 int xA1 = xP-1;
699 int yA1 = yP+nPbH-1;
700
701 bool availableA1;
702
703 if ((xP>>log2_parallel_merge_level) == (xA1>>log2_parallel_merge_level) &&
704 (yP>>log2_parallel_merge_level) == (yA1>>log2_parallel_merge_level)) {
705 availableA1 = false;
706 logtrace(LogMotion,"spatial merging candidate A1: below parallel merge level\n");
707 }
708 else if (!singleMCLFlag &&
709 partIdx==1 &&
710 (PartMode==PART_Nx2N ||
711 PartMode==PART_nLx2N ||
712 PartMode==PART_nRx2N)) {
713 availableA1 = false;
714 logtrace(LogMotion,"spatial merging candidate A1: second part ignore\n");
715 }
716 else {
717 availableA1 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA1,yA1);
718 if (!availableA1) logtrace(LogMotion,"spatial merging candidate A1: unavailable\n");
719 }
720
721 if (!availableA1) {
722 out_cand->available[PRED_A1] = 0;
723 reset_pred_vector(&out_cand->pred_vector[PRED_A1]);
724 }
725 else {
726 out_cand->available[PRED_A1] = 1;
727 out_cand->pred_vector[PRED_A1] = *img->get_mv_info(xA1,yA1);
728
729 logtrace(LogMotion,"spatial merging candidate A1:\n");
730 logmvcand(out_cand->pred_vector[PRED_A1]);
731 }
732
733
734 // --- B1 ---
735
736 int xB1 = xP+nPbW-1;
737 int yB1 = yP-1;
738
739 bool availableB1;
740
741 if ((xP>>log2_parallel_merge_level) == (xB1>>log2_parallel_merge_level) &&
742 (yP>>log2_parallel_merge_level) == (yB1>>log2_parallel_merge_level)) {
743 availableB1 = false;
744 logtrace(LogMotion,"spatial merging candidate B1: below parallel merge level\n");
745 }
746 else if (!singleMCLFlag &&
747 partIdx==1 &&
748 (PartMode==PART_2NxN ||
749 PartMode==PART_2NxnU ||
750 PartMode==PART_2NxnD)) {
751 availableB1 = false;
752 logtrace(LogMotion,"spatial merging candidate B1: second part ignore\n");
753 }
754 else {
755 availableB1 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB1,yB1);
756 if (!availableB1) logtrace(LogMotion,"spatial merging candidate B1: unavailable\n");
757 }
758
759 if (!availableB1) {
760 out_cand->available[PRED_B1] = 0;
761 reset_pred_vector(&out_cand->pred_vector[PRED_B1]);
762 }
763 else {
764 out_cand->available[PRED_B1] = 1;
765 out_cand->pred_vector[PRED_B1] = *img->get_mv_info(xB1,yB1);
766
767 if (availableA1 &&
768 equal_cand_MV(&out_cand->pred_vector[PRED_A1],
769 &out_cand->pred_vector[PRED_B1])) {
770 out_cand->available[PRED_B1] = 0;
771 logtrace(LogMotion,"spatial merging candidate B1: redundant to A1\n");
772 }
773 else {
774 logtrace(LogMotion,"spatial merging candidate B1:\n");
775 logmvcand(out_cand->pred_vector[PRED_B1]);
776 }
777 }
778
779
780 // --- B0 ---
781
782 int xB0 = xP+nPbW;
783 int yB0 = yP-1;
784
785 bool availableB0;
786
787 if ((xP>>log2_parallel_merge_level) == (xB0>>log2_parallel_merge_level) &&
788 (yP>>log2_parallel_merge_level) == (yB0>>log2_parallel_merge_level)) {
789 availableB0 = false;
790 logtrace(LogMotion,"spatial merging candidate B0: below parallel merge level\n");
791 }
792 else {
793 availableB0 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB0,yB0);
794 if (!availableB0) logtrace(LogMotion,"spatial merging candidate B0: unavailable\n");
795 }
796
797 if (!availableB0) {
798 out_cand->available[PRED_B0] = 0;
799 reset_pred_vector(&out_cand->pred_vector[PRED_B0]);
800 }
801 else {
802 out_cand->available[PRED_B0] = 1;
803 out_cand->pred_vector[PRED_B0] = *img->get_mv_info(xB0,yB0);
804
805 if (availableB1 &&
806 equal_cand_MV(&out_cand->pred_vector[PRED_B1],
807 &out_cand->pred_vector[PRED_B0])) {
808 out_cand->available[PRED_B0] = 0;
809 logtrace(LogMotion,"spatial merging candidate B0: redundant to B1\n");
810 }
811 else {
812 logtrace(LogMotion,"spatial merging candidate B0:\n");
813 logmvcand(out_cand->pred_vector[PRED_B0]);
814 }
815 }
816
817
818 // --- A0 ---
819
820 int xA0 = xP-1;
821 int yA0 = yP+nPbH;
822
823 bool availableA0;
824
825 if ((xP>>log2_parallel_merge_level) == (xA0>>log2_parallel_merge_level) &&
826 (yP>>log2_parallel_merge_level) == (yA0>>log2_parallel_merge_level)) {
827 availableA0 = false;
828 logtrace(LogMotion,"spatial merging candidate A0: below parallel merge level\n");
829 }
830 else {
831 availableA0 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA0,yA0);
832 if (!availableA0) logtrace(LogMotion,"spatial merging candidate A0: unavailable\n");
833 }
834
835 if (!availableA0) {
836 out_cand->available[PRED_A0] = 0;
837 reset_pred_vector(&out_cand->pred_vector[PRED_A0]);
838 }
839 else {
840 out_cand->available[PRED_A0] = 1;
841 out_cand->pred_vector[PRED_A0] = *img->get_mv_info(xA0,yA0);
842
843 if (availableA1 &&
844 equal_cand_MV(&out_cand->pred_vector[PRED_A1],
845 &out_cand->pred_vector[PRED_A0])) {
846 out_cand->available[PRED_A0] = 0;
847 logtrace(LogMotion,"spatial merging candidate A0: redundant to A1\n");
848 }
849 else {
850 logtrace(LogMotion,"spatial merging candidate A0:\n");
851 logmvcand(out_cand->pred_vector[PRED_A0]);
852 }
853 }
854
855
856 // --- B2 ---
857
858 int xB2 = xP-1;
859 int yB2 = yP-1;
860
861 bool availableB2;
862
863 if (out_cand->available[PRED_A0] && out_cand->available[PRED_A1] &&
864 out_cand->available[PRED_B0] && out_cand->available[PRED_B1]) {
865 availableB2 = false;
866 logtrace(LogMotion,"spatial merging candidate B2: ignore\n");
867 }
868 else if ((xP>>log2_parallel_merge_level) == (xB2>>log2_parallel_merge_level) &&
869 (yP>>log2_parallel_merge_level) == (yB2>>log2_parallel_merge_level)) {
870 availableB2 = false;
871 logtrace(LogMotion,"spatial merging candidate B2: below parallel merge level\n");
872 }
873 else {
874 availableB2 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB2,yB2);
875 if (!availableB2) logtrace(LogMotion,"spatial merging candidate B2: unavailable\n");
876 }
877
878 if (!availableB2) {
879 out_cand->available[PRED_B2] = 0;
880 reset_pred_vector(&out_cand->pred_vector[PRED_B2]);
881 }
882 else {
883 out_cand->available[PRED_B2] = 1;
884 out_cand->pred_vector[PRED_B2] = *img->get_mv_info(xB2,yB2);
885
886 if (availableB1 &&
887 equal_cand_MV(&out_cand->pred_vector[PRED_B1],
888 &out_cand->pred_vector[PRED_B2])) {
889 out_cand->available[PRED_B2] = 0;
890 logtrace(LogMotion,"spatial merging candidate B2: redundant to B1\n");
891 }
892 else if (availableA1 &&
893 equal_cand_MV(&out_cand->pred_vector[PRED_A1],
894 &out_cand->pred_vector[PRED_B2])) {
895 out_cand->available[PRED_B2] = 0;
896 logtrace(LogMotion,"spatial merging candidate B2: redundant to A1\n");
897 }
898 else {
899 logtrace(LogMotion,"spatial merging candidate B0:\n");
900 logmvcand(out_cand->pred_vector[PRED_B0]);
901 }
902 }
903 }
904
905
906 // 8.5.3.1.4
907 void derive_zero_motion_vector_candidates(slice_segment_header* shdr,
908 PredVectorInfo* inout_mergeCandList,
909 int* inout_numCurrMergeCand)
910 {
911 logtrace(LogMotion,"derive_zero_motion_vector_candidates\n");
912
913 int numRefIdx;
914
915 if (shdr->slice_type==SLICE_TYPE_P) {
916 numRefIdx = shdr->num_ref_idx_l0_active;
917 }
918 else {
919 numRefIdx = libde265_min(shdr->num_ref_idx_l0_active,
920 shdr->num_ref_idx_l1_active);
921 }
922
923
924 //int numInputMergeCand = *inout_numMergeCand;
925 int zeroIdx = 0;
926
927 while (*inout_numCurrMergeCand < shdr->MaxNumMergeCand) {
928 // 1.
929
930 logtrace(LogMotion,"zeroIdx:%d numRefIdx:%d\n", zeroIdx, numRefIdx);
931
932 PredVectorInfo* newCand = &inout_mergeCandList[*inout_numCurrMergeCand];
933
934 if (shdr->slice_type==SLICE_TYPE_P) {
935 newCand->refIdx[0] = (zeroIdx < numRefIdx) ? zeroIdx : 0;
936 newCand->refIdx[1] = -1;
937 newCand->predFlag[0] = 1;
938 newCand->predFlag[1] = 0;
939 }
940 else {
941 newCand->refIdx[0] = (zeroIdx < numRefIdx) ? zeroIdx : 0;
942 newCand->refIdx[1] = (zeroIdx < numRefIdx) ? zeroIdx : 0;
943 newCand->predFlag[0] = 1;
944 newCand->predFlag[1] = 1;
945 }
946
947 newCand->mv[0].x = 0;
948 newCand->mv[0].y = 0;
949 newCand->mv[1].x = 0;
950 newCand->mv[1].y = 0;
951
952 (*inout_numCurrMergeCand)++;
953
954 // 2.
955
956 zeroIdx++;
957 }
958 }
959
960
961 bool scale_mv(MotionVector* out_mv, MotionVector mv, int colDist, int currDist)
962 {
963 int td = Clip3(-128,127, colDist);
964 int tb = Clip3(-128,127, currDist);
965
966 if (td==0) {
967 *out_mv = mv;
968 return false;
969 }
970 else {
971 int tx = (16384 + (abs_value(td)>>1)) / td;
972 int distScaleFactor = Clip3(-4096,4095, (tb*tx+32)>>6);
973 out_mv->x = Clip3(-32768,32767,
974 Sign(distScaleFactor*mv.x)*((abs_value(distScaleFactor*mv.x)+127)>>8));
975 out_mv->y = Clip3(-32768,32767,
976 Sign(distScaleFactor*mv.y)*((abs_value(distScaleFactor*mv.y)+127)>>8));
977 return true;
978 }
979 }
980
981
982 // (L1003) 8.5.3.2.8
983
984 void derive_collocated_motion_vectors(decoder_context* ctx,
985 de265_image* img,
986 const slice_segment_header* shdr,
987 int xP,int yP,
988 int colPic,
989 int xColPb,int yColPb,
990 int refIdxLX, int X,
991 MotionVector* out_mvLXCol,
992 uint8_t* out_availableFlagLXCol)
993 {
994 logtrace(LogMotion,"derive_collocated_motion_vectors %d;%d\n",xP,yP);
995
996 const de265_image* colImg = ctx->get_image(colPic);
997 assert(colImg);
998 enum PredMode predMode = colImg->get_pred_mode(xColPb,yColPb);
999
1000 if (predMode == MODE_INTRA) {
1001 out_mvLXCol->x = 0;
1002 out_mvLXCol->y = 0;
1003 *out_availableFlagLXCol = 0;
1004 return;
1005 }
1006 else {
1007 logtrace(LogMotion,"colPic:%d (POC=%d) X:%d refIdxLX:%d refpiclist:%d\n",
1008 colPic,
1009 colImg->PicOrderCntVal,
1010 X,refIdxLX,shdr->RefPicList[X][refIdxLX]);
1011
1012 if (colImg->integrity == INTEGRITY_UNAVAILABLE_REFERENCE) {
1013 out_mvLXCol->x = 0;
1014 out_mvLXCol->y = 0;
1015 *out_availableFlagLXCol = 0;
1016 return;
1017 }
1018
1019 const PredVectorInfo* mvi = colImg->get_mv_info(xColPb,yColPb);
1020 int listCol;
1021 int refIdxCol;
1022 MotionVector mvCol;
1023
1024 logtrace(LogMotion,"read MVI %d;%d:\n",xColPb,yColPb);
1025 logmvcand(*mvi);
1026
1027 if (mvi->predFlag[0]==0) {
1028 mvCol = mvi->mv[1];
1029 refIdxCol = mvi->refIdx[1];
1030 listCol = 1;
1031 }
1032 else {
1033 if (mvi->predFlag[1]==0) {
1034 mvCol = mvi->mv[0];
1035 refIdxCol = mvi->refIdx[0];
1036 listCol = 0;
1037 }
1038 else {
1039 int AllDiffPicOrderCntLEZero = true;
1040
1041 const int PicOrderCntVal = img->PicOrderCntVal;
1042
1043 for (int rIdx=0; rIdx<shdr->num_ref_idx_l0_active && AllDiffPicOrderCntLEZero; rIdx++)
1044 {
1045 const de265_image* imgA = ctx->get_image(shdr->RefPicList[0][rIdx]);
1046 int aPOC = imgA->PicOrderCntVal;
1047
1048 if (aPOC > PicOrderCntVal) {
1049 AllDiffPicOrderCntLEZero = false;
1050 }
1051 }
1052
1053 for (int rIdx=0; rIdx<shdr->num_ref_idx_l1_active && AllDiffPicOrderCntLEZero; rIdx++)
1054 {
1055 const de265_image* imgA = ctx->get_image(shdr->RefPicList[1][rIdx]);
1056 int aPOC = imgA->PicOrderCntVal;
1057
1058 if (aPOC > PicOrderCntVal) {
1059 AllDiffPicOrderCntLEZero = false;
1060 }
1061 }
1062
1063 if (AllDiffPicOrderCntLEZero) {
1064 mvCol = mvi->mv[X];
1065 refIdxCol = mvi->refIdx[X];
1066 listCol = X;
1067 }
1068 else {
1069 int N = shdr->collocated_from_l0_flag;
1070 mvCol = mvi->mv[N];
1071 refIdxCol = mvi->refIdx[N];
1072 listCol = N;
1073 }
1074 }
1075 }
1076
1077
1078
1079 const slice_segment_header* colShdr = colImg->slices[ colImg->get_SliceHeaderIndex(xColPb,yColPb) ];
1080
1081 if (shdr->LongTermRefPic[X][refIdxLX] !=
1082 colShdr->LongTermRefPic[listCol][refIdxCol]) {
1083 *out_availableFlagLXCol = 0;
1084 out_mvLXCol->x = 0;
1085 out_mvLXCol->y = 0;
1086 }
1087 else {
1088 *out_availableFlagLXCol = 1;
1089
1090 const bool isLongTerm = shdr->LongTermRefPic[X][refIdxLX];
1091
1092 int colDist = colImg->PicOrderCntVal - colShdr->RefPicList_POC[listCol][refIdxCol];
1093 int currDist = img->PicOrderCntVal - shdr->RefPicList_POC[X][refIdxLX];
1094
1095 logtrace(LogMotion,"COLPOCDIFF %d %d [%d %d / %d %d]\n",colDist, currDist,
1096 colImg->PicOrderCntVal, colShdr->RefPicList_POC[listCol][refIdxCol],
1097 img->PicOrderCntVal, shdr->RefPicList_POC[X][refIdxLX]
1098 );
1099
1100 if (isLongTerm || colDist == currDist) {
1101 *out_mvLXCol = mvCol;
1102 }
1103 else {
1104 if (!scale_mv(out_mvLXCol, mvCol, colDist, currDist)) {
1105 ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false);
1106 img->integrity = INTEGRITY_DECODING_ERRORS;
1107 }
1108
1109 logtrace(LogMotion,"scale: %d;%d to %d;%d\n",
1110 mvCol.x,mvCol.y, out_mvLXCol->x,out_mvLXCol->y);
1111 }
1112 }
1113 }
1114 }
1115
1116
1117 // 8.5.3.1.7
1118 void derive_temporal_luma_vector_prediction(decoder_context* ctx,
1119 de265_image* img,
1120 const slice_segment_header* shdr,
1121 int xP,int yP,
1122 int nPbW,int nPbH,
1123 int refIdxL, int X,
1124 MotionVector* out_mvLXCol,
1125 uint8_t* out_availableFlagLXCol)
1126 {
1127
1128 if (shdr->slice_temporal_mvp_enabled_flag == 0) {
1129 out_mvLXCol->x = 0;
1130 out_mvLXCol->y = 0;
1131 *out_availableFlagLXCol = 0;
1132 return;
1133 }
1134
1135 int Log2CtbSizeY = img->sps.Log2CtbSizeY;
1136
1137 int colPic; // TODO: this is the same for the whole slice. We can precompute it.
1138
1139 if (shdr->slice_type == SLICE_TYPE_B &&
1140 shdr->collocated_from_l0_flag == 0)
1141 {
1142 logtrace(LogMotion,"collocated L1 ref_idx=%d\n",shdr->collocated_ref_idx);
1143
1144 // TODO: make sure that shdr->collocated_ref_idx is a valid index
1145 colPic = shdr->RefPicList[1][ shdr->collocated_ref_idx ];
1146 }
1147 else
1148 {
1149 logtrace(LogMotion,"collocated L0 ref_idx=%d\n",shdr->collocated_ref_idx);
1150
1151 // TODO: make sure that shdr->collocated_ref_idx is a valid index
1152 colPic = shdr->RefPicList[0][ shdr->collocated_ref_idx ];
1153 }
1154
1155 //logtrace(LogMotion,"collocated reference POC=%d\n",ctx->dpb[colPic].PicOrderCntVal);
1156
1157
1158 int xColPb,yColPb;
1159 int yColBr = yP + nPbH; // bottom right collocated motion vector position
1160 int xColBr = xP + nPbW;
1161
1162 if ((yP>>Log2CtbSizeY) == (yColBr>>Log2CtbSizeY) &&
1163 xColBr < img->sps.pic_width_in_luma_samples &&
1164 yColBr < img->sps.pic_height_in_luma_samples)
1165 {
1166 xColPb = xColBr & ~0x0F; // reduce resolution of collocated motion-vectors to 16 pixels grid
1167 yColPb = yColBr & ~0x0F;
1168
1169 derive_collocated_motion_vectors(ctx,img,shdr, xP,yP, colPic, xColPb,yColPb, refIdxL, X,
1170 out_mvLXCol, out_availableFlagLXCol);
1171 }
1172 else
1173 {
1174 out_mvLXCol->x = 0;
1175 out_mvLXCol->y = 0;
1176 *out_availableFlagLXCol = 0;
1177 }
1178
1179
1180 if (*out_availableFlagLXCol==0) {
1181
1182 int xColCtr = xP+(nPbW>>1);
1183 int yColCtr = yP+(nPbH>>1);
1184
1185 xColPb = xColCtr & ~0x0F; // reduce resolution of collocated motion-vectors to 16 pixels grid
1186 yColPb = yColCtr & ~0x0F;
1187
1188 derive_collocated_motion_vectors(ctx,img,shdr, xP,yP, colPic, xColPb,yColPb, refIdxL, X,
1189 out_mvLXCol, out_availableFlagLXCol);
1190 }
1191 }
1192
1193
1194 static int table_8_19[2][12] = {
1195 { 0,1,0,2,1,2,0,3,1,3,2,3 },
1196 { 1,0,2,0,2,1,3,0,3,1,3,2 }
1197 };
1198
1199 // 8.5.3.1.3
1200 void derive_combined_bipredictive_merging_candidates(const decoder_context* ctx,
1201 slice_segment_header* shdr,
1202 PredVectorInfo* inout_mergeCandList,
1203 int* inout_numMergeCand,
1204 int numOrigMergeCand)
1205 {
1206 if (*inout_numMergeCand>1 && *inout_numMergeCand < shdr->MaxNumMergeCand) {
1207 int numInputMergeCand = *inout_numMergeCand;
1208 int combIdx = 0;
1209 uint8_t combStop = false;
1210
1211 while (!combStop) {
1212 int l0CandIdx = table_8_19[0][combIdx];
1213 int l1CandIdx = table_8_19[1][combIdx];
1214
1215 if (l0CandIdx >= numInputMergeCand ||
1216 l1CandIdx >= numInputMergeCand) {
1217 assert(false); // bitstream error -> TODO: conceal error
1218 }
1219
1220 PredVectorInfo* l0Cand = &inout_mergeCandList[l0CandIdx];
1221 PredVectorInfo* l1Cand = &inout_mergeCandList[l1CandIdx];
1222
1223 logtrace(LogMotion,"add bipredictive merging candidate (combIdx:%d)\n",combIdx);
1224 logtrace(LogMotion,"l0Cand:\n"); logmvcand(*l0Cand);
1225 logtrace(LogMotion,"l1Cand:\n"); logmvcand(*l1Cand);
1226
1227 const de265_image* img0 = l0Cand->predFlag[0] ? ctx->get_image(shdr->RefPicList[0][l0Cand->refIdx[0]]) : NULL;
1228 const de265_image* img1 = l1Cand->predFlag[1] ? ctx->get_image(shdr->RefPicList[1][l1Cand->refIdx[1]]) : NULL;
1229
1230 if (l0Cand->predFlag[0] && l1Cand->predFlag[1] &&
1231 (img0->PicOrderCntVal != img1->PicOrderCntVal ||
1232 l0Cand->mv[0].x != l1Cand->mv[1].x ||
1233 l0Cand->mv[0].y != l1Cand->mv[1].y)) {
1234 PredVectorInfo* p = &inout_mergeCandList[ *inout_numMergeCand ];
1235 p->refIdx[0] = l0Cand->refIdx[0];
1236 p->refIdx[1] = l1Cand->refIdx[1];
1237 p->predFlag[0] = l0Cand->predFlag[0];
1238 p->predFlag[1] = l1Cand->predFlag[1];
1239 p->mv[0] = l0Cand->mv[0];
1240 p->mv[1] = l1Cand->mv[1];
1241 (*inout_numMergeCand)++;
1242
1243 logtrace(LogMotion,"result:\n");
1244 logmvcand(*p);
1245 }
1246
1247 combIdx++;
1248 if (combIdx == numOrigMergeCand*(numOrigMergeCand-1) ||
1249 *inout_numMergeCand == shdr->MaxNumMergeCand) {
1250 combStop = true;
1251 }
1252 }
1253 }
1254 }
1255
1256
1257 // 8.5.3.1.1
1258 void derive_luma_motion_merge_mode(decoder_context* ctx,
1259 thread_context* tctx,
1260 int xC,int yC, int xP,int yP,
1261 int nCS, int nPbW,int nPbH, int partIdx,
1262 VectorInfo* out_vi)
1263 {
1264 slice_segment_header* shdr = tctx->shdr;
1265
1266 //int xOrigP = xP;
1267 //int yOrigP = yP;
1268 int nOrigPbW = nPbW;
1269 int nOrigPbH = nPbH;
1270
1271 int singleMCLFlag;
1272 singleMCLFlag = (tctx->img->pps.log2_parallel_merge_level > 2 && nCS==8);
1273
1274 if (singleMCLFlag) {
1275 xP=xC;
1276 yP=yC;
1277 nPbW=nCS;
1278 nPbH=nCS;
1279 partIdx=0;
1280 }
1281
1282 MergingCandidates mergeCand;
1283 derive_spatial_merging_candidates(tctx->img, xC,yC, nCS, xP,yP, singleMCLFlag,
1284 nPbW,nPbH,partIdx, &mergeCand);
1285
1286 int refIdxCol[2] = { 0,0 };
1287
1288 MotionVector mvCol[2];
1289 uint8_t predFlagLCol[2];
1290 derive_temporal_luma_vector_prediction(ctx,tctx->img,shdr, xP,yP,nPbW,nPbH,
1291 refIdxCol[0],0, &mvCol[0],
1292 &predFlagLCol[0]);
1293
1294 uint8_t availableFlagCol = predFlagLCol[0];
1295 predFlagLCol[1] = 0;
1296
1297 if (shdr->slice_type == SLICE_TYPE_B) {
1298 derive_temporal_luma_vector_prediction(ctx,tctx->img,shdr,
1299 xP,yP,nPbW,nPbH, refIdxCol[1],1, &mvCol[1],
1300 &predFlagLCol[1]);
1301 availableFlagCol |= predFlagLCol[1];
1302 }
1303
1304
1305 // 4.
1306
1307 PredVectorInfo mergeCandList[5];
1308 int numMergeCand=0;
1309
1310 for (int i=0;i<5;i++) {
1311 if (mergeCand.available[i]) {
1312 mergeCandList[numMergeCand++] = mergeCand.pred_vector[i];
1313 }
1314 }
1315
1316 if (availableFlagCol) {
1317 // TODO: save in mergeCand directly...
1318 mergeCand.available[PRED_COL] = availableFlagCol;
1319 mergeCand.pred_vector[PRED_COL].mv[0] = mvCol[0];
1320 mergeCand.pred_vector[PRED_COL].mv[1] = mvCol[1];
1321 mergeCand.pred_vector[PRED_COL].predFlag[0] = predFlagLCol[0];
1322 mergeCand.pred_vector[PRED_COL].predFlag[1] = predFlagLCol[1];
1323 mergeCand.pred_vector[PRED_COL].refIdx[0] = refIdxCol[0];
1324 mergeCand.pred_vector[PRED_COL].refIdx[1] = refIdxCol[1];
1325
1326 mergeCandList[numMergeCand++] = mergeCand.pred_vector[PRED_COL];
1327 }
1328
1329 // 5.
1330
1331 //int numOrigMergeCand = numMergeCand;
1332
1333 // 6.
1334
1335 //int numCombMergeCand = 0;
1336
1337 if (shdr->slice_type == SLICE_TYPE_B) {
1338 derive_combined_bipredictive_merging_candidates(ctx, shdr,
1339 mergeCandList, &numMergeCand, numMergeCand);
1340
1341 //numCombMergeCand = numMergeCand - numOrigMergeCand;
1342 }
1343
1344
1345 // 7.
1346
1347 derive_zero_motion_vector_candidates(shdr, mergeCandList, &numMergeCand);
1348
1349 // 8.
1350
1351 int merge_idx = tctx->merge_idx; // get_merge_idx(ctx,xP,yP);
1352 out_vi->lum = mergeCandList[merge_idx];
1353
1354
1355 logtrace(LogMotion,"mergeCandList:\n");
1356 for (int i=0;i<shdr->MaxNumMergeCand;i++)
1357 {
1358 logtrace(LogMotion, " %d:%s\n", i, i==merge_idx ? " SELECTED":"");
1359 logmvcand(mergeCandList[i]);
1360 }
1361
1362 // 9.
1363
1364 if (out_vi->lum.predFlag[0] && out_vi->lum.predFlag[1] && nOrigPbW+nOrigPbH==12) {
1365 out_vi->lum.refIdx[1] = -1;
1366 out_vi->lum.predFlag[1] = 0;
1367 }
1368 }
1369
1370
1371 // 8.5.3.1.6
1372 void derive_spatial_luma_vector_prediction(de265_image* img,
1373 const slice_segment_header* shdr,
1374 int xC,int yC,int nCS,int xP,int yP,
1375 int nPbW,int nPbH, int X,
1376 int refIdxLX, int partIdx,
1377 uint8_t out_availableFlagLXN[2],
1378 MotionVector out_mvLXN[2])
1379 {
1380 const decoder_context* ctx = img->decctx;
1381
1382 int isScaledFlagLX = 0;
1383
1384 const int A=0;
1385 const int B=1;
1386
1387 // --- A ---
1388
1389 // 1.
1390
1391 int xA[2], yA[2];
1392 xA[0] = xP-1;
1393 yA[0] = yP + nPbH;
1394 xA[1] = xA[0];
1395 yA[1] = yA[0]-1;
1396
1397 // 2.
1398
1399 out_availableFlagLXN[A] = 0;
1400 out_mvLXN[A].x = 0;
1401 out_mvLXN[A].y = 0;
1402
1403 // 3. / 4.
1404
1405 bool availableA[2];
1406 availableA[0] = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA[0],yA[0]);
1407 availableA[1] = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA[1],yA[1]);
1408
1409 // 5.
1410
1411 if (availableA[0] || availableA[1]) {
1412 isScaledFlagLX = 1;
1413 }
1414
1415 // 6. test A0 and A1 (Ak)
1416
1417 int refIdxA=-1;
1418
1419 // the POC we want to reference in this PB
1420 const int referenced_POC = ctx->get_image(shdr->RefPicList[X][ refIdxLX ])->PicOrderCntVal;
1421
1422 const int referenced_refIdx = refIdxLX;
1423
1424 for (int k=0;k<=1;k++) {
1425 if (availableA[k] &&
1426 out_availableFlagLXN[A]==0 && // no A?-predictor so far
1427 img->get_pred_mode(xA[k],yA[k]) != MODE_INTRA) {
1428
1429 int Y=1-X;
1430
1431 const PredVectorInfo* vi = img->get_mv_info(xA[k],yA[k]);
1432 logtrace(LogMotion,"MVP A%d=\n",k);
1433 logmvcand(*vi);
1434
1435 // check whether the predictor X is available and references the same POC
1436 if (vi->predFlag[X] &&
1437 ctx->get_image(shdr->RefPicList[X][ vi->refIdx[X] ])->PicOrderCntVal == referenced_POC) {
1438 //vi->refIdx[X] == referenced_refIdx) {
1439
1440 logtrace(LogMotion,"take A%d/L%d as A candidate with same POC\n",k,X);
1441
1442 out_availableFlagLXN[A]=1;
1443 out_mvLXN[A] = vi->mv[X];
1444 refIdxA = vi->refIdx[X];
1445 }
1446 // check whether the other predictor (Y) is available and references the same POC
1447 else if (vi->predFlag[Y] &&
1448 ctx->get_image(shdr->RefPicList[Y][ vi->refIdx[Y] ])->PicOrderCntVal == referenced_POC) {
1449 //vi->refIdx[Y] == referenced_refIdx) {
1450
1451 logtrace(LogMotion,"take A%d/L%d as A candidate with same POC\n",k,Y);
1452
1453 out_availableFlagLXN[A]=1;
1454 out_mvLXN[A] = vi->mv[Y];
1455 refIdxA = vi->refIdx[Y];
1456 }
1457 }
1458 }
1459
1460 // 7. If there is no predictor referencing the same POC, we take any other reference as
1461 // long as it is the same type of reference (long-term / short-term)
1462
1463 for (int k=0 ; k<=1 && out_availableFlagLXN[A]==0 ; k++) {
1464 int refPicList=-1;
1465
1466 if (availableA[k] &&
1467 // TODO: we could remove this call by storing the result of the similar computation above
1468 img->get_pred_mode(xA[k],yA[k]) != MODE_INTRA) {
1469
1470 int Y=1-X;
1471
1472 const PredVectorInfo* vi = img->get_mv_info(xA[k],yA[k]);
1473 if (vi->predFlag[X]==1 &&
1474 shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[X][ vi->refIdx[X] ]) {
1475
1476 logtrace(LogMotion,"take A%D/L%d as A candidate with different POCs\n",k,X);
1477
1478 out_availableFlagLXN[A]=1;
1479 out_mvLXN[A] = vi->mv[X];
1480 refIdxA = vi->refIdx[X];
1481 refPicList = X;
1482 }
1483 else if (vi->predFlag[Y]==1 &&
1484 shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[Y][ vi->refIdx[Y] ]) {
1485
1486 logtrace(LogMotion,"take A%d/L%d as A candidate with different POCs\n",k,Y);
1487
1488 out_availableFlagLXN[A]=1;
1489 out_mvLXN[A] = vi->mv[Y];
1490 refIdxA = vi->refIdx[Y];
1491 refPicList = Y;
1492 }
1493 }
1494
1495 if (out_availableFlagLXN[A]==1) {
1496 assert(refIdxA>=0);
1497 assert(refPicList>=0);
1498
1499 const de265_image* refPicA = ctx->get_image(shdr->RefPicList[refPicList][refIdxA ]);
1500 const de265_image* refPicX = ctx->get_image(shdr->RefPicList[X ][refIdxLX]);
1501
1502 int picStateA = shdr->RefPicList_PicState[refPicList][refIdxA ];
1503 int picStateX = shdr->RefPicList_PicState[X ][refIdxLX];
1504
1505 int isLongTermA = shdr->LongTermRefPic[refPicList][refIdxA ];
1506 int isLongTermX = shdr->LongTermRefPic[X ][refIdxLX];
1507
1508 logtrace(LogMotion,"scale MVP A: A-POC:%d X-POC:%d\n",
1509 refPicA->PicOrderCntVal,refPicX->PicOrderCntVal);
1510
1511 if (!isLongTermA && !isLongTermX)
1512 /*
1513 if (picStateA == UsedForShortTermReference &&
1514 picStateX == UsedForShortTermReference)
1515 */
1516 {
1517 int distA = img->PicOrderCntVal - refPicA->PicOrderCntVal;
1518 int distX = img->PicOrderCntVal - referenced_POC;
1519
1520 if (!scale_mv(&out_mvLXN[A], out_mvLXN[A], distA, distX)) {
1521 img->decctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false);
1522 img->integrity = INTEGRITY_DECODING_ERRORS;
1523 }
1524 }
1525 }
1526 }
1527
1528
1529 // --- B ---
1530
1531 // 1.
1532
1533 int xB[3], yB[3];
1534 xB[0] = xP+nPbW;
1535 yB[0] = yP-1;
1536 xB[1] = xB[0]-1;
1537 yB[1] = yP-1;
1538 xB[2] = xP-1;
1539 yB[2] = yP-1;
1540
1541 // 2.
1542
1543 out_availableFlagLXN[B] = 0;
1544 out_mvLXN[B].x = 0;
1545 out_mvLXN[B].y = 0;
1546
1547 // 3. test B0,B1,B2 (Bk)
1548
1549 int refIdxB=-1;
1550
1551 bool availableB[3];
1552 for (int k=0;k<3;k++) {
1553 availableB[k] = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB[k],yB[k]);
1554
1555 if (availableB[k] && out_availableFlagLXN[B]==0) {
1556
1557 int Y=1-X;
1558
1559 const PredVectorInfo* vi = img->get_mv_info(xB[k],yB[k]);
1560 logtrace(LogMotion,"MVP B%d=\n",k);
1561 logmvcand(*vi);
1562
1563
1564 if (vi->predFlag[X] &&
1565 ctx->get_image(shdr->RefPicList[X][ vi->refIdx[X] ])->PicOrderCntVal == referenced_POC) {
1566 logtrace(LogMotion,"a) take B%d/L%d as B candidate with same POC\n",k,X);
1567
1568 out_availableFlagLXN[B]=1;
1569 out_mvLXN[B] = vi->mv[X];
1570 refIdxB = vi->refIdx[X];
1571 }
1572 else if (vi->predFlag[Y] &&
1573 ctx->get_image(shdr->RefPicList[Y][ vi->refIdx[Y] ])->PicOrderCntVal == referenced_POC) {
1574 logtrace(LogMotion,"b) take B%d/L%d as B candidate with same POC\n",k,Y);
1575
1576 out_availableFlagLXN[B]=1;
1577 out_mvLXN[B] = vi->mv[Y];
1578 refIdxB = vi->refIdx[Y];
1579 }
1580 }
1581 }
1582
1583 // 4.
1584
1585 if (isScaledFlagLX==0 && // no A predictor,
1586 out_availableFlagLXN[B]) // but an unscaled B predictor
1587 {
1588 // use unscaled B predictor as A predictor
1589
1590 logtrace(LogMotion,"copy the same-POC B candidate as additional A candidate\n");
1591
1592 out_availableFlagLXN[A]=1;
1593 out_mvLXN[A] = out_mvLXN[B];
1594 refIdxA = refIdxB;
1595 }
1596
1597 // 5.
1598
1599 // If no A predictor, we output the unscaled B as the A predictor (above)
1600 // and also add a scaled B predictor here.
1601 // If there is (probably) an A predictor, no differing-POC B predictor is generated.
1602 if (isScaledFlagLX==0) {
1603 out_availableFlagLXN[B]=0;
1604
1605 for (int k=0 ; k<=2 && out_availableFlagLXN[B]==0 ; k++) {
1606 int refPicList=-1;
1607
1608 if (availableB[k]) {
1609 int Y=1-X;
1610
1611 const PredVectorInfo* vi = img->get_mv_info(xB[k],yB[k]);
1612
1613 if (vi->predFlag[X]==1 &&
1614 shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[X][ vi->refIdx[X] ]) {
1615 out_availableFlagLXN[B]=1;
1616 out_mvLXN[B] = vi->mv[X];
1617 refIdxB = vi->refIdx[X];
1618 refPicList = X;
1619 }
1620 else if (vi->predFlag[Y]==1 &&
1621 shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[Y][ vi->refIdx[Y] ]) {
1622 out_availableFlagLXN[B]=1;
1623 out_mvLXN[B] = vi->mv[Y];
1624 refIdxB = vi->refIdx[Y];
1625 refPicList = Y;
1626 }
1627 }
1628
1629 if (out_availableFlagLXN[B]==1) {
1630 assert(refPicList>=0);
1631 assert(refIdxB>=0);
1632
1633 const de265_image* refPicB=img->decctx->get_image(shdr->RefPicList[refPicList][refIdxB ]);
1634 const de265_image* refPicX=img->decctx->get_image(shdr->RefPicList[X ][refIdxLX]);
1635
1636 int isLongTermB = shdr->LongTermRefPic[refPicList][refIdxB ];
1637 int isLongTermX = shdr->LongTermRefPic[X ][refIdxLX];
1638
1639 if (refPicB->PicOrderCntVal != refPicX->PicOrderCntVal &&
1640 !isLongTermB && !isLongTermX) {
1641 int distB = img->PicOrderCntVal - refPicB->PicOrderCntVal;
1642 int distX = img->PicOrderCntVal - referenced_POC;
1643
1644 logtrace(LogMotion,"scale MVP B: B-POC:%d X-POC:%d\n",refPicB->PicOrderCntVal,refPicX->PicOrderCntVal);
1645
1646 if (!scale_mv(&out_mvLXN[B], out_mvLXN[B], distB, distX)) {
1647 img->decctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false);
1648 img->integrity = INTEGRITY_DECODING_ERRORS;
1649 }
1650 }
1651 }
1652 }
1653 }
1654 }
1655
1656 // 8.5.3.1.5
1657 MotionVector luma_motion_vector_prediction(decoder_context* ctx,
1658 thread_context* tctx,
1659 int xC,int yC,int nCS,int xP,int yP,
1660 int nPbW,int nPbH, int l,
1661 int refIdx, int partIdx)
1662 {
1663 const slice_segment_header* shdr = tctx->shdr;
1664
1665
1666 // 8.5.3.1.6: derive two spatial vector predictors A (0) and B (1)
1667
1668 uint8_t availableFlagLXN[2];
1669 MotionVector mvLXN[2];
1670
1671 derive_spatial_luma_vector_prediction(tctx->img, shdr, xC,yC, nCS, xP,yP, nPbW,nPbH, l, refIdx, partIdx,
1672 availableFlagLXN, mvLXN);
1673
1674 // 8.5.3.1.7: if we only have one spatial vector or both spatial vectors are the same,
1675 // derive a temporal predictor
1676
1677 uint8_t availableFlagLXCol;
1678 MotionVector mvLXCol;
1679
1680
1681 if (availableFlagLXN[0] &&
1682 availableFlagLXN[1] &&
1683 (mvLXN[0].x != mvLXN[1].x || mvLXN[0].y != mvLXN[1].y)) {
1684 availableFlagLXCol = 0;
1685 }
1686 else {
1687 derive_temporal_luma_vector_prediction(ctx, tctx->img, shdr, xP,yP, nPbW,nPbH, refIdx,l,
1688 &mvLXCol, &availableFlagLXCol);
1689 }
1690
1691
1692 // --- build candidate vector list with exactly two entries ---
1693
1694 int numMVPCandLX=0;
1695
1696 // spatial predictor A
1697
1698 MotionVector mvpList[3];
1699 if (availableFlagLXN[0])
1700 {
1701 mvpList[numMVPCandLX++] = mvLXN[0];
1702 }
1703
1704 // spatial predictor B (if not same as A)
1705
1706 if (availableFlagLXN[1] &&
1707 (!availableFlagLXN[0] || // in case A in not available, but mvLXA initialized to same as mvLXB
1708 (mvLXN[0].x != mvLXN[1].x || mvLXN[0].y != mvLXN[1].y)))
1709 {
1710 mvpList[numMVPCandLX++] = mvLXN[1];
1711 }
1712
1713 // temporal predictor
1714
1715 if (availableFlagLXCol)
1716 {
1717 mvpList[numMVPCandLX++] = mvLXCol;
1718 }
1719
1720 // fill with zero predictors
1721
1722 while (numMVPCandLX<2) {
1723 mvpList[numMVPCandLX].x = 0;
1724 mvpList[numMVPCandLX].y = 0;
1725 numMVPCandLX++;
1726 }
1727
1728
1729 // select predictor according to mvp_lX_flag
1730
1731 return mvpList[ tctx->mvp_lX_flag[l] ];
1732 }
1733
1734 #if DE265_LOG_TRACE
1735 void logMV(int x0,int y0,int nPbW,int nPbH, const char* mode,const VectorInfo* mv)
1736 {
1737 int pred0 = mv->lum.predFlag[0];
1738 int pred1 = mv->lum.predFlag[1];
1739
1740 logtrace(LogMotion,
1741 "*MV %d;%d [%d;%d] %s: (%d) %d;%d @%d (%d) %d;%d @%d\n", x0,y0,nPbW,nPbH,mode,
1742 pred0,
1743 pred0 ? mv->lum.mv[0].x : 0,pred0 ? mv->lum.mv[0].y : 0, pred0 ? mv->lum.refIdx[0] : 0,
1744 pred1,
1745 pred1 ? mv->lum.mv[1].x : 0,pred1 ? mv->lum.mv[1].y : 0, pred1 ? mv->lum.refIdx[1] : 0);
1746 }
1747 #else
1748 #define logMV(x0,y0,nPbW,nPbH,mode,mv)
1749 #endif
1750
1751
1752
1753 // 8.5.3.1
1754 void motion_vectors_and_ref_indices(decoder_context* ctx,
1755 thread_context* tctx,
1756 int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx,
1757 VectorInfo* out_vi)
1758 {
1759 //slice_segment_header* shdr = tctx->shdr;
1760
1761 int xP = xC+xB;
1762 int yP = yC+yB;
1763
1764 enum PredMode predMode = tctx->img->get_pred_mode(xC,yC);
1765
1766 if (predMode == MODE_SKIP ||
1767 (predMode == MODE_INTER && tctx->merge_flag))
1768 {
1769 derive_luma_motion_merge_mode(ctx,tctx, xC,yC, xP,yP, nCS,nPbW,nPbH, partIdx, out_vi);
1770
1771 logMV(xP,yP,nPbW,nPbH, "merge_mode", out_vi);
1772 }
1773 else {
1774 int mvdL[2][2];
1775 MotionVector mvpL[2];
1776
1777 for (int l=0;l<2;l++) {
1778 // 1.
1779
1780 enum InterPredIdc inter_pred_idc = (enum InterPredIdc)tctx->inter_pred_idc;
1781
1782 if (inter_pred_idc == PRED_BI ||
1783 (inter_pred_idc == PRED_L0 && l==0) ||
1784 (inter_pred_idc == PRED_L1 && l==1)) {
1785 out_vi->lum.refIdx[l] = tctx->refIdx[l];
1786 out_vi->lum.predFlag[l] = 1;
1787 }
1788 else {
1789 out_vi->lum.refIdx[l] = -1;
1790 out_vi->lum.predFlag[l] = 0;
1791 }
1792
1793 // 2.
1794
1795 mvdL[l][0] = tctx->mvd[l][0];
1796 mvdL[l][1] = tctx->mvd[l][1];
1797
1798
1799 if (out_vi->lum.predFlag[l]) {
1800 // 3.
1801
1802 mvpL[l] = luma_motion_vector_prediction(ctx,tctx,xC,yC,nCS,xP,yP, nPbW,nPbH, l,
1803 out_vi->lum.refIdx[l], partIdx);
1804
1805 // 4.
1806
1807 int32_t x = (mvpL[l].x + mvdL[l][0] + 0x10000) & 0xFFFF;
1808 int32_t y = (mvpL[l].y + mvdL[l][1] + 0x10000) & 0xFFFF;
1809
1810 out_vi->lum.mv[l].x = (x>=0x8000) ? x-0x10000 : x;
1811 out_vi->lum.mv[l].y = (y>=0x8000) ? y-0x10000 : y;
1812 }
1813 }
1814
1815 logMV(xP,yP,nPbW,nPbH, "mvp", out_vi);
1816 }
1817 }
1818
1819
1820 // 8.5.3
1821 void decode_prediction_unit(thread_context* tctx,
1822 int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx)
1823 {
1824 logtrace(LogMotion,"decode_prediction_unit POC=%d %d;%d %dx%d\n",
1825 tctx->img->PicOrderCntVal, xC+xB,yC+yB, nPbW,nPbH);
1826
1827 slice_segment_header* shdr = tctx->shdr;
1828
1829 // 1.
1830
1831 VectorInfo vi;
1832 motion_vectors_and_ref_indices(tctx->decctx,tctx, xC,yC, xB,yB, nCS, nPbW,nPbH, partIdx, &vi);
1833
1834 // 2.
1835
1836 generate_inter_prediction_samples(tctx->decctx,tctx->img, shdr, xC,yC, xB,yB, nCS, nPbW,nPbH, &vi);
1837
1838
1839 tctx->img->set_mv_info(xC+xB,yC+yB,nPbW,nPbH, &vi.lum);
1840 }
1841
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
4444 } VectorInfo;
4545
4646
47 void decode_prediction_unit(struct thread_context* shdr,
48 int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx);
49
50 void inter_prediction(struct decoder_context* ctx,struct slice_segment_header* shdr,
51 int xC,int yC, int log2CbSize);
52
4753 #endif
+0
-32
libde265/motion_func.h less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef DE265_MOTION_FUNC_H
21 #define DE265_MOTION_FUNC_H
22
23 #include "libde265/decctx.h"
24
25 void decode_prediction_unit(decoder_context* ctx,thread_context* shdr,
26 int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx);
27
28 void inter_prediction(decoder_context* ctx,slice_segment_header* shdr,
29 int xC,int yC, int log2CbSize);
30
31 #endif
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "nal-parser.h"
21
22 #include <string.h>
23 #include <assert.h>
24 #include <stdlib.h>
25 #include <stdio.h>
26
27 #ifdef HAVE_CONFIG_H
28 #include "config.h"
29 #endif
30
31
32 NAL_unit::NAL_unit()
33 : skipped_bytes(DE265_SKIPPED_BYTES_INITIAL_SIZE)
34 {
35 pts=0;
36 user_data = NULL;
37
38 nal_data = NULL;
39 data_size = 0;
40 capacity = 0;
41 }
42
43 NAL_unit::~NAL_unit()
44 {
45 free(nal_data);
46 }
47
48 void NAL_unit::clear()
49 {
50 header = nal_header();
51 pts = 0;
52 user_data = NULL;
53
54 // set size to zero but keep memory
55 data_size = 0;
56
57 skipped_bytes.clear();
58 }
59
60 void NAL_unit::resize(int new_size)
61 {
62 if (capacity < new_size) {
63 unsigned char* newbuffer = (unsigned char*)malloc(new_size);
64
65 if (nal_data != NULL) {
66 memcpy(newbuffer, nal_data, data_size);
67 free(nal_data);
68 }
69
70 nal_data = newbuffer;
71 capacity = new_size;
72 }
73 }
74
75 void NAL_unit::append(const unsigned char* in_data, int n)
76 {
77 resize(data_size + n);
78 memcpy(nal_data + data_size, in_data, n);
79 data_size += n;
80 }
81
82 void NAL_unit::set_data(const unsigned char* in_data, int n)
83 {
84 resize(n);
85 memcpy(nal_data, in_data, n);
86 data_size = n;
87 }
88
89 void NAL_unit::insert_skipped_byte(int pos)
90 {
91 skipped_bytes.push_back(pos);
92 }
93
94 int NAL_unit::num_skipped_bytes_before(int byte_position, int headerLength) const
95 {
96 for (int k=skipped_bytes.size()-1;k>=0;k--)
97 if (skipped_bytes[k]-headerLength <= byte_position) {
98 return k+1;
99 }
100
101 return 0;
102 }
103
104 void NAL_unit::remove_stuffing_bytes()
105 {
106 uint8_t* p = data();
107
108 for (int i=0;i<size()-2;i++)
109 {
110 #if 0
111 for (int k=i;k<i+64;k++)
112 if (i*0+k<size()) {
113 printf("%c%02x", (k==i) ? '[':' ', data()[k]);
114 }
115 printf("\n");
116 #endif
117
118 if (p[2]!=3 && p[2]!=0) {
119 // fast forward 3 bytes (2+1)
120 p+=2;
121 i+=2;
122 }
123 else {
124 if (p[0]==0 && p[1]==0 && p[2]==3) {
125 //printf("SKIP NAL @ %d\n",i+2+num_skipped_bytes);
126 insert_skipped_byte(i+2 + num_skipped_bytes());
127
128 memmove(p+2, p+3, size()-i-3);
129 set_size(size()-1);
130
131 p++;
132 i++;
133 }
134 }
135
136 p++;
137 }
138 }
139
140
141
142
143
144 NAL_Parser::NAL_Parser()
145 {
146 end_of_stream = false;
147 input_push_state = 0;
148 pending_input_NAL = NULL;
149 nBytes_in_NAL_queue = 0;
150 }
151
152
153 NAL_Parser::~NAL_Parser()
154 {
155 // --- free NAL queues ---
156
157 // empty NAL queue
158
159 NAL_unit* nal;
160 while ( (nal = pop_from_NAL_queue()) ) {
161 free_NAL_unit(nal);
162 }
163
164 // free the pending input NAL
165
166 if (pending_input_NAL != NULL) {
167 free_NAL_unit(pending_input_NAL);
168 }
169
170 // free all NALs in free-list
171
172 for (int i=0;i<NAL_free_list.size();i++) {
173 delete NAL_free_list[i];
174 }
175 }
176
177
178 NAL_unit* NAL_Parser::alloc_NAL_unit(int size)
179 {
180 NAL_unit* nal;
181
182 // --- get NAL-unit object ---
183
184 if (NAL_free_list.size() > 0) {
185 nal = NAL_free_list.back();
186 NAL_free_list.pop_back();
187 }
188 else {
189 nal = new NAL_unit;
190 }
191
192 nal->clear();
193 nal->resize(size);
194
195 return nal;
196 }
197
198 void NAL_Parser::free_NAL_unit(NAL_unit* nal)
199 {
200 if (NAL_free_list.size() < DE265_NAL_FREE_LIST_SIZE) {
201 NAL_free_list.push_back(nal);
202 }
203 else {
204 delete nal;
205 }
206 }
207
208 NAL_unit* NAL_Parser::pop_from_NAL_queue()
209 {
210 if (NAL_queue.empty()) {
211 return NULL;
212 }
213 else {
214 NAL_unit* nal = NAL_queue.front();
215 NAL_queue.pop();
216
217 nBytes_in_NAL_queue -= nal->size();
218
219 return nal;
220 }
221 }
222
223 void NAL_Parser::push_to_NAL_queue(NAL_unit* nal)
224 {
225 NAL_queue.push(nal);
226 nBytes_in_NAL_queue += nal->size();
227 }
228
229 de265_error NAL_Parser::push_data(const unsigned char* data, int len,
230 de265_PTS pts, void* user_data)
231 {
232 if (pending_input_NAL == NULL) {
233 pending_input_NAL = alloc_NAL_unit(len+3);
234 pending_input_NAL->pts = pts;
235 pending_input_NAL->user_data = user_data;
236 }
237
238 NAL_unit* nal = pending_input_NAL; // shortcut
239
240 // Resize output buffer so that complete input would fit.
241 // We add 3, because in the worst case 3 extra bytes are created for an input byte.
242 nal->resize(nal->size() + len + 3);
243
244 unsigned char* out = nal->data() + nal->size();
245
246 for (int i=0;i<len;i++) {
247 /*
248 printf("state=%d input=%02x (%p) (output size: %d)\n",ctx->input_push_state, *data, data,
249 out - ctx->nal_data.data);
250 */
251
252 switch (input_push_state) {
253 case 0:
254 case 1:
255 if (*data == 0) { input_push_state++; }
256 else { input_push_state=0; }
257 break;
258 case 2:
259 if (*data == 1) { input_push_state=3; } // nal->clear_skipped_bytes(); }
260 else if (*data == 0) { } // *out++ = 0; }
261 else { input_push_state=0; }
262 break;
263 case 3:
264 *out++ = *data;
265 input_push_state = 4;
266 break;
267 case 4:
268 *out++ = *data;
269 input_push_state = 5;
270 break;
271
272 case 5:
273 if (*data==0) { input_push_state=6; }
274 else { *out++ = *data; }
275 break;
276
277 case 6:
278 if (*data==0) { input_push_state=7; }
279 else {
280 *out++ = 0;
281 *out++ = *data;
282 input_push_state=5;
283 }
284 break;
285
286 case 7:
287 if (*data==0) { *out++ = 0; }
288 else if (*data==3) {
289 *out++ = 0; *out++ = 0; input_push_state=5;
290
291 // remember which byte we removed
292 nal->insert_skipped_byte((out - nal->data()) + nal->num_skipped_bytes());
293 }
294 else if (*data==1) {
295
296 #if DEBUG_INSERT_STREAM_ERRORS
297 if ((rand()%100)<90 && nal_data.size>0) {
298 int pos = rand()%nal_data.size;
299 int bit = rand()%8;
300 nal->nal_data.data[pos] ^= 1<<bit;
301
302 //printf("inserted error...\n");
303 }
304 #endif
305
306 nal->set_size(out - nal->data());;
307
308 // push this NAL decoder queue
309 push_to_NAL_queue(nal);
310
311
312 // initialize new, empty NAL unit
313
314 pending_input_NAL = alloc_NAL_unit(len+3);
315 pending_input_NAL->pts = pts;
316 nal = pending_input_NAL;
317 out = nal->data();
318
319 input_push_state=3;
320 //nal->clear_skipped_bytes();
321 }
322 else {
323 *out++ = 0;
324 *out++ = 0;
325 *out++ = *data;
326
327 input_push_state=5;
328 }
329 break;
330 }
331
332 data++;
333 }
334
335 nal->set_size(out - nal->data());
336 return DE265_OK;
337 }
338
339
340 de265_error NAL_Parser::push_NAL(const unsigned char* data, int len,
341 de265_PTS pts, void* user_data)
342 {
343
344 // Cannot use byte-stream input and NAL input at the same time.
345 assert(pending_input_NAL == NULL);
346
347 NAL_unit* nal = alloc_NAL_unit(len);
348 nal->set_data(data, len);
349 nal->pts = pts;
350 nal->user_data = user_data;
351
352 nal->remove_stuffing_bytes();
353
354 push_to_NAL_queue(nal);
355
356 return DE265_OK;
357 }
358
359
360 de265_error NAL_Parser::flush_data()
361 {
362 if (pending_input_NAL) {
363 NAL_unit* nal = pending_input_NAL;
364 uint8_t null[2] = { 0,0 };
365
366 // append bytes that are implied by the push state
367
368 if (input_push_state==6) { nal->append(null,1); }
369 if (input_push_state==7) { nal->append(null,2); }
370
371
372 // only push the NAL if it contains at least the NAL header
373
374 if (input_push_state>=5) {
375 push_to_NAL_queue(nal);
376 pending_input_NAL = NULL;
377 }
378
379 input_push_state = 0;
380 }
381
382 return DE265_OK;
383 }
384
385
386 void NAL_Parser::remove_pending_input_data()
387 {
388 // --- remove pending input data ---
389
390 if (pending_input_NAL) {
391 free_NAL_unit(pending_input_NAL);
392 pending_input_NAL = NULL;
393 }
394
395 for (;;) {
396 NAL_unit* nal = pop_from_NAL_queue();
397 if (nal) { free_NAL_unit(nal); }
398 else break;
399 }
400
401 input_push_state = 0;
402 nBytes_in_NAL_queue = 0;
403 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef DE265_NAL_PARSER_H
21 #define DE265_NAL_PARSER_H
22
23 #include "libde265/sps.h"
24 #include "libde265/pps.h"
25 #include "libde265/nal.h"
26
27 #include <vector>
28 #include <queue>
29
30 #define DE265_NAL_FREE_LIST_SIZE 16
31 #define DE265_SKIPPED_BYTES_INITIAL_SIZE 16
32
33
34 class NAL_unit {
35 public:
36 NAL_unit();
37 ~NAL_unit();
38
39 nal_header header;
40
41 de265_PTS pts;
42 void* user_data;
43
44
45 void clear();
46
47 // --- rbsp data ---
48
49 void resize(int new_size);
50 void append(const unsigned char* data, int n);
51 void set_data(const unsigned char* data, int n);
52
53 int size() const { return data_size; }
54 void set_size(int s) { data_size=s; }
55 unsigned char* data() { return nal_data; }
56 const unsigned char* data() const { return nal_data; }
57
58
59 // --- skipped stuffing bytes ---
60
61 int num_skipped_bytes_before(int byte_position, int headerLength) const;
62 int num_skipped_bytes() const { return skipped_bytes.size(); }
63
64 //void clear_skipped_bytes() { skipped_bytes.clear(); }
65
66 /* Mark a byte as skipped. It is assumed that the byte is already removed
67 from the input data. The NAL data is not modified.
68 */
69 void insert_skipped_byte(int pos);
70
71 /* Remove all stuffing bytes from NAL data. The NAL data is modified and
72 the removed bytes are marked as skipped bytes.
73 */
74 void remove_stuffing_bytes();
75
76 private:
77 unsigned char* nal_data;
78 int data_size;
79 int capacity;
80
81 std::vector<int> skipped_bytes; // up to position[x], there were 'x' skipped bytes
82 };
83
84
85 class NAL_Parser
86 {
87 public:
88 NAL_Parser();
89 ~NAL_Parser();
90
91 de265_error push_data(const unsigned char* data, int len,
92 de265_PTS pts, void* user_data);
93
94 de265_error push_NAL(const unsigned char* data, int len,
95 de265_PTS pts, void* user_data);
96
97 NAL_unit* pop_from_NAL_queue();
98 void push_to_NAL_queue(NAL_unit*);
99 de265_error flush_data();
100 void mark_end_of_stream() { end_of_stream=true; }
101
102 void remove_pending_input_data();
103
104 int bytes_in_input_queue() const {
105 int size = nBytes_in_NAL_queue;
106 if (pending_input_NAL) { size += pending_input_NAL->size(); }
107 return size;
108 }
109
110 int number_of_NAL_units_pending() const {
111 int size = NAL_queue.size();
112 if (pending_input_NAL) { size++; }
113 return size;
114 }
115
116 void free_NAL_unit(NAL_unit*);
117
118
119 int get_NAL_queue_length() const { return NAL_queue.size(); }
120 bool is_end_of_stream() const { return end_of_stream; }
121
122 private:
123 // byte-stream level
124
125 bool end_of_stream; // data in pending_input_data is end of stream
126 int input_push_state;
127
128 NAL_unit* pending_input_NAL;
129
130
131 // NAL level
132
133 std::queue<NAL_unit*> NAL_queue; // enqueued NALs have suffing bytes removed
134 int nBytes_in_NAL_queue; // data bytes currently in NAL_queue
135
136
137 // pool of unused NAL memory
138
139 std::vector<NAL_unit*> NAL_free_list; // maximum size: DE265_NAL_FREE_LIST_SIZE
140
141 NAL_unit* alloc_NAL_unit(int size);
142 };
143
144
145 #endif
+0
-140
libde265/nal.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "nal.h"
21 #include <assert.h>
22
23
24 void nal_read_header(bitreader* reader, nal_header* hdr)
25 {
26 skip_bits(reader,1);
27 hdr->nal_unit_type = get_bits(reader,6);
28 hdr->nuh_layer_id = get_bits(reader,6);
29 hdr->nuh_temporal_id = get_bits(reader,3) -1;
30 }
31
32
33 bool isIDR(uint8_t unit_type)
34 {
35 return (unit_type == NAL_UNIT_IDR_W_RADL ||
36 unit_type == NAL_UNIT_IDR_N_LP);
37 }
38
39 bool isBLA(uint8_t unit_type)
40 {
41 return (unit_type == NAL_UNIT_BLA_W_LP ||
42 unit_type == NAL_UNIT_BLA_W_RADL ||
43 unit_type == NAL_UNIT_BLA_N_LP);
44 }
45
46 bool isCRA(uint8_t unit_type)
47 {
48 return unit_type == NAL_UNIT_CRA_NUT;
49 }
50
51 bool isRAP(uint8_t unit_type)
52 {
53 return isIDR(unit_type) || isBLA(unit_type) || isCRA(unit_type);
54 }
55
56 bool isRASL(uint8_t unit_type)
57 {
58 return (unit_type == NAL_UNIT_RASL_N ||
59 unit_type == NAL_UNIT_RASL_R);
60 }
61
62 bool isIRAP(uint8_t unit_type)
63 {
64 return (unit_type >= NAL_UNIT_BLA_W_LP &&
65 unit_type <= NAL_UNIT_RESERVED_IRAP_VCL23);
66 }
67
68 bool isRADL(uint8_t unit_type)
69 {
70 return (unit_type == NAL_UNIT_RADL_N ||
71 unit_type == NAL_UNIT_RADL_R);
72 }
73
74
75 bool isReferenceNALU(uint8_t unit_type)
76 {
77 return ( ((unit_type <= NAL_UNIT_RESERVED_VCL_R15) && (unit_type%2 != 0)) ||
78 ((unit_type >= NAL_UNIT_BLA_W_LP) &&
79 (unit_type <= NAL_UNIT_RESERVED_IRAP_VCL23)) );
80 }
81
82
83 static const char* NAL_unit_name[] = {
84 "TRAIL_N", // 0
85 "TRAIL_R",
86 "TSA_N",
87 "TSA_R",
88 "STSA_N",
89 "STSA_R", // 5
90 "RADL_N",
91 "RADL_R",
92 "RASL_N",
93 "RASL_R",
94 "RESERVED_VCL_N10", // 10
95 "RESERVED_VCL_R11",
96 "RESERVED_VCL_N12",
97 "RESERVED_VCL_R13",
98 "RESERVED_VCL_N14",
99 "RESERVED_VCL_R15", // 15
100 "BLA_W_LP",
101 "BLA_W_RADL",
102 "BLA_N_LP",
103 "IDR_W_RADL",
104 "IDR_N_LP", // 20
105 "CRA_NUT",
106 "RESERVED_IRAP_VCL22",
107 "RESERVED_IRAP_VCL23",
108 "RESERVED_VCL24",
109 "RESERVED_VCL25", // 25
110 "RESERVED_VCL26",
111 "RESERVED_VCL27",
112 "RESERVED_VCL28",
113 "RESERVED_VCL29",
114 "RESERVED_VCL30", // 30
115 "RESERVED_VCL31",
116 "VPS",
117 "SPS",
118 "PPS",
119 "AUD", // 35
120 "EOS",
121 "EOB",
122 "FD",
123 "PREFIX_SEI",
124 "SUFFIX_SEI", // 40
125 "RESERVED_NVCL41",
126 "RESERVED_NVCL42",
127 "RESERVED_NVCL43",
128 "RESERVED_NVCL44",
129 "RESERVED_NVCL45", // 45
130 "RESERVED_NVCL46",
131 "RESERVED_NVCL47"
132 };
133
134 const char* get_NAL_name(uint8_t unit_type)
135 {
136 assert(unit_type <= 47);
137 return NAL_unit_name[unit_type];
138 }
139
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "nal.h"
21 #include <assert.h>
22
23
24 void nal_read_header(bitreader* reader, nal_header* hdr)
25 {
26 skip_bits(reader,1);
27 hdr->nal_unit_type = get_bits(reader,6);
28 hdr->nuh_layer_id = get_bits(reader,6);
29 hdr->nuh_temporal_id = get_bits(reader,3) -1;
30 }
31
32
33 bool isIDR(uint8_t unit_type)
34 {
35 return (unit_type == NAL_UNIT_IDR_W_RADL ||
36 unit_type == NAL_UNIT_IDR_N_LP);
37 }
38
39 bool isBLA(uint8_t unit_type)
40 {
41 return (unit_type == NAL_UNIT_BLA_W_LP ||
42 unit_type == NAL_UNIT_BLA_W_RADL ||
43 unit_type == NAL_UNIT_BLA_N_LP);
44 }
45
46 bool isCRA(uint8_t unit_type)
47 {
48 return unit_type == NAL_UNIT_CRA_NUT;
49 }
50
51 bool isRAP(uint8_t unit_type)
52 {
53 return isIDR(unit_type) || isBLA(unit_type) || isCRA(unit_type);
54 }
55
56 bool isRASL(uint8_t unit_type)
57 {
58 return (unit_type == NAL_UNIT_RASL_N ||
59 unit_type == NAL_UNIT_RASL_R);
60 }
61
62 bool isIRAP(uint8_t unit_type)
63 {
64 return (unit_type >= NAL_UNIT_BLA_W_LP &&
65 unit_type <= NAL_UNIT_RESERVED_IRAP_VCL23);
66 }
67
68 bool isRADL(uint8_t unit_type)
69 {
70 return (unit_type == NAL_UNIT_RADL_N ||
71 unit_type == NAL_UNIT_RADL_R);
72 }
73
74
75 bool isReferenceNALU(uint8_t unit_type)
76 {
77 return ( ((unit_type <= NAL_UNIT_RESERVED_VCL_R15) && (unit_type%2 != 0)) ||
78 ((unit_type >= NAL_UNIT_BLA_W_LP) &&
79 (unit_type <= NAL_UNIT_RESERVED_IRAP_VCL23)) );
80 }
81
82
83 static const char* NAL_unit_name[] = {
84 "TRAIL_N", // 0
85 "TRAIL_R",
86 "TSA_N",
87 "TSA_R",
88 "STSA_N",
89 "STSA_R", // 5
90 "RADL_N",
91 "RADL_R",
92 "RASL_N",
93 "RASL_R",
94 "RESERVED_VCL_N10", // 10
95 "RESERVED_VCL_R11",
96 "RESERVED_VCL_N12",
97 "RESERVED_VCL_R13",
98 "RESERVED_VCL_N14",
99 "RESERVED_VCL_R15", // 15
100 "BLA_W_LP",
101 "BLA_W_RADL",
102 "BLA_N_LP",
103 "IDR_W_RADL",
104 "IDR_N_LP", // 20
105 "CRA_NUT",
106 "RESERVED_IRAP_VCL22",
107 "RESERVED_IRAP_VCL23",
108 "RESERVED_VCL24",
109 "RESERVED_VCL25", // 25
110 "RESERVED_VCL26",
111 "RESERVED_VCL27",
112 "RESERVED_VCL28",
113 "RESERVED_VCL29",
114 "RESERVED_VCL30", // 30
115 "RESERVED_VCL31",
116 "VPS",
117 "SPS",
118 "PPS",
119 "AUD", // 35
120 "EOS",
121 "EOB",
122 "FD",
123 "PREFIX_SEI",
124 "SUFFIX_SEI", // 40
125 "RESERVED_NVCL41",
126 "RESERVED_NVCL42",
127 "RESERVED_NVCL43",
128 "RESERVED_NVCL44",
129 "RESERVED_NVCL45", // 45
130 "RESERVED_NVCL46",
131 "RESERVED_NVCL47"
132 };
133
134 const char* get_NAL_name(uint8_t unit_type)
135 {
136 assert(unit_type <= 47);
137 return NAL_unit_name[unit_type];
138 }
139
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
3131
3232 #include "libde265/bitstream.h"
3333
34 typedef struct {
34 struct nal_header {
35 nal_header() {
36 nal_unit_type = 0;
37 nuh_layer_id = 0;
38 nuh_temporal_id = 0;
39 }
40
3541 int nal_unit_type;
3642 int nuh_layer_id;
3743 int nuh_temporal_id;
38 } nal_header;
44 };
3945
4046 #define NAL_UNIT_TRAIL_N 0
4147 #define NAL_UNIT_TRAIL_R 1
+0
-593
libde265/pps.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "pps.h"
21 #include "pps_func.h"
22 #include "sps_func.h"
23 #include "util.h"
24
25 #include <assert.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #if defined(_MSC_VER) || defined(__MINGW32__)
29 # include <malloc.h>
30 #else
31 # include <alloca.h>
32 #endif
33
34
35 bool read_pps(bitreader* br, pic_parameter_set* pps, decoder_context* ctx)
36 {
37 pps->pps_read = false; // incomplete pps
38
39 int uvlc;
40 pps->pic_parameter_set_id = uvlc = get_uvlc(br);
41 if (uvlc >= DE265_MAX_PPS_SETS ||
42 uvlc == UVLC_ERROR) {
43 add_warning(ctx, DE265_WARNING_NONEXISTING_PPS_REFERENCED, false);
44 return false;
45 }
46
47 pps->seq_parameter_set_id = uvlc = get_uvlc(br);
48 if (uvlc >= DE265_MAX_PPS_SETS ||
49 uvlc == UVLC_ERROR) {
50 add_warning(ctx, DE265_WARNING_NONEXISTING_SPS_REFERENCED, false);
51 return false;
52 }
53
54 pps->dependent_slice_segments_enabled_flag = get_bits(br,1);
55 pps->output_flag_present_flag = get_bits(br,1);
56 pps->num_extra_slice_header_bits = get_bits(br,3);
57 pps->sign_data_hiding_flag = get_bits(br,1);
58 pps->cabac_init_present_flag = get_bits(br,1);
59 pps->num_ref_idx_l0_default_active = uvlc = get_uvlc(br);
60 if (uvlc == UVLC_ERROR) {
61 add_warning(ctx, DE265_WARNING_PPS_HEADER_INVALID, false);
62 return false;
63 }
64 pps->num_ref_idx_l0_default_active++;
65
66 pps->num_ref_idx_l1_default_active = uvlc = get_uvlc(br);
67 if (uvlc == UVLC_ERROR) {
68 add_warning(ctx, DE265_WARNING_PPS_HEADER_INVALID, false);
69 return false;
70 }
71 pps->num_ref_idx_l1_default_active++;
72
73
74 seq_parameter_set* sps = get_sps(ctx, pps->seq_parameter_set_id);
75 if (sps==NULL) {
76 add_warning(ctx, DE265_WARNING_NONEXISTING_SPS_REFERENCED, false);
77 return false;
78 }
79
80 if ((pps->pic_init_qp = get_svlc(br)) == UVLC_ERROR) {
81 add_warning(ctx, DE265_WARNING_PPS_HEADER_INVALID, false);
82 return false;
83 }
84 pps->pic_init_qp += 26;
85
86 pps->constrained_intra_pred_flag = get_bits(br,1);
87 pps->transform_skip_enabled_flag = get_bits(br,1);
88 pps->cu_qp_delta_enabled_flag = get_bits(br,1);
89
90 if (pps->cu_qp_delta_enabled_flag) {
91 if ((pps->diff_cu_qp_delta_depth = get_uvlc(br)) == UVLC_ERROR) {
92 add_warning(ctx, DE265_WARNING_PPS_HEADER_INVALID, false);
93 return false;
94 }
95 } else {
96 pps->diff_cu_qp_delta_depth = 0;
97 }
98
99 if ((pps->pic_cb_qp_offset = get_svlc(br)) == UVLC_ERROR) {
100 add_warning(ctx, DE265_WARNING_PPS_HEADER_INVALID, false);
101 return false;
102 }
103
104 if ((pps->pic_cr_qp_offset = get_svlc(br)) == UVLC_ERROR) {
105 add_warning(ctx, DE265_WARNING_PPS_HEADER_INVALID, false);
106 return false;
107 }
108
109 pps->pps_slice_chroma_qp_offsets_present_flag = get_bits(br,1);
110 pps->weighted_pred_flag = get_bits(br,1);
111 pps->weighted_bipred_flag = get_bits(br,1);
112 pps->transquant_bypass_enable_flag = get_bits(br,1);
113 pps->tiles_enabled_flag = get_bits(br,1);
114 pps->entropy_coding_sync_enabled_flag = get_bits(br,1);
115
116
117 // --- tiles ---
118
119 if (pps->tiles_enabled_flag ) {
120 pps->num_tile_columns = get_uvlc(br);
121 if (pps->num_tile_columns == UVLC_ERROR ||
122 pps->num_tile_columns > DE265_MAX_TILE_COLUMNS) {
123 add_warning(ctx, DE265_WARNING_PPS_HEADER_INVALID, false);
124 return false;
125 }
126 pps->num_tile_columns++;
127
128 pps->num_tile_rows = get_uvlc(br);
129 if (pps->num_tile_rows == UVLC_ERROR ||
130 pps->num_tile_rows > DE265_MAX_TILE_ROWS) {
131 add_warning(ctx, DE265_WARNING_PPS_HEADER_INVALID, false);
132 return false;
133 }
134 pps->num_tile_rows++;
135
136 pps->uniform_spacing_flag = get_bits(br,1);
137
138 if (pps->uniform_spacing_flag==false) {
139 int lastColumnWidth = sps->PicWidthInCtbsY;
140 int lastRowHeight = sps->PicHeightInCtbsY;
141
142 for (int i=0; i<pps->num_tile_columns-1; i++)
143 {
144 pps->colWidth[i] = get_uvlc(br);
145 if (pps->colWidth[i] == UVLC_ERROR) {
146 add_warning(ctx, DE265_WARNING_PPS_HEADER_INVALID, false);
147 return false;
148 }
149 pps->colWidth[i]++;
150
151 lastColumnWidth -= pps->colWidth[i];
152 }
153
154 pps->colWidth[pps->num_tile_columns-1] = lastColumnWidth;
155
156 for (int i=0; i<pps->num_tile_rows-1; i++)
157 {
158 pps->rowHeight[i] = get_uvlc(br);
159 if (pps->rowHeight[i] == UVLC_ERROR) {
160 add_warning(ctx, DE265_WARNING_PPS_HEADER_INVALID, false);
161 return false;
162 }
163 pps->rowHeight[i]++;
164 lastRowHeight -= pps->rowHeight[i];
165 }
166
167 pps->rowHeight[pps->num_tile_rows-1] = lastRowHeight;
168 }
169
170 pps->loop_filter_across_tiles_enabled_flag = get_bits(br,1);
171
172 } else {
173 pps->num_tile_columns = 1;
174 pps->num_tile_rows = 1;
175 pps->uniform_spacing_flag = 1;
176 pps->loop_filter_across_tiles_enabled_flag = 0;
177 }
178
179
180
181 if (pps->uniform_spacing_flag) {
182
183 // set columns widths
184
185 int *const colPos = (int *)alloca((pps->num_tile_columns+1) * sizeof(int));
186
187 for (int i=0;i<=pps->num_tile_columns;i++) {
188 colPos[i] = i*sps->PicWidthInCtbsY / pps->num_tile_columns;
189 }
190 for (int i=0;i<pps->num_tile_columns;i++) {
191 pps->colWidth[i] = colPos[i+1] - colPos[i];
192 }
193
194 // set row heights
195
196 int *const rowPos = (int *)alloca((pps->num_tile_rows+1) * sizeof(int));
197
198 for (int i=0;i<=pps->num_tile_rows;i++) {
199 rowPos[i] = i*sps->PicHeightInCtbsY / pps->num_tile_rows;
200 }
201 for (int i=0;i<pps->num_tile_rows;i++) {
202 pps->rowHeight[i] = rowPos[i+1] - rowPos[i];
203 }
204 }
205
206
207 // set tile boundaries
208
209 pps->colBd[0]=0;
210 for (int i=0;i<pps->num_tile_columns;i++) {
211 pps->colBd[i+1] = pps->colBd[i] + pps->colWidth[i];
212 }
213
214 pps->rowBd[0]=0;
215 for (int i=0;i<pps->num_tile_rows;i++) {
216 pps->rowBd[i+1] = pps->rowBd[i] + pps->rowHeight[i];
217 }
218
219
220
221 // alloc raster scan arrays
222
223 if (pps->CtbAddrRStoTS) { free(pps->CtbAddrRStoTS); }
224 if (pps->CtbAddrTStoRS) { free(pps->CtbAddrTStoRS); }
225 if (pps->TileId) { free(pps->TileId); }
226 if (pps->TileIdRS) { free(pps->TileIdRS); }
227 if (pps->MinTbAddrZS) { free(pps->MinTbAddrZS); }
228
229 pps->CtbAddrRStoTS = (int *)malloc( sizeof(int) * sps->PicSizeInCtbsY );
230 pps->CtbAddrTStoRS = (int *)malloc( sizeof(int) * sps->PicSizeInCtbsY );
231 pps->TileId = (int *)malloc( sizeof(int) * sps->PicSizeInCtbsY );
232 pps->TileIdRS = (int *)malloc( sizeof(int) * sps->PicSizeInCtbsY );
233 pps->MinTbAddrZS = (int *)malloc( sizeof(int) * sps->PicSizeInTbsY );
234
235
236 // raster scan (RS) <-> tile scan (TS) conversion
237
238 for (int ctbAddrRS=0 ; ctbAddrRS < sps->PicSizeInCtbsY ; ctbAddrRS++)
239 {
240 int tbX = ctbAddrRS % sps->PicWidthInCtbsY;
241 int tbY = ctbAddrRS / sps->PicWidthInCtbsY;
242 int tileX=-1,tileY=-1;
243
244 for (int i=0;i<pps->num_tile_columns;i++)
245 if (tbX >= pps->colBd[i])
246 tileX=i;
247
248 for (int j=0;j<pps->num_tile_rows;j++)
249 if (tbY >= pps->rowBd[j])
250 tileY=j;
251
252 pps->CtbAddrRStoTS[ctbAddrRS] = 0;
253 for (int i=0;i<tileX;i++)
254 pps->CtbAddrRStoTS[ctbAddrRS] += pps->rowHeight[tileY]*pps->colWidth[i];
255
256 for (int j=0;j<tileY;j++)
257 {
258 //pps->CtbAddrRStoTS[ctbAddrRS] += (tbY - pps->rowBd[tileY])*pps->colWidth[tileX];
259 //pps->CtbAddrRStoTS[ctbAddrRS] += tbX - pps->colBd[tileX];
260
261 pps->CtbAddrRStoTS[ctbAddrRS] += sps->PicWidthInCtbsY * pps->rowHeight[j];
262 }
263
264 assert(tileX>=0 && tileY>=0);
265
266 pps->CtbAddrRStoTS[ctbAddrRS] += (tbY-pps->rowBd[tileY])*pps->colWidth[tileX];
267 pps->CtbAddrRStoTS[ctbAddrRS] += tbX - pps->colBd[tileX];
268
269
270 // inverse mapping
271
272 pps->CtbAddrTStoRS[ pps->CtbAddrRStoTS[ctbAddrRS] ] = ctbAddrRS;
273 }
274
275
276 logtrace(LogHeaders,"6.5.1 CtbAddrRSToTS\n");
277 for (int y=0;y<sps->PicHeightInCtbsY;y++)
278 {
279 for (int x=0;x<sps->PicWidthInCtbsY;x++)
280 {
281 logtrace(LogHeaders,"%3d ", pps->CtbAddrRStoTS[x + y*sps->PicWidthInCtbsY]);
282 }
283
284 logtrace(LogHeaders,"\n");
285 }
286
287
288 // tile id
289
290 for (int j=0, tIdx=0 ; j<pps->num_tile_rows ; j++)
291 for (int i=0 ; i<pps->num_tile_columns;i++)
292 {
293 for (int y=pps->rowBd[j] ; y<pps->rowBd[j+1] ; y++)
294 for (int x=pps->colBd[i] ; x<pps->colBd[i+1] ; x++) {
295 pps->TileId [ pps->CtbAddrRStoTS[y*sps->PicWidthInCtbsY + x] ] = tIdx;
296 pps->TileIdRS[ y*sps->PicWidthInCtbsY + x ] = tIdx;
297
298 //logtrace(LogHeaders,"tileID[%d,%d] = %d\n",x,y,pps->TileIdRS[ y*sps->PicWidthInCtbsY + x ]);
299 }
300
301 tIdx++;
302 }
303
304 logtrace(LogHeaders,"Tile IDs RS:\n");
305 for (int y=0;y<sps->PicHeightInCtbsY;y++) {
306 for (int x=0;x<sps->PicWidthInCtbsY;x++) {
307 logtrace(LogHeaders,"%2d ",pps->TileIdRS[y*sps->PicWidthInCtbsY+x]);
308 }
309 logtrace(LogHeaders,"\n");
310 }
311
312 // 6.5.2 Z-scan order array initialization process
313
314 for (int y=0;y<sps->PicHeightInTbsY;y++)
315 for (int x=0;x<sps->PicWidthInTbsY;x++)
316 {
317 int tbX = (x<<sps->Log2MinTrafoSize)>>sps->Log2CtbSizeY;
318 int tbY = (y<<sps->Log2MinTrafoSize)>>sps->Log2CtbSizeY;
319 int ctbAddrRS = sps->PicWidthInCtbsY*tbY + tbX;
320
321 pps->MinTbAddrZS[x + y*sps->PicWidthInTbsY] = pps->CtbAddrRStoTS[ctbAddrRS]
322 << ((sps->Log2CtbSizeY-sps->Log2MinTrafoSize)*2);
323
324 int p=0;
325 for (int i=0 ; i<(sps->Log2CtbSizeY - sps->Log2MinTrafoSize) ; i++) {
326 int m=1<<i;
327 p += (m & x ? m*m : 0) + (m & y ? 2*m*m : 0);
328 }
329
330 pps->MinTbAddrZS[x + y*sps->PicWidthInTbsY] += p;
331 }
332
333
334 // --- debug logging ---
335
336 /*
337 logtrace(LogHeaders,"6.5.2 Z-scan order array\n");
338 for (int y=0;y<sps->PicHeightInTbsY;y++)
339 {
340 for (int x=0;x<sps->PicWidthInTbsY;x++)
341 {
342 logtrace(LogHeaders,"%4d ", pps->MinTbAddrZS[x + y*sps->PicWidthInTbsY]);
343 }
344
345 logtrace(LogHeaders,"\n");
346 }
347
348 for (int i=0;i<sps->PicSizeInTbsY;i++)
349 {
350 for (int y=0;y<sps->PicHeightInTbsY;y++)
351 {
352 for (int x=0;x<sps->PicWidthInTbsY;x++)
353 {
354 if (pps->MinTbAddrZS[x + y*sps->PicWidthInTbsY] == i) {
355 logtrace(LogHeaders,"%d %d\n",x,y);
356 }
357 }
358 }
359 }
360 */
361
362 // END tiles
363
364
365 pps->Log2MinCuQpDeltaSize = sps->Log2CtbSizeY - pps->diff_cu_qp_delta_depth;
366
367
368 pps->beta_offset = 0; // default value
369 pps->tc_offset = 0; // default value
370
371 pps->pps_loop_filter_across_slices_enabled_flag = get_bits(br,1);
372 pps->deblocking_filter_control_present_flag = get_bits(br,1);
373 if (pps->deblocking_filter_control_present_flag) {
374 pps->deblocking_filter_override_enabled_flag = get_bits(br,1);
375 pps->pic_disable_deblocking_filter_flag = get_bits(br,1);
376 if (!pps->pic_disable_deblocking_filter_flag) {
377 pps->beta_offset = get_svlc(br);
378 if (pps->beta_offset == UVLC_ERROR) {
379 add_warning(ctx, DE265_WARNING_PPS_HEADER_INVALID, false);
380 return false;
381 }
382 pps->beta_offset *= 2;
383
384 pps->tc_offset = get_svlc(br);
385 if (pps->tc_offset == UVLC_ERROR) {
386 add_warning(ctx, DE265_WARNING_PPS_HEADER_INVALID, false);
387 return false;
388 }
389 pps->tc_offset *= 2;
390 }
391 }
392 else {
393 pps->deblocking_filter_override_enabled_flag = 0;
394 pps->pic_disable_deblocking_filter_flag = 0;
395 }
396
397
398 // --- scaling list ---
399
400 pps->pic_scaling_list_data_present_flag = get_bits(br,1);
401
402 // check consistency: if scaling-lists are not enabled, pic_scalign_list_data_present_flag
403 // must be FALSE
404 if (sps->scaling_list_enable_flag==0 &&
405 pps->pic_scaling_list_data_present_flag != 0) {
406 add_warning(ctx, DE265_WARNING_PPS_HEADER_INVALID, false);
407 return false;
408 }
409
410 if (pps->pic_scaling_list_data_present_flag) {
411 de265_error err = read_scaling_list(br, sps, &pps->scaling_list, true);
412 if (err != DE265_OK) {
413 add_warning(ctx, err, false);
414 return false;
415 }
416 }
417 else {
418 memcpy(&pps->scaling_list, &sps->scaling_list, sizeof(scaling_list_data));
419 }
420
421
422
423
424 pps->lists_modification_present_flag = get_bits(br,1);
425 pps->log2_parallel_merge_level = get_uvlc(br);
426 if (pps->log2_parallel_merge_level == UVLC_ERROR) {
427 add_warning(ctx, DE265_WARNING_PPS_HEADER_INVALID, false);
428 return false;
429 }
430 pps->log2_parallel_merge_level += 2;
431
432 pps->slice_segment_header_extension_present_flag = get_bits(br,1);
433 pps->pps_extension_flag = get_bits(br,1);
434
435 if (pps->pps_extension_flag) {
436 //assert(false);
437 /*
438 while( more_rbsp_data() )
439
440 pps_extension_data_flag
441 u(1)
442 rbsp_trailing_bits()
443
444 }
445 */
446 }
447
448
449 pps->pps_read = true;
450
451 return true;
452 }
453
454
455 void dump_pps(pic_parameter_set* pps, int fd)
456 {
457 FILE* fh;
458 if (fd==1) fh=stdout;
459 else if (fd==2) fh=stderr;
460 else { return; }
461
462 #define LOG0(t) log2fh(fh, t)
463 #define LOG1(t,d) log2fh(fh, t,d)
464
465 LOG0("----------------- PPS -----------------\n");
466 LOG1("pic_parameter_set_id : %d\n", pps->pic_parameter_set_id);
467 LOG1("seq_parameter_set_id : %d\n", pps->seq_parameter_set_id);
468 LOG1("dependent_slice_segments_enabled_flag : %d\n", pps->dependent_slice_segments_enabled_flag);
469 LOG1("sign_data_hiding_flag : %d\n", pps->sign_data_hiding_flag);
470 LOG1("cabac_init_present_flag : %d\n", pps->cabac_init_present_flag);
471 LOG1("num_ref_idx_l0_default_active : %d\n", pps->num_ref_idx_l0_default_active);
472 LOG1("num_ref_idx_l1_default_active : %d\n", pps->num_ref_idx_l1_default_active);
473
474 LOG1("pic_init_qp : %d\n", pps->pic_init_qp);
475 LOG1("constrained_intra_pred_flag: %d\n", pps->constrained_intra_pred_flag);
476 LOG1("transform_skip_enabled_flag: %d\n", pps->transform_skip_enabled_flag);
477 LOG1("cu_qp_delta_enabled_flag : %d\n", pps->cu_qp_delta_enabled_flag);
478
479 if (pps->cu_qp_delta_enabled_flag) {
480 LOG1("diff_cu_qp_delta_depth : %d\n", pps->diff_cu_qp_delta_depth);
481 }
482
483 LOG1("pic_cb_qp_offset : %d\n", pps->pic_cb_qp_offset);
484 LOG1("pic_cr_qp_offset : %d\n", pps->pic_cr_qp_offset);
485 LOG1("pps_slice_chroma_qp_offsets_present_flag : %d\n", pps->pps_slice_chroma_qp_offsets_present_flag);
486 LOG1("weighted_pred_flag : %d\n", pps->weighted_pred_flag);
487 LOG1("weighted_bipred_flag : %d\n", pps->weighted_bipred_flag);
488 LOG1("output_flag_present_flag : %d\n", pps->output_flag_present_flag);
489 LOG1("transquant_bypass_enable_flag: %d\n", pps->transquant_bypass_enable_flag);
490 LOG1("tiles_enabled_flag : %d\n", pps->tiles_enabled_flag);
491 LOG1("entropy_coding_sync_enabled_flag: %d\n", pps->entropy_coding_sync_enabled_flag);
492
493 if (pps->tiles_enabled_flag) {
494 LOG1("num_tile_columns : %d\n", pps->num_tile_columns);
495 LOG1("num_tile_rows : %d\n", pps->num_tile_rows);
496 LOG1("uniform_spacing_flag: %d\n", pps->uniform_spacing_flag);
497
498 LOG0("tile column boundaries: ");
499 for (int i=0;i<=pps->num_tile_columns;i++) {
500 LOG1("*%d ",pps->colBd[i]);
501 }
502 LOG0("*\n");
503
504 LOG0("tile row boundaries: ");
505 for (int i=0;i<=pps->num_tile_rows;i++) {
506 LOG1("*%d ",pps->rowBd[i]);
507 }
508 LOG0("*\n");
509
510 //if( !uniform_spacing_flag ) {
511 /*
512 for( i = 0; i < num_tile_columns_minus1; i++ )
513
514 column_width_minus1[i]
515 ue(v)
516 for( i = 0; i < num_tile_rows_minus1; i++ )
517
518 row_height_minus1[i]
519 ue(v)
520 }
521 */
522
523 LOG1("loop_filter_across_tiles_enabled_flag : %d\n", pps->loop_filter_across_tiles_enabled_flag);
524 }
525
526 LOG1("pps_loop_filter_across_slices_enabled_flag: %d\n", pps->pps_loop_filter_across_slices_enabled_flag);
527 LOG1("deblocking_filter_control_present_flag: %d\n", pps->deblocking_filter_control_present_flag);
528
529 if (pps->deblocking_filter_control_present_flag) {
530 LOG1("deblocking_filter_override_enabled_flag: %d\n", pps->deblocking_filter_override_enabled_flag);
531 LOG1("pic_disable_deblocking_filter_flag: %d\n", pps->pic_disable_deblocking_filter_flag);
532
533 LOG1("beta_offset: %d\n", pps->beta_offset);
534 LOG1("tc_offset: %d\n", pps->tc_offset);
535 }
536
537 LOG1("pic_scaling_list_data_present_flag: %d\n", pps->pic_scaling_list_data_present_flag);
538 if (pps->pic_scaling_list_data_present_flag) {
539 //scaling_list_data()
540 }
541
542 LOG1("lists_modification_present_flag: %d\n", pps->lists_modification_present_flag);
543 LOG1("log2_parallel_merge_level : %d\n", pps->log2_parallel_merge_level);
544 LOG1("num_extra_slice_header_bits : %d\n", pps->num_extra_slice_header_bits);
545 LOG1("slice_segment_header_extension_present_flag : %d\n", pps->slice_segment_header_extension_present_flag);
546 LOG1("pps_extension_flag : %d\n", pps->pps_extension_flag);
547 #undef LOG0
548 #undef LOG1
549 }
550
551
552 void init_pps(pic_parameter_set* pps)
553 {
554 pps->CtbAddrRStoTS = NULL;
555 pps->CtbAddrTStoRS = NULL;
556 pps->TileId = NULL;
557 pps->TileIdRS = NULL;
558 pps->MinTbAddrZS = NULL;
559 }
560
561
562 void free_pps(pic_parameter_set* pps)
563 {
564 if (pps->CtbAddrRStoTS) { free(pps->CtbAddrRStoTS); }
565 if (pps->CtbAddrTStoRS) { free(pps->CtbAddrTStoRS); }
566 if (pps->TileId) { free(pps->TileId); }
567 if (pps->TileIdRS) { free(pps->TileIdRS); }
568 if (pps->MinTbAddrZS) { free(pps->MinTbAddrZS); }
569 }
570
571
572 bool is_tile_start_CTB(const pic_parameter_set* pps,int ctbX,int ctbY)
573 {
574 // fast check
575 if (pps->tiles_enabled_flag==0) {
576 return ctbX == 0 && ctbY == 0;
577 }
578
579 for (int i=0;i<pps->num_tile_columns;i++)
580 if (pps->colBd[i]==ctbX)
581 {
582 for (int k=0;k<pps->num_tile_rows;k++)
583 if (pps->rowBd[k]==ctbY)
584 {
585 return true;
586 }
587
588 return false;
589 }
590
591 return false;
592 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "pps.h"
21 #include "decctx.h"
22 #include "util.h"
23
24 #include <assert.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #if defined(_MSC_VER) || defined(__MINGW32__)
28 # include <malloc.h>
29 #else
30 # include <alloca.h>
31 #endif
32
33
34 pic_parameter_set::pic_parameter_set()
35 {
36 pps_read = false;
37 }
38
39
40 pic_parameter_set::~pic_parameter_set()
41 {
42 }
43
44
45 bool pic_parameter_set::read(bitreader* br, decoder_context* ctx)
46 {
47 pps_read = false; // incomplete pps
48
49 int uvlc;
50 pic_parameter_set_id = uvlc = get_uvlc(br);
51 if (uvlc >= DE265_MAX_PPS_SETS ||
52 uvlc == UVLC_ERROR) {
53 ctx->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false);
54 return false;
55 }
56
57 seq_parameter_set_id = uvlc = get_uvlc(br);
58 if (uvlc >= DE265_MAX_PPS_SETS ||
59 uvlc == UVLC_ERROR) {
60 ctx->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false);
61 return false;
62 }
63
64 dependent_slice_segments_enabled_flag = get_bits(br,1);
65 output_flag_present_flag = get_bits(br,1);
66 num_extra_slice_header_bits = get_bits(br,3);
67 sign_data_hiding_flag = get_bits(br,1);
68 cabac_init_present_flag = get_bits(br,1);
69 num_ref_idx_l0_default_active = uvlc = get_uvlc(br);
70 if (uvlc == UVLC_ERROR) {
71 ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
72 return false;
73 }
74 num_ref_idx_l0_default_active++;
75
76 num_ref_idx_l1_default_active = uvlc = get_uvlc(br);
77 if (uvlc == UVLC_ERROR) {
78 ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
79 return false;
80 }
81 num_ref_idx_l1_default_active++;
82
83
84 seq_parameter_set* sps = ctx->get_sps(seq_parameter_set_id);
85 if (sps->sps_read==false) {
86 ctx->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false);
87 return false;
88 }
89
90 if ((pic_init_qp = get_svlc(br)) == UVLC_ERROR) {
91 ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
92 return false;
93 }
94 pic_init_qp += 26;
95
96 constrained_intra_pred_flag = get_bits(br,1);
97 transform_skip_enabled_flag = get_bits(br,1);
98 cu_qp_delta_enabled_flag = get_bits(br,1);
99
100 if (cu_qp_delta_enabled_flag) {
101 if ((diff_cu_qp_delta_depth = get_uvlc(br)) == UVLC_ERROR) {
102 ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
103 return false;
104 }
105 } else {
106 diff_cu_qp_delta_depth = 0;
107 }
108
109 if ((pic_cb_qp_offset = get_svlc(br)) == UVLC_ERROR) {
110 ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
111 return false;
112 }
113
114 if ((pic_cr_qp_offset = get_svlc(br)) == UVLC_ERROR) {
115 ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
116 return false;
117 }
118
119 pps_slice_chroma_qp_offsets_present_flag = get_bits(br,1);
120 weighted_pred_flag = get_bits(br,1);
121 weighted_bipred_flag = get_bits(br,1);
122 transquant_bypass_enable_flag = get_bits(br,1);
123 tiles_enabled_flag = get_bits(br,1);
124 entropy_coding_sync_enabled_flag = get_bits(br,1);
125
126
127 // --- tiles ---
128
129 if (tiles_enabled_flag) {
130 num_tile_columns = get_uvlc(br);
131 if (num_tile_columns == UVLC_ERROR ||
132 num_tile_columns+1 > DE265_MAX_TILE_COLUMNS) {
133 ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
134 return false;
135 }
136 num_tile_columns++;
137
138 num_tile_rows = get_uvlc(br);
139 if (num_tile_rows == UVLC_ERROR ||
140 num_tile_rows+1 > DE265_MAX_TILE_ROWS) {
141 ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
142 return false;
143 }
144 num_tile_rows++;
145
146 uniform_spacing_flag = get_bits(br,1);
147
148 if (uniform_spacing_flag==false) {
149 int lastColumnWidth = sps->PicWidthInCtbsY;
150 int lastRowHeight = sps->PicHeightInCtbsY;
151
152 for (int i=0; i<num_tile_columns-1; i++)
153 {
154 colWidth[i] = get_uvlc(br);
155 if (colWidth[i] == UVLC_ERROR) {
156 ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
157 return false;
158 }
159 colWidth[i]++;
160
161 lastColumnWidth -= colWidth[i];
162 }
163
164 colWidth[num_tile_columns-1] = lastColumnWidth;
165
166 for (int i=0; i<num_tile_rows-1; i++)
167 {
168 rowHeight[i] = get_uvlc(br);
169 if (rowHeight[i] == UVLC_ERROR) {
170 ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
171 return false;
172 }
173 rowHeight[i]++;
174 lastRowHeight -= rowHeight[i];
175 }
176
177 rowHeight[num_tile_rows-1] = lastRowHeight;
178 }
179
180 loop_filter_across_tiles_enabled_flag = get_bits(br,1);
181
182 } else {
183 num_tile_columns = 1;
184 num_tile_rows = 1;
185 uniform_spacing_flag = 1;
186 loop_filter_across_tiles_enabled_flag = 0;
187 }
188
189
190
191 if (uniform_spacing_flag) {
192
193 // set columns widths
194
195 int *const colPos = (int *)alloca((num_tile_columns+1) * sizeof(int));
196
197 for (int i=0;i<=num_tile_columns;i++) {
198 colPos[i] = i*sps->PicWidthInCtbsY / num_tile_columns;
199 }
200 for (int i=0;i<num_tile_columns;i++) {
201 colWidth[i] = colPos[i+1] - colPos[i];
202 }
203
204 // set row heights
205
206 int *const rowPos = (int *)alloca((num_tile_rows+1) * sizeof(int));
207
208 for (int i=0;i<=num_tile_rows;i++) {
209 rowPos[i] = i*sps->PicHeightInCtbsY / num_tile_rows;
210 }
211 for (int i=0;i<num_tile_rows;i++) {
212 rowHeight[i] = rowPos[i+1] - rowPos[i];
213 }
214 }
215
216
217 // set tile boundaries
218
219 colBd[0]=0;
220 for (int i=0;i<num_tile_columns;i++) {
221 colBd[i+1] = colBd[i] + colWidth[i];
222 }
223
224 rowBd[0]=0;
225 for (int i=0;i<num_tile_rows;i++) {
226 rowBd[i+1] = rowBd[i] + rowHeight[i];
227 }
228
229
230
231 // alloc raster scan arrays
232
233 CtbAddrRStoTS.resize(sps->PicSizeInCtbsY);
234 CtbAddrTStoRS.resize(sps->PicSizeInCtbsY);
235 TileId .resize(sps->PicSizeInCtbsY);
236 TileIdRS .resize(sps->PicSizeInCtbsY);
237 MinTbAddrZS .resize(sps->PicSizeInTbsY );
238
239
240 // raster scan (RS) <-> tile scan (TS) conversion
241
242 for (int ctbAddrRS=0 ; ctbAddrRS < sps->PicSizeInCtbsY ; ctbAddrRS++)
243 {
244 int tbX = ctbAddrRS % sps->PicWidthInCtbsY;
245 int tbY = ctbAddrRS / sps->PicWidthInCtbsY;
246 int tileX=-1,tileY=-1;
247
248 for (int i=0;i<num_tile_columns;i++)
249 if (tbX >= colBd[i])
250 tileX=i;
251
252 for (int j=0;j<num_tile_rows;j++)
253 if (tbY >= rowBd[j])
254 tileY=j;
255
256 CtbAddrRStoTS[ctbAddrRS] = 0;
257 for (int i=0;i<tileX;i++)
258 CtbAddrRStoTS[ctbAddrRS] += rowHeight[tileY]*colWidth[i];
259
260 for (int j=0;j<tileY;j++)
261 {
262 //pps->CtbAddrRStoTS[ctbAddrRS] += (tbY - pps->rowBd[tileY])*pps->colWidth[tileX];
263 //pps->CtbAddrRStoTS[ctbAddrRS] += tbX - pps->colBd[tileX];
264
265 CtbAddrRStoTS[ctbAddrRS] += sps->PicWidthInCtbsY * rowHeight[j];
266 }
267
268 assert(tileX>=0 && tileY>=0);
269
270 CtbAddrRStoTS[ctbAddrRS] += (tbY-rowBd[tileY])*colWidth[tileX];
271 CtbAddrRStoTS[ctbAddrRS] += tbX - colBd[tileX];
272
273
274 // inverse mapping
275
276 CtbAddrTStoRS[ CtbAddrRStoTS[ctbAddrRS] ] = ctbAddrRS;
277 }
278
279
280 logtrace(LogHeaders,"6.5.1 CtbAddrRSToTS\n");
281 for (int y=0;y<sps->PicHeightInCtbsY;y++)
282 {
283 for (int x=0;x<sps->PicWidthInCtbsY;x++)
284 {
285 logtrace(LogHeaders,"%3d ", CtbAddrRStoTS[x + y*sps->PicWidthInCtbsY]);
286 }
287
288 logtrace(LogHeaders,"\n");
289 }
290
291
292 // tile id
293
294 for (int j=0, tIdx=0 ; j<num_tile_rows ; j++)
295 for (int i=0 ; i<num_tile_columns;i++)
296 {
297 for (int y=rowBd[j] ; y<rowBd[j+1] ; y++)
298 for (int x=colBd[i] ; x<colBd[i+1] ; x++) {
299 TileId [ CtbAddrRStoTS[y*sps->PicWidthInCtbsY + x] ] = tIdx;
300 TileIdRS[ y*sps->PicWidthInCtbsY + x ] = tIdx;
301
302 //logtrace(LogHeaders,"tileID[%d,%d] = %d\n",x,y,pps->TileIdRS[ y*sps->PicWidthInCtbsY + x ]);
303 }
304
305 tIdx++;
306 }
307
308 logtrace(LogHeaders,"Tile IDs RS:\n");
309 for (int y=0;y<sps->PicHeightInCtbsY;y++) {
310 for (int x=0;x<sps->PicWidthInCtbsY;x++) {
311 logtrace(LogHeaders,"%2d ",TileIdRS[y*sps->PicWidthInCtbsY+x]);
312 }
313 logtrace(LogHeaders,"\n");
314 }
315
316 // 6.5.2 Z-scan order array initialization process
317
318 for (int y=0;y<sps->PicHeightInTbsY;y++)
319 for (int x=0;x<sps->PicWidthInTbsY;x++)
320 {
321 int tbX = (x<<sps->Log2MinTrafoSize)>>sps->Log2CtbSizeY;
322 int tbY = (y<<sps->Log2MinTrafoSize)>>sps->Log2CtbSizeY;
323 int ctbAddrRS = sps->PicWidthInCtbsY*tbY + tbX;
324
325 MinTbAddrZS[x + y*sps->PicWidthInTbsY] = CtbAddrRStoTS[ctbAddrRS]
326 << ((sps->Log2CtbSizeY-sps->Log2MinTrafoSize)*2);
327
328 int p=0;
329 for (int i=0 ; i<(sps->Log2CtbSizeY - sps->Log2MinTrafoSize) ; i++) {
330 int m=1<<i;
331 p += (m & x ? m*m : 0) + (m & y ? 2*m*m : 0);
332 }
333
334 MinTbAddrZS[x + y*sps->PicWidthInTbsY] += p;
335 }
336
337
338 // --- debug logging ---
339
340 /*
341 logtrace(LogHeaders,"6.5.2 Z-scan order array\n");
342 for (int y=0;y<sps->PicHeightInTbsY;y++)
343 {
344 for (int x=0;x<sps->PicWidthInTbsY;x++)
345 {
346 logtrace(LogHeaders,"%4d ", pps->MinTbAddrZS[x + y*sps->PicWidthInTbsY]);
347 }
348
349 logtrace(LogHeaders,"\n");
350 }
351
352 for (int i=0;i<sps->PicSizeInTbsY;i++)
353 {
354 for (int y=0;y<sps->PicHeightInTbsY;y++)
355 {
356 for (int x=0;x<sps->PicWidthInTbsY;x++)
357 {
358 if (pps->MinTbAddrZS[x + y*sps->PicWidthInTbsY] == i) {
359 logtrace(LogHeaders,"%d %d\n",x,y);
360 }
361 }
362 }
363 }
364 */
365
366 // END tiles
367
368
369 Log2MinCuQpDeltaSize = sps->Log2CtbSizeY - diff_cu_qp_delta_depth;
370
371
372 beta_offset = 0; // default value
373 tc_offset = 0; // default value
374
375 pps_loop_filter_across_slices_enabled_flag = get_bits(br,1);
376 deblocking_filter_control_present_flag = get_bits(br,1);
377 if (deblocking_filter_control_present_flag) {
378 deblocking_filter_override_enabled_flag = get_bits(br,1);
379 pic_disable_deblocking_filter_flag = get_bits(br,1);
380 if (!pic_disable_deblocking_filter_flag) {
381 beta_offset = get_svlc(br);
382 if (beta_offset == UVLC_ERROR) {
383 ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
384 return false;
385 }
386 beta_offset *= 2;
387
388 tc_offset = get_svlc(br);
389 if (tc_offset == UVLC_ERROR) {
390 ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
391 return false;
392 }
393 tc_offset *= 2;
394 }
395 }
396 else {
397 deblocking_filter_override_enabled_flag = 0;
398 pic_disable_deblocking_filter_flag = 0;
399 }
400
401
402 // --- scaling list ---
403
404 pic_scaling_list_data_present_flag = get_bits(br,1);
405
406 // check consistency: if scaling-lists are not enabled, pic_scalign_list_data_present_flag
407 // must be FALSE
408 if (sps->scaling_list_enable_flag==0 &&
409 pic_scaling_list_data_present_flag != 0) {
410 ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
411 return false;
412 }
413
414 if (pic_scaling_list_data_present_flag) {
415 de265_error err = read_scaling_list(br, sps, &scaling_list, true);
416 if (err != DE265_OK) {
417 ctx->add_warning(err, false);
418 return false;
419 }
420 }
421 else {
422 memcpy(&scaling_list, &sps->scaling_list, sizeof(scaling_list_data));
423 }
424
425
426
427
428 lists_modification_present_flag = get_bits(br,1);
429 log2_parallel_merge_level = get_uvlc(br);
430 if (log2_parallel_merge_level == UVLC_ERROR) {
431 ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
432 return false;
433 }
434 log2_parallel_merge_level += 2;
435
436 slice_segment_header_extension_present_flag = get_bits(br,1);
437 pps_extension_flag = get_bits(br,1);
438
439 if (pps_extension_flag) {
440 //assert(false);
441 /*
442 while( more_rbsp_data() )
443
444 pps_extension_data_flag
445 u(1)
446 rbsp_trailing_bits()
447
448 }
449 */
450 }
451
452
453 pps_read = true;
454
455 return true;
456 }
457
458
459 void pic_parameter_set::dump_pps(int fd) const
460 {
461 FILE* fh;
462 if (fd==1) fh=stdout;
463 else if (fd==2) fh=stderr;
464 else { return; }
465
466 #define LOG0(t) log2fh(fh, t)
467 #define LOG1(t,d) log2fh(fh, t,d)
468
469 LOG0("----------------- PPS -----------------\n");
470 LOG1("pic_parameter_set_id : %d\n", pic_parameter_set_id);
471 LOG1("seq_parameter_set_id : %d\n", seq_parameter_set_id);
472 LOG1("dependent_slice_segments_enabled_flag : %d\n", dependent_slice_segments_enabled_flag);
473 LOG1("sign_data_hiding_flag : %d\n", sign_data_hiding_flag);
474 LOG1("cabac_init_present_flag : %d\n", cabac_init_present_flag);
475 LOG1("num_ref_idx_l0_default_active : %d\n", num_ref_idx_l0_default_active);
476 LOG1("num_ref_idx_l1_default_active : %d\n", num_ref_idx_l1_default_active);
477
478 LOG1("pic_init_qp : %d\n", pic_init_qp);
479 LOG1("constrained_intra_pred_flag: %d\n", constrained_intra_pred_flag);
480 LOG1("transform_skip_enabled_flag: %d\n", transform_skip_enabled_flag);
481 LOG1("cu_qp_delta_enabled_flag : %d\n", cu_qp_delta_enabled_flag);
482
483 if (cu_qp_delta_enabled_flag) {
484 LOG1("diff_cu_qp_delta_depth : %d\n", diff_cu_qp_delta_depth);
485 }
486
487 LOG1("pic_cb_qp_offset : %d\n", pic_cb_qp_offset);
488 LOG1("pic_cr_qp_offset : %d\n", pic_cr_qp_offset);
489 LOG1("pps_slice_chroma_qp_offsets_present_flag : %d\n", pps_slice_chroma_qp_offsets_present_flag);
490 LOG1("weighted_pred_flag : %d\n", weighted_pred_flag);
491 LOG1("weighted_bipred_flag : %d\n", weighted_bipred_flag);
492 LOG1("output_flag_present_flag : %d\n", output_flag_present_flag);
493 LOG1("transquant_bypass_enable_flag: %d\n", transquant_bypass_enable_flag);
494 LOG1("tiles_enabled_flag : %d\n", tiles_enabled_flag);
495 LOG1("entropy_coding_sync_enabled_flag: %d\n", entropy_coding_sync_enabled_flag);
496
497 if (tiles_enabled_flag) {
498 LOG1("num_tile_columns : %d\n", num_tile_columns);
499 LOG1("num_tile_rows : %d\n", num_tile_rows);
500 LOG1("uniform_spacing_flag: %d\n", uniform_spacing_flag);
501
502 LOG0("tile column boundaries: ");
503 for (int i=0;i<=num_tile_columns;i++) {
504 LOG1("*%d ",colBd[i]);
505 }
506 LOG0("*\n");
507
508 LOG0("tile row boundaries: ");
509 for (int i=0;i<=num_tile_rows;i++) {
510 LOG1("*%d ",rowBd[i]);
511 }
512 LOG0("*\n");
513
514 //if( !uniform_spacing_flag ) {
515 /*
516 for( i = 0; i < num_tile_columns_minus1; i++ )
517
518 column_width_minus1[i]
519 ue(v)
520 for( i = 0; i < num_tile_rows_minus1; i++ )
521
522 row_height_minus1[i]
523 ue(v)
524 }
525 */
526
527 LOG1("loop_filter_across_tiles_enabled_flag : %d\n", loop_filter_across_tiles_enabled_flag);
528 }
529
530 LOG1("pps_loop_filter_across_slices_enabled_flag: %d\n", pps_loop_filter_across_slices_enabled_flag);
531 LOG1("deblocking_filter_control_present_flag: %d\n", deblocking_filter_control_present_flag);
532
533 if (deblocking_filter_control_present_flag) {
534 LOG1("deblocking_filter_override_enabled_flag: %d\n", deblocking_filter_override_enabled_flag);
535 LOG1("pic_disable_deblocking_filter_flag: %d\n", pic_disable_deblocking_filter_flag);
536
537 LOG1("beta_offset: %d\n", beta_offset);
538 LOG1("tc_offset: %d\n", tc_offset);
539 }
540
541 LOG1("pic_scaling_list_data_present_flag: %d\n", pic_scaling_list_data_present_flag);
542 if (pic_scaling_list_data_present_flag) {
543 //scaling_list_data()
544 }
545
546 LOG1("lists_modification_present_flag: %d\n", lists_modification_present_flag);
547 LOG1("log2_parallel_merge_level : %d\n", log2_parallel_merge_level);
548 LOG1("num_extra_slice_header_bits : %d\n", num_extra_slice_header_bits);
549 LOG1("slice_segment_header_extension_present_flag : %d\n", slice_segment_header_extension_present_flag);
550 LOG1("pps_extension_flag : %d\n", pps_extension_flag);
551
552 LOG1("Log2MinCuQpDeltaSize : %d\n", Log2MinCuQpDeltaSize);
553
554 #undef LOG0
555 #undef LOG1
556 }
557
558
559 bool pic_parameter_set::is_tile_start_CTB(int ctbX,int ctbY) const
560 {
561 // fast check
562 if (tiles_enabled_flag==0) {
563 return ctbX == 0 && ctbY == 0;
564 }
565
566 for (int i=0;i<num_tile_columns;i++)
567 if (colBd[i]==ctbX)
568 {
569 for (int k=0;k<num_tile_rows;k++)
570 if (rowBd[k]==ctbY)
571 {
572 return true;
573 }
574
575 return false;
576 }
577
578 return false;
579 }
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
2323 #include "libde265/bitstream.h"
2424 #include "libde265/sps.h" // for scaling list only
2525
26 #include <vector>
27
2628 #define DE265_MAX_TILE_COLUMNS 10
2729 #define DE265_MAX_TILE_ROWS 10
2830
2931
30 typedef struct {
32 struct pic_parameter_set {
33 pic_parameter_set();
34 ~pic_parameter_set();
35
36 bool read(bitreader*, struct decoder_context*);
37
38 bool is_tile_start_CTB(int ctbX,int ctbY) const;
39 void dump_pps(int fd) const;
40
41
3142 bool pps_read; // whether this pps has been read from bitstream
3243
3344 char pic_parameter_set_id;
7081 int colBd [ DE265_MAX_TILE_COLUMNS+1 ];
7182 int rowBd [ DE265_MAX_TILE_ROWS+1 ];
7283
73 int* CtbAddrRStoTS; // #CTBs
74 int* CtbAddrTStoRS; // #CTBs
75 int* TileId; // #CTBs
76 int* TileIdRS; // #CTBs
77 int* MinTbAddrZS; // #TBs [x + y*PicWidthInTbsY]
84 std::vector<int> CtbAddrRStoTS; // #CTBs
85 std::vector<int> CtbAddrTStoRS; // #CTBs
86 std::vector<int> TileId; // #CTBs // index in tile-scan order
87 std::vector<int> TileIdRS; // #CTBs // index in raster-scan order
88 std::vector<int> MinTbAddrZS; // #TBs [x + y*PicWidthInTbsY]
7889
7990
8091 // --- QP ---
103114 char slice_segment_header_extension_present_flag;
104115 char pps_extension_flag;
105116
106 } pic_parameter_set;
117 };
107118
108119 #endif
+0
-36
libde265/pps_func.h less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef DE265_PPS_FUNC_H
21 #define DE265_PPS_FUNC_H
22
23 #include "libde265/pps.h"
24 #include "libde265/decctx.h"
25
26
27 void init_pps(pic_parameter_set*);
28 void free_pps(pic_parameter_set*);
29
30 bool read_pps(bitreader*, pic_parameter_set*, decoder_context*);
31 void dump_pps(pic_parameter_set*, int fd);
32
33 bool is_tile_start_CTB(const pic_parameter_set* pps,int ctbX,int ctbY);
34
35 #endif
+0
-305
libde265/refpic.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "refpic.h"
21 #include "decctx.h"
22 #include "util.h"
23
24 #include <assert.h>
25 #include <stdlib.h>
26 #if defined(_MSC_VER) || defined(__MINGW32__)
27 # include <malloc.h>
28 #else
29 # include <alloca.h>
30 #endif
31
32
33 static void compute_NumPoc(ref_pic_set* rpset)
34 {
35 rpset->NumPocTotalCurr = 0;
36
37 for (int i=0; i<rpset->NumNegativePics; i++)
38 if (rpset->UsedByCurrPicS0[i])
39 rpset->NumPocTotalCurr++;
40
41 for (int i=0; i<rpset->NumPositivePics; i++)
42 if (rpset->UsedByCurrPicS1[i])
43 rpset->NumPocTotalCurr++;
44
45 /*
46 for (int i = 0; i < num_long_term_sps + num_long_term_pics; i++ )
47 if( UsedByCurrPicLt[i] )
48 NumPocTotalCurr++
49 }
50 */
51 }
52
53
54 /* A ref-pic-set is coded either coded
55 - as a list of the relative POC deltas themselves, or
56 - by shifting an existing ref-pic-set by some number of frames
57 When shifting an existing set, the frame 0 is also shifted as an additional reference frame.
58 When coding the ref-pic-sets in the SPS, predicition is always from the previous set.
59 In the slice header, the ref-pic-set can use any previous set as reference.
60 */
61 bool read_short_term_ref_pic_set(decoder_context* ctx,
62 const seq_parameter_set* sps,
63 bitreader* br,
64 ref_pic_set* out_set, // where to store the read set
65 int idxRps, // index of the set to be read
66 const ref_pic_set* sets, // previously read sets
67 bool sliceRefPicSet) // is this in the slice header?
68 {
69 // --- is this set coded in prediction mode (not possible for the first set)
70
71 char inter_ref_pic_set_prediction_flag;
72
73 if (idxRps != 0) {
74 inter_ref_pic_set_prediction_flag = get_bits(br,1);
75 }
76 else {
77 inter_ref_pic_set_prediction_flag = 0;
78 }
79
80
81
82 if (inter_ref_pic_set_prediction_flag) {
83 int vlc;
84
85 /* Only for the last ref_pic_set (that's the one coded in the slice header),
86 we can specify relative to which reference set we code the set. */
87
88 int delta_idx;
89 if (sliceRefPicSet) { // idxRps == num_short_term_ref_pic_sets) {
90 delta_idx = vlc = get_uvlc(br);
91 delta_idx++;
92 } else {
93 delta_idx = 1;
94 }
95
96 int RIdx = idxRps - delta_idx; // this is our source set, which we will modify
97 assert(RIdx>=0);
98
99 int delta_rps_sign = get_bits(br,1);
100 int abs_delta_rps = vlc = get_uvlc(br);
101 abs_delta_rps++;
102 int DeltaRPS = (delta_rps_sign ? -abs_delta_rps : abs_delta_rps);
103
104 // bits are stored in this order:
105 // - all bits for negative Pocs (forward),
106 // - then all bits for positive Pocs (forward),
107 // - then bits for '0', shifting of the current picture
108 // in total, these are 'nDeltaPocsRIdx'+1 bits
109
110 logtrace(LogHeaders,"predicted from %d with delta %d\n",RIdx,DeltaRPS);
111
112 int nDeltaPocsRIdx= sets[RIdx].NumDeltaPocs; // size of source set
113 char *const used_by_curr_pic_flag = (char *)alloca((nDeltaPocsRIdx+1) * sizeof(char));
114 char *const use_delta_flag = (char *)alloca((nDeltaPocsRIdx+1) * sizeof(char));
115
116 for (int j=0;j<=nDeltaPocsRIdx;j++) {
117 used_by_curr_pic_flag[j] = get_bits(br,1);
118 if (used_by_curr_pic_flag[j]) {
119 use_delta_flag[j] = 1; // if this frame is used, we also have to apply the delta
120 } else {
121 use_delta_flag[j] = get_bits(br,1); // otherwise, it is only optionally included
122 }
123 }
124
125 logtrace(LogHeaders,"flags: ");
126 for (int j=0;j<=nDeltaPocsRIdx;j++) {
127 logtrace(LogHeaders,"%d ", use_delta_flag[j]);
128 }
129 logtrace(LogHeaders,"\n");
130
131 int nNegativeRIdx = sets[RIdx].NumNegativePics;
132 int nPositiveRIdx = sets[RIdx].NumPositivePics;
133
134 // --- update list 0 (negative Poc) ---
135 // Iterate through all Pocs in decreasing value order (positive reverse, 0, negative forward).
136
137 int i=0; // target index
138
139 // positive list
140 for (int j=nPositiveRIdx-1;j>=0;j--) {
141 int dPoc = sets[RIdx].DeltaPocS1[j] + DeltaRPS; // new delta
142 if (dPoc<0 && use_delta_flag[nNegativeRIdx+j]) {
143 out_set->DeltaPocS0[i] = dPoc;
144 out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[nNegativeRIdx+j];
145 i++;
146 }
147 }
148
149 // frame 0
150 if (DeltaRPS<0 && use_delta_flag[nDeltaPocsRIdx]) {
151 out_set->DeltaPocS0[i] = DeltaRPS;
152 out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[nDeltaPocsRIdx];
153 i++;
154 }
155
156 // negative list
157 for (int j=0;j<nNegativeRIdx;j++) {
158 int dPoc = sets[RIdx].DeltaPocS0[j] + DeltaRPS;
159 if (dPoc<0 && use_delta_flag[j]) {
160 out_set->DeltaPocS0[i] = dPoc;
161 out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[j];
162 i++;
163 }
164 }
165
166 out_set->NumNegativePics = i;
167
168
169 // --- update list 1 (positive Poc) ---
170 // Iterate through all Pocs in increasing value order (negative reverse, 0, positive forward)
171
172 i=0; // target index
173
174 // negative list
175 for (int j=nNegativeRIdx-1;j>=0;j--) {
176 int dPoc = sets[RIdx].DeltaPocS0[j] + DeltaRPS;
177 if (dPoc>0 && use_delta_flag[j]) {
178 out_set->DeltaPocS1[i] = dPoc;
179 out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[j];
180 i++;
181 }
182 }
183
184 // frame 0
185 if (DeltaRPS>0 && use_delta_flag[nDeltaPocsRIdx]) {
186 out_set->DeltaPocS1[i] = DeltaRPS;
187 out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[nDeltaPocsRIdx];
188 i++;
189 }
190
191 // positive list
192 for (int j=0;j<nPositiveRIdx;j++) {
193 int dPoc = sets[RIdx].DeltaPocS1[j] + DeltaRPS;
194 if (dPoc>0 && use_delta_flag[nNegativeRIdx+j]) {
195 out_set->DeltaPocS1[i] = dPoc;
196 out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[nNegativeRIdx+j];
197 i++;
198 }
199 }
200
201 out_set->NumPositivePics = i;
202
203 out_set->NumDeltaPocs = out_set->NumNegativePics + out_set->NumPositivePics;
204
205 } else {
206
207 // --- first, read the number of past and future frames in this set ---
208
209 int num_negative_pics = get_uvlc(br);
210 int num_positive_pics = get_uvlc(br);
211
212 // total number of reference pictures may not exceed buffer capacity
213 if (num_negative_pics + num_positive_pics >
214 sps->sps_max_dec_pic_buffering[ sps->sps_max_sub_layers-1 ]) {
215 add_warning(ctx, DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false);
216 return false;
217 }
218
219
220 out_set->NumNegativePics = num_negative_pics;
221 out_set->NumPositivePics = num_positive_pics;
222 out_set->NumDeltaPocs = num_positive_pics + num_negative_pics;
223
224
225 // --- now, read the deltas between the reference frames to fill the lists ---
226
227 // past frames
228
229 int lastPocS=0;
230 for (int i=0;i<num_negative_pics;i++) {
231 int delta_poc_s0 = get_uvlc(br)+1;
232 char used_by_curr_pic_s0_flag = get_bits(br,1);
233
234 out_set->DeltaPocS0[i] = lastPocS - delta_poc_s0;
235 out_set->UsedByCurrPicS0[i] = used_by_curr_pic_s0_flag;
236 lastPocS = out_set->DeltaPocS0[i];
237 }
238
239 // future frames
240
241 lastPocS=0;
242 for (int i=0;i<num_positive_pics;i++) {
243 int delta_poc_s1 = get_uvlc(br)+1;
244 char used_by_curr_pic_s1_flag = get_bits(br,1);
245
246 out_set->DeltaPocS1[i] = lastPocS + delta_poc_s1;
247 out_set->UsedByCurrPicS1[i] = used_by_curr_pic_s1_flag;
248 lastPocS = out_set->DeltaPocS1[i];
249 }
250 }
251
252
253 compute_NumPoc(out_set);
254
255 return true;
256 }
257
258
259 void dump_short_term_ref_pic_set(const ref_pic_set* set, FILE* fh)
260 {
261 log2fh(fh,"NumDeltaPocs: %d [-:%d +:%d]\n", set->NumDeltaPocs,
262 set->NumNegativePics, set->NumPositivePics);
263
264 log2fh(fh,"DeltaPocS0:");
265 for (int i=0;i<set->NumNegativePics;i++) {
266 if (i) { log2fh(fh,","); }
267 log2fh(fh," %d/%d",set->DeltaPocS0[i],set->UsedByCurrPicS0[i]);
268 }
269 log2fh(fh,"\n");
270
271 log2fh(fh,"DeltaPocS1:");
272 for (int i=0;i<set->NumPositivePics;i++) {
273 if (i) { log2fh(fh,","); }
274 log2fh(fh," %d/%d",set->DeltaPocS1[i],set->UsedByCurrPicS1[i]);
275 }
276 log2fh(fh,"\n");
277 }
278
279
280 void dump_compact_short_term_ref_pic_set(const ref_pic_set* set, int range, FILE* fh)
281 {
282 char *const log = (char *)alloca((range+1+range+1) * sizeof(char));
283 log[2*range+1] = 0;
284 for (int i=0;i<2*range+1;i++) log[i]='.';
285 log[range]='|';
286
287 for (int i=set->NumNegativePics-1;i>=0;i--) {
288 int n = set->DeltaPocS0[i];
289 if (n>=-range) {
290 if (set->UsedByCurrPicS0[i]) log[n+range] = 'X';
291 else log[n+range] = 'o';
292 } else { log2fh(fh,"*%d%c ",n, set->UsedByCurrPicS0[i] ? 'X':'o'); }
293 }
294
295 for (int i=set->NumPositivePics-1;i>=0;i--) {
296 int n = set->DeltaPocS1[i];
297 if (n<=range) {
298 if (set->UsedByCurrPicS1[i]) log[n+range] = 'X';
299 else log[n+range] = 'o';
300 } else { log2fh(fh,"*%d%c ",n, set->UsedByCurrPicS1[i] ? 'X':'o'); }
301 }
302
303 log2fh(fh,"*%s\n",log);
304 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "refpic.h"
21 #include "decctx.h"
22 #include "util.h"
23
24 #include <assert.h>
25 #include <stdlib.h>
26 #if defined(_MSC_VER) || defined(__MINGW32__)
27 # include <malloc.h>
28 #else
29 # include <alloca.h>
30 #endif
31
32
33 static void compute_NumPoc(ref_pic_set* rpset)
34 {
35 rpset->NumPocTotalCurr_shortterm_only = 0;
36
37 for (int i=0; i<rpset->NumNegativePics; i++)
38 if (rpset->UsedByCurrPicS0[i])
39 rpset->NumPocTotalCurr_shortterm_only++;
40
41 for (int i=0; i<rpset->NumPositivePics; i++)
42 if (rpset->UsedByCurrPicS1[i])
43 rpset->NumPocTotalCurr_shortterm_only++;
44
45 /*
46 NOTE: this is done when reading the slice header.
47 The value numPocTotalCurr is then stored in the slice header.
48
49 for (int i = 0; i < num_long_term_sps + num_long_term_pics; i++ )
50 if( UsedByCurrPicLt[i] )
51 NumPocTotalCurr++
52 }
53 */
54 }
55
56
57 /* A ref-pic-set is coded either coded
58 - as a list of the relative POC deltas themselves, or
59 - by shifting an existing ref-pic-set by some number of frames
60 When shifting an existing set, the frame 0 is also shifted as an additional reference frame.
61 When coding the ref-pic-sets in the SPS, predicition is always from the previous set.
62 In the slice header, the ref-pic-set can use any previous set as reference.
63 */
64 bool read_short_term_ref_pic_set(decoder_context* ctx,
65 const seq_parameter_set* sps,
66 bitreader* br,
67 ref_pic_set* out_set, // where to store the read set
68 int idxRps, // index of the set to be read
69 const std::vector<ref_pic_set>& sets, // previously read sets
70 bool sliceRefPicSet) // is this in the slice header?
71 {
72 // --- is this set coded in prediction mode (not possible for the first set)
73
74 char inter_ref_pic_set_prediction_flag;
75
76 if (idxRps != 0) {
77 inter_ref_pic_set_prediction_flag = get_bits(br,1);
78 }
79 else {
80 inter_ref_pic_set_prediction_flag = 0;
81 }
82
83
84
85 if (inter_ref_pic_set_prediction_flag) {
86 int vlc;
87
88 /* Only for the last ref_pic_set (that's the one coded in the slice header),
89 we can specify relative to which reference set we code the set. */
90
91 int delta_idx;
92 if (sliceRefPicSet) { // idxRps == num_short_term_ref_pic_sets) {
93 delta_idx = vlc = get_uvlc(br);
94 delta_idx++;
95 } else {
96 delta_idx = 1;
97 }
98
99 int RIdx = idxRps - delta_idx; // this is our source set, which we will modify
100 assert(RIdx>=0);
101
102 int delta_rps_sign = get_bits(br,1);
103 int abs_delta_rps = vlc = get_uvlc(br);
104 abs_delta_rps++;
105 int DeltaRPS = (delta_rps_sign ? -abs_delta_rps : abs_delta_rps);
106
107 // bits are stored in this order:
108 // - all bits for negative Pocs (forward),
109 // - then all bits for positive Pocs (forward),
110 // - then bits for '0', shifting of the current picture
111 // in total, these are 'nDeltaPocsRIdx'+1 bits
112
113 logtrace(LogHeaders,"predicted from %d with delta %d\n",RIdx,DeltaRPS);
114
115 int nDeltaPocsRIdx= sets[RIdx].NumDeltaPocs; // size of source set
116 char *const used_by_curr_pic_flag = (char *)alloca((nDeltaPocsRIdx+1) * sizeof(char));
117 char *const use_delta_flag = (char *)alloca((nDeltaPocsRIdx+1) * sizeof(char));
118
119 for (int j=0;j<=nDeltaPocsRIdx;j++) {
120 used_by_curr_pic_flag[j] = get_bits(br,1);
121 if (used_by_curr_pic_flag[j]) {
122 use_delta_flag[j] = 1; // if this frame is used, we also have to apply the delta
123 } else {
124 use_delta_flag[j] = get_bits(br,1); // otherwise, it is only optionally included
125 }
126 }
127
128 logtrace(LogHeaders,"flags: ");
129 for (int j=0;j<=nDeltaPocsRIdx;j++) {
130 logtrace(LogHeaders,"%d ", use_delta_flag[j]);
131 }
132 logtrace(LogHeaders,"\n");
133
134 int nNegativeRIdx = sets[RIdx].NumNegativePics;
135 int nPositiveRIdx = sets[RIdx].NumPositivePics;
136
137 // --- update list 0 (negative Poc) ---
138 // Iterate through all Pocs in decreasing value order (positive reverse, 0, negative forward).
139
140 int i=0; // target index
141
142 // positive list
143 for (int j=nPositiveRIdx-1;j>=0;j--) {
144 int dPoc = sets[RIdx].DeltaPocS1[j] + DeltaRPS; // new delta
145 if (dPoc<0 && use_delta_flag[nNegativeRIdx+j]) {
146 out_set->DeltaPocS0[i] = dPoc;
147 out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[nNegativeRIdx+j];
148 i++;
149 }
150 }
151
152 // frame 0
153 if (DeltaRPS<0 && use_delta_flag[nDeltaPocsRIdx]) {
154 out_set->DeltaPocS0[i] = DeltaRPS;
155 out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[nDeltaPocsRIdx];
156 i++;
157 }
158
159 // negative list
160 for (int j=0;j<nNegativeRIdx;j++) {
161 int dPoc = sets[RIdx].DeltaPocS0[j] + DeltaRPS;
162 if (dPoc<0 && use_delta_flag[j]) {
163 out_set->DeltaPocS0[i] = dPoc;
164 out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[j];
165 i++;
166 }
167 }
168
169 out_set->NumNegativePics = i;
170
171
172 // --- update list 1 (positive Poc) ---
173 // Iterate through all Pocs in increasing value order (negative reverse, 0, positive forward)
174
175 i=0; // target index
176
177 // negative list
178 for (int j=nNegativeRIdx-1;j>=0;j--) {
179 int dPoc = sets[RIdx].DeltaPocS0[j] + DeltaRPS;
180 if (dPoc>0 && use_delta_flag[j]) {
181 out_set->DeltaPocS1[i] = dPoc;
182 out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[j];
183 i++;
184 }
185 }
186
187 // frame 0
188 if (DeltaRPS>0 && use_delta_flag[nDeltaPocsRIdx]) {
189 out_set->DeltaPocS1[i] = DeltaRPS;
190 out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[nDeltaPocsRIdx];
191 i++;
192 }
193
194 // positive list
195 for (int j=0;j<nPositiveRIdx;j++) {
196 int dPoc = sets[RIdx].DeltaPocS1[j] + DeltaRPS;
197 if (dPoc>0 && use_delta_flag[nNegativeRIdx+j]) {
198 out_set->DeltaPocS1[i] = dPoc;
199 out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[nNegativeRIdx+j];
200 i++;
201 }
202 }
203
204 out_set->NumPositivePics = i;
205
206 out_set->NumDeltaPocs = out_set->NumNegativePics + out_set->NumPositivePics;
207
208 } else {
209
210 // --- first, read the number of past and future frames in this set ---
211
212 int num_negative_pics = get_uvlc(br);
213 int num_positive_pics = get_uvlc(br);
214
215 // total number of reference pictures may not exceed buffer capacity
216 if (num_negative_pics + num_positive_pics >
217 sps->sps_max_dec_pic_buffering[ sps->sps_max_sub_layers-1 ]) {
218 ctx->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false);
219 return false;
220 }
221
222
223 out_set->NumNegativePics = num_negative_pics;
224 out_set->NumPositivePics = num_positive_pics;
225 out_set->NumDeltaPocs = num_positive_pics + num_negative_pics;
226
227
228 // --- now, read the deltas between the reference frames to fill the lists ---
229
230 // past frames
231
232 int lastPocS=0;
233 for (int i=0;i<num_negative_pics;i++) {
234 int delta_poc_s0 = get_uvlc(br)+1;
235 char used_by_curr_pic_s0_flag = get_bits(br,1);
236
237 out_set->DeltaPocS0[i] = lastPocS - delta_poc_s0;
238 out_set->UsedByCurrPicS0[i] = used_by_curr_pic_s0_flag;
239 lastPocS = out_set->DeltaPocS0[i];
240 }
241
242 // future frames
243
244 lastPocS=0;
245 for (int i=0;i<num_positive_pics;i++) {
246 int delta_poc_s1 = get_uvlc(br)+1;
247 char used_by_curr_pic_s1_flag = get_bits(br,1);
248
249 out_set->DeltaPocS1[i] = lastPocS + delta_poc_s1;
250 out_set->UsedByCurrPicS1[i] = used_by_curr_pic_s1_flag;
251 lastPocS = out_set->DeltaPocS1[i];
252 }
253 }
254
255
256 compute_NumPoc(out_set);
257
258 return true;
259 }
260
261
262 void dump_short_term_ref_pic_set(const ref_pic_set* set, FILE* fh)
263 {
264 log2fh(fh,"NumDeltaPocs: %d [-:%d +:%d]\n", set->NumDeltaPocs,
265 set->NumNegativePics, set->NumPositivePics);
266
267 log2fh(fh,"DeltaPocS0:");
268 for (int i=0;i<set->NumNegativePics;i++) {
269 if (i) { log2fh(fh,","); }
270 log2fh(fh," %d/%d",set->DeltaPocS0[i],set->UsedByCurrPicS0[i]);
271 }
272 log2fh(fh,"\n");
273
274 log2fh(fh,"DeltaPocS1:");
275 for (int i=0;i<set->NumPositivePics;i++) {
276 if (i) { log2fh(fh,","); }
277 log2fh(fh," %d/%d",set->DeltaPocS1[i],set->UsedByCurrPicS1[i]);
278 }
279 log2fh(fh,"\n");
280 }
281
282
283 void dump_compact_short_term_ref_pic_set(const ref_pic_set* set, int range, FILE* fh)
284 {
285 char *const log = (char *)alloca((range+1+range+1) * sizeof(char));
286 log[2*range+1] = 0;
287 for (int i=0;i<2*range+1;i++) log[i]='.';
288 log[range]='|';
289
290 for (int i=set->NumNegativePics-1;i>=0;i--) {
291 int n = set->DeltaPocS0[i];
292 if (n>=-range) {
293 if (set->UsedByCurrPicS0[i]) log[n+range] = 'X';
294 else log[n+range] = 'o';
295 } else { log2fh(fh,"*%d%c ",n, set->UsedByCurrPicS0[i] ? 'X':'o'); }
296 }
297
298 for (int i=set->NumPositivePics-1;i>=0;i--) {
299 int n = set->DeltaPocS1[i];
300 if (n<=range) {
301 if (set->UsedByCurrPicS1[i]) log[n+range] = 'X';
302 else log[n+range] = 'o';
303 } else { log2fh(fh,"*%d%c ",n, set->UsedByCurrPicS1[i] ? 'X':'o'); }
304 }
305
306 log2fh(fh,"*%s\n",log);
307 }
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
3030 uint8_t NumPositivePics; // number of future reference pictures
3131 uint8_t NumDeltaPocs; // total number of reference pictures (past + future)
3232
33 uint8_t NumPocTotalCurr; /* Total number of reference pictures that may actually be used
34 for prediction in the current frame. */
33 uint8_t NumPocTotalCurr_shortterm_only; /* Total number of reference pictures that may actually
34 be used for prediction in the current frame. */
3535
3636 // Lists of pictures that have to be kept in the decoded picture buffer for future
3737 // reference and that may optionally be used for prediction in the current frame.
+0
-266
libde265/sao.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "sao.h"
21 #include "util.h"
22
23 #include <stdlib.h>
24 #include <string.h>
25
26
27 void apply_sao(decoder_context* ctx, int xCtb,int yCtb,
28 const slice_segment_header* shdr, int cIdx, int nS,
29 const uint8_t* in_img, int in_stride)
30 {
31 const seq_parameter_set* sps = ctx->current_sps;
32 const pic_parameter_set* pps = ctx->current_pps;
33 int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C);
34 int maxPixelValue = (1<<bitDepth)-1;
35
36 int xC = xCtb*nS;
37 int yC = yCtb*nS;
38
39 const sao_info* saoinfo = get_sao_info(ctx->img,ctx->current_sps,xCtb,yCtb);
40
41 int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3;
42
43 logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xC,yC,cIdx, SaoTypeIdx, nS,nS);
44
45 if (SaoTypeIdx==0) {
46 return;
47 }
48
49 /*
50 if ((sps->pcm_loop_filter_disable_flag && get_pcm_flag(ctx->img,sps,xC,yC)) ||
51 get_cu_transquant_bypass(ctx->img,sps,xC,yC) ||
52 SaoTypeIdx == 0)
53 {
54 return;
55 }
56 */
57
58 int width = ctx->current_sps->pic_width_in_luma_samples;
59 int height = ctx->current_sps->pic_height_in_luma_samples;
60
61 if (cIdx>0) { width =(width+1)/2; height =(height+1)/2; }
62
63 int ctbSliceAddrRS = get_SliceHeader(ctx,xC,yC)->SliceAddrRS;
64 int* MinTbAddrZS = ctx->current_pps->MinTbAddrZS;
65 int PicWidthInTbsY = ctx->current_sps->PicWidthInTbsY;
66 int Log2MinTrafoSize = ctx->current_sps->Log2MinTrafoSize;
67 int chromaLog2MinTrafoSize = Log2MinTrafoSize;
68 if (cIdx>0) { chromaLog2MinTrafoSize-=1; }
69
70 int picWidthInCtbs = ctx->current_sps->PicWidthInCtbsY;
71 int ctbshift = ctx->current_sps->Log2CtbSizeY;
72 int chromashift = 0;
73 if (cIdx>0) { ctbshift-=1; chromashift=1; }
74
75
76 uint8_t* out_img;
77 int out_stride;
78 get_image_plane(ctx->img, cIdx, &out_img,&out_stride);
79
80
81 for (int i=0;i<5;i++)
82 {
83 logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]);
84 }
85
86 if (SaoTypeIdx==2) {
87 int hPos[2], vPos[2];
88 int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3;
89
90 //logtrace(LogSAO,"SaoEoClass = %d\n", SaoEoClass);
91
92 switch (SaoEoClass) {
93 case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break;
94 case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break;
95 case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break;
96 case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break;
97 }
98
99
100 for (int j=0;j<nS;j++)
101 for (int i=0;i<nS;i++) {
102 int edgeIdx = -1;
103
104 logtrace(LogSAO, "pos %d,%d\n",xC+i,yC+j);
105
106 if (xC+i>=width || yC+j>=height) {
107 continue;
108 }
109
110 if ((sps->pcm_loop_filter_disable_flag &&
111 get_pcm_flag(ctx->img,sps,(xC+i)<<chromashift,(yC+j)<<chromashift)) ||
112 get_cu_transquant_bypass(ctx->img,sps,(xC+i)<<chromashift,(yC+j)<<chromashift)) {
113 continue;
114 }
115
116 for (int k=0;k<2;k++) {
117 int xS = xC+i+hPos[k];
118 int yS = yC+j+vPos[k];
119
120 if (xS<0 || yS<0 || xS>=width || yS>=height) {
121 edgeIdx=0;
122 break;
123 }
124
125
126 // This part seems inefficient with all the get_SliceHeaderIndex() calls,
127 // but removing this part (because the input was known to have only a single
128 // slice anyway) reduced computation time only by 1.3%.
129 // TODO: however, this may still be a big part of SAO itself.
130
131 int sliceAddrRS = get_SliceHeader(ctx,xS<<chromashift,yS<<chromashift)->SliceAddrRS;
132 if (sliceAddrRS != ctbSliceAddrRS &&
133 MinTbAddrZS[( xS >>Log2MinTrafoSize) + (yS >>Log2MinTrafoSize)*PicWidthInTbsY] <
134 MinTbAddrZS[((xC+i)>>Log2MinTrafoSize) + ((yC+j)>>Log2MinTrafoSize)*PicWidthInTbsY] &&
135 get_SliceHeader(ctx,(xC+i)<<chromashift,(yC+j)<<chromashift)->slice_loop_filter_across_slices_enabled_flag==0) {
136 edgeIdx=0;
137 break;
138 }
139
140 if (sliceAddrRS != ctbSliceAddrRS &&
141 MinTbAddrZS[((xC+i)>>Log2MinTrafoSize) + ((yC+j)>>Log2MinTrafoSize)*PicWidthInTbsY] <
142 MinTbAddrZS[( xS >>Log2MinTrafoSize) + (yS >>Log2MinTrafoSize)*PicWidthInTbsY] &&
143 get_SliceHeader(ctx,xS<<chromashift,yS<<chromashift)->slice_loop_filter_across_slices_enabled_flag==0) {
144 edgeIdx=0;
145 break;
146 }
147
148
149 if (pps->loop_filter_across_tiles_enabled_flag==0 &&
150 pps->TileIdRS[(xS>>ctbshift) + (yS>>ctbshift)*picWidthInCtbs] !=
151 pps->TileIdRS[(xC>>ctbshift) + (yC>>ctbshift)*picWidthInCtbs]) {
152 edgeIdx=0;
153 }
154 }
155
156 if (edgeIdx != 0) {
157
158 logtrace(LogSAO,"edge: %x vs %x %x\n",
159 in_img[xC+i+(yC+j)*in_stride],
160 in_img[xC+i+hPos[0]+(yC+j+vPos[0])*in_stride],
161 in_img[xC+i+hPos[1]+(yC+j+vPos[1])*in_stride]);
162
163 edgeIdx = 2 +
164 Sign(in_img[xC+i+(yC+j)*in_stride] - in_img[xC+i+hPos[0]+(yC+j+vPos[0])*in_stride]) +
165 Sign(in_img[xC+i+(yC+j)*in_stride] - in_img[xC+i+hPos[1]+(yC+j+vPos[1])*in_stride]);
166
167 if (edgeIdx<=2) {
168 edgeIdx = (edgeIdx==2) ? 0 : (edgeIdx+1);
169 }
170 }
171
172 if (edgeIdx != 0) {
173 int offset = saoinfo->saoOffsetVal[cIdx][edgeIdx-1];
174
175
176 out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
177 in_img[xC+i+(yC+j)*in_stride] + offset);
178
179 logtrace(LogSAO,"%d %d (%d) offset %d %x -> %x = %x\n",xC+i,yC+j,edgeIdx,
180 offset,
181 in_img[xC+i+(yC+j)*in_stride],
182 in_img[xC+i+(yC+j)*in_stride]+offset,
183 out_img[xC+i+(yC+j)*out_stride]);
184 }
185 }
186 }
187 else {
188 int bandShift = bitDepth-5;
189 int saoLeftClass = saoinfo->sao_band_position[cIdx];
190 logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass);
191
192 int bandTable[32];
193 memset(bandTable, 0, sizeof(int)*32);
194
195 for (int k=0;k<4;k++) {
196 bandTable[ (k+saoLeftClass)&31 ] = k+1;
197 }
198
199
200 for (int j=0;j<nS;j++)
201 for (int i=0;i<nS;i++) {
202
203 if (xC+i>=width || yC+j>=height) {
204 break;
205 }
206
207 if ((sps->pcm_loop_filter_disable_flag &&
208 get_pcm_flag(ctx->img,sps,(xC+i)<<chromashift,(yC+j)<<chromashift)) ||
209 get_cu_transquant_bypass(ctx->img,sps,(xC+i)<<chromashift,(yC+j)<<chromashift)) {
210 continue;
211 }
212
213 int bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
214
215 if (bandIdx>0) {
216 int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
217
218 logtrace(LogSAO,"%d %d (%d) offset %d %x -> %x\n",xC+i,yC+j,bandIdx,
219 offset,
220 in_img[xC+i+(yC+j)*in_stride],
221 in_img[xC+i+(yC+j)*in_stride]+offset);
222
223 out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
224 in_img[xC+i+(yC+j)*in_stride] + offset);
225 }
226 }
227 }
228 }
229
230
231 void apply_sample_adaptive_offset(decoder_context* ctx)
232 {
233 if (ctx->current_sps->sample_adaptive_offset_enabled_flag==0) {
234 return;
235 }
236
237 de265_image inputCopy;
238 de265_init_image(&inputCopy);
239 de265_alloc_image(&inputCopy, ctx->img->width, ctx->img->height, de265_chroma_420, NULL);
240
241 de265_copy_image(&inputCopy, ctx->img);
242
243 for (int yCtb=0; yCtb<ctx->current_sps->PicHeightInCtbsY; yCtb++)
244 for (int xCtb=0; xCtb<ctx->current_sps->PicWidthInCtbsY; xCtb++)
245 {
246 const slice_segment_header* shdr = get_SliceHeaderCtb(ctx, xCtb,yCtb);
247
248 if (shdr->slice_sao_luma_flag) {
249 apply_sao(ctx, xCtb,yCtb, shdr, 0, 1<<ctx->current_sps->Log2CtbSizeY,
250 inputCopy.y, inputCopy.stride);
251 }
252
253 if (shdr->slice_sao_chroma_flag) {
254 apply_sao(ctx, xCtb,yCtb, shdr, 1, 1<<(ctx->current_sps->Log2CtbSizeY-1),
255 inputCopy.cb, inputCopy.chroma_stride);
256
257 apply_sao(ctx, xCtb,yCtb, shdr, 2, 1<<(ctx->current_sps->Log2CtbSizeY-1),
258 inputCopy.cr, inputCopy.chroma_stride);
259 }
260 }
261
262 de265_free_image(&inputCopy);
263 }
264
265
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "sao.h"
21 #include "util.h"
22
23 #include <stdlib.h>
24 #include <string.h>
25
26
27 void apply_sao(de265_image* img, int xCtb,int yCtb,
28 const slice_segment_header* shdr, int cIdx, int nS,
29 const uint8_t* in_img, int in_stride,
30 /* */ uint8_t* out_img, int out_stride)
31 {
32 const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb);
33
34 int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3;
35
36 logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nS,nS);
37
38 if (SaoTypeIdx==0) {
39 return;
40 }
41
42 const seq_parameter_set* sps = &img->sps;
43 const pic_parameter_set* pps = &img->pps;
44 const int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C);
45 const int maxPixelValue = (1<<bitDepth)-1;
46
47 // top left position of CTB in pixels
48 const int xC = xCtb*nS;
49 const int yC = yCtb*nS;
50
51 const int width = img->get_width(cIdx);
52 const int height = img->get_height(cIdx);
53
54 const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS;
55
56 const int picWidthInCtbs = sps->PicWidthInCtbsY;
57 const int ctbshift = sps->Log2CtbSizeY - (cIdx>0 ? 1 : 0);
58 const int chromashift = (cIdx>0 ? 1 : 0);
59
60
61 for (int i=0;i<5;i++)
62 {
63 logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]);
64 }
65
66
67 // actual size of CTB to be processed (can be smaller when partially outside of image)
68 const int ctbW = (xC+nS>width) ? width -xC : nS;
69 const int ctbH = (yC+nS>height) ? height-yC : nS;
70
71
72 const bool extendedTests = (img->get_CTB_has_pcm(xCtb,yCtb) ||
73 img->get_CTB_has_cu_transquant_bypass(xCtb,yCtb));
74
75 if (SaoTypeIdx==2) {
76 int hPos[2], vPos[2];
77 int vPosStride[2]; // vPos[] multiplied by image stride
78 int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3;
79
80 switch (SaoEoClass) {
81 case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break;
82 case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break;
83 case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break;
84 case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break;
85 }
86
87 vPosStride[0] = vPos[0] * in_stride;
88 vPosStride[1] = vPos[1] * in_stride;
89
90 /* Reorder sao_info.saoOffsetVal[] array, so that we can index it
91 directly with the sum of the two pixel-difference signs. */
92 int8_t saoOffsetVal[5]; // [2] unused
93 saoOffsetVal[0] = saoinfo->saoOffsetVal[cIdx][1-1];
94 saoOffsetVal[1] = saoinfo->saoOffsetVal[cIdx][2-1];
95 saoOffsetVal[2] = 0;
96 saoOffsetVal[3] = saoinfo->saoOffsetVal[cIdx][3-1];
97 saoOffsetVal[4] = saoinfo->saoOffsetVal[cIdx][4-1];
98
99
100 for (int j=0;j<ctbH;j++) {
101 const uint8_t* in_ptr = &in_img [xC+(yC+j)*in_stride];
102 /* */ uint8_t* out_ptr = &out_img[xC+(yC+j)*out_stride];
103
104 for (int i=0;i<ctbW;i++) {
105 int edgeIdx = -1;
106
107 logtrace(LogSAO, "pos %d,%d\n",xC+i,yC+j);
108
109 if (extendedTests &&
110 (sps->pcm_loop_filter_disable_flag &&
111 img->get_pcm_flag((xC+i)<<chromashift,(yC+j)<<chromashift)) ||
112 img->get_cu_transquant_bypass((xC+i)<<chromashift,(yC+j)<<chromashift)) {
113 continue;
114 }
115
116 // do the expensive test for boundaries only at the boundaries
117 bool testBoundary = (i==0 || j==0 || i==ctbW-1 || j==ctbH-1);
118
119 if (testBoundary)
120 for (int k=0;k<2;k++) {
121 int xS = xC+i+hPos[k];
122 int yS = yC+j+vPos[k];
123
124 if (xS<0 || yS<0 || xS>=width || yS>=height) {
125 edgeIdx=0;
126 break;
127 }
128
129
130 // This part seems inefficient with all the get_SliceHeaderIndex() calls,
131 // but removing this part (because the input was known to have only a single
132 // slice anyway) reduced computation time only by 1.3%.
133 // TODO: however, this may still be a big part of SAO itself.
134
135 int sliceAddrRS = img->get_SliceHeader(xS<<chromashift,yS<<chromashift)->SliceAddrRS;
136 if (sliceAddrRS < ctbSliceAddrRS &&
137 img->get_SliceHeader((xC+i)<<chromashift,(yC+j)<<chromashift)->slice_loop_filter_across_slices_enabled_flag==0) {
138 edgeIdx=0;
139 break;
140 }
141
142 if (sliceAddrRS > ctbSliceAddrRS &&
143 img->get_SliceHeader(xS<<chromashift,yS<<chromashift)->slice_loop_filter_across_slices_enabled_flag==0) {
144 edgeIdx=0;
145 break;
146 }
147
148
149 if (pps->loop_filter_across_tiles_enabled_flag==0 &&
150 pps->TileIdRS[(xS>>ctbshift) + (yS>>ctbshift)*picWidthInCtbs] !=
151 pps->TileIdRS[(xC>>ctbshift) + (yC>>ctbshift)*picWidthInCtbs]) {
152 edgeIdx=0;
153 break;
154 }
155 }
156
157 if (edgeIdx != 0) {
158
159 edgeIdx = ( Sign(in_ptr[i] - in_ptr[i+hPos[0]+vPosStride[0]]) +
160 Sign(in_ptr[i] - in_ptr[i+hPos[1]+vPosStride[1]]) );
161
162 if (1) { // edgeIdx != 0) { // seems to be faster without this check (zero in offset table)
163 int offset = saoOffsetVal[edgeIdx+2];
164
165 out_ptr[i] = Clip3(0,maxPixelValue,
166 in_ptr[i] + offset);
167 }
168 }
169 }
170 }
171 }
172 else {
173 int bandShift = bitDepth-5;
174 int saoLeftClass = saoinfo->sao_band_position[cIdx];
175 logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass);
176
177 int bandTable[32];
178 memset(bandTable, 0, sizeof(int)*32);
179
180 for (int k=0;k<4;k++) {
181 bandTable[ (k+saoLeftClass)&31 ] = k+1;
182 }
183
184
185 /* If PCM or transquant_bypass is used in this CTB, we have to
186 run all checks (A).
187 Otherwise, we run a simplified version of the code (B).
188
189 NOTE: this whole part of SAO does not seem to be a significant part of the time spent
190 */
191
192 if (extendedTests) {
193
194 // (A) full version with all checks
195
196 for (int j=0;j<ctbH;j++)
197 for (int i=0;i<ctbW;i++) {
198
199 if ((sps->pcm_loop_filter_disable_flag &&
200 img->get_pcm_flag((xC+i)<<chromashift,(yC+j)<<chromashift)) ||
201 img->get_cu_transquant_bypass((xC+i)<<chromashift,(yC+j)<<chromashift)) {
202 continue;
203 }
204
205 int bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
206
207 if (bandIdx>0) {
208 int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
209
210 logtrace(LogSAO,"%d %d (%d) offset %d %x -> %x\n",xC+i,yC+j,bandIdx,
211 offset,
212 in_img[xC+i+(yC+j)*in_stride],
213 in_img[xC+i+(yC+j)*in_stride]+offset);
214
215 out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
216 in_img[xC+i+(yC+j)*in_stride] + offset);
217 }
218 }
219 }
220 else
221 {
222 // (B) simplified version (only works if no PCM and transquant_bypass is active)
223
224 for (int j=0;j<ctbH;j++)
225 for (int i=0;i<ctbW;i++) {
226
227 int bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
228
229 if (bandIdx>0) {
230 int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
231
232 out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
233 in_img[xC+i+(yC+j)*in_stride] + offset);
234 }
235 }
236 }
237 }
238 }
239
240
241 void apply_sample_adaptive_offset(de265_image* img)
242 {
243 if (img->sps.sample_adaptive_offset_enabled_flag==0) {
244 return;
245 }
246
247 de265_image inputCopy;
248 de265_error err = inputCopy.copy_image(img);
249 if (err != DE265_OK) {
250 img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
251 return;
252 }
253
254 for (int yCtb=0; yCtb<img->sps.PicHeightInCtbsY; yCtb++)
255 for (int xCtb=0; xCtb<img->sps.PicWidthInCtbsY; xCtb++)
256 {
257 const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb);
258
259 if (shdr->slice_sao_luma_flag) {
260 apply_sao(img, xCtb,yCtb, shdr, 0, 1<<img->sps.Log2CtbSizeY,
261 inputCopy.get_image_plane(0), inputCopy.get_image_stride(0),
262 img->get_image_plane(0), img->get_image_stride(0));
263 }
264
265 if (shdr->slice_sao_chroma_flag) {
266 apply_sao(img, xCtb,yCtb, shdr, 1, 1<<(img->sps.Log2CtbSizeY-1),
267 inputCopy.get_image_plane(1), inputCopy.get_image_stride(1),
268 img->get_image_plane(1), img->get_image_stride(1));
269
270 apply_sao(img, xCtb,yCtb, shdr, 2, 1<<(img->sps.Log2CtbSizeY-1),
271 inputCopy.get_image_plane(2), inputCopy.get_image_stride(2),
272 img->get_image_plane(2), img->get_image_stride(2));
273 }
274 }
275 }
276
277
278 void apply_sample_adaptive_offset_sequential(de265_image* img)
279 {
280 if (img->sps.sample_adaptive_offset_enabled_flag==0) {
281 return;
282 }
283
284
285 uint8_t* inputCopy = new uint8_t[ img->get_image_stride(0) * img->get_height(0) ];
286 if (inputCopy == NULL) {
287 img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
288 return;
289 }
290
291
292 for (int cIdx=0;cIdx<3;cIdx++) {
293
294 int stride = img->get_image_stride(cIdx);
295 int height = img->get_height(cIdx);
296
297 memcpy(inputCopy, img->get_image_plane(cIdx), stride * height);
298
299 for (int yCtb=0; yCtb<img->sps.PicHeightInCtbsY; yCtb++)
300 for (int xCtb=0; xCtb<img->sps.PicWidthInCtbsY; xCtb++)
301 {
302 const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb);
303
304 if (cIdx==0 && shdr->slice_sao_luma_flag) {
305 apply_sao(img, xCtb,yCtb, shdr, 0, 1<<img->sps.Log2CtbSizeY,
306 inputCopy, stride,
307 img->get_image_plane(0), img->get_image_stride(0));
308 }
309
310 if (cIdx!=0 && shdr->slice_sao_chroma_flag) {
311 apply_sao(img, xCtb,yCtb, shdr, cIdx, 1<<(img->sps.Log2CtbSizeY-1),
312 inputCopy, stride,
313 img->get_image_plane(cIdx), img->get_image_stride(cIdx));
314 }
315 }
316 }
317
318 delete[] inputCopy;
319 }
320
321
322
323
324 class thread_task_sao : public thread_task
325 {
326 public:
327 int ctb_y;
328 de265_image* img; /* this is where we get the SPS from
329 (either inputImg or outputImg can be a dummy image)
330 */
331
332 de265_image* inputImg;
333 de265_image* outputImg;
334 int inputProgress;
335
336 virtual void work();
337 };
338
339
340 void thread_task_sao::work()
341 {
342 state = Running;
343 img->thread_run();
344
345 const int rightCtb = img->sps.PicWidthInCtbsY-1;
346 const int ctbSize = (1<<img->sps.Log2CtbSizeY);
347
348
349 // wait until also the CTB-rows below and above are ready
350
351 img->wait_for_progress(this, rightCtb,ctb_y, inputProgress);
352
353 if (ctb_y>0) {
354 img->wait_for_progress(this, rightCtb,ctb_y-1, inputProgress);
355 }
356
357 if (ctb_y+1<img->sps.PicHeightInCtbsY) {
358 img->wait_for_progress(this, rightCtb,ctb_y+1, inputProgress);
359 }
360
361
362 // copy input image to output for this CTB-row
363
364 outputImg->copy_lines_from(inputImg, ctb_y * ctbSize, (ctb_y+1) * ctbSize);
365
366
367 // process SAO in the CTB-row
368
369 for (int xCtb=0; xCtb<img->sps.PicWidthInCtbsY; xCtb++)
370 {
371 const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,ctb_y);
372
373 if (shdr->slice_sao_luma_flag) {
374 apply_sao(img, xCtb,ctb_y, shdr, 0, ctbSize,
375 inputImg ->get_image_plane(0), inputImg ->get_image_stride(0),
376 outputImg->get_image_plane(0), outputImg->get_image_stride(0));
377 }
378
379 if (shdr->slice_sao_chroma_flag) {
380 apply_sao(img, xCtb,ctb_y, shdr, 1, ctbSize>>1,
381 inputImg ->get_image_plane(1), inputImg ->get_image_stride(1),
382 outputImg->get_image_plane(1), outputImg->get_image_stride(1));
383
384 apply_sao(img, xCtb,ctb_y, shdr, 2, ctbSize>>1,
385 inputImg ->get_image_plane(2), inputImg ->get_image_stride(2),
386 outputImg->get_image_plane(2), outputImg->get_image_stride(2));
387 }
388 }
389
390
391 // mark SAO progress
392
393 for (int x=0;x<=rightCtb;x++) {
394 const int CtbWidth = img->sps.PicWidthInCtbsY;
395 img->ctb_progress[x+ctb_y*CtbWidth].set_progress(CTB_PROGRESS_SAO);
396 }
397
398
399 state = Finished;
400 img->thread_finishes();
401 }
402
403
404 bool add_sao_tasks(image_unit* imgunit, int saoInputProgress)
405 {
406 de265_image* img = imgunit->img;
407
408 if (img->sps.sample_adaptive_offset_enabled_flag==0) {
409 return false;
410 }
411
412
413 decoder_context* ctx = img->decctx;
414
415 de265_error err = imgunit->sao_output.alloc_image(img->get_width(), img->get_height(),
416 img->get_chroma_format(), &img->sps, false, img->decctx);
417 if (err != DE265_OK) {
418 img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
419 return false;
420 }
421
422 int nRows = img->sps.PicHeightInCtbsY;
423
424 int n=0;
425 img->thread_start(nRows);
426
427 for (int y=0;y<img->sps.PicHeightInCtbsY;y++)
428 {
429 thread_task_sao* task = new thread_task_sao;
430
431 task->inputImg = img;
432 task->outputImg = &imgunit->sao_output;
433 task->img = img;
434 task->ctb_y = y;
435 task->inputProgress = saoInputProgress;
436
437 imgunit->tasks.push_back(task);
438 add_task(&ctx->thread_pool, task);
439 n++;
440 }
441
442 /* Currently need barrier here because when are finished, we have to swap the pixel
443 data back into the main image. */
444 img->wait_for_completion();
445
446 img->exchange_pixel_data_with(imgunit->sao_output);
447
448 return true;
449 }
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
2222
2323 #include "libde265/decctx.h"
2424
25 void apply_sample_adaptive_offset(decoder_context* ctx);
25 void apply_sample_adaptive_offset(de265_image* img);
26
27 /* requires less memory than the function above */
28 void apply_sample_adaptive_offset_sequential(de265_image* img);
29
30 /* saoInputProgress - the CTB progress that SAO will wait for before beginning processing.
31 Returns 'true' if any tasks have been added.
32 */
33 bool add_sao_tasks(image_unit* imgunit, int saoInputProgress);
2634
2735 #endif
+0
-152
libde265/scan.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "scan.h"
21
22 static position scan0 = { 0,0 };
23 static position scan_h_1[ 2* 2], scan_v_1[ 2* 2], scan_d_1[ 2* 2];
24 static position scan_h_2[ 4* 4], scan_v_2[ 4* 4], scan_d_2[ 4* 4];
25 static position scan_h_3[ 8* 8], scan_v_3[ 8* 8], scan_d_3[ 8* 8];
26 static position scan_h_4[16*16], scan_v_4[16*16], scan_d_4[16*16];
27 static position scan_h_5[32*32], scan_v_5[32*32], scan_d_5[32*32];
28
29 static position* scan_h[7] = { &scan0,scan_h_1,scan_h_2,scan_h_3,scan_h_4,scan_h_5 };
30 static position* scan_v[7] = { &scan0,scan_v_1,scan_v_2,scan_v_3,scan_v_4,scan_v_5 };
31 static position* scan_d[7] = { &scan0,scan_d_1,scan_d_2,scan_d_3,scan_d_4,scan_d_5 };
32
33 static void init_scan_h(position* scan, int blkSize)
34 {
35 int i=0;
36 for (int y=0;y<blkSize;y++)
37 for (int x=0;x<blkSize;x++)
38 {
39 scan[i].x = x;
40 scan[i].y = y;
41 i++;
42 }
43 }
44
45 static void init_scan_v(position* scan, int blkSize)
46 {
47 int i=0;
48 for (int x=0;x<blkSize;x++)
49 for (int y=0;y<blkSize;y++)
50 {
51 scan[i].x = x;
52 scan[i].y = y;
53 i++;
54 }
55 }
56
57 static void init_scan_d(position* scan, int blkSize)
58 {
59 int i=0;
60 int x=0,y=0;
61
62 do {
63 while (y>=0) {
64 if (x<blkSize && y<blkSize) {
65 scan[i].x = x;
66 scan[i].y = y;
67 i++;
68 }
69 y--;
70 x++;
71 }
72
73 y=x;
74 x=0;
75 } while (i < blkSize*blkSize);
76 }
77
78
79 const position* get_scan_order(int log2BlockSize, int scanIdx)
80 {
81 switch (scanIdx) {
82 case 0: return scan_d[log2BlockSize];
83 case 1: return scan_h[log2BlockSize];
84 case 2: return scan_v[log2BlockSize];
85 default: return 0; // should never happen
86 }
87 }
88
89
90
91 static scan_position scanpos_h_2[ 4* 4], scanpos_v_2[ 4* 4], scanpos_d_2[ 4* 4];
92 static scan_position scanpos_h_3[ 8* 8], scanpos_v_3[ 8* 8], scanpos_d_3[ 8* 8];
93 static scan_position scanpos_h_4[16*16], scanpos_v_4[16*16], scanpos_d_4[16*16];
94 static scan_position scanpos_h_5[32*32], scanpos_v_5[32*32], scanpos_d_5[32*32];
95
96 static scan_position* scanpos[3][6] =
97 { { 0,0,scanpos_d_2,scanpos_d_3,scanpos_d_4,scanpos_d_5 },
98 { 0,0,scanpos_h_2,scanpos_h_3,scanpos_h_4,scanpos_h_5 },
99 { 0,0,scanpos_v_2,scanpos_v_3,scanpos_v_4,scanpos_v_5 } };
100
101
102 scan_position get_scan_position(int x,int y, int scanIdx, int log2BlkSize)
103 {
104 return scanpos[scanIdx][log2BlkSize][ y*(1<<log2BlkSize) + x ];
105 }
106
107 static void fill_scan_pos(scan_position* pos, int x,int y,int scanIdx, int log2TrafoSize)
108 {
109 int lastScanPos = 16;
110 int lastSubBlock = (1<<(log2TrafoSize-2)) * (1<<(log2TrafoSize-2)) -1;
111
112 const position* ScanOrderSub = get_scan_order(log2TrafoSize-2, scanIdx);
113 const position* ScanOrderPos = get_scan_order(2, scanIdx);
114
115 int xC,yC;
116 do {
117 if (lastScanPos==0) {
118 lastScanPos=16;
119 lastSubBlock--;
120 }
121 lastScanPos--;
122
123 position S = ScanOrderSub[lastSubBlock];
124 xC = (S.x<<2) + ScanOrderPos[lastScanPos].x;
125 yC = (S.y<<2) + ScanOrderPos[lastScanPos].y;
126
127 } while ( (xC != x) || (yC != y));
128
129 pos->subBlock = lastSubBlock;
130 pos->scanPos = lastScanPos;
131 }
132
133
134 void init_scan_orders()
135 {
136 for (int log2size=1;log2size<=5;log2size++)
137 {
138 init_scan_h(scan_h[log2size], 1<<log2size);
139 init_scan_v(scan_v[log2size], 1<<log2size);
140 init_scan_d(scan_d[log2size], 1<<log2size);
141 }
142
143
144 for (int log2size=2;log2size<=5;log2size++)
145 for (int scanIdx=0;scanIdx<3;scanIdx++)
146 for (int y=0;y<(1<<log2size);y++)
147 for (int x=0;x<(1<<log2size);x++)
148 {
149 fill_scan_pos(&scanpos[scanIdx][log2size][ y*(1<<log2size) + x ],x,y,scanIdx,log2size);
150 }
151 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "scan.h"
21
22 static position scan0 = { 0,0 };
23 static position scan_h_1[ 2* 2], scan_v_1[ 2* 2], scan_d_1[ 2* 2];
24 static position scan_h_2[ 4* 4], scan_v_2[ 4* 4], scan_d_2[ 4* 4];
25 static position scan_h_3[ 8* 8], scan_v_3[ 8* 8], scan_d_3[ 8* 8];
26 static position scan_h_4[16*16], scan_v_4[16*16], scan_d_4[16*16];
27 static position scan_h_5[32*32], scan_v_5[32*32], scan_d_5[32*32];
28
29 static position* scan_h[7] = { &scan0,scan_h_1,scan_h_2,scan_h_3,scan_h_4,scan_h_5 };
30 static position* scan_v[7] = { &scan0,scan_v_1,scan_v_2,scan_v_3,scan_v_4,scan_v_5 };
31 static position* scan_d[7] = { &scan0,scan_d_1,scan_d_2,scan_d_3,scan_d_4,scan_d_5 };
32
33 static void init_scan_h(position* scan, int blkSize)
34 {
35 int i=0;
36 for (int y=0;y<blkSize;y++)
37 for (int x=0;x<blkSize;x++)
38 {
39 scan[i].x = x;
40 scan[i].y = y;
41 i++;
42 }
43 }
44
45 static void init_scan_v(position* scan, int blkSize)
46 {
47 int i=0;
48 for (int x=0;x<blkSize;x++)
49 for (int y=0;y<blkSize;y++)
50 {
51 scan[i].x = x;
52 scan[i].y = y;
53 i++;
54 }
55 }
56
57 static void init_scan_d(position* scan, int blkSize)
58 {
59 int i=0;
60 int x=0,y=0;
61
62 do {
63 while (y>=0) {
64 if (x<blkSize && y<blkSize) {
65 scan[i].x = x;
66 scan[i].y = y;
67 i++;
68 }
69 y--;
70 x++;
71 }
72
73 y=x;
74 x=0;
75 } while (i < blkSize*blkSize);
76 }
77
78
79 const position* get_scan_order(int log2BlockSize, int scanIdx)
80 {
81 switch (scanIdx) {
82 case 0: return scan_d[log2BlockSize];
83 case 1: return scan_h[log2BlockSize];
84 case 2: return scan_v[log2BlockSize];
85 default: return 0; // should never happen
86 }
87 }
88
89
90
91 static scan_position scanpos_h_2[ 4* 4], scanpos_v_2[ 4* 4], scanpos_d_2[ 4* 4];
92 static scan_position scanpos_h_3[ 8* 8], scanpos_v_3[ 8* 8], scanpos_d_3[ 8* 8];
93 static scan_position scanpos_h_4[16*16], scanpos_v_4[16*16], scanpos_d_4[16*16];
94 static scan_position scanpos_h_5[32*32], scanpos_v_5[32*32], scanpos_d_5[32*32];
95
96 static scan_position* scanpos[3][6] =
97 { { 0,0,scanpos_d_2,scanpos_d_3,scanpos_d_4,scanpos_d_5 },
98 { 0,0,scanpos_h_2,scanpos_h_3,scanpos_h_4,scanpos_h_5 },
99 { 0,0,scanpos_v_2,scanpos_v_3,scanpos_v_4,scanpos_v_5 } };
100
101
102 scan_position get_scan_position(int x,int y, int scanIdx, int log2BlkSize)
103 {
104 return scanpos[scanIdx][log2BlkSize][ y*(1<<log2BlkSize) + x ];
105 }
106
107 static void fill_scan_pos(scan_position* pos, int x,int y,int scanIdx, int log2TrafoSize)
108 {
109 int lastScanPos = 16;
110 int lastSubBlock = (1<<(log2TrafoSize-2)) * (1<<(log2TrafoSize-2)) -1;
111
112 const position* ScanOrderSub = get_scan_order(log2TrafoSize-2, scanIdx);
113 const position* ScanOrderPos = get_scan_order(2, scanIdx);
114
115 int xC,yC;
116 do {
117 if (lastScanPos==0) {
118 lastScanPos=16;
119 lastSubBlock--;
120 }
121 lastScanPos--;
122
123 position S = ScanOrderSub[lastSubBlock];
124 xC = (S.x<<2) + ScanOrderPos[lastScanPos].x;
125 yC = (S.y<<2) + ScanOrderPos[lastScanPos].y;
126
127 } while ( (xC != x) || (yC != y));
128
129 pos->subBlock = lastSubBlock;
130 pos->scanPos = lastScanPos;
131 }
132
133
134 void init_scan_orders()
135 {
136 for (int log2size=1;log2size<=5;log2size++)
137 {
138 init_scan_h(scan_h[log2size], 1<<log2size);
139 init_scan_v(scan_v[log2size], 1<<log2size);
140 init_scan_d(scan_d[log2size], 1<<log2size);
141 }
142
143
144 for (int log2size=2;log2size<=5;log2size++)
145 for (int scanIdx=0;scanIdx<3;scanIdx++)
146 for (int y=0;y<(1<<log2size);y++)
147 for (int x=0;x<(1<<log2size);x++)
148 {
149 fill_scan_pos(&scanpos[scanIdx][log2size][ y*(1<<log2size) + x ],x,y,scanIdx,log2size);
150 }
151 }
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
+0
-426
libde265/sei.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "sei.h"
21 #include "util.h"
22 #include "md5.h"
23
24 #include <assert.h>
25
26
27 static bool read_sei_decoded_picture_hash(bitreader* reader, sei_message* sei,
28 const decoder_context* ctx)
29 {
30 // cannot read hash SEI, because SPS is not defined
31 if (ctx->current_sps == NULL) {
32 return false;
33 }
34
35 sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash;
36
37 seihash->hash_type = (enum sei_decoded_picture_hash_type)get_bits(reader,8);
38
39 int nHashes = ctx->current_sps->chroma_format_idc==0 ? 1 : 3;
40 for (int i=0;i<nHashes;i++) {
41 switch (seihash->hash_type) {
42 case sei_decoded_picture_hash_type_MD5:
43 for (int b=0;b<16;b++) { seihash->md5[i][b] = get_bits(reader,8); }
44 break;
45
46 case sei_decoded_picture_hash_type_CRC:
47 seihash->crc[i] = get_bits(reader,16);
48 break;
49
50 case sei_decoded_picture_hash_type_checksum:
51 seihash->checksum[i] = get_bits(reader,32);
52 break;
53 }
54 }
55
56 return true;
57 }
58
59
60 static void dump_sei_decoded_picture_hash(const sei_message* sei,
61 const decoder_context* ctx)
62 {
63 const sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash;
64
65 loginfo(LogSEI," hash_type: ");
66 switch (seihash->hash_type) {
67 case sei_decoded_picture_hash_type_MD5: loginfo(LogSEI,"MD5\n"); break;
68 case sei_decoded_picture_hash_type_CRC: loginfo(LogSEI,"CRC\n"); break;
69 case sei_decoded_picture_hash_type_checksum: loginfo(LogSEI,"checksum\n"); break;
70 }
71
72 int nHashes = ctx->current_sps->chroma_format_idc==0 ? 1 : 3;
73 for (int i=0;i<nHashes;i++) {
74 switch (seihash->hash_type) {
75 case sei_decoded_picture_hash_type_MD5:
76 loginfo(LogSEI," MD5[%d]: %02x", i,seihash->md5[i][0]);
77 for (int b=1;b<16;b++) {
78 loginfo(LogSEI,"*:%02x", seihash->md5[i][b]);
79 }
80 loginfo(LogSEI,"*\n");
81 break;
82
83 case sei_decoded_picture_hash_type_CRC:
84 loginfo(LogSEI," CRC[%d]: %02x\n", i,seihash->crc[i]);
85 break;
86
87 case sei_decoded_picture_hash_type_checksum:
88 loginfo(LogSEI," checksum[%d]: %04x\n", i,seihash->checksum[i]);
89 break;
90 }
91 }
92 }
93
94
95 static uint32_t compute_checksum_8bit(uint8_t* data,int w,int h,int stride)
96 {
97 uint32_t sum = 0;
98 for (int y=0; y<h; y++)
99 for(int x=0; x<w; x++) {
100 uint8_t xorMask = ( x & 0xFF ) ^ ( y & 0xFF ) ^ ( x >> 8 ) ^ ( y >> 8 );
101 sum += data[y*stride + x] ^ xorMask;
102
103 /*
104 if (compDepth[cIdx] > 8 )
105 sum = ( sum + ( ( component[cIdx][y * compWidth[cIdx] + x] >> 8 ) ^ xorMask ) ) &
106 0xFFFFFFFF
107 }
108 */
109 }
110
111 return sum & 0xFFFFFFFF;
112 }
113
114 static inline uint16_t crc_process_byte(uint16_t crc, uint8_t byte)
115 {
116 for (int bit=0;bit<8;bit++) {
117 int bitVal = (byte >> (7-bit)) & 1;
118
119 int crcMsb = (crc>>15) & 1;
120 crc = (((crc<<1) + bitVal) & 0xFFFF);
121
122 if (crcMsb) { crc ^= 0x1021; }
123 }
124
125 return crc;
126 }
127
128 /*
129 static uint16_t compute_CRC_8bit_old(const uint8_t* data,int w,int h,int stride)
130 {
131 uint16_t crc = 0xFFFF;
132
133 for (int y=0; y<h; y++)
134 for(int x=0; x<w; x++) {
135 crc = crc_process_byte(crc, data[y*stride+x]);
136 }
137
138 crc = crc_process_byte(crc, 0);
139 crc = crc_process_byte(crc, 0);
140
141 return crc;
142 }
143 */
144
145 static inline uint16_t crc_process_byte_parallel(uint16_t crc, uint8_t byte)
146 {
147 uint16_t s = byte ^ (crc >> 8);
148 uint16_t t = s ^ (s >> 4);
149
150 return ((crc << 8) ^
151 t ^
152 (t << 5) ^
153 (t << 12)) & 0xFFFF;
154 }
155
156 static uint32_t compute_CRC_8bit_fast(const uint8_t* data,int w,int h,int stride)
157 {
158 uint16_t crc = 0xFFFF;
159
160 crc = crc_process_byte_parallel(crc, 0);
161 crc = crc_process_byte_parallel(crc, 0);
162
163 for (int y=0; y<h; y++) {
164 const uint8_t* d = &data[y*stride];
165
166 for(int x=0; x<w; x++) {
167 crc = crc_process_byte_parallel(crc, *d++);
168 }
169 }
170
171 return crc;
172 }
173
174 static void compute_MD5_8bit(uint8_t* data,int w,int h,int stride, uint8_t* result)
175 {
176 MD5_CTX md5;
177 MD5_Init(&md5);
178
179 for (int y=0; y<h; y++) {
180 MD5_Update(&md5, &data[y*stride], w);
181 }
182
183 MD5_Final(result, &md5);
184 }
185
186
187 static de265_error process_sei_decoded_picture_hash(const sei_message* sei, decoder_context* ctx)
188 {
189 if (ctx->current_sps == NULL || ctx->last_decoded_image == NULL) {
190 add_warning(ctx,DE265_ERROR_CANNOT_PROCESS_SEI, false);
191 return DE265_OK;
192 }
193
194 const sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash;
195
196 de265_image* img = ctx->last_decoded_image;
197 assert(img != NULL);
198
199 /* Do not check SEI on pictures that are not output.
200 Hash may be wrong, because of a broken link (BLA).
201 This happens, for example in conformance stream RAP_B, where a EOS-NAL
202 appears before a CRA (POC=32). */
203 if (img->PicOutputFlag == false) {
204 return DE265_OK;
205 }
206
207 //write_picture(img);
208
209 int nHashes = ctx->current_sps->chroma_format_idc==0 ? 1 : 3;
210 for (int i=0;i<nHashes;i++) {
211 uint8_t* data;
212 int w,h,stride;
213
214 switch (i) {
215 case 0:
216 w = img->width;
217 h = img->height;
218 stride = img->stride;
219 break;
220
221 case 1:
222 case 2:
223 w = img->chroma_width;
224 h = img->chroma_height;
225 stride = img->chroma_stride;
226 break;
227 }
228
229 /**/ if (i==0) data = img->y;
230 else if (i==1) data = img->cb;
231 else data = img->cr;
232
233 switch (seihash->hash_type) {
234 case sei_decoded_picture_hash_type_MD5:
235 {
236 uint8_t md5[16];
237 compute_MD5_8bit(data,w,h,stride,md5);
238
239 /*
240 fprintf(stderr,"computed MD5: ");
241 for (int b=0;b<16;b++) {
242 fprintf(stderr,"%02x", md5[b]);
243 }
244 fprintf(stderr,"\n");
245 */
246
247 for (int b=0;b<16;b++) {
248 if (md5[b] != seihash->md5[i][b]) {
249 fprintf(stderr,"SEI decoded picture MD5 mismatch (POC=%d)\n", img->PicOrderCntVal);
250 return DE265_ERROR_CHECKSUM_MISMATCH;
251 }
252 }
253 }
254 break;
255
256 case sei_decoded_picture_hash_type_CRC:
257 {
258 uint16_t crc = compute_CRC_8bit_fast(data,w,h,stride);
259
260 logtrace(LogSEI,"SEI decoded picture hash: %04x <-[%d]-> decoded picture: %04x\n",
261 seihash->crc[i], i, crc);
262
263 if (crc != seihash->crc[i]) {
264 fprintf(stderr,"SEI decoded picture hash: %04x, decoded picture: %04x (POC=%d)\n",
265 seihash->crc[i], crc, img->PicOrderCntVal);
266 return DE265_ERROR_CHECKSUM_MISMATCH;
267 }
268 }
269 break;
270
271 case sei_decoded_picture_hash_type_checksum:
272 {
273 uint32_t chksum = compute_checksum_8bit(data,w,h,stride);
274
275 if (chksum != seihash->checksum[i]) {
276 fprintf(stderr,"SEI decoded picture hash: %04x, decoded picture: %04x (POC=%d)\n",
277 seihash->checksum[i], chksum, img->PicOrderCntVal);
278 return DE265_ERROR_CHECKSUM_MISMATCH;
279 }
280 }
281 break;
282 }
283 }
284
285 loginfo(LogSEI,"decoded picture hash checked: OK\n");
286
287 return DE265_OK;
288 }
289
290
291 bool read_sei(bitreader* reader, sei_message* sei, bool suffix, const decoder_context* ctx)
292 {
293 int payload_type = 0;
294 for (;;)
295 {
296 int byte = get_bits(reader,8);
297 payload_type += byte;
298 if (byte != 0xFF) { break; }
299 }
300
301 int payload_size = 0;
302 for (;;)
303 {
304 int byte = get_bits(reader,8);
305 payload_size += byte;
306 if (byte != 0xFF) { break; }
307 }
308
309 sei->payload_type = (enum sei_payload_type)payload_type;
310 sei->payload_size = payload_size;
311
312
313 // --- sei message dispatch
314
315 bool success=false;
316
317 switch (sei->payload_type) {
318 case sei_payload_type_decoded_picture_hash:
319 success = read_sei_decoded_picture_hash(reader,sei,ctx);
320 break;
321
322 default:
323 // TODO: unknown SEI messages are ignored
324 break;
325 }
326
327 return success;
328 }
329
330 void dump_sei(const sei_message* sei, const decoder_context* ctx)
331 {
332 loginfo(LogHeaders,"SEI message: %s\n", sei_type_name(sei->payload_type));
333
334 switch (sei->payload_type) {
335 case sei_payload_type_decoded_picture_hash:
336 dump_sei_decoded_picture_hash(sei, ctx);
337 break;
338
339 default:
340 // TODO: unknown SEI messages are ignored
341 break;
342 }
343 }
344
345
346 de265_error process_sei(const sei_message* sei, decoder_context* ctx)
347 {
348 de265_error err = DE265_OK;
349
350 switch (sei->payload_type) {
351 case sei_payload_type_decoded_picture_hash:
352 if (ctx->param_sei_check_hash) {
353 err = process_sei_decoded_picture_hash(sei, ctx);
354 }
355
356 break;
357
358 default:
359 // TODO: unknown SEI messages are ignored
360 break;
361 }
362
363 return err;
364 }
365
366
367 const char* sei_type_name(enum sei_payload_type type)
368 {
369 switch (type) {
370 case sei_payload_type_buffering_period:
371 return "buffering_period";
372 case sei_payload_type_pic_timing:
373 return "pic_timing";
374 case sei_payload_type_pan_scan_rect:
375 return "pan_scan_rect";
376 case sei_payload_type_filler_payload:
377 return "filler_payload";
378 case sei_payload_type_user_data_registered_itu_t_t35:
379 return "user_data_registered_itu_t_t35";
380 case sei_payload_type_user_data_unregistered:
381 return "user_data_unregistered";
382 case sei_payload_type_recovery_point:
383 return "recovery_point";
384 case sei_payload_type_scene_info:
385 return "scene_info";
386 case sei_payload_type_picture_snapshot:
387 return "picture_snapshot";
388 case sei_payload_type_progressive_refinement_segment_start:
389 return "progressive_refinement_segment_start";
390 case sei_payload_type_progressive_refinement_segment_end:
391 return "progressive_refinement_segment_end";
392 case sei_payload_type_film_grain_characteristics:
393 return "film_grain_characteristics";
394 case sei_payload_type_post_filter_hint:
395 return "post_filter_hint";
396 case sei_payload_type_tone_mapping_info:
397 return "tone_mapping_info";
398 case sei_payload_type_frame_packing_arrangement:
399 return "frame_packing_arrangement";
400 case sei_payload_type_display_orientation:
401 return "display_orientation";
402 case sei_payload_type_structure_of_pictures_info:
403 return "structure_of_pictures_info";
404 case sei_payload_type_active_parameter_sets:
405 return "active_parameter_sets";
406 case sei_payload_type_decoding_unit_info:
407 return "decoding_unit_info";
408 case sei_payload_type_temporal_sub_layer_zero_index:
409 return "temporal_sub_layer_zero_index";
410 case sei_payload_type_decoded_picture_hash:
411 return "decoded_picture_hash";
412 case sei_payload_type_scalable_nesting:
413 return "scalable_nesting";
414 case sei_payload_type_region_refresh_info:
415 return "region_refresh_info";
416 case sei_payload_type_no_display:
417 return "no_display";
418 case sei_payload_type_motion_constrained_tile_sets:
419 return "motion_constrained_tile_sets";
420
421 default:
422 return "unknown SEI message";
423 }
424 }
425
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "sei.h"
21 #include "util.h"
22 #include "md5.h"
23
24 #include "libde265/sps.h"
25 #include "libde265/image.h"
26 #include "libde265/decctx.h"
27
28 #include <assert.h>
29
30
31 static de265_error read_sei_decoded_picture_hash(bitreader* reader, sei_message* sei,
32 const seq_parameter_set* sps)
33 {
34 sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash;
35
36 seihash->hash_type = (enum sei_decoded_picture_hash_type)get_bits(reader,8);
37
38 if (sps==NULL) {
39 return DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI;
40 }
41
42 int nHashes = sps->chroma_format_idc==0 ? 1 : 3;
43 for (int i=0;i<nHashes;i++) {
44 switch (seihash->hash_type) {
45 case sei_decoded_picture_hash_type_MD5:
46 for (int b=0;b<16;b++) { seihash->md5[i][b] = get_bits(reader,8); }
47 break;
48
49 case sei_decoded_picture_hash_type_CRC:
50 seihash->crc[i] = get_bits(reader,16);
51 break;
52
53 case sei_decoded_picture_hash_type_checksum:
54 seihash->checksum[i] = get_bits(reader,32);
55 break;
56 }
57 }
58
59 return DE265_OK;
60 }
61
62
63 static void dump_sei_decoded_picture_hash(const sei_message* sei,
64 const seq_parameter_set* sps)
65 {
66 const sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash;
67
68 loginfo(LogSEI," hash_type: ");
69 switch (seihash->hash_type) {
70 case sei_decoded_picture_hash_type_MD5: loginfo(LogSEI,"MD5\n"); break;
71 case sei_decoded_picture_hash_type_CRC: loginfo(LogSEI,"CRC\n"); break;
72 case sei_decoded_picture_hash_type_checksum: loginfo(LogSEI,"checksum\n"); break;
73 }
74
75 int nHashes = sps->chroma_format_idc==0 ? 1 : 3;
76 for (int i=0;i<nHashes;i++) {
77 switch (seihash->hash_type) {
78 case sei_decoded_picture_hash_type_MD5:
79 loginfo(LogSEI," MD5[%d]: %02x", i,seihash->md5[i][0]);
80 for (int b=1;b<16;b++) {
81 loginfo(LogSEI,"*:%02x", seihash->md5[i][b]);
82 }
83 loginfo(LogSEI,"*\n");
84 break;
85
86 case sei_decoded_picture_hash_type_CRC:
87 loginfo(LogSEI," CRC[%d]: %02x\n", i,seihash->crc[i]);
88 break;
89
90 case sei_decoded_picture_hash_type_checksum:
91 loginfo(LogSEI," checksum[%d]: %04x\n", i,seihash->checksum[i]);
92 break;
93 }
94 }
95 }
96
97
98 static uint32_t compute_checksum_8bit(uint8_t* data,int w,int h,int stride)
99 {
100 uint32_t sum = 0;
101 for (int y=0; y<h; y++)
102 for(int x=0; x<w; x++) {
103 uint8_t xorMask = ( x & 0xFF ) ^ ( y & 0xFF ) ^ ( x >> 8 ) ^ ( y >> 8 );
104 sum += data[y*stride + x] ^ xorMask;
105
106 /*
107 if (compDepth[cIdx] > 8 )
108 sum = ( sum + ( ( component[cIdx][y * compWidth[cIdx] + x] >> 8 ) ^ xorMask ) ) &
109 0xFFFFFFFF
110 }
111 */
112 }
113
114 return sum & 0xFFFFFFFF;
115 }
116
117 static inline uint16_t crc_process_byte(uint16_t crc, uint8_t byte)
118 {
119 for (int bit=0;bit<8;bit++) {
120 int bitVal = (byte >> (7-bit)) & 1;
121
122 int crcMsb = (crc>>15) & 1;
123 crc = (((crc<<1) + bitVal) & 0xFFFF);
124
125 if (crcMsb) { crc ^= 0x1021; }
126 }
127
128 return crc;
129 }
130
131 /*
132 static uint16_t compute_CRC_8bit_old(const uint8_t* data,int w,int h,int stride)
133 {
134 uint16_t crc = 0xFFFF;
135
136 for (int y=0; y<h; y++)
137 for(int x=0; x<w; x++) {
138 crc = crc_process_byte(crc, data[y*stride+x]);
139 }
140
141 crc = crc_process_byte(crc, 0);
142 crc = crc_process_byte(crc, 0);
143
144 return crc;
145 }
146 */
147
148 static inline uint16_t crc_process_byte_parallel(uint16_t crc, uint8_t byte)
149 {
150 uint16_t s = byte ^ (crc >> 8);
151 uint16_t t = s ^ (s >> 4);
152
153 return ((crc << 8) ^
154 t ^
155 (t << 5) ^
156 (t << 12)) & 0xFFFF;
157 }
158
159 static uint32_t compute_CRC_8bit_fast(const uint8_t* data,int w,int h,int stride)
160 {
161 uint16_t crc = 0xFFFF;
162
163 crc = crc_process_byte_parallel(crc, 0);
164 crc = crc_process_byte_parallel(crc, 0);
165
166 for (int y=0; y<h; y++) {
167 const uint8_t* d = &data[y*stride];
168
169 for(int x=0; x<w; x++) {
170 crc = crc_process_byte_parallel(crc, *d++);
171 }
172 }
173
174 return crc;
175 }
176
177 static void compute_MD5_8bit(uint8_t* data,int w,int h,int stride, uint8_t* result)
178 {
179 MD5_CTX md5;
180 MD5_Init(&md5);
181
182 for (int y=0; y<h; y++) {
183 MD5_Update(&md5, &data[y*stride], w);
184 }
185
186 MD5_Final(result, &md5);
187 }
188
189
190 static de265_error process_sei_decoded_picture_hash(const sei_message* sei, de265_image* img)
191 {
192 const sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash;
193
194 /* Do not check SEI on pictures that are not output.
195 Hash may be wrong, because of a broken link (BLA).
196 This happens, for example in conformance stream RAP_B, where a EOS-NAL
197 appears before a CRA (POC=32). */
198 if (img->PicOutputFlag == false) {
199 return DE265_OK;
200 }
201
202 //write_picture(img);
203
204 int nHashes = img->sps.chroma_format_idc==0 ? 1 : 3;
205 for (int i=0;i<nHashes;i++) {
206 uint8_t* data;
207 int w,h,stride;
208
209 w = img->get_width(i);
210 h = img->get_height(i);
211
212 data = img->get_image_plane(i);
213 stride = img->get_image_stride(i);
214
215 switch (seihash->hash_type) {
216 case sei_decoded_picture_hash_type_MD5:
217 {
218 uint8_t md5[16];
219 compute_MD5_8bit(data,w,h,stride,md5);
220
221 /*
222 fprintf(stderr,"computed MD5: ");
223 for (int b=0;b<16;b++) {
224 fprintf(stderr,"%02x", md5[b]);
225 }
226 fprintf(stderr,"\n");
227 */
228
229 for (int b=0;b<16;b++) {
230 if (md5[b] != seihash->md5[i][b]) {
231 fprintf(stderr,"SEI decoded picture MD5 mismatch (POC=%d)\n", img->PicOrderCntVal);
232 return DE265_ERROR_CHECKSUM_MISMATCH;
233 }
234 }
235 }
236 break;
237
238 case sei_decoded_picture_hash_type_CRC:
239 {
240 uint16_t crc = compute_CRC_8bit_fast(data,w,h,stride);
241
242 logtrace(LogSEI,"SEI decoded picture hash: %04x <-[%d]-> decoded picture: %04x\n",
243 seihash->crc[i], i, crc);
244
245 if (crc != seihash->crc[i]) {
246 fprintf(stderr,"SEI decoded picture hash: %04x, decoded picture: %04x (POC=%d)\n",
247 seihash->crc[i], crc, img->PicOrderCntVal);
248 return DE265_ERROR_CHECKSUM_MISMATCH;
249 }
250 }
251 break;
252
253 case sei_decoded_picture_hash_type_checksum:
254 {
255 uint32_t chksum = compute_checksum_8bit(data,w,h,stride);
256
257 if (chksum != seihash->checksum[i]) {
258 fprintf(stderr,"SEI decoded picture hash: %04x, decoded picture: %04x (POC=%d)\n",
259 seihash->checksum[i], chksum, img->PicOrderCntVal);
260 return DE265_ERROR_CHECKSUM_MISMATCH;
261 }
262 }
263 break;
264 }
265 }
266
267 loginfo(LogSEI,"decoded picture hash checked: OK\n");
268 //printf("checked picture %d SEI: OK\n", img->PicOrderCntVal);
269
270 return DE265_OK;
271 }
272
273
274 de265_error read_sei(bitreader* reader, sei_message* sei, bool suffix, const seq_parameter_set* sps)
275 {
276 int payload_type = 0;
277 for (;;)
278 {
279 int byte = get_bits(reader,8);
280 payload_type += byte;
281 if (byte != 0xFF) { break; }
282 }
283
284 int payload_size = 0;
285 for (;;)
286 {
287 int byte = get_bits(reader,8);
288 payload_size += byte;
289 if (byte != 0xFF) { break; }
290 }
291
292 sei->payload_type = (enum sei_payload_type)payload_type;
293 sei->payload_size = payload_size;
294
295
296 // --- sei message dispatch
297
298 de265_error err = DE265_OK;
299
300 switch (sei->payload_type) {
301 case sei_payload_type_decoded_picture_hash:
302 err = read_sei_decoded_picture_hash(reader,sei,sps);
303 break;
304
305 default:
306 // TODO: unknown SEI messages are ignored
307 break;
308 }
309
310 return err;
311 }
312
313 void dump_sei(const sei_message* sei, const seq_parameter_set* sps)
314 {
315 loginfo(LogHeaders,"SEI message: %s\n", sei_type_name(sei->payload_type));
316
317 switch (sei->payload_type) {
318 case sei_payload_type_decoded_picture_hash:
319 dump_sei_decoded_picture_hash(sei, sps);
320 break;
321
322 default:
323 // TODO: unknown SEI messages are ignored
324 break;
325 }
326 }
327
328
329 de265_error process_sei(const sei_message* sei, de265_image* img)
330 {
331 de265_error err = DE265_OK;
332
333 switch (sei->payload_type) {
334 case sei_payload_type_decoded_picture_hash:
335 if (img->decctx->param_sei_check_hash) {
336 err = process_sei_decoded_picture_hash(sei, img);
337 }
338
339 break;
340
341 default:
342 // TODO: unknown SEI messages are ignored
343 break;
344 }
345
346 return err;
347 }
348
349
350 const char* sei_type_name(enum sei_payload_type type)
351 {
352 switch (type) {
353 case sei_payload_type_buffering_period:
354 return "buffering_period";
355 case sei_payload_type_pic_timing:
356 return "pic_timing";
357 case sei_payload_type_pan_scan_rect:
358 return "pan_scan_rect";
359 case sei_payload_type_filler_payload:
360 return "filler_payload";
361 case sei_payload_type_user_data_registered_itu_t_t35:
362 return "user_data_registered_itu_t_t35";
363 case sei_payload_type_user_data_unregistered:
364 return "user_data_unregistered";
365 case sei_payload_type_recovery_point:
366 return "recovery_point";
367 case sei_payload_type_scene_info:
368 return "scene_info";
369 case sei_payload_type_picture_snapshot:
370 return "picture_snapshot";
371 case sei_payload_type_progressive_refinement_segment_start:
372 return "progressive_refinement_segment_start";
373 case sei_payload_type_progressive_refinement_segment_end:
374 return "progressive_refinement_segment_end";
375 case sei_payload_type_film_grain_characteristics:
376 return "film_grain_characteristics";
377 case sei_payload_type_post_filter_hint:
378 return "post_filter_hint";
379 case sei_payload_type_tone_mapping_info:
380 return "tone_mapping_info";
381 case sei_payload_type_frame_packing_arrangement:
382 return "frame_packing_arrangement";
383 case sei_payload_type_display_orientation:
384 return "display_orientation";
385 case sei_payload_type_structure_of_pictures_info:
386 return "structure_of_pictures_info";
387 case sei_payload_type_active_parameter_sets:
388 return "active_parameter_sets";
389 case sei_payload_type_decoding_unit_info:
390 return "decoding_unit_info";
391 case sei_payload_type_temporal_sub_layer_zero_index:
392 return "temporal_sub_layer_zero_index";
393 case sei_payload_type_decoded_picture_hash:
394 return "decoded_picture_hash";
395 case sei_payload_type_scalable_nesting:
396 return "scalable_nesting";
397 case sei_payload_type_region_refresh_info:
398 return "region_refresh_info";
399 case sei_payload_type_no_display:
400 return "no_display";
401 case sei_payload_type_motion_constrained_tile_sets:
402 return "motion_constrained_tile_sets";
403
404 default:
405 return "unknown SEI message";
406 }
407 }
408
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
2121 #define DE265_SEI_H
2222
2323 #include "libde265/bitstream.h"
24 #include "libde265/decctx.h"
24 #include "libde265/de265.h"
2525
2626
2727 enum sei_payload_type {
6868 } sei_decoded_picture_hash;
6969
7070
71 typedef struct {
71 struct sei_message {
7272 enum sei_payload_type payload_type;
7373 int payload_size;
7474
7575 union {
7676 sei_decoded_picture_hash decoded_picture_hash;
7777 } data;
78 } sei_message;
78 };
7979
80
81 class seq_parameter_set;
8082
8183 const char* sei_type_name(enum sei_payload_type type);
8284
83 bool read_sei(bitreader* reader, sei_message*, bool suffix, const decoder_context* ctx);
84 void dump_sei(const sei_message*, const decoder_context* ctx);
85 de265_error process_sei(const sei_message*, decoder_context* ctx);
85 de265_error read_sei(bitreader* reader, sei_message*, bool suffix, const seq_parameter_set* sps);
86 void dump_sei(const sei_message*, const seq_parameter_set* sps);
87 de265_error process_sei(const sei_message*, class de265_image* img);
8688
8789 #endif
+0
-3953
libde265/slice.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * Authors: StrukturAG, Dirk Farin <farin@struktur.de>
5 * Min Chen <chenm003@163.com>
6 *
7 * This file is part of libde265.
8 *
9 * libde265 is free software: you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser General Public License as
11 * published by the Free Software Foundation, either version 3 of
12 * the License, or (at your option) any later version.
13 *
14 * libde265 is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public License
20 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
21 */
22
23 #include "slice.h"
24 #include "slice_func.h"
25 #include "motion_func.h"
26 #include "util.h"
27 #include "scan.h"
28 #include "intrapred.h"
29 #include "transform.h"
30 #include "threads.h"
31 #include "image.h"
32 #include "pps_func.h"
33
34 #include <assert.h>
35 #include <string.h>
36 #include <stdlib.h>
37
38
39 #define LOCK de265_mutex_lock(&ctx->thread_pool.mutex)
40 #define UNLOCK de265_mutex_unlock(&ctx->thread_pool.mutex)
41
42 extern bool read_short_term_ref_pic_set(decoder_context* ctx,
43 const seq_parameter_set* sps,
44 bitreader* br,
45 ref_pic_set* out_set,
46 int idxRps, // index of the set to be read
47 const ref_pic_set* sets,
48 bool sliceRefPicSet);
49
50
51 void read_coding_tree_unit(decoder_context* ctx, thread_context* tctx);
52 void read_coding_quadtree(decoder_context* ctx,
53 thread_context* tctx,
54 int xCtb, int yCtb,
55 int Log2CtbSizeY,
56 int ctDepth);
57 int check_CTB_available(decoder_context* ctx,
58 slice_segment_header* shdr,
59 int xC,int yC, int xN,int yN);
60 /*
61 void decode_inter_block(decoder_context* ctx,thread_context* tctx,
62 int xC, int yC, int log2CbSize);
63 */
64
65 bool read_pred_weight_table(bitreader* br, slice_segment_header* shdr, decoder_context* ctx)
66 {
67 int vlc;
68
69 pic_parameter_set* pps = &ctx->pps[(int)shdr->slice_pic_parameter_set_id];
70 assert(pps);
71 seq_parameter_set* sps = &ctx->sps[(int)pps->seq_parameter_set_id];
72 assert(sps);
73
74 shdr->luma_log2_weight_denom = vlc = get_uvlc(br);
75 if (vlc<0 || vlc>7) return false;
76
77 if (sps->chroma_format_idc != 0) {
78 vlc = get_svlc(br);
79 vlc += shdr->luma_log2_weight_denom;
80 if (vlc<0 || vlc>7) return false;
81 shdr->ChromaLog2WeightDenom = vlc;
82 }
83
84 int sumWeightFlags = 0;
85
86 for (int l=0;l<=1;l++)
87 if (l==0 || (l==1 && shdr->slice_type == SLICE_TYPE_B))
88 {
89 int num_ref = (l==0 ? shdr->num_ref_idx_l0_active-1 : shdr->num_ref_idx_l1_active-1);
90
91 for (int i=0;i<=num_ref;i++) {
92 shdr->luma_weight_flag[l][i] = get_bits(br,1);
93 if (shdr->luma_weight_flag[l][i]) sumWeightFlags++;
94 }
95
96 if (sps->chroma_format_idc != 0) {
97 for (int i=0;i<=num_ref;i++) {
98 shdr->chroma_weight_flag[l][i] = get_bits(br,1);
99 if (shdr->chroma_weight_flag[l][i]) sumWeightFlags+=2;
100 }
101 }
102
103 for (int i=0;i<=num_ref;i++) {
104 if (shdr->luma_weight_flag[l][i]) {
105
106 // delta_luma_weight
107
108 vlc = get_svlc(br);
109 if (vlc < -128 || vlc > 127) return false;
110
111 shdr->LumaWeight[l][i] = (1<<shdr->luma_log2_weight_denom) + vlc;
112
113 // luma_offset
114
115 vlc = get_svlc(br);
116 if (vlc < -128 || vlc > 127) return false;
117 shdr->luma_offset[l][i] = vlc;
118 }
119 else {
120 shdr->LumaWeight[l][i] = 1<<shdr->luma_log2_weight_denom;
121 shdr->luma_offset[l][i] = 0;
122 }
123
124 if (shdr->chroma_weight_flag[l][i])
125 for (int j=0;j<2;j++) {
126 // delta_chroma_weight
127
128 vlc = get_svlc(br);
129 if (vlc < -128 || vlc > 127) return false;
130
131 shdr->ChromaWeight[l][i][j] = (1<<shdr->ChromaLog2WeightDenom) + vlc;
132
133 // delta_chroma_offset
134
135 vlc = get_svlc(br);
136 if (vlc < -512 || vlc > 511) return false;
137
138 vlc = Clip3(-128,127, (vlc-((128*shdr->ChromaWeight[l][i][j])
139 >> shdr->ChromaLog2WeightDenom) + 128));
140
141 shdr->ChromaOffset[l][i][j] = vlc;
142 }
143 else {
144 for (int j=0;j<2;j++) {
145 shdr->ChromaWeight[l][i][j] = 1<<shdr->ChromaLog2WeightDenom;
146 shdr->ChromaOffset[l][i][j] = 0;
147 }
148 }
149 }
150 }
151
152 // TODO: bitstream conformance requires that 'sumWeightFlags<=24'
153
154 return true;
155 }
156
157
158 de265_error read_slice_segment_header(bitreader* br, slice_segment_header* shdr, decoder_context* ctx,
159 bool* continueDecoding)
160 {
161 *continueDecoding = false;
162
163 // set defaults
164
165 shdr->dependent_slice_segment_flag = 0;
166
167
168 // read bitstream
169
170 shdr->first_slice_segment_in_pic_flag = get_bits(br,1);
171
172 if (ctx->RapPicFlag) { // TODO: is this still correct ? Should we drop RapPicFlag ?
173 shdr->no_output_of_prior_pics_flag = get_bits(br,1);
174 }
175
176 shdr->slice_pic_parameter_set_id = get_uvlc(br);
177 if (shdr->slice_pic_parameter_set_id > DE265_MAX_PPS_SETS ||
178 shdr->slice_pic_parameter_set_id == UVLC_ERROR) {
179 add_warning(ctx, DE265_WARNING_NONEXISTING_PPS_REFERENCED, false);
180 return DE265_OK;
181 }
182
183 pic_parameter_set* pps = &ctx->pps[(int)shdr->slice_pic_parameter_set_id];
184 if (!pps->pps_read) {
185 add_warning(ctx, DE265_WARNING_NONEXISTING_PPS_REFERENCED, false);
186 return DE265_OK;
187 }
188
189 seq_parameter_set* sps = &ctx->sps[(int)pps->seq_parameter_set_id];
190 if (!sps->sps_read) {
191 add_warning(ctx, DE265_WARNING_NONEXISTING_SPS_REFERENCED, false);
192 *continueDecoding = false;
193 return DE265_OK;
194 }
195
196 if (!shdr->first_slice_segment_in_pic_flag) {
197 if (pps->dependent_slice_segments_enabled_flag) {
198 shdr->dependent_slice_segment_flag = get_bits(br,1);
199 } else {
200 shdr->dependent_slice_segment_flag = 0;
201 }
202
203 int slice_segment_address = get_bits(br, ceil_log2(sps->PicSizeInCtbsY));
204
205 if (shdr->dependent_slice_segment_flag) {
206 if (slice_segment_address == 0) {
207 *continueDecoding = false;
208 add_warning(ctx, DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO, false);
209 return DE265_OK;
210 }
211
212 int prevCtb = pps->CtbAddrTStoRS[ pps->CtbAddrRStoTS[slice_segment_address] -1 ];
213 slice_segment_header* prevCtbHdr = &ctx->slice[ctx->img->ctb_info[prevCtb].SliceHeaderIndex];
214 memcpy(shdr, prevCtbHdr, sizeof(slice_segment_header));
215
216 shdr->first_slice_segment_in_pic_flag = 0;
217 shdr->dependent_slice_segment_flag = 1;
218 }
219
220 shdr->slice_segment_address = slice_segment_address;
221 } else {
222 shdr->dependent_slice_segment_flag = 0;
223 shdr->slice_segment_address = 0;
224 }
225
226 if (shdr->slice_segment_address < 0 ||
227 shdr->slice_segment_address > sps->PicSizeInCtbsY) {
228 add_warning(ctx, DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID, false);
229 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
230 }
231
232
233
234 if (!shdr->dependent_slice_segment_flag) {
235 for (int i=0; i<pps->num_extra_slice_header_bits; i++) {
236 //slice_reserved_undetermined_flag[i]
237 skip_bits(br,1);
238 }
239
240 shdr->slice_type = get_uvlc(br);
241 if (shdr->slice_type > 2 ||
242 shdr->slice_type == UVLC_ERROR) {
243 add_warning(ctx, DE265_WARNING_SLICEHEADER_INVALID, false);
244 *continueDecoding = false;
245 return DE265_OK;
246 }
247
248 if (pps->output_flag_present_flag) {
249 shdr->pic_output_flag = get_bits(br,1);
250 }
251 else {
252 shdr->pic_output_flag = 1;
253 }
254
255 if (sps->separate_colour_plane_flag == 1) {
256 shdr->colour_plane_id = get_bits(br,1);
257 }
258
259
260 shdr->slice_pic_order_cnt_lsb = 0;
261 shdr->short_term_ref_pic_set_sps_flag = 0;
262
263 if (ctx->nal_unit_type != NAL_UNIT_IDR_W_RADL &&
264 ctx->nal_unit_type != NAL_UNIT_IDR_N_LP) {
265 shdr->slice_pic_order_cnt_lsb = get_bits(br, sps->log2_max_pic_order_cnt_lsb);
266 shdr->short_term_ref_pic_set_sps_flag = get_bits(br,1);
267
268 if (!shdr->short_term_ref_pic_set_sps_flag) {
269 read_short_term_ref_pic_set(ctx, sps,
270 br, &shdr->slice_ref_pic_set,
271 sps->num_short_term_ref_pic_sets,
272 sps->ref_pic_sets,
273 true);
274
275 shdr->CurrRpsIdx = sps->num_short_term_ref_pic_sets;
276 shdr->CurrRps = &shdr->slice_ref_pic_set;
277 }
278 else {
279 int nBits = ceil_log2(sps->num_short_term_ref_pic_sets);
280 if (nBits>0) shdr->short_term_ref_pic_set_idx = get_bits(br,nBits);
281 else shdr->short_term_ref_pic_set_idx = 0;
282
283 if (shdr->short_term_ref_pic_set_idx > sps->num_short_term_ref_pic_sets) {
284 add_warning(ctx, DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE, false);
285 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
286 }
287
288 shdr->CurrRpsIdx = shdr->short_term_ref_pic_set_idx;
289 shdr->CurrRps = &sps->ref_pic_sets[shdr->CurrRpsIdx];
290 }
291
292
293 // --- long-term MC ---
294
295 if (sps->long_term_ref_pics_present_flag) {
296 if (sps->num_long_term_ref_pics_sps > 0) {
297 shdr->num_long_term_sps = get_uvlc(br);
298 }
299 else {
300 shdr->num_long_term_sps = 0;
301 }
302
303 shdr->num_long_term_pics= get_uvlc(br);
304
305
306 // check maximum number of reference frames
307
308 if (shdr->num_long_term_sps +
309 shdr->num_long_term_pics +
310 shdr->CurrRps->NumNegativePics +
311 shdr->CurrRps->NumPositivePics
312 > sps->sps_max_dec_pic_buffering[sps->sps_max_sub_layers-1])
313 {
314 add_warning(ctx, DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false);
315 *continueDecoding = false;
316 return DE265_OK;
317 }
318
319 for (int i=0; i<shdr->num_long_term_sps + shdr->num_long_term_pics; i++) {
320 if (i < shdr->num_long_term_sps) {
321 int nBits = ceil_log2(sps->num_long_term_ref_pics_sps);
322 shdr->lt_idx_sps[i] = get_bits(br, nBits);
323
324 // check that the referenced lt-reference really exists
325
326 if (shdr->lt_idx_sps[i] >= sps->num_long_term_ref_pics_sps) {
327 add_warning(ctx, DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER, false);
328 *continueDecoding = false;
329 return DE265_OK;
330 }
331
332 ctx->PocLsbLt[i] = sps->lt_ref_pic_poc_lsb_sps[ shdr->lt_idx_sps[i] ];
333 ctx->UsedByCurrPicLt[i] = sps->used_by_curr_pic_lt_sps_flag[ shdr->lt_idx_sps[i] ];
334 }
335 else {
336 int nBits = sps->log2_max_pic_order_cnt_lsb;
337 shdr->poc_lsb_lt[i] = get_bits(br, nBits);
338 shdr->used_by_curr_pic_lt_flag[i] = get_bits(br,1);
339
340 ctx->PocLsbLt[i] = shdr->poc_lsb_lt[i];
341 ctx->UsedByCurrPicLt[i] = shdr->used_by_curr_pic_lt_flag[i];
342 }
343
344 shdr->delta_poc_msb_present_flag[i] = get_bits(br,1);
345 if (shdr->delta_poc_msb_present_flag[i]) {
346 shdr->delta_poc_msb_cycle_lt[i] = get_uvlc(br);
347 }
348 else {
349 shdr->delta_poc_msb_cycle_lt[i] = 0;
350 }
351
352 if (i==0 || i==shdr->num_long_term_sps) {
353 ctx->DeltaPocMsbCycleLt[i] = shdr->delta_poc_msb_cycle_lt[i];
354 }
355 else {
356 ctx->DeltaPocMsbCycleLt[i] = (shdr->delta_poc_msb_cycle_lt[i] +
357 ctx->DeltaPocMsbCycleLt[i-1]);
358 }
359 }
360 }
361
362 if (sps->sps_temporal_mvp_enabled_flag) {
363 shdr->slice_temporal_mvp_enabled_flag = get_bits(br,1);
364 }
365 else {
366 shdr->slice_temporal_mvp_enabled_flag = 0;
367 }
368 }
369 else {
370 shdr->slice_pic_order_cnt_lsb = 0;
371 shdr->num_long_term_sps = 0;
372 shdr->num_long_term_pics= 0;
373 }
374
375
376 // --- SAO ---
377
378 if (sps->sample_adaptive_offset_enabled_flag) {
379 shdr->slice_sao_luma_flag = get_bits(br,1);
380 shdr->slice_sao_chroma_flag = get_bits(br,1);
381 }
382 else {
383 shdr->slice_sao_luma_flag = 0;
384 shdr->slice_sao_chroma_flag = 0;
385 }
386
387 if (shdr->slice_type == SLICE_TYPE_P ||
388 shdr->slice_type == SLICE_TYPE_B) {
389 shdr->num_ref_idx_active_override_flag = get_bits(br,1);
390 if (shdr->num_ref_idx_active_override_flag) {
391 shdr->num_ref_idx_l0_active = get_uvlc(br);
392 if (shdr->num_ref_idx_l0_active == UVLC_ERROR) {
393 add_warning(ctx, DE265_WARNING_SLICEHEADER_INVALID, false);
394 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
395 }
396 shdr->num_ref_idx_l0_active++;;
397
398 if (shdr->slice_type == SLICE_TYPE_B) {
399 shdr->num_ref_idx_l1_active = get_uvlc(br);
400 if (shdr->num_ref_idx_l1_active == UVLC_ERROR) {
401 add_warning(ctx, DE265_WARNING_SLICEHEADER_INVALID, false);
402 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
403 }
404 shdr->num_ref_idx_l1_active++;
405 }
406 }
407 else {
408 shdr->num_ref_idx_l0_active = pps->num_ref_idx_l0_default_active;
409 shdr->num_ref_idx_l1_active = pps->num_ref_idx_l1_default_active;
410 }
411
412 int NumPocTotalCurr = shdr->CurrRps->NumPocTotalCurr;
413 // TODO: add number of longterm images
414
415 if (pps->lists_modification_present_flag && NumPocTotalCurr > 1) {
416
417 int nBits = ceil_log2(NumPocTotalCurr);
418
419 shdr->ref_pic_list_modification_flag_l0 = get_bits(br,1);
420 if (shdr->ref_pic_list_modification_flag_l0) {
421 for (int i=0;i<shdr->num_ref_idx_l0_active;i++) {
422 shdr->list_entry_l0[i] = get_bits(br, nBits);
423 }
424 }
425
426 if (shdr->slice_type == SLICE_TYPE_B) {
427 shdr->ref_pic_list_modification_flag_l1 = get_bits(br,1);
428 if (shdr->ref_pic_list_modification_flag_l1) {
429 for (int i=0;i<shdr->num_ref_idx_l1_active;i++) {
430 shdr->list_entry_l1[i] = get_bits(br, nBits);
431 }
432 }
433 }
434 else {
435 shdr->ref_pic_list_modification_flag_l1 = 0;
436 }
437 }
438 else {
439 shdr->ref_pic_list_modification_flag_l0 = 0;
440 shdr->ref_pic_list_modification_flag_l1 = 0;
441 }
442
443 if (shdr->slice_type == SLICE_TYPE_B) {
444 shdr->mvd_l1_zero_flag = get_bits(br,1);
445 }
446
447 if (pps->cabac_init_present_flag) {
448 shdr->cabac_init_flag = get_bits(br,1);
449 }
450 else {
451 shdr->cabac_init_flag = 0;
452 }
453
454 if (shdr->slice_temporal_mvp_enabled_flag) {
455 if (shdr->slice_type == SLICE_TYPE_B)
456 shdr->collocated_from_l0_flag = get_bits(br,1);
457 else
458 shdr->collocated_from_l0_flag = 1;
459
460 if (( shdr->collocated_from_l0_flag && shdr->num_ref_idx_l0_active > 1) ||
461 (!shdr->collocated_from_l0_flag && shdr->num_ref_idx_l1_active > 1)) {
462 shdr->collocated_ref_idx = get_uvlc(br);
463 if (shdr->collocated_ref_idx == UVLC_ERROR) {
464 add_warning(ctx, DE265_WARNING_SLICEHEADER_INVALID, false);
465 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
466 }
467 }
468 else {
469 shdr->collocated_ref_idx = 0;
470 }
471 }
472
473 if ((pps->weighted_pred_flag && shdr->slice_type == SLICE_TYPE_P) ||
474 (pps->weighted_bipred_flag && shdr->slice_type == SLICE_TYPE_B)) {
475
476 if (!read_pred_weight_table(br,shdr,ctx))
477 {
478 add_warning(ctx, DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
479 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
480 }
481 }
482
483 shdr->five_minus_max_num_merge_cand = get_uvlc(br);
484 if (shdr->five_minus_max_num_merge_cand == UVLC_ERROR) {
485 add_warning(ctx, DE265_WARNING_SLICEHEADER_INVALID, false);
486 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
487 }
488 shdr->MaxNumMergeCand = 5-shdr->five_minus_max_num_merge_cand;
489 }
490
491 shdr->slice_qp_delta = get_svlc(br);
492 if (shdr->slice_qp_delta == UVLC_ERROR) {
493 add_warning(ctx, DE265_WARNING_SLICEHEADER_INVALID, false);
494 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
495 }
496 //logtrace(LogSlice,"slice_qp_delta: %d\n",shdr->slice_qp_delta);
497
498 if (pps->pps_slice_chroma_qp_offsets_present_flag) {
499 shdr->slice_cb_qp_offset = get_svlc(br);
500 if (shdr->slice_cb_qp_offset == UVLC_ERROR) {
501 add_warning(ctx, DE265_WARNING_SLICEHEADER_INVALID, false);
502 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
503 }
504
505 shdr->slice_cr_qp_offset = get_svlc(br);
506 if (shdr->slice_cr_qp_offset == UVLC_ERROR) {
507 add_warning(ctx, DE265_WARNING_SLICEHEADER_INVALID, false);
508 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
509 }
510 }
511 else {
512 shdr->slice_cb_qp_offset = 0;
513 shdr->slice_cr_qp_offset = 0;
514 }
515
516 if (pps->deblocking_filter_override_enabled_flag) {
517 shdr->deblocking_filter_override_flag = get_bits(br,1);
518 }
519 else {
520 shdr->deblocking_filter_override_flag = 0;
521 }
522
523 shdr->slice_beta_offset = pps->beta_offset;
524 shdr->slice_tc_offset = pps->tc_offset;
525
526 if (shdr->deblocking_filter_override_flag) {
527 shdr->slice_deblocking_filter_disabled_flag = get_bits(br,1);
528 if (!shdr->slice_deblocking_filter_disabled_flag) {
529 shdr->slice_beta_offset = get_svlc(br);
530 if (shdr->slice_beta_offset == UVLC_ERROR) {
531 add_warning(ctx, DE265_WARNING_SLICEHEADER_INVALID, false);
532 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
533 }
534 shdr->slice_beta_offset *= 2;
535
536 shdr->slice_tc_offset = get_svlc(br);
537 if (shdr->slice_tc_offset == UVLC_ERROR) {
538 add_warning(ctx, DE265_WARNING_SLICEHEADER_INVALID, false);
539 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
540 }
541 shdr->slice_tc_offset *= 2;
542 }
543 }
544 else {
545 shdr->slice_deblocking_filter_disabled_flag = pps->pic_disable_deblocking_filter_flag;
546 }
547
548 if (pps->pps_loop_filter_across_slices_enabled_flag &&
549 (shdr->slice_sao_luma_flag || shdr->slice_sao_chroma_flag ||
550 !shdr->slice_deblocking_filter_disabled_flag )) {
551 shdr->slice_loop_filter_across_slices_enabled_flag = get_bits(br,1);
552 }
553 else {
554 shdr->slice_loop_filter_across_slices_enabled_flag =
555 pps->pps_loop_filter_across_slices_enabled_flag;
556 }
557 }
558
559 if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag ) {
560 shdr->num_entry_point_offsets = get_uvlc(br);
561 if (shdr->num_entry_point_offsets == UVLC_ERROR ||
562 shdr->num_entry_point_offsets > MAX_ENTRY_POINTS) { // TODO: make entry points array dynamic
563 add_warning(ctx, DE265_WARNING_SLICEHEADER_INVALID, false);
564 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
565 }
566
567 if (shdr->num_entry_point_offsets > 0) {
568 shdr->offset_len = get_uvlc(br);
569 if (shdr->offset_len == UVLC_ERROR) {
570 add_warning(ctx, DE265_WARNING_SLICEHEADER_INVALID, false);
571 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
572 }
573 shdr->offset_len++;
574
575 for (int i=0; i<shdr->num_entry_point_offsets; i++) {
576 {
577 shdr->entry_point_offset[i] = get_bits(br,shdr->offset_len)+1;
578 }
579
580 if (i>0) {
581 shdr->entry_point_offset[i] += shdr->entry_point_offset[i-1];
582 }
583 }
584 }
585 }
586
587 if (pps->slice_segment_header_extension_present_flag) {
588 shdr->slice_segment_header_extension_length = get_uvlc(br);
589 if (shdr->slice_segment_header_extension_length == UVLC_ERROR ||
590 shdr->slice_segment_header_extension_length > 1000) { // TODO: safety check against too large values
591 add_warning(ctx, DE265_WARNING_SLICEHEADER_INVALID, false);
592 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
593 }
594
595 for (int i=0; i<shdr->slice_segment_header_extension_length; i++) {
596 //slice_segment_header_extension_data_byte[i]
597 get_bits(br,8);
598 }
599 }
600
601 //byte_alignment();
602 //skip_to_byte_boundary(br);
603
604
605 // --- init variables ---
606
607 shdr->SliceQPY = pps->pic_init_qp + shdr->slice_qp_delta;
608 //shdr->CuQpDelta = 0;
609
610 switch (shdr->slice_type)
611 {
612 case SLICE_TYPE_I: shdr->initType = 0; break;
613 case SLICE_TYPE_P: shdr->initType = shdr->cabac_init_flag + 1/*shdr->cabac_init_flag ? 2 : 1*/; break;
614 case SLICE_TYPE_B: shdr->initType = 2 - shdr->cabac_init_flag/*shdr->cabac_init_flag ? 1 : 2*/; break;
615 }
616
617 *continueDecoding = true;
618 return DE265_OK;
619 }
620
621
622
623 //-----------------------------------------------------------------------
624
625
626 void dump_slice_segment_header(const slice_segment_header* shdr, const decoder_context* ctx, int fd)
627 {
628 FILE* fh;
629 if (fd==1) fh=stdout;
630 else if (fd==2) fh=stderr;
631 else { return; }
632
633 #define LOG0(t) log2fh(fh, t)
634 #define LOG1(t,d) log2fh(fh, t,d)
635 #define LOG2(t,d1,d2) log2fh(fh, t,d1,d2)
636 #define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3)
637 #define LOG4(t,d1,d2,d3,d4) log2fh(fh, t,d1,d2,d3,d4)
638
639 const pic_parameter_set* pps = &ctx->pps[shdr->slice_pic_parameter_set_id];
640 assert(pps->pps_read); // TODO: error handling
641
642 const seq_parameter_set* sps = &ctx->sps[(int)pps->seq_parameter_set_id];
643 assert(sps->sps_read); // TODO: error handling
644
645
646 LOG0("----------------- SLICE -----------------\n");
647 LOG1("first_slice_segment_in_pic_flag : %d\n", shdr->first_slice_segment_in_pic_flag);
648 if (ctx->nal_unit_type >= NAL_UNIT_BLA_W_LP &&
649 ctx->nal_unit_type <= NAL_UNIT_RESERVED_IRAP_VCL23) {
650 LOG1("no_output_of_prior_pics_flag : %d\n", shdr->no_output_of_prior_pics_flag);
651 }
652
653 LOG1("slice_pic_parameter_set_id : %d\n", shdr->slice_pic_parameter_set_id);
654
655 if (!shdr->first_slice_segment_in_pic_flag) {
656 if (pps->dependent_slice_segments_enabled_flag) {
657 LOG1("dependent_slice_segment_flag : %d\n", shdr->dependent_slice_segment_flag);
658 }
659 LOG1("slice_segment_address : %d\n", shdr->slice_segment_address);
660 }
661
662 if (!shdr->dependent_slice_segment_flag) {
663 //for (int i=0; i<pps->num_extra_slice_header_bits; i++) {
664 //slice_reserved_flag[i]
665
666 LOG1("slice_type : %c\n",
667 shdr->slice_type == 0 ? 'B' :
668 shdr->slice_type == 1 ? 'P' : 'I');
669
670 if (pps->output_flag_present_flag) {
671 LOG1("pic_output_flag : %d\n", shdr->pic_output_flag);
672 }
673
674 if (sps->separate_colour_plane_flag == 1) {
675 LOG1("colour_plane_id : %d\n", shdr->colour_plane_id);
676 }
677
678 LOG1("slice_pic_order_cnt_lsb : %d\n", shdr->slice_pic_order_cnt_lsb);
679
680 if (ctx->nal_unit_type != NAL_UNIT_IDR_W_RADL &&
681 ctx->nal_unit_type != NAL_UNIT_IDR_N_LP) {
682 LOG1("short_term_ref_pic_set_sps_flag : %d\n", shdr->short_term_ref_pic_set_sps_flag);
683
684 if (!shdr->short_term_ref_pic_set_sps_flag) {
685 LOG1("ref_pic_set[ %2d ]: ",sps->num_short_term_ref_pic_sets);
686 dump_compact_short_term_ref_pic_set(&shdr->slice_ref_pic_set, 16, fh);
687 }
688 else if (sps->num_short_term_ref_pic_sets > 1) {
689 LOG1("short_term_ref_pic_set_idx : %d\n", shdr->short_term_ref_pic_set_idx);
690 dump_compact_short_term_ref_pic_set(&sps->ref_pic_sets[shdr->short_term_ref_pic_set_idx], 16, fh);
691 }
692
693 if (sps->long_term_ref_pics_present_flag) {
694 if (sps->num_long_term_ref_pics_sps > 0) {
695 LOG1("num_long_term_sps : %d\n", shdr->num_long_term_sps);
696 }
697
698 LOG1("num_long_term_pics : %d\n", shdr->num_long_term_pics);
699
700 for (int i=0; i<shdr->num_long_term_sps + shdr->num_long_term_pics; i++) {
701 LOG2("PocLsbLt[%d] : %d\n", i, ctx->PocLsbLt[i]);
702 LOG2("UsedByCurrPicLt[%d] : %d\n", i, ctx->UsedByCurrPicLt[i]);
703 LOG2("DeltaPocMsbCycleLt[%d] : %d\n", i, ctx->DeltaPocMsbCycleLt[i]);
704 }
705 }
706
707 if (sps->sps_temporal_mvp_enabled_flag) {
708 LOG1("slice_temporal_mvp_enabled_flag : %d\n", shdr->slice_temporal_mvp_enabled_flag);
709 }
710 }
711
712
713 if (sps->sample_adaptive_offset_enabled_flag) {
714 LOG1("slice_sao_luma_flag : %d\n", shdr->slice_sao_luma_flag);
715 LOG1("slice_sao_chroma_flag : %d\n", shdr->slice_sao_chroma_flag);
716 }
717
718
719 if (shdr->slice_type == SLICE_TYPE_P || shdr->slice_type == SLICE_TYPE_B) {
720 LOG1("num_ref_idx_active_override_flag : %d\n", shdr->num_ref_idx_active_override_flag);
721
722 LOG2("num_ref_idx_l0_active : %d %s\n", shdr->num_ref_idx_l0_active,
723 shdr->num_ref_idx_active_override_flag ? "" : "(from PPS)");
724
725 if (shdr->slice_type == SLICE_TYPE_B) {
726 LOG2("num_ref_idx_l1_active : %d %s\n", shdr->num_ref_idx_l1_active,
727 shdr->num_ref_idx_active_override_flag ? "" : "(from PPS)");
728 }
729
730 int NumPocTotalCurr = shdr->CurrRps->NumPocTotalCurr;
731 // TODO: add number of longterm images
732
733 if (pps->lists_modification_present_flag && NumPocTotalCurr > 1)
734 {
735 LOG1("ref_pic_list_modification_flag_l0 : %d\n", shdr->ref_pic_list_modification_flag_l0);
736 if (shdr->ref_pic_list_modification_flag_l0) {
737 for (int i=0;i<shdr->num_ref_idx_l0_active;i++) {
738 LOG2(" %d: %d\n",i,shdr->list_entry_l0[i]);
739 }
740 }
741
742 LOG1("ref_pic_list_modification_flag_l1 : %d\n", shdr->ref_pic_list_modification_flag_l1);
743 if (shdr->ref_pic_list_modification_flag_l1) {
744 for (int i=0;i<shdr->num_ref_idx_l1_active;i++) {
745 LOG2(" %d: %d\n",i,shdr->list_entry_l1[i]);
746 }
747 }
748 }
749
750 if (shdr->slice_type == SLICE_TYPE_B) {
751 LOG1("mvd_l1_zero_flag : %d\n", shdr->mvd_l1_zero_flag);
752 }
753
754 LOG1("cabac_init_flag : %d\n", shdr->cabac_init_flag);
755
756 if (shdr->slice_temporal_mvp_enabled_flag) {
757 LOG1("collocated_from_l0_flag : %d\n", shdr->collocated_from_l0_flag);
758 LOG1("collocated_ref_idx : %d\n", shdr->collocated_ref_idx);
759 }
760
761 if ((pps->weighted_pred_flag && shdr->slice_type == SLICE_TYPE_P) ||
762 (pps->weighted_bipred_flag && shdr->slice_type == SLICE_TYPE_B))
763 {
764 LOG1("luma_log2_weight_denom : %d\n", shdr->luma_log2_weight_denom);
765 if (sps->chroma_format_idc != 0) {
766 LOG1("ChromaLog2WeightDenom : %d\n", shdr->ChromaLog2WeightDenom);
767 }
768
769 for (int l=0;l<=1;l++)
770 if (l==0 || (l==1 && shdr->slice_type == SLICE_TYPE_B))
771 {
772 int num_ref = (l==0 ?
773 shdr->num_ref_idx_l0_active-1 :
774 shdr->num_ref_idx_l1_active-1);
775
776 if (false) { // do not show these flags
777 for (int i=0;i<=num_ref;i++) {
778 LOG3("luma_weight_flag_l%d[%d] : %d\n",l,i,shdr->luma_weight_flag[l][i]);
779 }
780
781 if (sps->chroma_format_idc != 0) {
782 for (int i=0;i<=num_ref;i++) {
783 LOG3("chroma_weight_flag_l%d[%d] : %d\n",l,i,shdr->chroma_weight_flag[l][i]);
784 }
785 }
786 }
787
788 for (int i=0;i<=num_ref;i++) {
789 LOG3("LumaWeight_L%d[%d] : %d\n",l,i,shdr->LumaWeight[l][i]);
790 LOG3("luma_offset_l%d[%d] : %d\n",l,i,shdr->luma_offset[l][i]);
791
792 for (int j=0;j<2;j++) {
793 LOG4("ChromaWeight_L%d[%d][%d] : %d\n",l,i,j,shdr->ChromaWeight[l][i][j]);
794 LOG4("ChromaOffset_L%d[%d][%d] : %d\n",l,i,j,shdr->ChromaOffset[l][i][j]);
795 }
796 }
797 }
798 }
799
800 LOG1("five_minus_max_num_merge_cand : %d\n", shdr->five_minus_max_num_merge_cand);
801 }
802
803
804 LOG1("slice_qp_delta : %d\n", shdr->slice_qp_delta);
805 if (pps->pps_slice_chroma_qp_offsets_present_flag) {
806 LOG1("slice_cb_qp_offset : %d\n", shdr->slice_cb_qp_offset);
807 LOG1("slice_cr_qp_offset : %d\n", shdr->slice_cr_qp_offset);
808 }
809
810 if (pps->deblocking_filter_override_enabled_flag) {
811 LOG1("deblocking_filter_override_flag : %d\n", shdr->deblocking_filter_override_flag);
812 }
813
814 LOG2("slice_deblocking_filter_disabled_flag : %d %s\n",
815 shdr->slice_deblocking_filter_disabled_flag,
816 (shdr->deblocking_filter_override_flag ? "(override)" : "(from pps)"));
817
818 if (shdr->deblocking_filter_override_flag) {
819
820 if (!shdr->slice_deblocking_filter_disabled_flag) {
821 LOG1("slice_beta_offset : %d\n", shdr->slice_beta_offset);
822 LOG1("slice_tc_offset : %d\n", shdr->slice_tc_offset);
823 }
824 }
825
826 if (pps->pps_loop_filter_across_slices_enabled_flag &&
827 (shdr->slice_sao_luma_flag || shdr->slice_sao_chroma_flag ||
828 !shdr->slice_deblocking_filter_disabled_flag)) {
829 LOG1("slice_loop_filter_across_slices_enabled_flag : %d\n",
830 shdr->slice_loop_filter_across_slices_enabled_flag);
831 }
832 }
833
834 if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag) {
835 LOG1("num_entry_point_offsets : %d\n", shdr->num_entry_point_offsets);
836
837 if (shdr->num_entry_point_offsets > 0) {
838 LOG1("offset_len : %d\n", shdr->offset_len);
839
840 for (int i=0; i<shdr->num_entry_point_offsets; i++) {
841 LOG2("entry point [%i] : %d\n", i, shdr->entry_point_offset[i]);
842 }
843 }
844 }
845
846 /*
847 if( slice_segment_header_extension_present_flag ) {
848 slice_segment_header_extension_length
849 for( i = 0; i < slice_segment_header_extension_length; i++)
850 slice_segment_header_extension_data_byte[i]
851 }
852 byte_alignment()
853 }
854 */
855
856 #undef LOG0
857 #undef LOG1
858 #undef LOG2
859 #undef LOG3
860 #undef LOG4
861 //#endif
862 }
863
864
865
866
867
868 static void set_initValue(decoder_context* ctx, slice_segment_header* shdr,
869 context_model* model, int initValue)
870 {
871 int slopeIdx = initValue >> 4;
872 int intersecIdx = initValue & 0xF;
873 int m = slopeIdx*5 - 45;
874 int n = (intersecIdx<<3) - 16;
875 int preCtxState = Clip3(1,126, ((m*Clip3(0,51, shdr->SliceQPY))>>4)+n);
876
877 logtrace(LogSlice,"QP=%d slopeIdx=%d intersecIdx=%d m=%d n=%d\n",shdr->SliceQPY,slopeIdx,intersecIdx,m,n);
878
879 model->MPSbit=(preCtxState<=63) ? 0 : 1;
880 model->state = model->MPSbit ? (preCtxState-64) : (63-preCtxState);
881
882 // model state will always be between [0;62]
883
884 assert(model->state >= 0);
885 assert(model->state <= 62);
886 }
887
888
889 static const int initValue_split_cu_flag[3][3] = {
890 { 139,141,157 },
891 { 107,139,126 },
892 { 107,139,126 },
893 };
894 static const int initValue_cu_skip_flag[2][3] = {
895 { 197,185,201 },
896 { 197,185,201 },
897 };
898 static const int initValue_part_mode[9] = { 184,154,139, 154,154,154, 139,154,154 };
899 static const int initValue_prev_intra_luma_pred_flag[3] = { 184,154,183 };
900 static const int initValue_intra_chroma_pred_mode[3] = { 63,152,152 };
901 static const int initValue_cbf_luma[4] = { 111,141,153,111 };
902 static const int initValue_cbf_chroma[12] = { 94,138,182,154,149,107,167,154,149,92,167,154 };
903 static const int initValue_split_transform_flag[9] = { 153,138,138, 124,138,94, 224,167,122 }; // FIX712
904 static const int initValue_last_significant_coefficient_prefix[54] = {
905 110,110,124,125,140,153,125,127,140,109,111,143,127,111, 79,108,123, 63,
906 125,110, 94,110, 95, 79,125,111,110, 78,110,111,111, 95, 94,108,123,108,
907 125,110,124,110, 95, 94,125,111,111, 79,125,126,111,111, 79,108,123, 93
908 };
909 static const int initValue_coded_sub_block_flag[12] = { 91,171,134,141,121,140,61,154,121,140,61,154 };
910 static const int initValue_significant_coeff_flag[3][42] = {
911 {
912 111, 111, 125, 110, 110, 94, 124, 108, 124, 107, 125, 141, 179, 153, 125, 107,
913 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140, 139, 182, 182, 152,
914 136, 152, 136, 153, 136, 139, 111, 136, 139, 111
915 },
916 {
917 155, 154, 139, 153, 139, 123, 123, 63, 153, 166, 183, 140, 136, 153, 154, 166,
918 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, 153, 123, 123, 107,
919 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
920 },
921 {
922 170, 154, 139, 153, 139, 123, 123, 63, 124, 166, 183, 140, 136, 153, 154, 166,
923 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, 153, 138, 138, 122,
924 121, 122, 121, 167, 151, 183, 140, 151, 183, 140
925 },
926 };
927 static const int initValue_coeff_abs_level_greater1_flag[72] = {
928 140, 92,137,138,140,152,138,139,153, 74,149, 92,139,107,122,152,
929 140,179,166,182,140,227,122,197,154,196,196,167,154,152,167,182,
930 182,134,149,136,153,121,136,137,169,194,166,167,154,167,137,182,
931 154,196,167,167,154,152,167,182,182,134,149,136,153,121,136,122,
932 169,208,166,167,154,152,167,182
933 };
934 static const int initValue_coeff_abs_level_greater2_flag[18] = {
935 138,153,136,167,152,152,107,167, 91,122,107,167,
936 107,167, 91,107,107,167
937 };
938 static const int initValue_sao_merge_leftUp_flag[3] = { 153,153,153 };
939 static const int initValue_sao_type_idx_lumaChroma_flag[3] = { 200,185,160 };
940 static const int initValue_cu_qp_delta_abs[2] = { 154,154 };
941 static const int initValue_transform_skip_flag[2] = { 139,139 };
942 static const int initValue_merge_flag[2] = { 110,154 };
943 static const int initValue_merge_idx[2] = { 122,137 };
944 static const int initValue_pred_mode_flag[2] = { 149,134 };
945 static const int initValue_abs_mvd_greater01_flag[4] = { 140,198,169,198 };
946 static const int initValue_mvp_lx_flag[1] = { 168 };
947 static const int initValue_rqt_root_cbf[1] = { 79 };
948 static const int initValue_ref_idx_lX[2] = { 153,153 };
949 static const int initValue_inter_pred_idc[5] = { 95,79,63,31,31 };
950 static const int initValue_cu_transquant_bypass_flag[3] = { 154,154,154 };
951
952
953 static void init_context(decoder_context* ctx,
954 thread_context* tctx,
955 enum context_model_indices idx,
956 const int* initValues, int len)
957 {
958 for (int i=0;i<len;i++)
959 {
960 set_initValue(ctx,tctx->shdr,
961 &tctx->ctx_model[idx+i],
962 initValues[i]);
963 }
964 }
965
966
967 static int decode_transform_skip_flag(thread_context* tctx, int cIdx)
968 {
969 const int context = (cIdx==0) ? 0 : 1;
970
971 logtrace(LogSlice,"# transform_skip_flag (context=%d)\n",context);
972
973 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
974 &tctx->ctx_model[CONTEXT_MODEL_TRANSFORM_SKIP_FLAG+context]);
975 return bit;
976 }
977
978
979 static int decode_sao_merge_flag(thread_context* tctx)
980 {
981 logtrace(LogSlice,"# sao_merge_left/up_flag\n");
982 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
983 &tctx->ctx_model[CONTEXT_MODEL_SAO_MERGE_FLAG]);
984 return bit;
985 }
986
987
988
989 static int decode_sao_type_idx(thread_context* tctx)
990 {
991 logtrace(LogSlice,"# sao_type_idx_luma/chroma\n");
992
993 int bit0 = decode_CABAC_bit(&tctx->cabac_decoder,
994 &tctx->ctx_model[CONTEXT_MODEL_SAO_TYPE_IDX]);
995
996 if (bit0==0) {
997 return 0;
998 }
999 else {
1000 int bit1 = decode_CABAC_bypass(&tctx->cabac_decoder);
1001 if (bit1==0) {
1002 return 1;
1003 }
1004 else {
1005 return 2;
1006 }
1007 }
1008 }
1009
1010
1011 static int decode_sao_offset_abs(thread_context* tctx)
1012 {
1013 logtrace(LogSlice,"# sao_offset_abs\n");
1014 int bitDepth = 8;
1015 int cMax = (1<<(libde265_min(bitDepth,10)-5))-1;
1016 int value = decode_CABAC_TU_bypass(&tctx->cabac_decoder, cMax);
1017 return value;
1018 }
1019
1020
1021 static int decode_sao_class(thread_context* tctx)
1022 {
1023 logtrace(LogSlice,"# sao_class\n");
1024 int value = decode_CABAC_FL_bypass(&tctx->cabac_decoder, 2);
1025 return value;
1026 }
1027
1028
1029 static int decode_sao_offset_sign(thread_context* tctx)
1030 {
1031 logtrace(LogSlice,"# sao_offset_sign\n");
1032 int value = decode_CABAC_bypass(&tctx->cabac_decoder);
1033 return value;
1034 }
1035
1036
1037 static int decode_sao_band_position(thread_context* tctx)
1038 {
1039 logtrace(LogSlice,"# sao_band_position\n");
1040 int value = decode_CABAC_FL_bypass(&tctx->cabac_decoder,5);
1041 return value;
1042 }
1043
1044
1045 static int decode_transquant_bypass_flag(thread_context* tctx)
1046 {
1047 logtrace(LogSlice,"# cu_transquant_bypass_enable_flag\n");
1048 int value = decode_CABAC_bit(&tctx->cabac_decoder,
1049 &tctx->ctx_model[CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG]);
1050 return value;
1051 }
1052
1053
1054 static int decode_split_cu_flag(thread_context* tctx,
1055 int x0, int y0, int ctDepth)
1056 {
1057 decoder_context* ctx = tctx->decctx;
1058
1059 // check if neighbors are available
1060
1061 int availableL = check_CTB_available(ctx,tctx->shdr, x0,y0, x0-1,y0);
1062 int availableA = check_CTB_available(ctx,tctx->shdr, x0,y0, x0,y0-1);
1063
1064 int condL = 0;
1065 int condA = 0;
1066
1067 if (availableL && get_ctDepth(ctx->img,ctx->current_sps,x0-1,y0) > ctDepth) condL=1;
1068 if (availableA && get_ctDepth(ctx->img,ctx->current_sps,x0,y0-1) > ctDepth) condA=1;
1069
1070 int contextOffset = condL + condA;
1071 int context = contextOffset;
1072
1073 // decode bit
1074
1075 logtrace(LogSlice,"# split_cu_flag context=%d R=%x\n", context, tctx->cabac_decoder.range);
1076
1077 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_SPLIT_CU_FLAG + context]);
1078
1079 logtrace(LogSlice,"> split_cu_flag R=%x, ctx=%d, bit=%d\n", tctx->cabac_decoder.range,context,bit);
1080
1081 return bit;
1082 }
1083
1084
1085 static int decode_cu_skip_flag(thread_context* tctx,
1086 int x0, int y0, int ctDepth)
1087 {
1088 decoder_context* ctx = tctx->decctx;
1089
1090 // check if neighbors are available
1091
1092 int availableL = check_CTB_available(ctx,tctx->shdr, x0,y0, x0-1,y0);
1093 int availableA = check_CTB_available(ctx,tctx->shdr, x0,y0, x0,y0-1);
1094
1095 int condL = 0;
1096 int condA = 0;
1097
1098 if (availableL && get_cu_skip_flag(ctx->current_sps,ctx->img,x0-1,y0)) condL=1;
1099 if (availableA && get_cu_skip_flag(ctx->current_sps,ctx->img,x0,y0-1)) condA=1;
1100
1101 int contextOffset = condL + condA;
1102 int context = contextOffset;
1103
1104 // decode bit
1105
1106 logtrace(LogSlice,"# cu_skip_flag context=%d R=%x\n", context, tctx->cabac_decoder.range);
1107
1108 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_CU_SKIP_FLAG + context]);
1109
1110 logtrace(LogSlice,"> cu_skip_flag R=%x, ctx=%d, bit=%d\n", tctx->cabac_decoder.range,context,bit);
1111
1112 return bit;
1113 }
1114
1115
1116 static enum PartMode decode_part_mode(thread_context* tctx,
1117 enum PredMode pred_mode, int cLog2CbSize)
1118 {
1119 decoder_context* ctx = tctx->decctx;
1120
1121 if (pred_mode == MODE_INTRA) {
1122 logtrace(LogSlice,"# part_mode (INTRA)\n");
1123
1124 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE]);
1125
1126 logtrace(LogSlice,"> %s\n",bit ? "2Nx2N" : "NxN");
1127
1128 return bit ? PART_2Nx2N : PART_NxN;
1129 }
1130 else {
1131 int bit0 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+0]);
1132 if (bit0) { return PART_2Nx2N; }
1133
1134 // CHECK_ME: I optimize code and fix bug here, need more VERIFY!
1135 int bit1 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+1]);
1136 if (cLog2CbSize > ctx->current_sps->Log2MinCbSizeY) {
1137 if (!ctx->current_sps->amp_enabled_flag) {
1138 return bit1 ? PART_2NxN : PART_Nx2N;
1139 }
1140 else {
1141 int bit3 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+3]);
1142 if (bit3) {
1143 return bit1 ? PART_2NxN : PART_Nx2N;
1144 }
1145
1146 int bit4 = decode_CABAC_bypass(&tctx->cabac_decoder);
1147 if ( bit1 && bit4) return PART_2NxnD;
1148 if ( bit1 && !bit4) return PART_2NxnU;
1149 if (!bit1 && !bit4) return PART_nLx2N;
1150 if (!bit1 && bit4) return PART_nRx2N;
1151 }
1152 }
1153 else {
1154 // TODO, we could save one if here when first decoding the next bin and then
1155 // checkcLog2CbSize==3 when it is '0'
1156
1157 if (bit1) return PART_2NxN;
1158
1159 if (cLog2CbSize==3) {
1160 return PART_Nx2N;
1161 }
1162 else {
1163 int bit2 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+2]);
1164 return (enum PartMode)((int)PART_NxN - bit2)/*bit2 ? PART_Nx2N : PART_NxN*/;
1165 }
1166 }
1167 }
1168
1169 assert(false); // should never be reached
1170 return PART_2Nx2N;
1171 }
1172
1173
1174 static int decode_prev_intra_luma_pred_flag(thread_context* tctx)
1175 {
1176 logtrace(LogSlice,"# prev_intra_luma_pred_flag\n");
1177 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG]);
1178 return bit;
1179 }
1180
1181
1182 static int decode_mpm_idx(thread_context* tctx)
1183 {
1184 logtrace(LogSlice,"# mpm_idx (TU:2)\n");
1185 int mpm = decode_CABAC_TU_bypass(&tctx->cabac_decoder, 2);
1186 logtrace(LogSlice,"> mpm_idx = %d\n",mpm);
1187 return mpm;
1188 }
1189
1190
1191 static int decode_rem_intra_luma_pred_mode(thread_context* tctx)
1192 {
1193 logtrace(LogSlice,"# rem_intra_luma_pred_mode (5 bits)\n");
1194 return decode_CABAC_FL_bypass(&tctx->cabac_decoder, 5);
1195 }
1196
1197
1198 static int decode_intra_chroma_pred_mode(thread_context* tctx)
1199 {
1200 logtrace(LogSlice,"# intra_chroma_pred_mode\n");
1201
1202 int prefix = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE]);
1203
1204 int mode;
1205 if (prefix==0) {
1206 mode=4;
1207 }
1208 else {
1209 mode = decode_CABAC_FL_bypass(&tctx->cabac_decoder, 2);
1210 }
1211
1212 logtrace(LogSlice,"> intra_chroma_pred_mode = %d\n",mode);
1213
1214 return mode;
1215 }
1216
1217
1218 static int decode_split_transform_flag(thread_context* tctx,
1219 int log2TrafoSize)
1220 {
1221 logtrace(LogSlice,"# split_transform_flag (log2TrafoSize=%d)\n",log2TrafoSize);
1222
1223 int context = 5-log2TrafoSize;
1224 assert(context >= 0 && context <= 2);
1225
1226 logtrace(LogSlice,"# context: %d\n",context);
1227
1228 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG + context]);
1229 return bit;
1230 }
1231
1232
1233 static int decode_cbf_chroma(thread_context* tctx,
1234 int trafoDepth)
1235 {
1236 logtrace(LogSlice,"# cbf_chroma\n");
1237
1238 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_CBF_CHROMA + trafoDepth]);
1239
1240 return bit;
1241 }
1242
1243
1244 static int decode_cbf_luma(thread_context* tctx,
1245 int trafoDepth)
1246 {
1247 logtrace(LogSlice,"# cbf_luma\n");
1248
1249 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_CBF_LUMA + (trafoDepth==0)]);
1250
1251 logtrace(LogSlice,"> cbf_luma = %d\n",bit);
1252
1253 return bit;
1254 }
1255
1256
1257 static inline int decode_coded_sub_block_flag(thread_context* tctx,
1258 int cIdx,
1259 uint8_t coded_sub_block_neighbors)
1260 {
1261 logtrace(LogSlice,"# coded_sub_block_flag\n");
1262
1263 // tricky computation of csbfCtx
1264 int csbfCtx = ((coded_sub_block_neighbors & 1) | // right neighbor set or
1265 (coded_sub_block_neighbors >> 1)); // bottom neighbor set -> csbfCtx=1
1266
1267 int ctxIdxInc = csbfCtx;
1268 if (cIdx!=0) {
1269 ctxIdxInc += 2;
1270 }
1271
1272 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1273 &tctx->ctx_model[CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG + ctxIdxInc]);
1274
1275 return bit;
1276 }
1277
1278
1279 static int decode_cu_qp_delta_abs(thread_context* tctx)
1280 {
1281 logtrace(LogSlice,"# cu_qp_delta_abs\n");
1282
1283 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1284 &tctx->ctx_model[CONTEXT_MODEL_CU_QP_DELTA_ABS + 0]);
1285 if (bit==0) {
1286 return 0;
1287 }
1288
1289 int prefix=1;
1290 for (int i=0;i<4;i++) {
1291 bit = decode_CABAC_bit(&tctx->cabac_decoder,
1292 &tctx->ctx_model[CONTEXT_MODEL_CU_QP_DELTA_ABS + 1]);
1293 if (bit==0) { break; }
1294 else { prefix++; }
1295 }
1296
1297 if (prefix==5) {
1298 int value = decode_CABAC_EGk_bypass(&tctx->cabac_decoder, 0);
1299 return value + 5;
1300 }
1301 else {
1302 return prefix;
1303 }
1304 }
1305
1306
1307 static int decode_last_significant_coeff_prefix(thread_context* tctx,
1308 int log2TrafoSize,
1309 int cIdx,
1310 context_model* model)
1311 {
1312 logtrace(LogSlice,"# last_significant_coeff_prefix log2TrafoSize:%d cIdx:%d\n",log2TrafoSize,cIdx);
1313
1314 int cMax = (log2TrafoSize<<1)-1;
1315
1316 int ctxOffset, ctxShift;
1317 if (cIdx==0) {
1318 ctxOffset = 3*(log2TrafoSize-2) + ((log2TrafoSize-1)>>2);
1319 ctxShift = (log2TrafoSize+1)>>2;
1320 }
1321 else {
1322 ctxOffset = 15;
1323 ctxShift = log2TrafoSize-2;
1324 }
1325
1326 int binIdx;
1327 int value = cMax;
1328 for (binIdx=0;binIdx<cMax;binIdx++)
1329 {
1330 int ctxIdxInc = (binIdx >> ctxShift);
1331
1332 logtrace(LogSlice,"context: %d+%d\n",ctxOffset,ctxIdxInc);
1333
1334 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &model[ctxOffset + ctxIdxInc]);
1335 if (bit==0) {
1336 value=binIdx;
1337 break;
1338 }
1339 }
1340
1341 logtrace(LogSlice,"> last_significant_coeff_prefix: %d\n", value);
1342
1343 return value;
1344 }
1345
1346
1347 static const uint8_t ctxIdxMap[16] = {
1348 0,1,4,5,
1349 2,3,4,5,
1350 6,6,8,8,
1351 7,7,8,99
1352 };
1353
1354 uint8_t* ctxIdxLookup[4 /* 4-log2-32 */][2 /* !!cIdx */][2 /* !!scanIdx */][4 /* prevCsbf */];
1355
1356 bool alloc_and_init_significant_coeff_ctxIdx_lookupTable()
1357 {
1358 int tableSize = 4*4*(2) + 8*8*(2*2*4) + 16*16*(2*4) + 32*32*(2*4);
1359
1360 uint8_t* p = (uint8_t*)malloc(tableSize);
1361 if (p==NULL) {
1362 return false;
1363 }
1364
1365 memset(p,0xFF,tableSize); // just for debugging
1366
1367
1368 // --- Set pointers to memory areas. Note that some parameters share the same memory. ---
1369
1370 // 4x4
1371
1372 for (int cIdx=0;cIdx<2;cIdx++) {
1373 for (int scanIdx=0;scanIdx<2;scanIdx++)
1374 for (int prevCsbf=0;prevCsbf<4;prevCsbf++)
1375 ctxIdxLookup[0][cIdx][scanIdx][prevCsbf] = p;
1376
1377 p += 4*4;
1378 }
1379
1380 // 8x8
1381
1382 for (int cIdx=0;cIdx<2;cIdx++)
1383 for (int scanIdx=0;scanIdx<2;scanIdx++)
1384 for (int prevCsbf=0;prevCsbf<4;prevCsbf++) {
1385 ctxIdxLookup[1][cIdx][scanIdx][prevCsbf] = p;
1386 p += 8*8;
1387 }
1388
1389 // 16x16
1390
1391 for (int cIdx=0;cIdx<2;cIdx++)
1392 for (int prevCsbf=0;prevCsbf<4;prevCsbf++) {
1393 for (int scanIdx=0;scanIdx<2;scanIdx++) {
1394 ctxIdxLookup[2][cIdx][scanIdx][prevCsbf] = p;
1395 }
1396
1397 p += 16*16;
1398 }
1399
1400 // 32x32
1401
1402 for (int cIdx=0;cIdx<2;cIdx++)
1403 for (int prevCsbf=0;prevCsbf<4;prevCsbf++) {
1404 for (int scanIdx=0;scanIdx<2;scanIdx++) {
1405 ctxIdxLookup[3][cIdx][scanIdx][prevCsbf] = p;
1406 }
1407
1408 p += 32*32;
1409 }
1410
1411
1412 // --- precompute ctxIdx tables ---
1413
1414 for (int log2w=2; log2w<=5 ; log2w++)
1415 for (int cIdx=0;cIdx<2;cIdx++)
1416 for (int scanIdx=0;scanIdx<2;scanIdx++)
1417 for (int prevCsbf=0;prevCsbf<4;prevCsbf++)
1418 {
1419 for (int yC=0;yC<(1<<log2w);yC++)
1420 for (int xC=0;xC<(1<<log2w);xC++)
1421 {
1422 int w = 1<<log2w;
1423 int sbWidth = w>>2;
1424
1425 int sigCtx;
1426
1427 // if log2TrafoSize==2
1428 if (sbWidth==1) {
1429 sigCtx = ctxIdxMap[(yC<<2) + xC];
1430 }
1431 else if (xC+yC==0) {
1432 sigCtx = 0;
1433 }
1434 else {
1435 int xS = xC>>2;
1436 int yS = yC>>2;
1437 /*
1438 int prevCsbf = 0;
1439
1440 if (xS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+1 +yS*sbWidth]; }
1441 if (yS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+(1+yS)*sbWidth]<<1; }
1442 */
1443 int xP = xC & 3;
1444 int yP = yC & 3;
1445
1446 //logtrace(LogSlice,"posInSubset: %d,%d\n",xP,yP);
1447 //logtrace(LogSlice,"prevCsbf: %d\n",prevCsbf);
1448
1449 switch (prevCsbf) {
1450 case 0:
1451 sigCtx = (xP+yP>=3) ? 0 : (xP+yP>0) ? 1 : 2;
1452 break;
1453 case 1:
1454 sigCtx = (yP==0) ? 2 : (yP==1) ? 1 : 0;
1455 break;
1456 case 2:
1457 sigCtx = (xP==0) ? 2 : (xP==1) ? 1 : 0;
1458 break;
1459 default:
1460 sigCtx = 2;
1461 break;
1462 }
1463
1464 //logtrace(LogSlice,"a) sigCtx=%d\n",sigCtx);
1465
1466 if (cIdx==0) {
1467 if (xS+yS > 0) sigCtx+=3;
1468
1469 //logtrace(LogSlice,"b) sigCtx=%d\n",sigCtx);
1470
1471 // if log2TrafoSize==3
1472 if (sbWidth==2) { // 8x8 block
1473 sigCtx += (scanIdx==0) ? 9 : 15;
1474 } else {
1475 sigCtx += 21;
1476 }
1477
1478 //logtrace(LogSlice,"c) sigCtx=%d\n",sigCtx);
1479 }
1480 else {
1481 // if log2TrafoSize==3
1482 if (sbWidth==2) { // 8x8 block
1483 sigCtx+=9;
1484 }
1485 else {
1486 sigCtx+=12;
1487 }
1488 }
1489
1490 }
1491
1492 int ctxIdxInc;
1493 if (cIdx==0) { ctxIdxInc=sigCtx; }
1494 else { ctxIdxInc=27+sigCtx; }
1495
1496 if (ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][xC+(yC<<log2w)] != 0xFF) {
1497 assert(ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][xC+(yC<<log2w)] == ctxIdxInc);
1498 }
1499
1500 ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][xC+(yC<<log2w)] = ctxIdxInc;
1501
1502 //NOTE: when using this option, we have to include all three scanIdx in the table
1503 //ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][s] = ctxIdxInc;
1504 }
1505 }
1506
1507 return true;
1508 }
1509
1510
1511 bool alloc_and_init_significant_coeff_ctxIdx_lookupTable_OLD()
1512 {
1513 int tableSize = 2*2*4*(4*4 + 8*8 + 16*16 + 32*32);
1514 uint8_t* p = (uint8_t*)malloc(tableSize);
1515 if (p==NULL) {
1516 return false;
1517 }
1518
1519 for (int log2w=2; log2w<=5 ; log2w++)
1520 for (int cIdx=0;cIdx<2;cIdx++)
1521 for (int scanIdx=0;scanIdx<2;scanIdx++)
1522 for (int prevCsbf=0;prevCsbf<4;prevCsbf++)
1523 {
1524 // assign pointer into reserved memory area
1525
1526 ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf] = p;
1527 p += (1<<log2w)*(1<<log2w);
1528
1529 const position* ScanOrderSub = get_scan_order(log2w-2, scanIdx);
1530 const position* ScanOrderPos = get_scan_order(2, scanIdx);
1531
1532 //for (int yC=0;yC<(1<<log2w);yC++)
1533 // for (int xC=0;xC<(1<<log2w);xC++)
1534 for (int s=0;s<(1<<log2w)*(1<<log2w);s++)
1535 {
1536 position S = ScanOrderSub[s>>4];
1537 int x0 = S.x<<2;
1538 int y0 = S.y<<2;
1539
1540 int subX = ScanOrderPos[s & 0xF].x;
1541 int subY = ScanOrderPos[s & 0xF].y;
1542 int xC = x0 + subX;
1543 int yC = y0 + subY;
1544
1545
1546 int w = 1<<log2w;
1547 int sbWidth = w>>2;
1548
1549 int sigCtx;
1550
1551 // if log2TrafoSize==2
1552 if (sbWidth==1) {
1553 sigCtx = ctxIdxMap[(yC<<2) + xC];
1554 }
1555 else if (xC+yC==0) {
1556 sigCtx = 0;
1557 }
1558 else {
1559 int xS = xC>>2;
1560 int yS = yC>>2;
1561 /*
1562 int prevCsbf = 0;
1563
1564 if (xS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+1 +yS*sbWidth]; }
1565 if (yS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+(1+yS)*sbWidth]<<1; }
1566 */
1567 int xP = xC & 3;
1568 int yP = yC & 3;
1569
1570 logtrace(LogSlice,"posInSubset: %d,%d\n",xP,yP);
1571 logtrace(LogSlice,"prevCsbf: %d\n",prevCsbf);
1572
1573 //printf("%d | %d %d\n",prevCsbf,xP,yP);
1574
1575 switch (prevCsbf) {
1576 case 0:
1577 //sigCtx = (xP+yP==0) ? 2 : (xP+yP<3) ? 1 : 0;
1578 sigCtx = (xP+yP>=3) ? 0 : (xP+yP>0) ? 1 : 2;
1579 break;
1580 case 1:
1581 sigCtx = (yP==0) ? 2 : (yP==1) ? 1 : 0;
1582 break;
1583 case 2:
1584 sigCtx = (xP==0) ? 2 : (xP==1) ? 1 : 0;
1585 break;
1586 default:
1587 sigCtx = 2;
1588 break;
1589 }
1590
1591 logtrace(LogSlice,"a) sigCtx=%d\n",sigCtx);
1592
1593 if (cIdx==0) {
1594 if (xS+yS > 0) sigCtx+=3;
1595
1596 logtrace(LogSlice,"b) sigCtx=%d\n",sigCtx);
1597
1598 // if log2TrafoSize==3
1599 if (sbWidth==2) { // 8x8 block
1600 sigCtx += (scanIdx==0) ? 9 : 15;
1601 } else {
1602 sigCtx += 21;
1603 }
1604
1605 logtrace(LogSlice,"c) sigCtx=%d\n",sigCtx);
1606 }
1607 else {
1608 // if log2TrafoSize==3
1609 if (sbWidth==2) { // 8x8 block
1610 sigCtx+=9;
1611 }
1612 else {
1613 sigCtx+=12;
1614 }
1615 }
1616 }
1617
1618 int ctxIdxInc;
1619 if (cIdx==0) { ctxIdxInc=sigCtx; }
1620 else { ctxIdxInc=27+sigCtx; }
1621
1622
1623 ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][xC+(yC<<log2w)] = ctxIdxInc;
1624
1625 //NOTE: when using this option, we have to include all three scanIdx in the table
1626 //ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][s] = ctxIdxInc;
1627 }
1628 }
1629
1630 return true;
1631 }
1632
1633 void free_significant_coeff_ctxIdx_lookupTable()
1634 {
1635 free(ctxIdxLookup[0][0][0][0]);
1636 ctxIdxLookup[0][0][0][0]=NULL;
1637 }
1638
1639
1640
1641
1642 #if 0
1643 static int decode_significant_coeff_flag(thread_context* tctx,
1644 int xC,int yC,
1645 const uint8_t* coded_sub_block_flag,
1646 int sbWidth,
1647 int cIdx,
1648 int scanIdx)
1649 {
1650 logtrace(LogSlice,"# significant_coeff_flag (xC:%d yC:%d sbWidth:%d cIdx:%d scanIdx:%d)\n",
1651 xC,yC,sbWidth,cIdx,scanIdx);
1652
1653 int sigCtx;
1654
1655 // if log2TrafoSize==2
1656 if (sbWidth==1) {
1657 sigCtx = ctxIdxMap[(yC<<2) + xC];
1658 }
1659 else if (xC+yC==0) {
1660 sigCtx = 0;
1661 }
1662 else {
1663 int xS = xC>>2;
1664 int yS = yC>>2;
1665 int prevCsbf = 0;
1666 if (xS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+1 +yS*sbWidth]; }
1667 if (yS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+(1+yS)*sbWidth]<<1; }
1668
1669 int xP = xC & 3;
1670 int yP = yC & 3;
1671
1672 logtrace(LogSlice,"posInSubset: %d,%d\n",xP,yP);
1673 logtrace(LogSlice,"prevCsbf: %d\n",prevCsbf);
1674
1675 //printf("%d | %d %d\n",prevCsbf,xP,yP);
1676
1677 switch (prevCsbf) {
1678 case 0:
1679 //sigCtx = (xP+yP==0) ? 2 : (xP+yP<3) ? 1 : 0;
1680 sigCtx = (xP+yP>=3) ? 0 : (xP+yP>0) ? 1 : 2;
1681 break;
1682 case 1:
1683 sigCtx = (yP==0) ? 2 : (yP==1) ? 1 : 0;
1684 break;
1685 case 2:
1686 sigCtx = (xP==0) ? 2 : (xP==1) ? 1 : 0;
1687 break;
1688 default:
1689 sigCtx = 2;
1690 break;
1691 }
1692
1693 logtrace(LogSlice,"a) sigCtx=%d\n",sigCtx);
1694
1695 if (cIdx==0) {
1696 if (xS+yS > 0) sigCtx+=3;
1697
1698 logtrace(LogSlice,"b) sigCtx=%d\n",sigCtx);
1699
1700 // if log2TrafoSize==3
1701 if (sbWidth==2) {
1702 sigCtx += (scanIdx==0) ? 9 : 15;
1703 } else {
1704 sigCtx += 21;
1705 }
1706
1707 logtrace(LogSlice,"c) sigCtx=%d\n",sigCtx);
1708 }
1709 else {
1710 // if log2TrafoSize==3
1711 if (sbWidth==2) {
1712 sigCtx+=9;
1713 }
1714 else {
1715 sigCtx+=12;
1716 }
1717 }
1718 }
1719
1720 int ctxIdxInc;
1721 if (cIdx==0) { ctxIdxInc=sigCtx; }
1722 else { ctxIdxInc=27+sigCtx; }
1723
1724 int context = tctx->shdr->initType*42 + ctxIdxInc;
1725 logtrace(LogSlice,"context: %d\n",context);
1726
1727 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1728 &tctx->ctx_model[CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + context]);
1729 return bit;
1730 }
1731 #endif
1732
1733
1734
1735 static inline int decode_significant_coeff_flag_lookup(thread_context* tctx,
1736 uint8_t ctxIdxInc)
1737 {
1738 logtrace(LogSlice,"# significant_coeff_flag\n");
1739 logtrace(LogSlice,"context: %d\n",ctxIdxInc);
1740
1741 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1742 &tctx->ctx_model[CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + ctxIdxInc]);
1743 return bit;
1744 }
1745
1746
1747
1748
1749
1750 static inline int decode_coeff_abs_level_greater1(thread_context* tctx,
1751 int cIdx, int i,
1752 bool firstCoeffInSubblock,
1753 bool firstSubblock,
1754 int lastSubblock_greater1Ctx,
1755 int* lastInvocation_greater1Ctx,
1756 int* lastInvocation_coeff_abs_level_greater1_flag,
1757 int* lastInvocation_ctxSet, int c1)
1758 {
1759 logtrace(LogSlice,"# coeff_abs_level_greater1\n");
1760
1761 logtrace(LogSlice," cIdx:%d i:%d firstCoeffInSB:%d firstSB:%d lastSB>1:%d last>1Ctx:%d lastLev>1:%d lastCtxSet:%d\n", cIdx,i,firstCoeffInSubblock,firstSubblock,lastSubblock_greater1Ctx,
1762 *lastInvocation_greater1Ctx,
1763 *lastInvocation_coeff_abs_level_greater1_flag,
1764 *lastInvocation_ctxSet);
1765
1766 int lastGreater1Ctx;
1767 int greater1Ctx;
1768 int ctxSet;
1769
1770 logtrace(LogSlice,"c1: %d\n",c1);
1771
1772 if (firstCoeffInSubblock) {
1773 // block with real DC -> ctx 0
1774 if (i==0 || cIdx>0) { ctxSet=0; }
1775 else { ctxSet=2; }
1776
1777 if (firstSubblock) { lastGreater1Ctx=1; }
1778 else { lastGreater1Ctx = lastSubblock_greater1Ctx; }
1779
1780 if (lastGreater1Ctx==0) { ctxSet++; }
1781
1782 logtrace(LogSlice,"ctxSet: %d\n",ctxSet);
1783
1784 greater1Ctx=1;
1785 }
1786 else { // !firstCoeffInSubblock
1787 ctxSet = *lastInvocation_ctxSet;
1788 logtrace(LogSlice,"ctxSet (old): %d\n",ctxSet);
1789
1790 greater1Ctx = *lastInvocation_greater1Ctx;
1791 if (greater1Ctx>0) {
1792 int lastGreater1Flag=*lastInvocation_coeff_abs_level_greater1_flag;
1793 if (lastGreater1Flag==1) greater1Ctx=0;
1794 else { /*if (greater1Ctx>0)*/ greater1Ctx++; }
1795 }
1796 }
1797
1798 ctxSet = c1; // use HM algo
1799
1800 int ctxIdxInc = (ctxSet*4) + (greater1Ctx>=3 ? 3 : greater1Ctx);
1801
1802 if (cIdx>0) { ctxIdxInc+=16; }
1803
1804 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1805 &tctx->ctx_model[CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG + ctxIdxInc]);
1806
1807 *lastInvocation_greater1Ctx = greater1Ctx;
1808 *lastInvocation_coeff_abs_level_greater1_flag = bit;
1809 *lastInvocation_ctxSet = ctxSet;
1810
1811 return bit;
1812 }
1813
1814
1815 static int decode_coeff_abs_level_greater2(thread_context* tctx,
1816 int cIdx, // int i,int n,
1817 int ctxSet)
1818 {
1819 logtrace(LogSlice,"# coeff_abs_level_greater2\n");
1820
1821 int ctxIdxInc = ctxSet;
1822
1823 if (cIdx>0) ctxIdxInc+=4;
1824
1825 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1826 &tctx->ctx_model[CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG + ctxIdxInc]);
1827
1828 return bit;
1829 }
1830
1831
1832 /*
1833 static int decode_coeff_abs_level_remaining(thread_context* tctx,
1834 int cRiceParam, int cTRMax)
1835 {
1836 logtrace(LogSlice,"# decode_coeff_abs_level_remaining\n");
1837
1838 int prefix = decode_CABAC_TR_bypass(&tctx->cabac_decoder,cRiceParam,cTRMax);
1839
1840 if (prefix==cTRMax) {
1841 int value = decode_CABAC_EGk_bypass(&tctx->cabac_decoder,cRiceParam+1);
1842 value += cTRMax;
1843 return value;
1844 }
1845 else {
1846 return prefix;
1847 }
1848 }
1849 */
1850
1851
1852 static int decode_coeff_abs_level_remaining_HM(thread_context* tctx,
1853 int param)
1854 {
1855 logtrace(LogSlice,"# decode_coeff_abs_level_remaining_HM\n");
1856
1857 int prefix=0;
1858 int codeword=0;
1859 do {
1860 prefix++;
1861 codeword = decode_CABAC_bypass(&tctx->cabac_decoder);
1862 }
1863 while (codeword);
1864 codeword = 1-codeword;
1865 prefix -= codeword;
1866 codeword=0;
1867
1868 int value;
1869
1870 if (prefix < /* COEF_REMAIN_BIN_REDUCTION */ 3) {
1871 codeword = decode_CABAC_FL_bypass(&tctx->cabac_decoder, param);
1872 value = (prefix<<param) + codeword;
1873 }
1874 else {
1875 codeword = decode_CABAC_FL_bypass(&tctx->cabac_decoder, prefix-3+param);
1876 value = (((1<<(prefix-3))+3-1)<<param)+codeword;
1877 }
1878
1879 return value;
1880 }
1881
1882
1883 static int decode_merge_flag(thread_context* tctx)
1884 {
1885 logtrace(LogSlice,"# merge_flag\n");
1886
1887 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1888 &tctx->ctx_model[CONTEXT_MODEL_MERGE_FLAG]);
1889
1890 return bit;
1891 }
1892
1893
1894 static int decode_merge_idx(thread_context* tctx)
1895 {
1896 logtrace(LogSlice,"# merge_idx\n");
1897
1898 // TU coding, first bin is CABAC, remaining are bypass.
1899 // cMax = MaxNumMergeCand-1
1900
1901 int idx = decode_CABAC_bit(&tctx->cabac_decoder,
1902 &tctx->ctx_model[CONTEXT_MODEL_MERGE_IDX]);
1903
1904 if (idx==0) {
1905 // nothing
1906 }
1907 else {
1908 idx=1;
1909
1910 while (idx<tctx->shdr->MaxNumMergeCand-1) {
1911 if (decode_CABAC_bypass(&tctx->cabac_decoder)) {
1912 idx++;
1913 }
1914 else {
1915 break;
1916 }
1917 }
1918 }
1919
1920 logtrace(LogSlice,"> merge_idx = %d\n",idx);
1921
1922 return idx;
1923 }
1924
1925
1926 static int decode_pred_mode_flag(thread_context* tctx)
1927 {
1928 logtrace(LogSlice,"# pred_mode_flag\n");
1929
1930 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1931 &tctx->ctx_model[CONTEXT_MODEL_PRED_MODE_FLAG]);
1932
1933 return bit;
1934 }
1935
1936 static int decode_mvp_lx_flag(thread_context* tctx)
1937 {
1938 logtrace(LogSlice,"# mvp_lx_flag\n");
1939
1940 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1941 &tctx->ctx_model[CONTEXT_MODEL_MVP_LX_FLAG]);
1942
1943 return bit;
1944 }
1945
1946 static int decode_rqt_root_cbf(thread_context* tctx)
1947 {
1948 logtrace(LogSlice,"# rqt_root_cbf\n");
1949
1950 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1951 &tctx->ctx_model[CONTEXT_MODEL_RQT_ROOT_CBF]);
1952
1953 return bit;
1954 }
1955
1956 static int decode_ref_idx_lX(thread_context* tctx, int numRefIdxLXActive)
1957 {
1958 logtrace(LogSlice,"# ref_idx_lX\n");
1959
1960 int cMax = numRefIdxLXActive-1;
1961
1962 if (cMax==0) {
1963 logtrace(LogSlice,"> ref_idx = 0 (cMax==0)\n");
1964 return 0;
1965 } // do check for single reference frame here
1966
1967 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1968 &tctx->ctx_model[CONTEXT_MODEL_REF_IDX_LX + 0]);
1969
1970 int idx=0;
1971
1972 while (bit) {
1973 idx++;
1974 if (idx==cMax) { break; }
1975
1976 if (idx==1) {
1977 bit = decode_CABAC_bit(&tctx->cabac_decoder,
1978 &tctx->ctx_model[CONTEXT_MODEL_REF_IDX_LX + 1]);
1979 }
1980 else {
1981 bit = decode_CABAC_bypass(&tctx->cabac_decoder);
1982 }
1983 }
1984
1985 logtrace(LogSlice,"> ref_idx = %d\n",idx);
1986
1987 return idx;
1988 }
1989
1990
1991 static enum InterPredIdc decode_inter_pred_idc(thread_context* tctx,
1992 int x0, int y0,
1993 int nPbW, int nPbH,
1994 int ctDepth)
1995 {
1996 logtrace(LogSlice,"# inter_pred_idc\n");
1997
1998 int value;
1999
2000 context_model* model = &tctx->ctx_model[CONTEXT_MODEL_INTER_PRED_IDC];
2001
2002 if (nPbW+nPbH==12) {
2003 value = decode_CABAC_bit(&tctx->cabac_decoder,
2004 &model[4]);
2005 }
2006 else {
2007 int bit0 = decode_CABAC_bit(&tctx->cabac_decoder,
2008 &model[ctDepth]);
2009 if (bit0==0) {
2010 value = decode_CABAC_bit(&tctx->cabac_decoder,
2011 &model[4]);
2012 }
2013 else {
2014 value = 2;
2015 }
2016 }
2017
2018 logtrace(LogSlice,"> inter_pred_idc = %d (%s)\n",value,
2019 value==0 ? "L0" : (value==1 ? "L1" : "BI"));
2020
2021 return (enum InterPredIdc) value;
2022 }
2023
2024
2025
2026 void initialize_CABAC(decoder_context* ctx, thread_context* tctx)
2027 {
2028 const int initType = tctx->shdr->initType;
2029 assert(initType >= 0 && initType <= 2);
2030
2031 init_context(ctx,tctx, CONTEXT_MODEL_SPLIT_CU_FLAG, initValue_split_cu_flag[initType], 3);
2032 if (initType > 0) {
2033 init_context(ctx,tctx, CONTEXT_MODEL_CU_SKIP_FLAG, initValue_cu_skip_flag[initType-1], 3);
2034 }
2035 init_context(ctx,tctx, CONTEXT_MODEL_PART_MODE, &initValue_part_mode[(initType!=2 ? initType : 5)], 4);
2036 init_context(ctx,tctx, CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG, &initValue_prev_intra_luma_pred_flag[initType], 1);
2037 init_context(ctx,tctx, CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE, &initValue_intra_chroma_pred_mode[initType], 1);
2038 init_context(ctx,tctx, CONTEXT_MODEL_CBF_LUMA, &initValue_cbf_luma[initType == 0 ? 0 : 2], 2);
2039 init_context(ctx,tctx, CONTEXT_MODEL_CBF_CHROMA, &initValue_cbf_chroma[initType * 4], 4);
2040 init_context(ctx,tctx, CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG, &initValue_split_transform_flag[initType * 3], 3);
2041 init_context(ctx,tctx, CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX, &initValue_last_significant_coefficient_prefix[initType * 18], 18);
2042 init_context(ctx,tctx, CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX, &initValue_last_significant_coefficient_prefix[initType * 18], 18);
2043 init_context(ctx,tctx, CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG, &initValue_coded_sub_block_flag[initType * 4], 4);
2044 init_context(ctx,tctx, CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG, initValue_significant_coeff_flag[initType], 42);
2045 init_context(ctx,tctx, CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG, &initValue_coeff_abs_level_greater1_flag[initType * 24], 24);
2046 init_context(ctx,tctx, CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG, &initValue_coeff_abs_level_greater2_flag[initType * 6], 6);
2047 init_context(ctx,tctx, CONTEXT_MODEL_SAO_MERGE_FLAG, &initValue_sao_merge_leftUp_flag[initType], 1);
2048 init_context(ctx,tctx, CONTEXT_MODEL_SAO_TYPE_IDX, &initValue_sao_type_idx_lumaChroma_flag[initType], 1);
2049 init_context(ctx,tctx, CONTEXT_MODEL_CU_QP_DELTA_ABS, initValue_cu_qp_delta_abs, 2);
2050 init_context(ctx,tctx, CONTEXT_MODEL_TRANSFORM_SKIP_FLAG, initValue_transform_skip_flag, 2);
2051 init_context(ctx,tctx, CONTEXT_MODEL_MERGE_FLAG, &initValue_merge_flag[initType-1],1);
2052 init_context(ctx,tctx, CONTEXT_MODEL_MERGE_IDX, &initValue_merge_idx[initType-1], 1);
2053 init_context(ctx,tctx, CONTEXT_MODEL_PRED_MODE_FLAG, &initValue_pred_mode_flag[initType-1], 1);
2054 init_context(ctx,tctx, CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG, &initValue_abs_mvd_greater01_flag[initType == 1 ? 0 : 2], 2);
2055 init_context(ctx,tctx, CONTEXT_MODEL_MVP_LX_FLAG, initValue_mvp_lx_flag, 1);
2056 init_context(ctx,tctx, CONTEXT_MODEL_RQT_ROOT_CBF, initValue_rqt_root_cbf, 1);
2057 init_context(ctx,tctx, CONTEXT_MODEL_REF_IDX_LX, initValue_ref_idx_lX, 2);
2058 init_context(ctx,tctx, CONTEXT_MODEL_INTER_PRED_IDC, initValue_inter_pred_idc, 5);
2059 init_context(ctx,tctx, CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG, &initValue_cu_transquant_bypass_flag[initType], 1);
2060 }
2061
2062
2063 /* Take CtbAddrInTS and compute
2064 -> CtbAddrInRS, CtbX, CtbY
2065 */
2066 bool setCtbAddrFromTS(thread_context* tctx)
2067 {
2068 const seq_parameter_set* sps = tctx->decctx->current_sps;
2069
2070 if (tctx->CtbAddrInTS < tctx->decctx->current_sps->PicSizeInCtbsY) {
2071 tctx->CtbAddrInRS = tctx->decctx->current_pps->CtbAddrTStoRS[tctx->CtbAddrInTS];
2072
2073 tctx->CtbX = tctx->CtbAddrInRS % sps->PicWidthInCtbsY;
2074 tctx->CtbY = tctx->CtbAddrInRS / sps->PicWidthInCtbsY;
2075 return false;
2076 }
2077 else {
2078 tctx->CtbAddrInRS = tctx->decctx->current_sps->PicSizeInCtbsY;
2079
2080 tctx->CtbX = tctx->CtbAddrInRS % sps->PicWidthInCtbsY;
2081 tctx->CtbY = tctx->CtbAddrInRS / sps->PicWidthInCtbsY;
2082 return true;
2083 }
2084 }
2085
2086 // returns true when we reached the end of the image (ctbAddr==picSizeInCtbsY)
2087 bool advanceCtbAddr(thread_context* tctx)
2088 {
2089 tctx->CtbAddrInTS++;
2090
2091 return setCtbAddrFromTS(tctx);
2092 }
2093
2094
2095 void read_sao(decoder_context* ctx, thread_context* tctx, int xCtb,int yCtb,
2096 int CtbAddrInSliceSeg)
2097 {
2098 slice_segment_header* shdr = tctx->shdr;
2099 const seq_parameter_set* sps = ctx->current_sps;
2100 const pic_parameter_set* pps = ctx->current_pps;
2101
2102 logtrace(LogSlice,"# read_sao(%d,%d)\n",xCtb,yCtb);
2103
2104 sao_info saoinfo;
2105 memset(&saoinfo,0,sizeof(sao_info));
2106 logtrace(LogSlice,"sizeof saoinfo: %d\n",sizeof(sao_info));
2107
2108
2109 char sao_merge_left_flag = 0;
2110 char sao_merge_up_flag = 0;
2111
2112 if (xCtb>0) {
2113 //char leftCtbInSliceSeg = (CtbAddrInSliceSeg>0);
2114 char leftCtbInSliceSeg = (tctx->CtbAddrInRS > shdr->SliceAddrRS);
2115 char leftCtbInTile = (pps->TileIdRS[xCtb + yCtb * sps->PicWidthInCtbsY] ==
2116 pps->TileIdRS[xCtb-1 + yCtb * sps->PicWidthInCtbsY]);
2117
2118 if (leftCtbInSliceSeg && leftCtbInTile) {
2119 sao_merge_left_flag = decode_sao_merge_flag(tctx);
2120 logtrace(LogSlice,"sao_merge_left_flag: %d\n",sao_merge_left_flag);
2121 }
2122 }
2123
2124 if (yCtb>0 && sao_merge_left_flag==0) {
2125 logtrace(LogSlice,"CtbAddrInRS:%d PicWidthInCtbsY:%d slice_segment_address:%d\n",
2126 tctx->CtbAddrInRS,
2127 ctx->current_sps->PicWidthInCtbsY,
2128 shdr->slice_segment_address);
2129 char upCtbInSliceSeg = (tctx->CtbAddrInRS - ctx->current_sps->PicWidthInCtbsY) >= shdr->SliceAddrRS;
2130 char upCtbInTile = (pps->TileIdRS[xCtb + yCtb * sps->PicWidthInCtbsY] ==
2131 pps->TileIdRS[xCtb + (yCtb-1) * sps->PicWidthInCtbsY]);
2132
2133 if (upCtbInSliceSeg && upCtbInTile) {
2134 sao_merge_up_flag = decode_sao_merge_flag(tctx);
2135 logtrace(LogSlice,"sao_merge_up_flag: %d\n",sao_merge_up_flag);
2136 }
2137 }
2138
2139 if (!sao_merge_up_flag && !sao_merge_left_flag) {
2140 for (int cIdx=0; cIdx<3; cIdx++) {
2141 if ((shdr->slice_sao_luma_flag && cIdx==0) ||
2142 (shdr->slice_sao_chroma_flag && cIdx>0)) {
2143
2144 uint8_t SaoTypeIdx = 0;
2145
2146 if (cIdx==0) {
2147 char sao_type_idx_luma = decode_sao_type_idx(tctx);
2148 logtrace(LogSlice,"sao_type_idx_luma: %d\n", sao_type_idx_luma);
2149 saoinfo.SaoTypeIdx = SaoTypeIdx = sao_type_idx_luma;
2150 }
2151 else if (cIdx==1) {
2152 char sao_type_idx_chroma = decode_sao_type_idx(tctx);
2153 logtrace(LogSlice,"sao_type_idx_chroma: %d\n", sao_type_idx_chroma);
2154 SaoTypeIdx = sao_type_idx_chroma;
2155 saoinfo.SaoTypeIdx |= SaoTypeIdx<<(2*1);
2156 saoinfo.SaoTypeIdx |= SaoTypeIdx<<(2*2); // set for both chroma components
2157 }
2158 else {
2159 // SaoTypeIdx = 0
2160
2161 SaoTypeIdx = (saoinfo.SaoTypeIdx >> (2*cIdx)) & 0x3;
2162 }
2163
2164 if (SaoTypeIdx != 0) {
2165 for (int i=0;i<4;i++) {
2166 saoinfo.saoOffsetVal[cIdx][i] = decode_sao_offset_abs(tctx);
2167 logtrace(LogSlice,"saoOffsetVal[%d][%d] = %d\n",cIdx,i, saoinfo.saoOffsetVal[cIdx][i]);
2168 }
2169
2170 int sign[4];
2171 if (SaoTypeIdx==1) {
2172 for (int i=0;i<4;i++) {
2173 if (saoinfo.saoOffsetVal[cIdx][i] != 0) {
2174 sign[i] = decode_sao_offset_sign(tctx) ? -1 : 1;
2175 }
2176 else {
2177 sign[i] = 0; // not really required, but compiler warns about uninitialized values
2178 }
2179 }
2180
2181 saoinfo.sao_band_position[cIdx] = decode_sao_band_position(tctx);
2182 }
2183 else {
2184 uint8_t SaoEoClass = 0;
2185
2186 sign[0] = sign[1] = 1;
2187 sign[2] = sign[3] = -1;
2188
2189 if (cIdx==0) {
2190 saoinfo.SaoEoClass = SaoEoClass = decode_sao_class(tctx);
2191 }
2192 else if (cIdx==1) {
2193 SaoEoClass = decode_sao_class(tctx);
2194 saoinfo.SaoEoClass |= SaoEoClass << (2*1);
2195 saoinfo.SaoEoClass |= SaoEoClass << (2*2);
2196 }
2197
2198 logtrace(LogSlice,"SaoEoClass[%d] = %d\n",cIdx,SaoEoClass);
2199 }
2200
2201 int bitDepth = (cIdx==0 ?
2202 ctx->current_sps->BitDepth_Y :
2203 ctx->current_sps->BitDepth_C);
2204 int shift = bitDepth-libde265_min(bitDepth,10);
2205
2206 for (int i=0;i<4;i++) {
2207 saoinfo.saoOffsetVal[cIdx][i] = sign[i]*(saoinfo.saoOffsetVal[cIdx][i] << shift);
2208 }
2209 }
2210 }
2211 }
2212
2213 set_sao_info(ctx->img,ctx->current_sps, xCtb,yCtb, &saoinfo);
2214 }
2215
2216
2217 if (sao_merge_left_flag) {
2218 set_sao_info(ctx->img,ctx->current_sps, xCtb,yCtb, get_sao_info(ctx->img,ctx->current_sps,xCtb-1,yCtb));
2219 }
2220
2221 if (sao_merge_up_flag) {
2222 set_sao_info(ctx->img,ctx->current_sps, xCtb,yCtb, get_sao_info(ctx->img,ctx->current_sps,xCtb,yCtb-1));
2223 }
2224 }
2225
2226
2227 void read_coding_tree_unit(decoder_context* ctx, thread_context* tctx)
2228 {
2229 slice_segment_header* shdr = tctx->shdr;
2230 seq_parameter_set* sps = ctx->current_sps;
2231
2232 int xCtb = (tctx->CtbAddrInRS % sps->PicWidthInCtbsY);
2233 int yCtb = (tctx->CtbAddrInRS / sps->PicWidthInCtbsY);
2234 int xCtbPixels = xCtb << sps->Log2CtbSizeY;
2235 int yCtbPixels = yCtb << sps->Log2CtbSizeY;
2236
2237 logtrace(LogSlice,"----- decode CTB %d;%d (%d;%d) POC=%d\n",xCtbPixels,yCtbPixels, xCtb,yCtb,
2238 ctx->img->PicOrderCntVal);
2239
2240 set_SliceAddrRS(ctx->img, sps, xCtb, yCtb,
2241 tctx->shdr->SliceAddrRS);
2242
2243 set_SliceHeaderIndex(ctx->img,sps, xCtbPixels,yCtbPixels, shdr->slice_index);
2244 ctx->slice[ shdr->slice_index ].inUse=true; // mark that we are using this header
2245
2246 int CtbAddrInSliceSeg = tctx->CtbAddrInRS - shdr->slice_segment_address;
2247
2248 if (shdr->slice_sao_luma_flag || shdr->slice_sao_chroma_flag)
2249 {
2250 read_sao(ctx,tctx, xCtb,yCtb, CtbAddrInSliceSeg);
2251 }
2252
2253 read_coding_quadtree(ctx,tctx, xCtbPixels, yCtbPixels, sps->Log2CtbSizeY, 0);
2254 }
2255
2256
2257 int luma_pos_to_ctbAddrRS(decoder_context* ctx, int x,int y)
2258 {
2259 int ctbX = x >> (ctx->current_sps->Log2CtbSizeY);
2260 int ctbY = y >> (ctx->current_sps->Log2CtbSizeY);
2261
2262 return ctbY * ctx->current_sps->PicWidthInCtbsY + ctbX;
2263 }
2264
2265
2266 int check_CTB_available(decoder_context* ctx,
2267 slice_segment_header* shdr,
2268 int xC,int yC, int xN,int yN)
2269 {
2270 // check whether neighbor is outside of frame
2271
2272 if (xN < 0 || yN < 0) { return 0; }
2273 if (xN >= ctx->current_sps->pic_width_in_luma_samples) { return 0; }
2274 if (yN >= ctx->current_sps->pic_height_in_luma_samples) { return 0; }
2275
2276
2277 int current_ctbAddrRS = luma_pos_to_ctbAddrRS(ctx, xC,yC);
2278 int neighbor_ctbAddrRS = luma_pos_to_ctbAddrRS(ctx, xN,yN);
2279
2280 // TODO: check if this is correct (6.4.1)
2281
2282 #if 0
2283 int neighbor_ctbAddrTS = ctx->current_pps->CtbAddrRStoTS[ neighbor_ctbAddrRS ];
2284
2285
2286 // check whether neighbor is in the same slice
2287
2288 int first_ctb_in_slice_TS = ctx->current_pps->CtbAddrRStoTS[ shdr->slice_segment_address ];
2289
2290 if (neighbor_ctbAddrTS < first_ctb_in_slice_TS) {
2291 return 0;
2292 }
2293 #else
2294 if (get_SliceAddrRS_atCtbRS(ctx->img,ctx->current_sps, current_ctbAddrRS) !=
2295 get_SliceAddrRS_atCtbRS(ctx->img,ctx->current_sps, neighbor_ctbAddrRS)) {
2296 return 0;
2297 }
2298 #endif
2299
2300 // check if both CTBs are in the same tile.
2301
2302 if (ctx->current_pps->TileIdRS[current_ctbAddrRS] !=
2303 ctx->current_pps->TileIdRS[neighbor_ctbAddrRS]) {
2304 return 0;
2305 }
2306
2307 return 1;
2308 }
2309
2310
2311 int residual_coding(decoder_context* ctx,
2312 thread_context* tctx,
2313 int x0, int y0, // position of TU in frame
2314 int xL, int yL, // position of TU in local CU
2315 int log2TrafoSize,
2316 int cIdx)
2317 {
2318 logtrace(LogSlice,"- residual_coding x0:%d y0:%d log2TrafoSize:%d cIdx:%d\n",x0,y0,log2TrafoSize,cIdx);
2319
2320 //slice_segment_header* shdr = tctx->shdr;
2321
2322 const seq_parameter_set* sps = ctx->current_sps;
2323
2324
2325 if (cIdx==0) {
2326 set_nonzero_coefficient(ctx->img,sps,x0,y0,log2TrafoSize);
2327 }
2328
2329
2330 //tctx->cu_transquant_bypass_flag=0; // TODO
2331
2332 if (ctx->current_pps->transform_skip_enabled_flag &&
2333 !tctx->cu_transquant_bypass_flag &&
2334 (log2TrafoSize==2))
2335 {
2336 tctx->transform_skip_flag[cIdx] = decode_transform_skip_flag(tctx,cIdx);
2337 }
2338 else
2339 {
2340 tctx->transform_skip_flag[cIdx] = 0;
2341 }
2342
2343
2344 // --- decode position of last coded coefficient ---
2345
2346 int last_significant_coeff_x_prefix =
2347 decode_last_significant_coeff_prefix(tctx,log2TrafoSize,cIdx,
2348 &tctx->ctx_model[CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX]);
2349
2350 int last_significant_coeff_y_prefix =
2351 decode_last_significant_coeff_prefix(tctx,log2TrafoSize,cIdx,
2352 &tctx->ctx_model[CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX]);
2353
2354
2355 // TODO: we can combine both FL-bypass calls into one, but the gain may be limited...
2356
2357 int LastSignificantCoeffX;
2358 if (last_significant_coeff_x_prefix > 3) {
2359 int nBits = (last_significant_coeff_x_prefix>>1)-1;
2360 int last_significant_coeff_x_suffix = decode_CABAC_FL_bypass(&tctx->cabac_decoder,nBits);
2361
2362 LastSignificantCoeffX = (1<<nBits) *
2363 (2+(last_significant_coeff_x_prefix & 1)) + last_significant_coeff_x_suffix;
2364 }
2365 else {
2366 LastSignificantCoeffX = last_significant_coeff_x_prefix;
2367 }
2368
2369 int LastSignificantCoeffY;
2370 if (last_significant_coeff_y_prefix > 3) {
2371 int nBits = (last_significant_coeff_y_prefix>>1)-1;
2372 int last_significant_coeff_y_suffix = decode_CABAC_FL_bypass(&tctx->cabac_decoder,nBits);
2373
2374 LastSignificantCoeffY = (1<<nBits) *
2375 (2+(last_significant_coeff_y_prefix & 1)) + last_significant_coeff_y_suffix;
2376 }
2377 else {
2378 LastSignificantCoeffY = last_significant_coeff_y_prefix;
2379 }
2380
2381
2382
2383 // --- determine scanIdx ---
2384
2385 int scanIdx;
2386
2387 enum PredMode PredMode = get_pred_mode(ctx->img,sps,x0,y0);
2388
2389
2390 if (PredMode == MODE_INTRA) {
2391 if (cIdx==0) {
2392 if (log2TrafoSize==2 || log2TrafoSize==3) {
2393 int PUidx = (x0>>sps->Log2MinPUSize) + (y0>>sps->Log2MinPUSize) * sps->PicWidthInMinPUs;
2394
2395 enum IntraPredMode predMode = (enum IntraPredMode) ctx->img->intraPredMode[PUidx];
2396 logtrace(LogSlice,"IntraPredMode[%d,%d] = %d\n",x0,y0,predMode);
2397
2398 if (predMode >= 6 && predMode <= 14) scanIdx=2;
2399 else if (predMode >= 22 && predMode <= 30) scanIdx=1;
2400 else scanIdx=0;
2401 }
2402 else { scanIdx=0; }
2403 }
2404 else {
2405 if (log2TrafoSize==1 || log2TrafoSize==2) {
2406 enum IntraPredMode predMode = tctx->IntraPredModeC;
2407
2408 if (predMode >= 6 && predMode <= 14) scanIdx=2;
2409 else if (predMode >= 22 && predMode <= 30) scanIdx=1;
2410 else scanIdx=0;
2411 }
2412 else { scanIdx=0; }
2413 }
2414
2415 logtrace(LogSlice,"pred: %d -> scan: %d\n",PredMode,scanIdx);
2416 }
2417 else {
2418 scanIdx=0;
2419 }
2420
2421
2422 // HM 9 only ?
2423 if (scanIdx==2) {
2424 int t = LastSignificantCoeffX;
2425 LastSignificantCoeffX = LastSignificantCoeffY;
2426 LastSignificantCoeffY = t;
2427 }
2428
2429 logtrace(LogSlice,"LastSignificantCoeff: x=%d;y=%d\n",LastSignificantCoeffX,LastSignificantCoeffY);
2430
2431 const position* ScanOrderSub = get_scan_order(log2TrafoSize-2, scanIdx);
2432 const position* ScanOrderPos = get_scan_order(2, scanIdx);
2433
2434 logtrace(LogSlice,"ScanOrderPos: ");
2435 for (int n=0;n<4*4;n++)
2436 logtrace(LogSlice,"*%d,%d ", ScanOrderPos[n].x, ScanOrderPos[n].y);
2437 logtrace(LogSlice,"*\n");
2438
2439
2440 // --- find last sub block and last scan pos ---
2441
2442 int xC,yC;
2443
2444 scan_position lastScanP = get_scan_position(LastSignificantCoeffX, LastSignificantCoeffY,
2445 scanIdx, log2TrafoSize);
2446
2447 int lastScanPos = lastScanP.scanPos;
2448 int lastSubBlock = lastScanP.subBlock;
2449
2450
2451 int sbWidth = 1<<(log2TrafoSize-2);
2452
2453 uint8_t coded_sub_block_neighbors[32/4*32/4];
2454 memset(coded_sub_block_neighbors,0,sbWidth*sbWidth);
2455
2456 int c1 = 1;
2457 bool firstSubblock = true; // for coeff_abs_level_greater1_flag context model
2458 int lastSubblock_greater1Ctx=false; /* for coeff_abs_level_greater1_flag context model
2459 (initialization not strictly needed)
2460 */
2461
2462 #ifdef DE265_LOG_TRACE
2463 int16_t TransCoeffLevel[32 * 32];
2464 memset(TransCoeffLevel,0, sizeof(uint16_t)*32*32);
2465 #endif
2466
2467 int CoeffStride = 1<<log2TrafoSize;
2468
2469 int lastInvocation_greater1Ctx=0;
2470 int lastInvocation_coeff_abs_level_greater1_flag=0;
2471 int lastInvocation_ctxSet=0;
2472
2473
2474
2475 // ----- decode coefficients -----
2476
2477 tctx->nCoeff[cIdx] = 0;
2478
2479
2480 // i - subblock index
2481 // n - coefficient index in subblock
2482
2483 for (int i=lastSubBlock;i>=0;i--) {
2484 position S = ScanOrderSub[i];
2485 int inferSbDcSigCoeffFlag=0;
2486
2487 logtrace(LogSlice,"sub block scan idx: %d\n",i);
2488
2489
2490 // --- check whether this sub-block is coded ---
2491
2492 int sub_block_is_coded = 0;
2493
2494 if ((i<lastSubBlock) && (i>0)) {
2495 sub_block_is_coded = decode_coded_sub_block_flag(tctx, cIdx,
2496 coded_sub_block_neighbors[S.x+S.y*sbWidth]);
2497 inferSbDcSigCoeffFlag=1;
2498 }
2499 else if (i==0 || i==lastSubBlock) {
2500 // first (DC) and last sub-block are always coded
2501 // - the first will most probably contain coefficients
2502 // - the last obviously contains the last coded coefficient
2503
2504 sub_block_is_coded = 1;
2505 }
2506
2507 if (sub_block_is_coded) {
2508 if (S.x > 0) coded_sub_block_neighbors[S.x-1 + S.y *sbWidth] |= 1;
2509 if (S.y > 0) coded_sub_block_neighbors[S.x + (S.y-1)*sbWidth] |= 2;
2510 }
2511
2512
2513 // ----- find significant coefficients in this sub-block -----
2514
2515 int16_t coeff_value[16];
2516 int8_t coeff_scan_pos[16];
2517 int8_t coeff_sign[16];
2518 int8_t coeff_has_max_base_level[16];
2519 int nCoefficients=0;
2520
2521
2522 if (sub_block_is_coded) {
2523 int x0 = S.x<<2;
2524 int y0 = S.y<<2;
2525
2526 int log2w = log2TrafoSize-2;
2527 int prevCsbf = coded_sub_block_neighbors[S.x+S.y*sbWidth];
2528 uint8_t* ctxIdxMap = ctxIdxLookup[log2w][!!cIdx][!!scanIdx][prevCsbf];
2529
2530
2531 // set the last coded coefficient in the last subblock
2532
2533 int last_coeff = (i==lastSubBlock) ? lastScanPos-1 : 15;
2534
2535 if (i==lastSubBlock) {
2536 coeff_value[nCoefficients] = 1;
2537 coeff_has_max_base_level[nCoefficients] = 1;
2538 coeff_scan_pos[nCoefficients] = lastScanPos;
2539 nCoefficients++;
2540 }
2541
2542
2543 // --- decode all coefficients' significant_coeff flags except for the DC coefficient ---
2544
2545 for (int n= last_coeff ; n>0 ; n--) {
2546 int subX = ScanOrderPos[n].x;
2547 int subY = ScanOrderPos[n].y;
2548 xC = x0 + subX;
2549 yC = y0 + subY;
2550
2551
2552 // for all AC coefficients in sub-block, a significant_coeff flag is coded
2553
2554 int significant_coeff = decode_significant_coeff_flag_lookup(tctx,
2555 ctxIdxMap[xC+(yC<<log2TrafoSize)]);
2556 //ctxIdxMap[(i<<4)+n]);
2557
2558 if (significant_coeff) {
2559 coeff_value[nCoefficients] = 1;
2560 coeff_has_max_base_level[nCoefficients] = 1;
2561 coeff_scan_pos[nCoefficients] = n;
2562 nCoefficients++;
2563
2564 // since we have a coefficient in the sub-block,
2565 // we cannot infer the DC coefficient anymore
2566 inferSbDcSigCoeffFlag = 0;
2567 }
2568 }
2569
2570
2571 // --- decode DC coefficient significance ---
2572
2573 if (last_coeff>=0) // last coded coefficient (always set to 1) is not the DC coefficient
2574 {
2575 if (inferSbDcSigCoeffFlag==0) {
2576 // if we cannot infert the DC coefficient, it is coded
2577 int significant_coeff = decode_significant_coeff_flag_lookup(tctx,
2578 ctxIdxMap[x0+(y0<<log2TrafoSize)]);
2579 //ctxIdxMap[(i<<4)+0]);
2580
2581
2582 if (significant_coeff) {
2583 coeff_value[nCoefficients] = 1;
2584 coeff_has_max_base_level[nCoefficients] = 1;
2585 coeff_scan_pos[nCoefficients] = 0;
2586 nCoefficients++;
2587 }
2588 }
2589 else {
2590 // we can infer that the DC coefficient must be present
2591 coeff_value[nCoefficients] = 1;
2592 coeff_has_max_base_level[nCoefficients] = 1;
2593 coeff_scan_pos[nCoefficients] = 0;
2594 nCoefficients++;
2595 }
2596 }
2597
2598 }
2599
2600
2601 /*
2602 logtrace(LogSlice,"significant_coeff_flags:\n");
2603 for (int y=0;y<4;y++) {
2604 logtrace(LogSlice," ");
2605 for (int x=0;x<4;x++) {
2606 logtrace(LogSlice,"*%d ",significant_coeff_flag[y][x]);
2607 }
2608 logtrace(LogSlice,"*\n");
2609 }
2610 */
2611
2612
2613 if (nCoefficients) {
2614 int ctxSet;
2615 if (i==0 || cIdx>0) { ctxSet=0; }
2616 else { ctxSet=2; }
2617
2618 if (c1==0) { ctxSet++; }
2619 c1=1;
2620
2621
2622 // --- decode greater-1 flags ---
2623
2624 int newLastGreater1ScanPos=-1;
2625
2626 int lastGreater1Coefficient = libde265_min(8,nCoefficients);
2627 for (int c=0;c<lastGreater1Coefficient;c++) {
2628 int greater1_flag =
2629 decode_coeff_abs_level_greater1(tctx, cIdx,i,
2630 c==0,
2631 firstSubblock,
2632 lastSubblock_greater1Ctx,
2633 &lastInvocation_greater1Ctx,
2634 &lastInvocation_coeff_abs_level_greater1_flag,
2635 &lastInvocation_ctxSet, ctxSet);
2636
2637 if (greater1_flag) {
2638 coeff_value[c]++;
2639
2640 c1=0;
2641
2642 if (newLastGreater1ScanPos == -1) {
2643 newLastGreater1ScanPos=c;
2644 }
2645 }
2646 else {
2647 coeff_has_max_base_level[c] = 0;
2648
2649 if (c1<3 && c1>0) {
2650 c1++;
2651 }
2652 }
2653 }
2654
2655 firstSubblock = false;
2656 lastSubblock_greater1Ctx = lastInvocation_greater1Ctx;
2657
2658
2659 // --- decode greater-2 flag ---
2660
2661 if (newLastGreater1ScanPos != -1) {
2662 int flag = decode_coeff_abs_level_greater2(tctx,cIdx, lastInvocation_ctxSet);
2663 coeff_value[newLastGreater1ScanPos] += flag;
2664 coeff_has_max_base_level[newLastGreater1ScanPos] = flag;
2665 }
2666
2667
2668 // --- decode coefficient signs ---
2669
2670 int signHidden = (coeff_scan_pos[0]-coeff_scan_pos[nCoefficients-1] > 3 &&
2671 !tctx->cu_transquant_bypass_flag);
2672
2673 for (int n=0;n<nCoefficients-1;n++) {
2674 coeff_sign[n] = decode_CABAC_bypass(&tctx->cabac_decoder);
2675 logtrace(LogSlice,"sign[%d] = %d\n", n, coeff_sign[n]);
2676 }
2677
2678 // n==nCoefficients-1
2679 if (!ctx->current_pps->sign_data_hiding_flag || !signHidden) {
2680 coeff_sign[nCoefficients-1] = decode_CABAC_bypass(&tctx->cabac_decoder);
2681 logtrace(LogSlice,"sign[%d] = %d\n", nCoefficients-1, coeff_sign[nCoefficients-1]);
2682 }
2683 else {
2684 coeff_sign[nCoefficients-1] = 0;
2685 }
2686
2687
2688 // --- decode coefficient value ---
2689
2690 int sumAbsLevel=0;
2691 int uiGoRiceParam=0;
2692
2693 for (int n=0;n<nCoefficients;n++) {
2694 int baseLevel = coeff_value[n];
2695
2696 int coeff_abs_level_remaining;
2697
2698 if (coeff_has_max_base_level[n]) {
2699 coeff_abs_level_remaining =
2700 decode_coeff_abs_level_remaining_HM(tctx, uiGoRiceParam);
2701
2702 if (baseLevel + coeff_abs_level_remaining > 3*(1<<uiGoRiceParam)) {
2703 uiGoRiceParam++;
2704 if (uiGoRiceParam>4) uiGoRiceParam=4;
2705 }
2706 }
2707 else {
2708 coeff_abs_level_remaining = 0;
2709 }
2710
2711
2712 int16_t currCoeff = baseLevel + coeff_abs_level_remaining;
2713 if (coeff_sign[n]) {
2714 currCoeff = -currCoeff;
2715 }
2716
2717 if (ctx->current_pps->sign_data_hiding_flag && signHidden) {
2718 sumAbsLevel += baseLevel + coeff_abs_level_remaining;
2719
2720 if (n==nCoefficients-1 && (sumAbsLevel & 1)) {
2721 currCoeff = -currCoeff;
2722 }
2723 }
2724
2725 #ifdef DE265_LOG_TRACE
2726 //TransCoeffLevel[yC*CoeffStride + xC] = currCoeff;
2727 #endif
2728
2729 // put coefficient in list
2730 int p = coeff_scan_pos[n];
2731 xC = (S.x<<2) + ScanOrderPos[p].x;
2732 yC = (S.y<<2) + ScanOrderPos[p].y;
2733
2734 tctx->coeffList[cIdx][ tctx->nCoeff[cIdx] ] = currCoeff;
2735 tctx->coeffPos [cIdx][ tctx->nCoeff[cIdx] ] = xC + yC*CoeffStride;
2736 tctx->nCoeff[cIdx]++;
2737 } // iterate through coefficients in sub-block
2738 } // if nonZero
2739 } // next sub-block
2740
2741
2742
2743 #ifdef DE265_LOG_TRACE
2744 /*
2745 int xB = x0;
2746 int yB = y0;
2747 if (cIdx>0) { xB/=2; yB/=2; }
2748
2749 logtrace(LogSlice,"coefficients [cIdx=%d,at %d,%d] ----------------------------------------\n",cIdx,xB,yB);
2750
2751 for (int y=0;y<(1<<log2TrafoSize);y++) {
2752 logtrace(LogSlice," ");
2753 for (int x=0;x<(1<<log2TrafoSize);x++) {
2754 logtrace(LogSlice,"*%3d ", TransCoeffLevel[y*CoeffStride + x]);
2755 }
2756 logtrace(LogSlice,"*\n");
2757 }
2758 */
2759 #endif
2760
2761 return DE265_OK;
2762 }
2763
2764
2765 int read_transform_unit(decoder_context* ctx,
2766 thread_context* tctx,
2767 int x0, int y0, // position of TU in frame
2768 int xBase, int yBase, // position of parent TU in frame
2769 int xCUBase,int yCUBase, // position of CU in frame
2770 int log2TrafoSize,
2771 int trafoDepth,
2772 int blkIdx,
2773 int cbf_luma, int cbf_cb, int cbf_cr)
2774 {
2775 logtrace(LogSlice,"- read_transform_unit x0:%d y0:%d xBase:%d yBase:%d nT:%d cbf:%d:%d:%d\n",
2776 x0,y0,xBase,yBase, 1<<log2TrafoSize, cbf_luma, cbf_cb, cbf_cr);
2777
2778 //slice_segment_header* shdr = tctx->shdr;
2779
2780 assert(cbf_cb != -1);
2781 assert(cbf_cr != -1);
2782 assert(cbf_luma != -1);
2783
2784 tctx->transform_skip_flag[0]=0;
2785 tctx->transform_skip_flag[1]=0;
2786 tctx->transform_skip_flag[2]=0;
2787
2788
2789 if (cbf_luma || cbf_cb || cbf_cr)
2790 {
2791 if (ctx->current_pps->cu_qp_delta_enabled_flag &&
2792 !tctx->IsCuQpDeltaCoded) {
2793
2794 int cu_qp_delta_abs = decode_cu_qp_delta_abs(tctx);
2795 int cu_qp_delta_sign=0;
2796 if (cu_qp_delta_abs) {
2797 cu_qp_delta_sign = decode_CABAC_bypass(&tctx->cabac_decoder);
2798 }
2799
2800 tctx->IsCuQpDeltaCoded = 1;
2801 tctx->CuQpDelta = cu_qp_delta_abs*(1-2*cu_qp_delta_sign);
2802
2803 //printf("read cu_qp_delta (%d;%d) = %d\n",x0,y0,tctx->CuQpDelta);
2804
2805 logtrace(LogSlice,"cu_qp_delta_abs = %d\n",cu_qp_delta_abs);
2806 logtrace(LogSlice,"cu_qp_delta_sign = %d\n",cu_qp_delta_sign);
2807 logtrace(LogSlice,"CuQpDelta = %d\n",tctx->CuQpDelta);
2808
2809 decode_quantization_parameters(ctx,tctx, x0,y0, xCUBase, yCUBase);
2810 }
2811 }
2812
2813 /*
2814 if (x0 == xCUBase && y0 == yCUBase)
2815 decode_quantization_parameters(ctx,tctx, x0,y0, xCUBase, yCUBase);
2816 */
2817
2818 if (cbf_luma || cbf_cb || cbf_cr)
2819 {
2820 // position of TU in local CU
2821 int xL = x0 - xCUBase;
2822 int yL = y0 - yCUBase;
2823
2824 int err;
2825 if (cbf_luma) {
2826 if ((err=residual_coding(ctx,tctx,x0,y0, xL,yL,log2TrafoSize,0)) != DE265_OK) return err;
2827 }
2828
2829 if (log2TrafoSize>2) {
2830 if (cbf_cb) {
2831 if ((err=residual_coding(ctx,tctx,x0,y0,xL,yL,log2TrafoSize-1,1)) != DE265_OK) return err;
2832 }
2833
2834 if (cbf_cr) {
2835 if ((err=residual_coding(ctx,tctx,x0,y0,xL,yL,log2TrafoSize-1,2)) != DE265_OK) return err;
2836 }
2837 }
2838 else if (blkIdx==3) {
2839 if (cbf_cb) {
2840 if ((err=residual_coding(ctx,tctx,xBase,yBase,xBase-xCUBase,yBase-yCUBase,
2841 log2TrafoSize,1)) != DE265_OK) return err;
2842 }
2843
2844 if (cbf_cr) {
2845 if ((err=residual_coding(ctx,tctx,xBase,yBase,xBase-xCUBase,yBase-yCUBase,
2846 log2TrafoSize,2)) != DE265_OK) return err;
2847 }
2848 }
2849 }
2850
2851 return DE265_OK;
2852 }
2853
2854
2855 void read_transform_tree(decoder_context* ctx,
2856 thread_context* tctx,
2857 int x0, int y0, // position of TU in frame
2858 int xBase, int yBase, // position of parent TU in frame
2859 int xCUBase, int yCUBase, // position of CU in frame
2860 int log2TrafoSize,
2861 int trafoDepth,
2862 int blkIdx,
2863 int MaxTrafoDepth,
2864 int IntraSplitFlag,
2865 enum PredMode cuPredMode,
2866 bool parent_cbf_cb,bool parent_cbf_cr)
2867 {
2868 logtrace(LogSlice,"- read_transform_tree (interleaved) x0:%d y0:%d xBase:%d yBase:%d "
2869 "log2TrafoSize:%d trafoDepth:%d MaxTrafoDepth:%d\n",
2870 x0,y0,xBase,yBase,log2TrafoSize,trafoDepth,MaxTrafoDepth);
2871
2872 const seq_parameter_set* sps = ctx->current_sps;
2873
2874 enum PredMode PredMode = get_pred_mode(ctx->img,sps,x0,y0);
2875 enum PartMode PartMode = get_PartMode(ctx->img,sps,x0,y0);
2876
2877 int split_transform_flag;
2878
2879 int interSplitFlag= (sps->max_transform_hierarchy_depth_inter==0 &&
2880 PredMode == MODE_INTER &&
2881 PartMode != PART_2Nx2N &&
2882 trafoDepth == 0);
2883
2884
2885 /* If TrafoSize is larger than maximum size -> split automatically
2886 If TrafoSize is at minimum size -> do not split
2887 If maximum transformation depth is reached -> do not split
2888 If intra-prediction is NxN mode -> split automatically (only at level 0)
2889 Otherwise -> read split flag
2890 */
2891 if (log2TrafoSize <= ctx->current_sps->Log2MaxTrafoSize &&
2892 log2TrafoSize > ctx->current_sps->Log2MinTrafoSize &&
2893 trafoDepth < MaxTrafoDepth &&
2894 !(IntraSplitFlag && trafoDepth==0))
2895 {
2896 split_transform_flag = decode_split_transform_flag(tctx, log2TrafoSize);
2897 }
2898 else
2899 {
2900 split_transform_flag = (log2TrafoSize > ctx->current_sps->Log2MaxTrafoSize ||
2901 (IntraSplitFlag==1 && trafoDepth==0) ||
2902 interSplitFlag==1) ? 1:0;
2903 }
2904
2905
2906 if (split_transform_flag) {
2907 logtrace(LogSlice,"set_split_transform_flag(%d,%d, %d)\n",x0,y0,trafoDepth);
2908 set_split_transform_flag(ctx->img,sps,x0,y0,trafoDepth);
2909 }
2910
2911
2912 int cbf_cb=-1;
2913 int cbf_cr=-1;
2914
2915 if (log2TrafoSize>2) {
2916 // we do not have to test for trafoDepth==0, because parent_cbf_cb is 1 at depth 0
2917 if (/*trafoDepth==0 ||*/ parent_cbf_cb) {
2918 cbf_cb = decode_cbf_chroma(tctx,trafoDepth);
2919 }
2920
2921 // we do not have to test for trafoDepth==0, because parent_cbf_cb is 1 at depth 0
2922 if (/*trafoDepth==0 ||*/ parent_cbf_cr) {
2923 cbf_cr = decode_cbf_chroma(tctx,trafoDepth);
2924 }
2925 }
2926
2927
2928 // cbf_cr/cbf_cb not present in bitstream -> induce values
2929
2930 if (cbf_cb<0) {
2931 if (trafoDepth>0 && log2TrafoSize==2) {
2932 cbf_cb = parent_cbf_cb;
2933 } else {
2934 cbf_cb=0;
2935 }
2936 }
2937
2938 if (cbf_cr<0) {
2939 if (trafoDepth>0 && log2TrafoSize==2) {
2940 cbf_cr = parent_cbf_cr;
2941 } else {
2942 cbf_cr=0;
2943 }
2944 }
2945
2946 if (split_transform_flag) {
2947 int x1 = x0 + (1<<(log2TrafoSize-1));
2948 int y1 = y0 + (1<<(log2TrafoSize-1));
2949
2950 logtrace(LogSlice,"transform split.\n");
2951
2952 read_transform_tree(ctx,tctx, x0,y0, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 0,
2953 MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr);
2954 read_transform_tree(ctx,tctx, x1,y0, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 1,
2955 MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr);
2956 read_transform_tree(ctx,tctx, x0,y1, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 2,
2957 MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr);
2958 read_transform_tree(ctx,tctx, x1,y1, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 3,
2959 MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr);
2960 }
2961 else {
2962 int cbf_luma=1;
2963
2964 if (PredMode==MODE_INTRA || trafoDepth!=0 || cbf_cb || cbf_cr) {
2965 cbf_luma = decode_cbf_luma(tctx,trafoDepth);
2966 }
2967
2968 logtrace(LogSlice,"call read_transform_unit %d/%d\n",x0,y0);
2969
2970 read_transform_unit(ctx,tctx, x0,y0,xBase,yBase, xCUBase,yCUBase, log2TrafoSize,trafoDepth, blkIdx,
2971 cbf_luma, cbf_cb, cbf_cr);
2972
2973
2974 int nT = 1<<log2TrafoSize;
2975
2976
2977 if (cuPredMode == MODE_INTRA) // if intra mode
2978 {
2979 int PUidx = (x0>>sps->Log2MinPUSize) + (y0>>sps->Log2MinPUSize) * sps->PicWidthInMinPUs;
2980
2981 enum IntraPredMode intraPredMode = (enum IntraPredMode) ctx->img->intraPredMode[PUidx];
2982
2983 decode_intra_prediction(ctx, x0,y0, intraPredMode, nT, 0);
2984
2985 enum IntraPredMode chromaPredMode = tctx->IntraPredModeC;
2986
2987 if (nT>=8) {
2988 decode_intra_prediction(ctx, x0/2,y0/2, chromaPredMode, nT/2, 1);
2989 decode_intra_prediction(ctx, x0/2,y0/2, chromaPredMode, nT/2, 2);
2990 }
2991 else if (blkIdx==3) {
2992 decode_intra_prediction(ctx, xBase/2,yBase/2, chromaPredMode, nT, 1);
2993 decode_intra_prediction(ctx, xBase/2,yBase/2, chromaPredMode, nT, 2);
2994 }
2995 }
2996
2997 if (cbf_luma) {
2998 scale_coefficients(ctx, tctx, x0,y0, xCUBase,yCUBase, nT, 0,
2999 tctx->transform_skip_flag[0], PredMode==MODE_INTRA);
3000 }
3001
3002 if (nT>=8) {
3003 if (cbf_cb) {
3004 scale_coefficients(ctx, tctx, x0/2,y0/2, xCUBase/2,yCUBase/2, nT/2, 1,
3005 tctx->transform_skip_flag[1], PredMode==MODE_INTRA);
3006 }
3007 if (cbf_cr) {
3008 scale_coefficients(ctx, tctx, x0/2,y0/2, xCUBase/2,yCUBase/2, nT/2, 2,
3009 tctx->transform_skip_flag[2], PredMode==MODE_INTRA);
3010 }
3011 }
3012 else if (blkIdx==3) {
3013 if (cbf_cb) {
3014 scale_coefficients(ctx, tctx, xBase/2,yBase/2, xCUBase/2,yCUBase/2, nT, 1,
3015 tctx->transform_skip_flag[1], PredMode==MODE_INTRA);
3016 }
3017 if (cbf_cr) {
3018 scale_coefficients(ctx, tctx, xBase/2,yBase/2, xCUBase/2,yCUBase/2, nT, 2,
3019 tctx->transform_skip_flag[2], PredMode==MODE_INTRA);
3020 }
3021 }
3022 }
3023 }
3024
3025
3026 #if DE265_LOG_TRACE
3027 static const char* part_mode_name(enum PartMode pm)
3028 {
3029 switch (pm) {
3030 case PART_2Nx2N: return "2Nx2N";
3031 case PART_2NxN: return "2NxN";
3032 case PART_Nx2N: return "Nx2N";
3033 case PART_NxN: return "NxN";
3034 case PART_2NxnU: return "2NxnU";
3035 case PART_2NxnD: return "2NxnD";
3036 case PART_nLx2N: return "nLx2N";
3037 case PART_nRx2N: return "nRx2N";
3038 }
3039
3040 return "undefined part mode";
3041 }
3042 #endif
3043
3044
3045 void read_mvd_coding(thread_context* tctx,
3046 int x0,int y0, int refList)
3047 {
3048 //slice_segment_header* shdr = tctx->shdr;
3049
3050 int abs_mvd_greater0_flag[2];
3051 abs_mvd_greater0_flag[0] = decode_CABAC_bit(&tctx->cabac_decoder,
3052 &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+0]);
3053 abs_mvd_greater0_flag[1] = decode_CABAC_bit(&tctx->cabac_decoder,
3054 &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+0]);
3055
3056 int abs_mvd_greater1_flag[2];
3057 if (abs_mvd_greater0_flag[0]) {
3058 abs_mvd_greater1_flag[0] = decode_CABAC_bit(&tctx->cabac_decoder,
3059 &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+1]);
3060 }
3061 else {
3062 abs_mvd_greater1_flag[0]=0;
3063 }
3064
3065 if (abs_mvd_greater0_flag[1]) {
3066 abs_mvd_greater1_flag[1] = decode_CABAC_bit(&tctx->cabac_decoder,
3067 &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+1]);
3068 }
3069 else {
3070 abs_mvd_greater1_flag[1]=0;
3071 }
3072
3073
3074 int abs_mvd_minus2[2];
3075 int mvd_sign_flag[2];
3076 int value[2];
3077
3078 for (int c=0;c<2;c++) {
3079 if (abs_mvd_greater0_flag[c]) {
3080 if (abs_mvd_greater1_flag[c]) {
3081 abs_mvd_minus2[c] = decode_CABAC_EGk_bypass(&tctx->cabac_decoder, 1);
3082 }
3083 else {
3084 abs_mvd_minus2[c] = abs_mvd_greater1_flag[c] -1;
3085 }
3086
3087 mvd_sign_flag[c] = decode_CABAC_bypass(&tctx->cabac_decoder);
3088
3089 value[c] = abs_mvd_minus2[c]+2;
3090 if (mvd_sign_flag[c]) { value[c] = -value[c]; }
3091 }
3092 else {
3093 value[c] = 0;
3094 }
3095 }
3096
3097 //set_mvd(tctx->decctx, x0,y0, refList, value[0],value[1]);
3098 tctx->mvd[refList][0] = value[0];
3099 tctx->mvd[refList][1] = value[1];
3100
3101 logtrace(LogSlice, "MVD[%d;%d|%d] = %d;%d\n",x0,y0,refList, value[0],value[1]);
3102 }
3103
3104
3105 void read_prediction_unit_SKIP(decoder_context* ctx,
3106 thread_context* tctx,
3107 int x0, int y0,
3108 int nPbW, int nPbH)
3109 {
3110 slice_segment_header* shdr = tctx->shdr;
3111
3112 int merge_idx;
3113 if (shdr->MaxNumMergeCand>1) {
3114 merge_idx = decode_merge_idx(tctx);
3115 }
3116 else {
3117 merge_idx = 0;
3118 }
3119
3120 tctx->merge_idx = merge_idx;
3121 tctx->merge_flag = true;
3122
3123 logtrace(LogSlice,"prediction skip 2Nx2N, merge_idx: %d\n",merge_idx);
3124 }
3125
3126
3127 void read_prediction_unit(decoder_context* ctx,
3128 thread_context* tctx,
3129 int xC,int yC, int xB,int yB,
3130 int nPbW, int nPbH,
3131 int ctDepth, int nCS,int partIdx)
3132 {
3133 logtrace(LogSlice,"read_prediction_unit %d;%d %dx%d\n",xC+xB,yC+xB,nPbW,nPbH);
3134
3135 int x0 = xC+xB;
3136 int y0 = yC+yB;
3137
3138 slice_segment_header* shdr = tctx->shdr;
3139
3140 int merge_flag = decode_merge_flag(tctx);
3141 tctx->merge_flag = merge_flag;
3142
3143 if (merge_flag) {
3144 int merge_idx;
3145
3146 if (shdr->MaxNumMergeCand>1) {
3147 merge_idx = decode_merge_idx(tctx);
3148 }
3149 else {
3150 merge_idx = 0;
3151 }
3152
3153 logtrace(LogSlice,"prediction unit %d,%d, merge mode, index: %d\n",x0,y0,merge_idx);
3154
3155 tctx->merge_idx = merge_idx;
3156 }
3157 else { // no merge flag
3158 enum InterPredIdc inter_pred_idc;
3159
3160 if (shdr->slice_type == SLICE_TYPE_B) {
3161 inter_pred_idc = decode_inter_pred_idc(tctx,x0,y0,nPbW,nPbH,ctDepth);
3162 }
3163 else {
3164 inter_pred_idc = PRED_L0;
3165 }
3166
3167 tctx->inter_pred_idc = inter_pred_idc; // set_inter_pred_idc(ctx,x0,y0, inter_pred_idc);
3168
3169 if (inter_pred_idc != PRED_L1) {
3170 int ref_idx_l0 = decode_ref_idx_lX(tctx, shdr->num_ref_idx_l0_active);
3171
3172 // NOTE: case for only one reference frame is handles in decode_ref_idx_lX()
3173 tctx->refIdx[0] = ref_idx_l0;
3174
3175 read_mvd_coding(tctx,x0,y0, 0);
3176
3177 int mvp_l0_flag = decode_mvp_lx_flag(tctx); // l0
3178 tctx->mvp_lX_flag[0] = mvp_l0_flag;
3179
3180 logtrace(LogSlice,"prediction unit %d,%d, L0, refIdx=%d mvp_l0_flag:%d\n",
3181 x0,y0, tctx->refIdx[0], mvp_l0_flag);
3182 }
3183
3184 if (inter_pred_idc != PRED_L0) {
3185 int ref_idx_l1 = decode_ref_idx_lX(tctx, shdr->num_ref_idx_l1_active);
3186
3187 // NOTE: case for only one reference frame is handles in decode_ref_idx_lX()
3188 tctx->refIdx[1] = ref_idx_l1;
3189
3190 if (shdr->mvd_l1_zero_flag &&
3191 inter_pred_idc == PRED_BI) {
3192 tctx->mvd[1][0] = 0;
3193 tctx->mvd[1][1] = 0;
3194 }
3195 else {
3196 read_mvd_coding(tctx,x0,y0, 1);
3197 }
3198
3199 int mvp_l1_flag = decode_mvp_lx_flag(tctx); // l1
3200 tctx->mvp_lX_flag[1] = mvp_l1_flag;
3201
3202 logtrace(LogSlice,"prediction unit %d,%d, L1, refIdx=%d mvp_l1_flag:%d\n",
3203 x0,y0, tctx->refIdx[1], mvp_l1_flag);
3204 }
3205 }
3206
3207
3208
3209 decode_prediction_unit(ctx,tctx, xC,yC,xB,yB, nCS, nPbW,nPbH, partIdx);
3210 }
3211
3212
3213
3214
3215 static void read_pcm_samples(thread_context* tctx, int x0, int y0, int log2CbSize)
3216 {
3217 bitreader br;
3218 br.data = tctx->cabac_decoder.bitstream_curr;
3219 br.bytes_remaining = tctx->cabac_decoder.bitstream_end - tctx->cabac_decoder.bitstream_curr;
3220 br.nextbits = 0;
3221 br.nextbits_cnt = 0;
3222
3223 const seq_parameter_set* sps = tctx->decctx->current_sps;
3224 //fprintf(stderr,"PCM pos: %d %d (POC=%d)\n",x0,y0,tctx->decctx->img->PicOrderCntVal);
3225
3226 int nBitsY = sps->pcm_sample_bit_depth_luma;
3227 int nBitsC = sps->pcm_sample_bit_depth_chroma;
3228
3229 int wY = 1<<log2CbSize;
3230 int wC = 1<<(log2CbSize-1);
3231
3232 uint8_t* yPtr;
3233 uint8_t* cbPtr;
3234 uint8_t* crPtr;
3235 int stride;
3236 int chroma_stride;
3237 get_image_plane(tctx->decctx->img, 0, &yPtr, &stride);
3238 get_image_plane(tctx->decctx->img, 1, &cbPtr, &chroma_stride);
3239 get_image_plane(tctx->decctx->img, 2, &crPtr, &chroma_stride);
3240
3241 yPtr = &yPtr [y0*stride + x0];
3242 cbPtr = &cbPtr[y0/2*chroma_stride + x0/2];
3243 crPtr = &crPtr[y0/2*chroma_stride + x0/2];
3244
3245 int shiftY = sps->BitDepth_Y - nBitsY;
3246 int shiftC = sps->BitDepth_C - nBitsC;
3247
3248 for (int y=0;y<wY;y++)
3249 for (int x=0;x<wY;x++)
3250 {
3251 int value = get_bits(&br, nBitsY);
3252 yPtr[y*stride+x] = value << shiftY;
3253 }
3254
3255 for (int y=0;y<wC;y++)
3256 for (int x=0;x<wC;x++)
3257 {
3258 int value = get_bits(&br, nBitsC);
3259 cbPtr[y*chroma_stride+x] = value << shiftC;
3260 }
3261
3262 for (int y=0;y<wC;y++)
3263 for (int x=0;x<wC;x++)
3264 {
3265 int value = get_bits(&br, nBitsC);
3266 crPtr[y*chroma_stride+x] = value << shiftC;
3267 }
3268
3269 prepare_for_CABAC(&br);
3270 tctx->cabac_decoder.bitstream_curr = br.data;
3271 init_CABAC_decoder_2(&tctx->cabac_decoder);
3272 }
3273
3274
3275 void read_coding_unit(decoder_context* ctx,
3276 thread_context* tctx,
3277 int x0, int y0, // position of coding unit in frame
3278 int log2CbSize,
3279 int ctDepth)
3280 {
3281 const seq_parameter_set* sps = ctx->current_sps;
3282
3283 //int nS = 1 << log2CbSize;
3284
3285 logtrace(LogSlice,"- read_coding_unit %d;%d cbsize:%d\n",x0,y0,1<<log2CbSize);
3286
3287
3288 slice_segment_header* shdr = tctx->shdr;
3289
3290 set_log2CbSize(ctx->img,sps, x0,y0, log2CbSize);
3291
3292 int nCbS = 1<<log2CbSize; // number of coding block samples
3293
3294 decode_quantization_parameters(ctx,tctx, x0,y0, x0, y0);
3295
3296
3297 if (ctx->current_pps->transquant_bypass_enable_flag)
3298 {
3299 int transquant_bypass = decode_transquant_bypass_flag(tctx);
3300
3301 tctx->cu_transquant_bypass_flag = transquant_bypass;
3302
3303 if (transquant_bypass) {
3304 set_cu_transquant_bypass(ctx->img,sps,x0,y0,log2CbSize);
3305 }
3306 }
3307
3308 uint8_t cu_skip_flag = 0;
3309 if (shdr->slice_type != SLICE_TYPE_I) {
3310 cu_skip_flag = decode_cu_skip_flag(tctx,x0,y0,ctDepth);
3311 }
3312
3313 set_cu_skip_flag(ctx->current_sps,ctx->img,x0,y0,log2CbSize, cu_skip_flag);
3314
3315 int IntraSplitFlag = 0;
3316
3317 enum PredMode cuPredMode;
3318
3319 if (cu_skip_flag) {
3320 read_prediction_unit_SKIP(ctx,tctx,x0,y0,nCbS,nCbS);
3321
3322 set_PartMode(ctx->img, ctx->current_sps, x0,y0, PART_2Nx2N); // need this for deblocking filter
3323 set_pred_mode(ctx->img,sps,x0,y0,log2CbSize, MODE_SKIP);
3324 cuPredMode = MODE_SKIP;
3325
3326 logtrace(LogSlice,"CU pred mode: SKIP\n");
3327
3328
3329 // DECODE
3330
3331 //UNIFY decode_quantization_parameters(ctx, tctx, x0, y0, x0, y0);
3332
3333 int nCS_L = 1<<log2CbSize;
3334 decode_prediction_unit(ctx,tctx,x0,y0, 0,0, nCS_L, nCS_L,nCS_L, 0);
3335 }
3336 else /* not skipped */ {
3337 if (shdr->slice_type != SLICE_TYPE_I) {
3338 int pred_mode_flag = decode_pred_mode_flag(tctx);
3339 cuPredMode = pred_mode_flag ? MODE_INTRA : MODE_INTER;
3340 }
3341 else {
3342 cuPredMode = MODE_INTRA;
3343 }
3344
3345 set_pred_mode(ctx->img,sps,x0,y0,log2CbSize, cuPredMode);
3346
3347 logtrace(LogSlice,"CU pred mode: %s\n", cuPredMode==MODE_INTRA ? "INTRA" : "INTER");
3348
3349
3350 enum PartMode PartMode;
3351
3352 if (cuPredMode != MODE_INTRA ||
3353 log2CbSize == sps->Log2MinCbSizeY) {
3354 PartMode = decode_part_mode(tctx, cuPredMode, log2CbSize);
3355
3356 if (PartMode==PART_NxN && cuPredMode==MODE_INTRA) {
3357 IntraSplitFlag=1;
3358 }
3359 } else {
3360 PartMode = PART_2Nx2N;
3361 }
3362
3363 set_PartMode(ctx->img,ctx->current_sps, x0,y0, PartMode); // needed for deblocking ?
3364
3365 logtrace(LogSlice, "PartMode: %s\n", part_mode_name(PartMode));
3366
3367
3368 bool pcm_flag = false;
3369
3370 if (cuPredMode == MODE_INTRA) {
3371 if (PartMode == PART_2Nx2N && sps->pcm_enabled_flag &&
3372 log2CbSize >= sps->Log2MinIpcmCbSizeY &&
3373 log2CbSize <= sps->Log2MaxIpcmCbSizeY) {
3374 pcm_flag = decode_CABAC_term_bit(&tctx->cabac_decoder);
3375 }
3376
3377 if (pcm_flag) {
3378 set_pcm_flag(ctx->img, ctx->current_sps, x0,y0,log2CbSize);
3379
3380 //UNIFY decode_quantization_parameters(ctx,tctx, x0,y0, x0, y0);
3381
3382 read_pcm_samples(tctx, x0,y0, log2CbSize);
3383 }
3384 else {
3385 int pbOffset = (PartMode == PART_NxN) ? (nCbS/2) : nCbS;
3386 int log2IntraPredSize = (PartMode == PART_NxN) ? (log2CbSize-1) : log2CbSize;
3387
3388 logtrace(LogSlice,"nCbS:%d pbOffset:%d\n",nCbS,pbOffset);
3389
3390 int prev_intra_luma_pred_flag[4];
3391
3392 int idx=0;
3393 for (int j=0;j<nCbS;j+=pbOffset)
3394 for (int i=0;i<nCbS;i+=pbOffset)
3395 {
3396 prev_intra_luma_pred_flag[idx++] = decode_prev_intra_luma_pred_flag(tctx);
3397 }
3398
3399 int mpm_idx[4], rem_intra_luma_pred_mode[4];
3400 idx=0;
3401
3402 for (int j=0;j<nCbS;j+=pbOffset)
3403 for (int i=0;i<nCbS;i+=pbOffset)
3404 {
3405 if (prev_intra_luma_pred_flag[idx]) {
3406 mpm_idx[idx] = decode_mpm_idx(tctx);
3407 }
3408 else {
3409 rem_intra_luma_pred_mode[idx] = decode_rem_intra_luma_pred_mode(tctx);
3410 }
3411
3412
3413 int x = x0+i;
3414 int y = y0+j;
3415
3416 // --- find intra prediction mode ---
3417
3418 int IntraPredMode;
3419
3420 int availableA = check_CTB_available(ctx, shdr, x,y, x-1,y);
3421 int availableB = check_CTB_available(ctx, shdr, x,y, x,y-1);
3422
3423 int PUidx = (x>>sps->Log2MinPUSize) + (y>>sps->Log2MinPUSize)*sps->PicWidthInMinPUs;
3424
3425 // block on left side
3426
3427 enum IntraPredMode candIntraPredModeA, candIntraPredModeB;
3428 if (availableA==false) {
3429 candIntraPredModeA=INTRA_DC;
3430 }
3431 else if (get_pred_mode(ctx->img,sps, x-1,y) != MODE_INTRA ||
3432 get_pcm_flag(ctx->img,sps, x-1,y)) {
3433 candIntraPredModeA=INTRA_DC;
3434 }
3435 else {
3436 candIntraPredModeA = (enum IntraPredMode) ctx->img->intraPredMode[PUidx-1];
3437 }
3438
3439 // block above
3440
3441 if (availableB==false) {
3442 candIntraPredModeB=INTRA_DC;
3443 }
3444 else if (get_pred_mode(ctx->img,sps, x,y-1) != MODE_INTRA ||
3445 get_pcm_flag(ctx->img,sps, x,y-1)) {
3446 candIntraPredModeB=INTRA_DC;
3447 }
3448 else if (y-1 < ((y >> sps->Log2CtbSizeY) << sps->Log2CtbSizeY)) {
3449 candIntraPredModeB=INTRA_DC;
3450 }
3451 else {
3452 candIntraPredModeB = (enum IntraPredMode) ctx->img->intraPredMode[PUidx-sps->PicWidthInMinPUs];
3453 }
3454
3455 // build candidate list
3456
3457 int candModeList[3];
3458
3459 logtrace(LogSlice,"availableA:%d candA:%d & availableB:%d candB:%d\n",
3460 availableA, candIntraPredModeA,
3461 availableB, candIntraPredModeB);
3462
3463 if (candIntraPredModeA == candIntraPredModeB) {
3464 if (candIntraPredModeA < 2) {
3465 candModeList[0] = INTRA_PLANAR;
3466 candModeList[1] = INTRA_DC;
3467 candModeList[2] = INTRA_ANGULAR_26;
3468 }
3469 else {
3470 candModeList[0] = candIntraPredModeA;
3471 candModeList[1] = 2 + ((candIntraPredModeA-2 -1 +32) % 32);
3472 candModeList[2] = 2 + ((candIntraPredModeA-2 +1 ) % 32);
3473 }
3474 }
3475 else {
3476 candModeList[0] = candIntraPredModeA;
3477 candModeList[1] = candIntraPredModeB;
3478
3479 if (candIntraPredModeA != INTRA_PLANAR &&
3480 candIntraPredModeB != INTRA_PLANAR) {
3481 candModeList[2] = INTRA_PLANAR;
3482 }
3483 else if (candIntraPredModeA != INTRA_DC &&
3484 candIntraPredModeB != INTRA_DC) {
3485 candModeList[2] = INTRA_DC;
3486 }
3487 else {
3488 candModeList[2] = INTRA_ANGULAR_26;
3489 }
3490 }
3491
3492 for (int i=0;i<3;i++)
3493 logtrace(LogSlice,"candModeList[%d] = %d\n", i, candModeList[i]);
3494
3495 if (prev_intra_luma_pred_flag[idx]==1) {
3496 IntraPredMode = candModeList[ mpm_idx[idx] ];
3497 }
3498 else {
3499 // sort candModeList
3500
3501 if (candModeList[0] > candModeList[1]) {
3502 int t = candModeList[0]; candModeList[0]=candModeList[1]; candModeList[1]=t;
3503 }
3504 if (candModeList[0] > candModeList[2]) {
3505 int t = candModeList[0]; candModeList[0]=candModeList[2]; candModeList[2]=t;
3506 }
3507 if (candModeList[1] > candModeList[2]) {
3508 int t = candModeList[1]; candModeList[1]=candModeList[2]; candModeList[2]=t;
3509 }
3510
3511 // skip modes in the list
3512 // (we have 35 modes. skipping the 3 in the list gives us 32, which can be selected by 5 bits)
3513 IntraPredMode = rem_intra_luma_pred_mode[idx];
3514 for (int n=0;n<=2;n++) {
3515 if (IntraPredMode >= candModeList[n]) { IntraPredMode++; }
3516 }
3517 }
3518
3519 logtrace(LogSlice,"IntraPredMode[%d][%d] = %d (log2blk:%d)\n",x,y,IntraPredMode, log2IntraPredSize);
3520
3521 int pbSize = 1<<(log2IntraPredSize - sps->Log2MinPUSize);
3522
3523 for (int y=0;y<pbSize;y++)
3524 for (int x=0;x<pbSize;x++)
3525 ctx->img->intraPredMode[PUidx + x + y*sps->PicWidthInMinPUs] = IntraPredMode;
3526
3527 idx++;
3528 }
3529
3530
3531 // set chroma intra prediction mode
3532
3533 int intra_chroma_pred_mode = decode_intra_chroma_pred_mode(tctx);
3534
3535 int IntraPredMode = ctx->img->intraPredMode[(x0>>sps->Log2MinPUSize) +
3536 (y0>>sps->Log2MinPUSize) * sps->PicWidthInMinPUs];
3537 logtrace(LogSlice,"IntraPredMode: %d\n",IntraPredMode);
3538
3539 int IntraPredModeC;
3540 if (intra_chroma_pred_mode==4) {
3541 IntraPredModeC = IntraPredMode;
3542 }
3543 else {
3544 static enum IntraPredMode IntraPredModeCCand[4] = {
3545 INTRA_PLANAR,
3546 INTRA_ANGULAR_26, // vertical
3547 INTRA_ANGULAR_10, // horizontal
3548 INTRA_DC
3549 };
3550
3551 IntraPredModeC = IntraPredModeCCand[intra_chroma_pred_mode];
3552 if (IntraPredModeC == IntraPredMode) {
3553 IntraPredModeC = INTRA_ANGULAR_34;
3554 }
3555 }
3556
3557 logtrace(LogSlice,"IntraPredModeC[%d][%d]: %d\n",x0,y0,IntraPredModeC);
3558
3559 tctx->IntraPredModeC = (enum IntraPredMode) IntraPredModeC;
3560 }
3561 }
3562 else { // INTER
3563 int nCS = 1<<log2CbSize;
3564
3565 if (PartMode == PART_2Nx2N) {
3566 read_prediction_unit(ctx,tctx,x0,y0,0,0,nCbS,nCbS,ctDepth,nCS,0);
3567 }
3568 else if (PartMode == PART_2NxN) {
3569 read_prediction_unit(ctx,tctx,x0,y0,0,0 ,nCbS,nCbS/2,ctDepth,nCS,0);
3570 read_prediction_unit(ctx,tctx,x0,y0,0,nCbS/2,nCbS,nCbS/2,ctDepth,nCS,1);
3571 }
3572 else if (PartMode == PART_Nx2N) {
3573 read_prediction_unit(ctx,tctx,x0,y0,0,0 , nCbS/2,nCbS,ctDepth,nCS,0);
3574 read_prediction_unit(ctx,tctx,x0,y0,nCbS/2,0,nCbS/2,nCbS,ctDepth,nCS,1);
3575 }
3576 else if (PartMode == PART_2NxnU) {
3577 read_prediction_unit(ctx,tctx,x0,y0,0,0, nCbS,nCbS/4,ctDepth,nCS,0);
3578 read_prediction_unit(ctx,tctx,x0,y0,0,nCbS/4,nCbS,nCbS*3/4,ctDepth,nCS,1);
3579 }
3580 else if (PartMode == PART_2NxnD) {
3581 read_prediction_unit(ctx,tctx,x0,y0,0,0, nCbS,nCbS*3/4,ctDepth,nCS,0);
3582 read_prediction_unit(ctx,tctx,x0,y0,0,nCbS*3/4,nCbS,nCbS/4,ctDepth,nCS,1);
3583 }
3584 else if (PartMode == PART_nLx2N) {
3585 read_prediction_unit(ctx,tctx,x0,y0,0,0, nCbS/4,nCbS,ctDepth,nCS,0);
3586 read_prediction_unit(ctx,tctx,x0,y0,nCbS/4,0,nCbS*3/4,nCbS,ctDepth,nCS,1);
3587 }
3588 else if (PartMode == PART_nRx2N) {
3589 read_prediction_unit(ctx,tctx,x0,y0,0,0, nCbS*3/4,nCbS,ctDepth,nCS,0);
3590 read_prediction_unit(ctx,tctx,x0,y0,nCbS*3/4,0,nCbS/4,nCbS,ctDepth,nCS,1);
3591 }
3592 else if (PartMode == PART_NxN) {
3593 read_prediction_unit(ctx,tctx,x0,y0,0,0, nCbS/2,nCbS/2,ctDepth,nCS,0);
3594 read_prediction_unit(ctx,tctx,x0,y0,nCbS/2,0, nCbS/2,nCbS/2,ctDepth,nCS,1);
3595 read_prediction_unit(ctx,tctx,x0,y0,0,nCbS/2, nCbS/2,nCbS/2,ctDepth,nCS,2);
3596 read_prediction_unit(ctx,tctx,x0,y0,nCbS/2,nCbS/2,nCbS/2,nCbS/2,ctDepth,nCS,3);
3597 }
3598 else {
3599 assert(0); // undefined PartMode
3600 }
3601 } // INTER
3602
3603
3604 // decode residual
3605
3606 //decode_quantization_parameters(ctx,tctx, x0,y0);
3607
3608
3609 if (!pcm_flag) { // !pcm
3610 bool rqt_root_cbf;
3611
3612 uint8_t merge_flag = tctx->merge_flag; // !!get_merge_flag(ctx,x0,y0);
3613
3614 if (cuPredMode != MODE_INTRA &&
3615 !(PartMode == PART_2Nx2N && merge_flag)) {
3616
3617 rqt_root_cbf = !!decode_rqt_root_cbf(tctx);
3618 }
3619 else {
3620 rqt_root_cbf = true;
3621 }
3622
3623 //set_rqt_root_cbf(ctx,x0,y0, log2CbSize, rqt_root_cbf);
3624
3625 if (rqt_root_cbf) {
3626 int MaxTrafoDepth;
3627
3628 if (cuPredMode==MODE_INTRA) {
3629 MaxTrafoDepth = ctx->current_sps->max_transform_hierarchy_depth_intra + IntraSplitFlag;
3630 }
3631 else {
3632 MaxTrafoDepth = ctx->current_sps->max_transform_hierarchy_depth_inter;
3633 }
3634
3635 logtrace(LogSlice,"MaxTrafoDepth: %d\n",MaxTrafoDepth);
3636
3637 //UNIFY decode_quantization_parameters(ctx,tctx, x0,y0, x0, y0);
3638
3639 read_transform_tree(ctx,tctx, x0,y0, x0,y0, x0,y0, log2CbSize, 0,0,
3640 MaxTrafoDepth, IntraSplitFlag, cuPredMode, 1,1);
3641 }
3642 else {
3643 //UNIFY decode_quantization_parameters(ctx,tctx, x0,y0, x0, y0);
3644 }
3645 } // !pcm
3646 }
3647 }
3648
3649
3650 // ------------------------------------------------------------------------------------------
3651
3652
3653 void read_coding_quadtree(decoder_context* ctx,
3654 thread_context* tctx,
3655 int x0, int y0,
3656 int log2CbSize,
3657 int ctDepth)
3658 {
3659 logtrace(LogSlice,"- read_coding_quadtree %d;%d cbsize:%d depth:%d POC:%d\n",x0,y0,1<<log2CbSize,ctDepth,ctx->img->PicOrderCntVal);
3660
3661 //slice_segment_header* shdr = tctx->shdr;
3662 seq_parameter_set* sps = ctx->current_sps;
3663
3664 int split_flag;
3665
3666 // We only send a split flag if CU is larger than minimum size and
3667 // completely contained within the image area.
3668 // If it is partly outside the image area and not at minimum size,
3669 // it is split. If already at minimum size, it is not split further.
3670 if (x0+(1<<log2CbSize) <= sps->pic_width_in_luma_samples &&
3671 y0+(1<<log2CbSize) <= sps->pic_height_in_luma_samples &&
3672 log2CbSize > sps->Log2MinCbSizeY) {
3673 split_flag = decode_split_cu_flag(tctx, x0,y0, ctDepth);
3674 } else {
3675 if (log2CbSize > sps->Log2MinCbSizeY) { split_flag=1; }
3676 else { split_flag=0; }
3677 }
3678
3679
3680 if (ctx->current_pps->cu_qp_delta_enabled_flag &&
3681 log2CbSize >= ctx->current_pps->Log2MinCuQpDeltaSize)
3682 {
3683 tctx->IsCuQpDeltaCoded = 0;
3684 tctx->CuQpDelta = 0;
3685 }
3686 else
3687 {
3688 // shdr->CuQpDelta = 0; // TODO check: is this the right place to set to default value ?
3689 }
3690
3691 if (split_flag) {
3692 int x1 = x0 + (1<<(log2CbSize-1));
3693 int y1 = y0 + (1<<(log2CbSize-1));
3694
3695 read_coding_quadtree(ctx,tctx,x0,y0, log2CbSize-1, ctDepth+1);
3696
3697 if (x1<sps->pic_width_in_luma_samples)
3698 read_coding_quadtree(ctx,tctx,x1,y0, log2CbSize-1, ctDepth+1);
3699
3700 if (y1<sps->pic_height_in_luma_samples)
3701 read_coding_quadtree(ctx,tctx,x0,y1, log2CbSize-1, ctDepth+1);
3702
3703 if (x1<sps->pic_width_in_luma_samples &&
3704 y1<sps->pic_height_in_luma_samples)
3705 read_coding_quadtree(ctx,tctx,x1,y1, log2CbSize-1, ctDepth+1);
3706 }
3707 else {
3708 // set ctDepth of this CU
3709
3710 set_ctDepth(ctx->img,ctx->current_sps, x0,y0, log2CbSize, ctDepth);
3711
3712 read_coding_unit(ctx,tctx, x0,y0, log2CbSize, ctDepth);
3713 }
3714
3715 logtrace(LogSlice,"-\n");
3716 }
3717
3718
3719 // ---------------------------------------------------------------------------
3720
3721 enum DecodeResult {
3722 Decode_EndOfSliceSegment,
3723 Decode_EndOfSubstream,
3724 Decode_Error
3725 };
3726
3727 /* Decode CTBs until the end of sub-stream, the end-of-slice, or some error occurs.
3728 */
3729 enum DecodeResult decode_substream(thread_context* tctx,
3730 bool block_wpp, // block on WPP dependencies
3731 int context_copy_ctbx, // copy CABAC-context after decoding this CTB
3732 context_model* context_storage) // copy CABAC-context to this storage space
3733 {
3734 decoder_context* ctx = tctx->decctx;
3735 const pic_parameter_set* pps = ctx->current_pps;
3736 const seq_parameter_set* sps = ctx->current_sps;
3737
3738 const int ctbW = sps->PicWidthInCtbsY;
3739
3740 do {
3741 const int ctbx = tctx->CtbX;
3742 const int ctby = tctx->CtbY;
3743
3744 if (block_wpp && ctby>0 && ctbx < ctbW-1) {
3745 //printf("wait on %d/%d\n",ctbx+1,ctby-1);
3746
3747 // TODO: ctx->img should be tctx->img
3748 de265_wait_for_progress(&ctx->img->ctb_progress[ctbx+1+(ctby-1)*ctbW],
3749 CTB_PROGRESS_PREFILTER);
3750 }
3751
3752 //printf("%p: decode %d|%d\n", tctx, tctx->CtbY,tctx->CtbX);
3753
3754
3755 // read and decode CTB
3756
3757 read_coding_tree_unit(ctx, tctx);
3758
3759 if (pps->entropy_coding_sync_enabled_flag &&
3760 ctbx == context_copy_ctbx &&
3761 ctby+1 < sps->PicHeightInCtbsY)
3762 {
3763 assert(context_storage);
3764 memcpy(context_storage,
3765 &tctx->ctx_model,
3766 CONTEXT_MODEL_TABLE_LENGTH * sizeof(context_model));
3767 }
3768
3769 // TODO: ctx->img should be tctx->img
3770 de265_announce_progress(&ctx->img->ctb_progress[ctbx+ctby*ctbW], CTB_PROGRESS_PREFILTER);
3771
3772 //printf("%p: decoded %d|%d\n",tctx, ctby,ctbx);
3773
3774
3775 // end of slice segment ?
3776
3777 int end_of_slice_segment_flag = decode_CABAC_term_bit(&tctx->cabac_decoder);
3778
3779 logtrace(LogSlice,"read CTB %d -> end=%d\n", tctx->CtbAddrInRS, end_of_slice_segment_flag);
3780
3781 const int lastCtbY = tctx->CtbY;
3782
3783 bool endOfPicture = advanceCtbAddr(tctx); // true if we read past the end of the image
3784
3785 if (endOfPicture &&
3786 end_of_slice_segment_flag == false)
3787 {
3788 add_warning(ctx, DE265_WARNING_CTB_OUTSIDE_IMAGE_AREA, false);
3789 ctx->img->integrity = INTEGRITY_DECODING_ERRORS;
3790 return Decode_Error;
3791 }
3792
3793
3794 if (end_of_slice_segment_flag) {
3795 return Decode_EndOfSliceSegment;
3796 }
3797
3798
3799 if (!end_of_slice_segment_flag) {
3800 bool end_of_sub_stream = false;
3801 end_of_sub_stream |= (pps->tiles_enabled_flag &&
3802 pps->TileId[tctx->CtbAddrInTS] != pps->TileId[tctx->CtbAddrInTS-1]);
3803 end_of_sub_stream |= (pps->entropy_coding_sync_enabled_flag &&
3804 lastCtbY != tctx->CtbY);
3805
3806 if (end_of_sub_stream) {
3807 int end_of_sub_stream_one_bit = decode_CABAC_term_bit(&tctx->cabac_decoder);
3808 if (!end_of_sub_stream_one_bit) {
3809 add_warning(ctx, DE265_WARNING_EOSS_BIT_NOT_SET, false);
3810 ctx->img->integrity = INTEGRITY_DECODING_ERRORS;
3811 return Decode_Error;
3812 }
3813
3814 init_CABAC_decoder_2(&tctx->cabac_decoder); // byte alignment
3815 return Decode_EndOfSubstream;
3816 }
3817 }
3818
3819 } while (true);
3820 }
3821
3822
3823
3824 void thread_decode_slice_segment(void* d)
3825 {
3826 struct thread_task_ctb_row* data = (struct thread_task_ctb_row*)d;
3827 decoder_context* ctx = data->ctx;
3828 thread_context* tctx = &ctx->thread_context[data->thread_context_id];
3829
3830 setCtbAddrFromTS(tctx);
3831
3832 //printf("%p: A start decoding at %d/%d\n", tctx, tctx->CtbX,tctx->CtbY);
3833
3834 initialize_CABAC(ctx,tctx);
3835 init_CABAC_decoder_2(&tctx->cabac_decoder);
3836
3837 /*enum DecodeResult result =*/ decode_substream(tctx, false, -1,NULL);
3838
3839 decrease_pending_tasks(ctx->img, 1);
3840
3841 return; // DE265_OK;
3842 }
3843
3844
3845 void thread_decode_CTB_row(void* d)
3846 {
3847 struct thread_task_ctb_row* data = (struct thread_task_ctb_row*)d;
3848 decoder_context* ctx = data->ctx;
3849 thread_context* tctx = &ctx->thread_context[data->thread_context_id];
3850
3851 seq_parameter_set* sps = ctx->current_sps;
3852 int ctbW = sps->PicWidthInCtbsY;
3853
3854 setCtbAddrFromTS(tctx);
3855
3856 int ctby = tctx->CtbAddrInRS / ctbW;
3857 int myCtbRow = ctby;
3858
3859 // printf("start decoding at %d/%d\n", ctbx,ctby);
3860
3861 if (data->initCABAC) {
3862 initialize_CABAC(ctx,tctx);
3863 }
3864
3865 init_CABAC_decoder_2(&tctx->cabac_decoder);
3866
3867 int destThreadContext = 0;
3868 if (ctby+1 < sps->PicHeightInCtbsY) {
3869 destThreadContext = ctx->img->ctb_info[0 + (ctby+1)*ctbW].thread_context_id;
3870 }
3871
3872 /*enum DecodeResult result =*/ decode_substream(tctx, true, 1,
3873 ctx->thread_context[destThreadContext].ctx_model);
3874
3875 // mark progress on remaining CTBs in row (in case of decoder error and early termination)
3876
3877 if (tctx->CtbY == myCtbRow) {
3878 int lastCtbX = sps->PicWidthInCtbsY; // assume no tiles when WPP is on
3879 for (int x = tctx->CtbX; x<lastCtbX ; x++) {
3880 de265_announce_progress(&ctx->img->ctb_progress[myCtbRow*ctbW + x], CTB_PROGRESS_PREFILTER);
3881 }
3882 }
3883
3884 decrease_pending_tasks(ctx->img, 1);
3885 }
3886
3887
3888 de265_error read_slice_segment_data(decoder_context* ctx, thread_context* tctx)
3889 {
3890 setCtbAddrFromTS(tctx);
3891
3892 const pic_parameter_set* pps = ctx->current_pps;
3893 slice_segment_header* shdr = tctx->shdr;
3894
3895 if (shdr->dependent_slice_segment_flag) {
3896 int prevCtb = pps->CtbAddrTStoRS[ pps->CtbAddrRStoTS[shdr->slice_segment_address] -1 ];
3897
3898 slice_segment_header* prevCtbHdr = &ctx->slice[ ctx->img->ctb_info[prevCtb ].SliceHeaderIndex ];
3899
3900 if (is_tile_start_CTB(pps,
3901 shdr->slice_segment_address % ctx->current_sps->PicWidthInCtbsY,
3902 shdr->slice_segment_address / ctx->current_sps->PicWidthInCtbsY
3903 )) {
3904 initialize_CABAC(ctx,tctx);
3905 }
3906 else {
3907 memcpy(tctx->ctx_model,
3908 prevCtbHdr->ctx_model_storage,
3909 CONTEXT_MODEL_TABLE_LENGTH * sizeof(context_model));
3910 }
3911 }
3912 else {
3913 initialize_CABAC(ctx,tctx);
3914 }
3915
3916 init_CABAC_decoder_2(&tctx->cabac_decoder);
3917
3918 // printf("-----\n");
3919
3920 enum DecodeResult result;
3921 do {
3922 result = decode_substream(tctx, false, 1,
3923 shdr->ctx_model_storage);
3924
3925 if (result == Decode_EndOfSliceSegment ||
3926 result == Decode_Error) {
3927
3928 if (pps->dependent_slice_segments_enabled_flag) {
3929 memcpy(shdr->ctx_model_storage,
3930 tctx->ctx_model,
3931 CONTEXT_MODEL_TABLE_LENGTH * sizeof(context_model));
3932 }
3933 break;
3934 }
3935
3936 if (ctx->current_pps->entropy_coding_sync_enabled_flag) {
3937 memcpy(tctx->ctx_model,
3938 shdr->ctx_model_storage,
3939 CONTEXT_MODEL_TABLE_LENGTH * sizeof(context_model));
3940 }
3941
3942 if (ctx->current_pps->tiles_enabled_flag) {
3943 initialize_CABAC(ctx,tctx);
3944 }
3945 } while (true);
3946
3947 return DE265_OK;
3948 }
3949
3950
3951
3952 // &ctx->thread_context[destThreadContext].ctx_model,
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * Authors: struktur AG, Dirk Farin <farin@struktur.de>
5 * Min Chen <chenm003@163.com>
6 *
7 * This file is part of libde265.
8 *
9 * libde265 is free software: you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser General Public License as
11 * published by the Free Software Foundation, either version 3 of
12 * the License, or (at your option) any later version.
13 *
14 * libde265 is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public License
20 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
21 */
22
23 #include "slice.h"
24 #include "motion.h"
25 #include "util.h"
26 #include "scan.h"
27 #include "intrapred.h"
28 #include "transform.h"
29 #include "threads.h"
30 #include "image.h"
31
32 #include <assert.h>
33 #include <string.h>
34 #include <stdlib.h>
35
36
37 #define LOCK de265_mutex_lock(&ctx->thread_pool.mutex)
38 #define UNLOCK de265_mutex_unlock(&ctx->thread_pool.mutex)
39
40 extern bool read_short_term_ref_pic_set(decoder_context* ctx,
41 const seq_parameter_set* sps,
42 bitreader* br,
43 ref_pic_set* out_set,
44 int idxRps, // index of the set to be read
45 const std::vector<ref_pic_set>& sets,
46 bool sliceRefPicSet);
47
48
49 void read_coding_tree_unit(thread_context* tctx);
50 void read_coding_quadtree(thread_context* tctx,
51 int xCtb, int yCtb,
52 int Log2CtbSizeY,
53 int ctDepth);
54 int check_CTB_available(de265_image* img,
55 slice_segment_header* shdr,
56 int xC,int yC, int xN,int yN);
57 /*
58 void decode_inter_block(decoder_context* ctx,thread_context* tctx,
59 int xC, int yC, int log2CbSize);
60 */
61
62 bool read_pred_weight_table(bitreader* br, slice_segment_header* shdr, decoder_context* ctx)
63 {
64 int vlc;
65
66 pic_parameter_set* pps = ctx->get_pps((int)shdr->slice_pic_parameter_set_id);
67 assert(pps);
68 seq_parameter_set* sps = ctx->get_sps((int)pps->seq_parameter_set_id);
69 assert(sps);
70
71 shdr->luma_log2_weight_denom = vlc = get_uvlc(br);
72 if (vlc<0 || vlc>7) return false;
73
74 if (sps->chroma_format_idc != 0) {
75 vlc = get_svlc(br);
76 vlc += shdr->luma_log2_weight_denom;
77 if (vlc<0 || vlc>7) return false;
78 shdr->ChromaLog2WeightDenom = vlc;
79 }
80
81 int sumWeightFlags = 0;
82
83 for (int l=0;l<=1;l++)
84 if (l==0 || (l==1 && shdr->slice_type == SLICE_TYPE_B))
85 {
86 int num_ref = (l==0 ? shdr->num_ref_idx_l0_active-1 : shdr->num_ref_idx_l1_active-1);
87
88 for (int i=0;i<=num_ref;i++) {
89 shdr->luma_weight_flag[l][i] = get_bits(br,1);
90 if (shdr->luma_weight_flag[l][i]) sumWeightFlags++;
91 }
92
93 if (sps->chroma_format_idc != 0) {
94 for (int i=0;i<=num_ref;i++) {
95 shdr->chroma_weight_flag[l][i] = get_bits(br,1);
96 if (shdr->chroma_weight_flag[l][i]) sumWeightFlags+=2;
97 }
98 }
99
100 for (int i=0;i<=num_ref;i++) {
101 if (shdr->luma_weight_flag[l][i]) {
102
103 // delta_luma_weight
104
105 vlc = get_svlc(br);
106 if (vlc < -128 || vlc > 127) return false;
107
108 shdr->LumaWeight[l][i] = (1<<shdr->luma_log2_weight_denom) + vlc;
109
110 // luma_offset
111
112 vlc = get_svlc(br);
113 if (vlc < -128 || vlc > 127) return false;
114 shdr->luma_offset[l][i] = vlc;
115 }
116 else {
117 shdr->LumaWeight[l][i] = 1<<shdr->luma_log2_weight_denom;
118 shdr->luma_offset[l][i] = 0;
119 }
120
121 if (shdr->chroma_weight_flag[l][i])
122 for (int j=0;j<2;j++) {
123 // delta_chroma_weight
124
125 vlc = get_svlc(br);
126 if (vlc < -128 || vlc > 127) return false;
127
128 shdr->ChromaWeight[l][i][j] = (1<<shdr->ChromaLog2WeightDenom) + vlc;
129
130 // delta_chroma_offset
131
132 vlc = get_svlc(br);
133 if (vlc < -512 || vlc > 511) return false;
134
135 vlc = Clip3(-128,127, (vlc-((128*shdr->ChromaWeight[l][i][j])
136 >> shdr->ChromaLog2WeightDenom) + 128));
137
138 shdr->ChromaOffset[l][i][j] = vlc;
139 }
140 else {
141 for (int j=0;j<2;j++) {
142 shdr->ChromaWeight[l][i][j] = 1<<shdr->ChromaLog2WeightDenom;
143 shdr->ChromaOffset[l][i][j] = 0;
144 }
145 }
146 }
147 }
148
149 // TODO: bitstream conformance requires that 'sumWeightFlags<=24'
150
151 return true;
152 }
153
154
155 de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
156 bool* continueDecoding)
157 {
158 *continueDecoding = false;
159
160 // set defaults
161
162 dependent_slice_segment_flag = 0;
163
164
165 // read bitstream
166
167 first_slice_segment_in_pic_flag = get_bits(br,1);
168
169 if (ctx->get_RapPicFlag()) { // TODO: is this still correct ? Should we drop RapPicFlag ?
170 no_output_of_prior_pics_flag = get_bits(br,1);
171 }
172
173 slice_pic_parameter_set_id = get_uvlc(br);
174 if (slice_pic_parameter_set_id > DE265_MAX_PPS_SETS ||
175 slice_pic_parameter_set_id == UVLC_ERROR) {
176 ctx->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false);
177 return DE265_OK;
178 }
179
180 pic_parameter_set* pps = ctx->get_pps((int)slice_pic_parameter_set_id);
181 if (!pps->pps_read) {
182 ctx->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false);
183 return DE265_OK;
184 }
185
186 seq_parameter_set* sps = ctx->get_sps((int)pps->seq_parameter_set_id);
187 if (!sps->sps_read) {
188 ctx->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false);
189 *continueDecoding = false;
190 return DE265_OK;
191 }
192
193 if (!first_slice_segment_in_pic_flag) {
194 if (pps->dependent_slice_segments_enabled_flag) {
195 dependent_slice_segment_flag = get_bits(br,1);
196 } else {
197 dependent_slice_segment_flag = 0;
198 }
199
200 int slice_segment_address = get_bits(br, ceil_log2(sps->PicSizeInCtbsY));
201
202 if (dependent_slice_segment_flag) {
203 if (slice_segment_address == 0) {
204 *continueDecoding = false;
205 ctx->add_warning(DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO, false);
206 return DE265_OK;
207 }
208
209 *this = *ctx->previous_slice_header;
210
211 first_slice_segment_in_pic_flag = 0;
212 dependent_slice_segment_flag = 1;
213 }
214
215 this->slice_segment_address = slice_segment_address;
216 } else {
217 dependent_slice_segment_flag = 0;
218 slice_segment_address = 0;
219 }
220
221 if (slice_segment_address < 0 ||
222 slice_segment_address > sps->PicSizeInCtbsY) {
223 ctx->add_warning(DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID, false);
224 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
225 }
226
227
228
229 if (!dependent_slice_segment_flag) {
230 for (int i=0; i<pps->num_extra_slice_header_bits; i++) {
231 //slice_reserved_undetermined_flag[i]
232 skip_bits(br,1);
233 }
234
235 slice_type = get_uvlc(br);
236 if (slice_type > 2 ||
237 slice_type == UVLC_ERROR) {
238 ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
239 *continueDecoding = false;
240 return DE265_OK;
241 }
242
243 if (pps->output_flag_present_flag) {
244 pic_output_flag = get_bits(br,1);
245 }
246 else {
247 pic_output_flag = 1;
248 }
249
250 if (sps->separate_colour_plane_flag == 1) {
251 colour_plane_id = get_bits(br,1);
252 }
253
254
255 slice_pic_order_cnt_lsb = 0;
256 short_term_ref_pic_set_sps_flag = 0;
257
258 int NumLtPics = 0;
259
260 if (ctx->get_nal_unit_type() != NAL_UNIT_IDR_W_RADL &&
261 ctx->get_nal_unit_type() != NAL_UNIT_IDR_N_LP) {
262 slice_pic_order_cnt_lsb = get_bits(br, sps->log2_max_pic_order_cnt_lsb);
263 short_term_ref_pic_set_sps_flag = get_bits(br,1);
264
265 if (!short_term_ref_pic_set_sps_flag) {
266 read_short_term_ref_pic_set(ctx, sps,
267 br, &slice_ref_pic_set,
268 sps->num_short_term_ref_pic_sets,
269 sps->ref_pic_sets,
270 true);
271
272 CurrRpsIdx = sps->num_short_term_ref_pic_sets;
273 CurrRps = slice_ref_pic_set;
274 }
275 else {
276 int nBits = ceil_log2(sps->num_short_term_ref_pic_sets);
277 if (nBits>0) short_term_ref_pic_set_idx = get_bits(br,nBits);
278 else short_term_ref_pic_set_idx = 0;
279
280 if (short_term_ref_pic_set_idx > sps->num_short_term_ref_pic_sets) {
281 ctx->add_warning(DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE, false);
282 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
283 }
284
285 CurrRpsIdx = short_term_ref_pic_set_idx;
286 CurrRps = sps->ref_pic_sets[CurrRpsIdx];
287 }
288
289
290 // --- long-term MC ---
291
292 if (sps->long_term_ref_pics_present_flag) {
293 if (sps->num_long_term_ref_pics_sps > 0) {
294 num_long_term_sps = get_uvlc(br);
295 }
296 else {
297 num_long_term_sps = 0;
298 }
299
300 num_long_term_pics= get_uvlc(br);
301
302
303 // check maximum number of reference frames
304
305 if (num_long_term_sps +
306 num_long_term_pics +
307 CurrRps.NumNegativePics +
308 CurrRps.NumPositivePics
309 > sps->sps_max_dec_pic_buffering[sps->sps_max_sub_layers-1])
310 {
311 ctx->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false);
312 *continueDecoding = false;
313 return DE265_OK;
314 }
315
316 for (int i=0; i<num_long_term_sps + num_long_term_pics; i++) {
317 if (i < num_long_term_sps) {
318 int nBits = ceil_log2(sps->num_long_term_ref_pics_sps);
319 lt_idx_sps[i] = get_bits(br, nBits);
320
321 // check that the referenced lt-reference really exists
322
323 if (lt_idx_sps[i] >= sps->num_long_term_ref_pics_sps) {
324 ctx->add_warning(DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER, false);
325 *continueDecoding = false;
326 return DE265_OK;
327 }
328
329 // delta_poc_msb_present_flag[i] = 0; // TODO ?
330
331 ctx->PocLsbLt[i] = sps->lt_ref_pic_poc_lsb_sps[ lt_idx_sps[i] ];
332 ctx->UsedByCurrPicLt[i] = sps->used_by_curr_pic_lt_sps_flag[ lt_idx_sps[i] ];
333 }
334 else {
335 int nBits = sps->log2_max_pic_order_cnt_lsb;
336 poc_lsb_lt[i] = get_bits(br, nBits);
337 used_by_curr_pic_lt_flag[i] = get_bits(br,1);
338
339 ctx->PocLsbLt[i] = poc_lsb_lt[i];
340 ctx->UsedByCurrPicLt[i] = used_by_curr_pic_lt_flag[i];
341 }
342
343 if (ctx->UsedByCurrPicLt[i]) {
344 NumLtPics++;
345 }
346
347 delta_poc_msb_present_flag[i] = get_bits(br,1);
348 if (delta_poc_msb_present_flag[i]) {
349 delta_poc_msb_cycle_lt[i] = get_uvlc(br);
350 }
351 else {
352 delta_poc_msb_cycle_lt[i] = 0;
353 }
354
355 if (i==0 || i==num_long_term_sps) {
356 ctx->DeltaPocMsbCycleLt[i] = delta_poc_msb_cycle_lt[i];
357 }
358 else {
359 ctx->DeltaPocMsbCycleLt[i] = (delta_poc_msb_cycle_lt[i] +
360 ctx->DeltaPocMsbCycleLt[i-1]);
361 }
362 }
363 }
364 else {
365 num_long_term_sps = 0;
366 num_long_term_pics= 0;
367 }
368
369 if (sps->sps_temporal_mvp_enabled_flag) {
370 slice_temporal_mvp_enabled_flag = get_bits(br,1);
371 }
372 else {
373 slice_temporal_mvp_enabled_flag = 0;
374 }
375 }
376 else {
377 slice_pic_order_cnt_lsb = 0;
378 num_long_term_sps = 0;
379 num_long_term_pics= 0;
380 }
381
382
383 // --- SAO ---
384
385 if (sps->sample_adaptive_offset_enabled_flag) {
386 slice_sao_luma_flag = get_bits(br,1);
387 slice_sao_chroma_flag = get_bits(br,1);
388 }
389 else {
390 slice_sao_luma_flag = 0;
391 slice_sao_chroma_flag = 0;
392 }
393
394 if (slice_type == SLICE_TYPE_P ||
395 slice_type == SLICE_TYPE_B) {
396 num_ref_idx_active_override_flag = get_bits(br,1);
397 if (num_ref_idx_active_override_flag) {
398 num_ref_idx_l0_active = get_uvlc(br);
399 if (num_ref_idx_l0_active == UVLC_ERROR) {
400 ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
401 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
402 }
403 num_ref_idx_l0_active++;;
404
405 if (slice_type == SLICE_TYPE_B) {
406 num_ref_idx_l1_active = get_uvlc(br);
407 if (num_ref_idx_l1_active == UVLC_ERROR) {
408 ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
409 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
410 }
411 num_ref_idx_l1_active++;
412 }
413 }
414 else {
415 num_ref_idx_l0_active = pps->num_ref_idx_l0_default_active;
416 num_ref_idx_l1_active = pps->num_ref_idx_l1_default_active;
417 }
418
419 NumPocTotalCurr = CurrRps.NumPocTotalCurr_shortterm_only + NumLtPics;
420
421 if (pps->lists_modification_present_flag && NumPocTotalCurr > 1) {
422
423 int nBits = ceil_log2(NumPocTotalCurr);
424
425 ref_pic_list_modification_flag_l0 = get_bits(br,1);
426 if (ref_pic_list_modification_flag_l0) {
427 for (int i=0;i<num_ref_idx_l0_active;i++) {
428 list_entry_l0[i] = get_bits(br, nBits);
429 }
430 }
431
432 if (slice_type == SLICE_TYPE_B) {
433 ref_pic_list_modification_flag_l1 = get_bits(br,1);
434 if (ref_pic_list_modification_flag_l1) {
435 for (int i=0;i<num_ref_idx_l1_active;i++) {
436 list_entry_l1[i] = get_bits(br, nBits);
437 }
438 }
439 }
440 else {
441 ref_pic_list_modification_flag_l1 = 0;
442 }
443 }
444 else {
445 ref_pic_list_modification_flag_l0 = 0;
446 ref_pic_list_modification_flag_l1 = 0;
447 }
448
449 if (slice_type == SLICE_TYPE_B) {
450 mvd_l1_zero_flag = get_bits(br,1);
451 }
452
453 if (pps->cabac_init_present_flag) {
454 cabac_init_flag = get_bits(br,1);
455 }
456 else {
457 cabac_init_flag = 0;
458 }
459
460 if (slice_temporal_mvp_enabled_flag) {
461 if (slice_type == SLICE_TYPE_B)
462 collocated_from_l0_flag = get_bits(br,1);
463 else
464 collocated_from_l0_flag = 1;
465
466 if (( collocated_from_l0_flag && num_ref_idx_l0_active > 1) ||
467 (!collocated_from_l0_flag && num_ref_idx_l1_active > 1)) {
468 collocated_ref_idx = get_uvlc(br);
469 if (collocated_ref_idx == UVLC_ERROR) {
470 ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
471 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
472 }
473 }
474 else {
475 collocated_ref_idx = 0;
476 }
477 }
478
479 if ((pps->weighted_pred_flag && slice_type == SLICE_TYPE_P) ||
480 (pps->weighted_bipred_flag && slice_type == SLICE_TYPE_B)) {
481
482 if (!read_pred_weight_table(br,this,ctx))
483 {
484 ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
485 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
486 }
487 }
488
489 five_minus_max_num_merge_cand = get_uvlc(br);
490 if (five_minus_max_num_merge_cand == UVLC_ERROR) {
491 ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
492 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
493 }
494 MaxNumMergeCand = 5-five_minus_max_num_merge_cand;
495 }
496
497 slice_qp_delta = get_svlc(br);
498 if (slice_qp_delta == UVLC_ERROR) {
499 ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
500 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
501 }
502 //logtrace(LogSlice,"slice_qp_delta: %d\n",shdr->slice_qp_delta);
503
504 if (pps->pps_slice_chroma_qp_offsets_present_flag) {
505 slice_cb_qp_offset = get_svlc(br);
506 if (slice_cb_qp_offset == UVLC_ERROR) {
507 ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
508 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
509 }
510
511 slice_cr_qp_offset = get_svlc(br);
512 if (slice_cr_qp_offset == UVLC_ERROR) {
513 ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
514 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
515 }
516 }
517 else {
518 slice_cb_qp_offset = 0;
519 slice_cr_qp_offset = 0;
520 }
521
522 if (pps->deblocking_filter_override_enabled_flag) {
523 deblocking_filter_override_flag = get_bits(br,1);
524 }
525 else {
526 deblocking_filter_override_flag = 0;
527 }
528
529 slice_beta_offset = pps->beta_offset;
530 slice_tc_offset = pps->tc_offset;
531
532 if (deblocking_filter_override_flag) {
533 slice_deblocking_filter_disabled_flag = get_bits(br,1);
534 if (!slice_deblocking_filter_disabled_flag) {
535 slice_beta_offset = get_svlc(br);
536 if (slice_beta_offset == UVLC_ERROR) {
537 ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
538 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
539 }
540 slice_beta_offset *= 2;
541
542 slice_tc_offset = get_svlc(br);
543 if (slice_tc_offset == UVLC_ERROR) {
544 ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
545 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
546 }
547 slice_tc_offset *= 2;
548 }
549 }
550 else {
551 slice_deblocking_filter_disabled_flag = pps->pic_disable_deblocking_filter_flag;
552 }
553
554 if (pps->pps_loop_filter_across_slices_enabled_flag &&
555 (slice_sao_luma_flag || slice_sao_chroma_flag ||
556 !slice_deblocking_filter_disabled_flag )) {
557 slice_loop_filter_across_slices_enabled_flag = get_bits(br,1);
558 }
559 else {
560 slice_loop_filter_across_slices_enabled_flag =
561 pps->pps_loop_filter_across_slices_enabled_flag;
562 }
563 }
564
565 if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag ) {
566 num_entry_point_offsets = get_uvlc(br);
567 if (num_entry_point_offsets == UVLC_ERROR) {
568 ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
569 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
570 }
571
572 entry_point_offset.resize( num_entry_point_offsets );
573
574 if (num_entry_point_offsets > 0) {
575 offset_len = get_uvlc(br);
576 if (offset_len == UVLC_ERROR) {
577 ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
578 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
579 }
580 offset_len++;
581
582 for (int i=0; i<num_entry_point_offsets; i++) {
583 {
584 entry_point_offset[i] = get_bits(br,offset_len)+1;
585 }
586
587 if (i>0) {
588 entry_point_offset[i] += entry_point_offset[i-1];
589 }
590 }
591 }
592 }
593 else {
594 num_entry_point_offsets = 0;
595 }
596
597 if (pps->slice_segment_header_extension_present_flag) {
598 slice_segment_header_extension_length = get_uvlc(br);
599 if (slice_segment_header_extension_length == UVLC_ERROR ||
600 slice_segment_header_extension_length > 1000) { // TODO: safety check against too large values
601 ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
602 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
603 }
604
605 for (int i=0; i<slice_segment_header_extension_length; i++) {
606 //slice_segment_header_extension_data_byte[i]
607 get_bits(br,8);
608 }
609 }
610
611
612 // --- init variables ---
613
614 SliceQPY = pps->pic_init_qp + slice_qp_delta;
615
616 switch (slice_type)
617 {
618 case SLICE_TYPE_I: initType = 0; break;
619 case SLICE_TYPE_P: initType = cabac_init_flag + 1; break;
620 case SLICE_TYPE_B: initType = 2 - cabac_init_flag; break;
621 }
622
623 *continueDecoding = true;
624 return DE265_OK;
625 }
626
627
628
629 //-----------------------------------------------------------------------
630
631
632 void slice_segment_header::dump_slice_segment_header(const decoder_context* ctx, int fd) const
633 {
634 FILE* fh;
635 if (fd==1) fh=stdout;
636 else if (fd==2) fh=stderr;
637 else { return; }
638
639 #define LOG0(t) log2fh(fh, t)
640 #define LOG1(t,d) log2fh(fh, t,d)
641 #define LOG2(t,d1,d2) log2fh(fh, t,d1,d2)
642 #define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3)
643 #define LOG4(t,d1,d2,d3,d4) log2fh(fh, t,d1,d2,d3,d4)
644
645 const pic_parameter_set* pps = ctx->get_pps(slice_pic_parameter_set_id);
646 assert(pps->pps_read); // TODO: error handling
647
648 const seq_parameter_set* sps = ctx->get_sps((int)pps->seq_parameter_set_id);
649 assert(sps->sps_read); // TODO: error handling
650
651
652 LOG0("----------------- SLICE -----------------\n");
653 LOG1("first_slice_segment_in_pic_flag : %d\n", first_slice_segment_in_pic_flag);
654 if (ctx->get_nal_unit_type() >= NAL_UNIT_BLA_W_LP &&
655 ctx->get_nal_unit_type() <= NAL_UNIT_RESERVED_IRAP_VCL23) {
656 LOG1("no_output_of_prior_pics_flag : %d\n", no_output_of_prior_pics_flag);
657 }
658
659 LOG1("slice_pic_parameter_set_id : %d\n", slice_pic_parameter_set_id);
660
661 if (!first_slice_segment_in_pic_flag) {
662 if (pps->dependent_slice_segments_enabled_flag) {
663 LOG1("dependent_slice_segment_flag : %d\n", dependent_slice_segment_flag);
664 }
665 LOG1("slice_segment_address : %d\n", slice_segment_address);
666 }
667
668 //if (!dependent_slice_segment_flag)
669 {
670 //for (int i=0; i<pps->num_extra_slice_header_bits; i++) {
671 //slice_reserved_flag[i]
672
673 LOG1("slice_type : %c\n",
674 slice_type == 0 ? 'B' :
675 slice_type == 1 ? 'P' : 'I');
676
677 if (pps->output_flag_present_flag) {
678 LOG1("pic_output_flag : %d\n", pic_output_flag);
679 }
680
681 if (sps->separate_colour_plane_flag == 1) {
682 LOG1("colour_plane_id : %d\n", colour_plane_id);
683 }
684
685 LOG1("slice_pic_order_cnt_lsb : %d\n", slice_pic_order_cnt_lsb);
686
687 if (ctx->get_nal_unit_type() != NAL_UNIT_IDR_W_RADL &&
688 ctx->get_nal_unit_type() != NAL_UNIT_IDR_N_LP) {
689 LOG1("short_term_ref_pic_set_sps_flag : %d\n", short_term_ref_pic_set_sps_flag);
690
691 if (!short_term_ref_pic_set_sps_flag) {
692 LOG1("ref_pic_set[ %2d ]: ",sps->num_short_term_ref_pic_sets);
693 dump_compact_short_term_ref_pic_set(&slice_ref_pic_set, 16, fh);
694 }
695 else if (sps->num_short_term_ref_pic_sets > 1) {
696 LOG1("short_term_ref_pic_set_idx : %d\n", short_term_ref_pic_set_idx);
697 dump_compact_short_term_ref_pic_set(&sps->ref_pic_sets[short_term_ref_pic_set_idx], 16, fh);
698 }
699
700 if (sps->long_term_ref_pics_present_flag) {
701 if (sps->num_long_term_ref_pics_sps > 0) {
702 LOG1("num_long_term_sps : %d\n", num_long_term_sps);
703 }
704
705 LOG1("num_long_term_pics : %d\n", num_long_term_pics);
706
707 #if 0
708 for (int i=0; i<num_long_term_sps + num_long_term_pics; i++) {
709 LOG2("PocLsbLt[%d] : %d\n", i, ctx->PocLsbLt[i]);
710 LOG2("UsedByCurrPicLt[%d] : %d\n", i, ctx->UsedByCurrPicLt[i]);
711 LOG2("DeltaPocMsbCycleLt[%d] : %d\n", i, ctx->DeltaPocMsbCycleLt[i]);
712 }
713 #endif
714 }
715
716 if (sps->sps_temporal_mvp_enabled_flag) {
717 LOG1("slice_temporal_mvp_enabled_flag : %d\n", slice_temporal_mvp_enabled_flag);
718 }
719 }
720
721
722 if (sps->sample_adaptive_offset_enabled_flag) {
723 LOG1("slice_sao_luma_flag : %d\n", slice_sao_luma_flag);
724 LOG1("slice_sao_chroma_flag : %d\n", slice_sao_chroma_flag);
725 }
726
727
728 if (slice_type == SLICE_TYPE_P || slice_type == SLICE_TYPE_B) {
729 LOG1("num_ref_idx_active_override_flag : %d\n", num_ref_idx_active_override_flag);
730
731 LOG2("num_ref_idx_l0_active : %d %s\n", num_ref_idx_l0_active,
732 num_ref_idx_active_override_flag ? "" : "(from PPS)");
733
734 if (slice_type == SLICE_TYPE_B) {
735 LOG2("num_ref_idx_l1_active : %d %s\n", num_ref_idx_l1_active,
736 num_ref_idx_active_override_flag ? "" : "(from PPS)");
737 }
738
739 if (pps->lists_modification_present_flag && NumPocTotalCurr > 1)
740 {
741 LOG1("ref_pic_list_modification_flag_l0 : %d\n", ref_pic_list_modification_flag_l0);
742 if (ref_pic_list_modification_flag_l0) {
743 for (int i=0;i<num_ref_idx_l0_active;i++) {
744 LOG2(" %d: %d\n",i,list_entry_l0[i]);
745 }
746 }
747
748 LOG1("ref_pic_list_modification_flag_l1 : %d\n", ref_pic_list_modification_flag_l1);
749 if (ref_pic_list_modification_flag_l1) {
750 for (int i=0;i<num_ref_idx_l1_active;i++) {
751 LOG2(" %d: %d\n",i,list_entry_l1[i]);
752 }
753 }
754 }
755
756 if (slice_type == SLICE_TYPE_B) {
757 LOG1("mvd_l1_zero_flag : %d\n", mvd_l1_zero_flag);
758 }
759
760 LOG1("cabac_init_flag : %d\n", cabac_init_flag);
761
762 if (slice_temporal_mvp_enabled_flag) {
763 LOG1("collocated_from_l0_flag : %d\n", collocated_from_l0_flag);
764 LOG1("collocated_ref_idx : %d\n", collocated_ref_idx);
765 }
766
767 if ((pps->weighted_pred_flag && slice_type == SLICE_TYPE_P) ||
768 (pps->weighted_bipred_flag && slice_type == SLICE_TYPE_B))
769 {
770 LOG1("luma_log2_weight_denom : %d\n", luma_log2_weight_denom);
771 if (sps->chroma_format_idc != 0) {
772 LOG1("ChromaLog2WeightDenom : %d\n", ChromaLog2WeightDenom);
773 }
774
775 for (int l=0;l<=1;l++)
776 if (l==0 || (l==1 && slice_type == SLICE_TYPE_B))
777 {
778 int num_ref = (l==0 ?
779 num_ref_idx_l0_active-1 :
780 num_ref_idx_l1_active-1);
781
782 if (false) { // do not show these flags
783 for (int i=0;i<=num_ref;i++) {
784 LOG3("luma_weight_flag_l%d[%d] : %d\n",l,i,luma_weight_flag[l][i]);
785 }
786
787 if (sps->chroma_format_idc != 0) {
788 for (int i=0;i<=num_ref;i++) {
789 LOG3("chroma_weight_flag_l%d[%d] : %d\n",l,i,chroma_weight_flag[l][i]);
790 }
791 }
792 }
793
794 for (int i=0;i<=num_ref;i++) {
795 LOG3("LumaWeight_L%d[%d] : %d\n",l,i,LumaWeight[l][i]);
796 LOG3("luma_offset_l%d[%d] : %d\n",l,i,luma_offset[l][i]);
797
798 for (int j=0;j<2;j++) {
799 LOG4("ChromaWeight_L%d[%d][%d] : %d\n",l,i,j,ChromaWeight[l][i][j]);
800 LOG4("ChromaOffset_L%d[%d][%d] : %d\n",l,i,j,ChromaOffset[l][i][j]);
801 }
802 }
803 }
804 }
805
806 LOG1("five_minus_max_num_merge_cand : %d\n", five_minus_max_num_merge_cand);
807 }
808
809
810 LOG1("slice_qp_delta : %d\n", slice_qp_delta);
811 if (pps->pps_slice_chroma_qp_offsets_present_flag) {
812 LOG1("slice_cb_qp_offset : %d\n", slice_cb_qp_offset);
813 LOG1("slice_cr_qp_offset : %d\n", slice_cr_qp_offset);
814 }
815
816 if (pps->deblocking_filter_override_enabled_flag) {
817 LOG1("deblocking_filter_override_flag : %d\n", deblocking_filter_override_flag);
818 }
819
820 LOG2("slice_deblocking_filter_disabled_flag : %d %s\n",
821 slice_deblocking_filter_disabled_flag,
822 (deblocking_filter_override_flag ? "(override)" : "(from pps)"));
823
824 if (deblocking_filter_override_flag) {
825
826 if (!slice_deblocking_filter_disabled_flag) {
827 LOG1("slice_beta_offset : %d\n", slice_beta_offset);
828 LOG1("slice_tc_offset : %d\n", slice_tc_offset);
829 }
830 }
831
832 if (pps->pps_loop_filter_across_slices_enabled_flag &&
833 (slice_sao_luma_flag || slice_sao_chroma_flag ||
834 !slice_deblocking_filter_disabled_flag)) {
835 LOG1("slice_loop_filter_across_slices_enabled_flag : %d\n",
836 slice_loop_filter_across_slices_enabled_flag);
837 }
838 }
839
840 if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag) {
841 LOG1("num_entry_point_offsets : %d\n", num_entry_point_offsets);
842
843 if (num_entry_point_offsets > 0) {
844 LOG1("offset_len : %d\n", offset_len);
845
846 for (int i=0; i<num_entry_point_offsets; i++) {
847 LOG2("entry point [%i] : %d\n", i, entry_point_offset[i]);
848 }
849 }
850 }
851
852 /*
853 if( slice_segment_header_extension_present_flag ) {
854 slice_segment_header_extension_length
855 for( i = 0; i < slice_segment_header_extension_length; i++)
856 slice_segment_header_extension_data_byte[i]
857 }
858 byte_alignment()
859 }
860 */
861
862 #undef LOG0
863 #undef LOG1
864 #undef LOG2
865 #undef LOG3
866 #undef LOG4
867 //#endif
868 }
869
870
871
872
873
874 static void set_initValue(slice_segment_header* shdr,
875 context_model* model, int initValue)
876 {
877 int slopeIdx = initValue >> 4;
878 int intersecIdx = initValue & 0xF;
879 int m = slopeIdx*5 - 45;
880 int n = (intersecIdx<<3) - 16;
881 int preCtxState = Clip3(1,126, ((m*Clip3(0,51, shdr->SliceQPY))>>4)+n);
882
883 logtrace(LogSlice,"QP=%d slopeIdx=%d intersecIdx=%d m=%d n=%d\n",shdr->SliceQPY,slopeIdx,intersecIdx,m,n);
884
885 model->MPSbit=(preCtxState<=63) ? 0 : 1;
886 model->state = model->MPSbit ? (preCtxState-64) : (63-preCtxState);
887
888 // model state will always be between [0;62]
889
890 assert(model->state <= 62);
891 }
892
893
894 static const int initValue_split_cu_flag[3][3] = {
895 { 139,141,157 },
896 { 107,139,126 },
897 { 107,139,126 },
898 };
899 static const int initValue_cu_skip_flag[2][3] = {
900 { 197,185,201 },
901 { 197,185,201 },
902 };
903 static const int initValue_part_mode[9] = { 184,154,139, 154,154,154, 139,154,154 };
904 static const int initValue_prev_intra_luma_pred_flag[3] = { 184,154,183 };
905 static const int initValue_intra_chroma_pred_mode[3] = { 63,152,152 };
906 static const int initValue_cbf_luma[4] = { 111,141,153,111 };
907 static const int initValue_cbf_chroma[12] = { 94,138,182,154,149,107,167,154,149,92,167,154 };
908 static const int initValue_split_transform_flag[9] = { 153,138,138, 124,138,94, 224,167,122 }; // FIX712
909 static const int initValue_last_significant_coefficient_prefix[54] = {
910 110,110,124,125,140,153,125,127,140,109,111,143,127,111, 79,108,123, 63,
911 125,110, 94,110, 95, 79,125,111,110, 78,110,111,111, 95, 94,108,123,108,
912 125,110,124,110, 95, 94,125,111,111, 79,125,126,111,111, 79,108,123, 93
913 };
914 static const int initValue_coded_sub_block_flag[12] = { 91,171,134,141,121,140,61,154,121,140,61,154 };
915 static const int initValue_significant_coeff_flag[3][42] = {
916 {
917 111, 111, 125, 110, 110, 94, 124, 108, 124, 107, 125, 141, 179, 153, 125, 107,
918 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140, 139, 182, 182, 152,
919 136, 152, 136, 153, 136, 139, 111, 136, 139, 111
920 },
921 {
922 155, 154, 139, 153, 139, 123, 123, 63, 153, 166, 183, 140, 136, 153, 154, 166,
923 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, 153, 123, 123, 107,
924 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
925 },
926 {
927 170, 154, 139, 153, 139, 123, 123, 63, 124, 166, 183, 140, 136, 153, 154, 166,
928 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, 153, 138, 138, 122,
929 121, 122, 121, 167, 151, 183, 140, 151, 183, 140
930 },
931 };
932 static const int initValue_coeff_abs_level_greater1_flag[72] = {
933 140, 92,137,138,140,152,138,139,153, 74,149, 92,139,107,122,152,
934 140,179,166,182,140,227,122,197,154,196,196,167,154,152,167,182,
935 182,134,149,136,153,121,136,137,169,194,166,167,154,167,137,182,
936 154,196,167,167,154,152,167,182,182,134,149,136,153,121,136,122,
937 169,208,166,167,154,152,167,182
938 };
939 static const int initValue_coeff_abs_level_greater2_flag[18] = {
940 138,153,136,167,152,152,107,167, 91,122,107,167,
941 107,167, 91,107,107,167
942 };
943 static const int initValue_sao_merge_leftUp_flag[3] = { 153,153,153 };
944 static const int initValue_sao_type_idx_lumaChroma_flag[3] = { 200,185,160 };
945 static const int initValue_cu_qp_delta_abs[2] = { 154,154 };
946 static const int initValue_transform_skip_flag[2] = { 139,139 };
947 static const int initValue_merge_flag[2] = { 110,154 };
948 static const int initValue_merge_idx[2] = { 122,137 };
949 static const int initValue_pred_mode_flag[2] = { 149,134 };
950 static const int initValue_abs_mvd_greater01_flag[4] = { 140,198,169,198 };
951 static const int initValue_mvp_lx_flag[1] = { 168 };
952 static const int initValue_rqt_root_cbf[1] = { 79 };
953 static const int initValue_ref_idx_lX[2] = { 153,153 };
954 static const int initValue_inter_pred_idc[5] = { 95,79,63,31,31 };
955 static const int initValue_cu_transquant_bypass_flag[3] = { 154,154,154 };
956
957
958 static void init_context(thread_context* tctx,
959 enum context_model_indices idx,
960 const int* initValues, int len)
961 {
962 for (int i=0;i<len;i++)
963 {
964 set_initValue(tctx->shdr,
965 &tctx->ctx_model[idx+i],
966 initValues[i]);
967 }
968 }
969
970
971 static int decode_transform_skip_flag(thread_context* tctx, int cIdx)
972 {
973 const int context = (cIdx==0) ? 0 : 1;
974
975 logtrace(LogSlice,"# transform_skip_flag (context=%d)\n",context);
976
977 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
978 &tctx->ctx_model[CONTEXT_MODEL_TRANSFORM_SKIP_FLAG+context]);
979 return bit;
980 }
981
982
983 static int decode_sao_merge_flag(thread_context* tctx)
984 {
985 logtrace(LogSlice,"# sao_merge_left/up_flag\n");
986 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
987 &tctx->ctx_model[CONTEXT_MODEL_SAO_MERGE_FLAG]);
988 return bit;
989 }
990
991
992
993 static int decode_sao_type_idx(thread_context* tctx)
994 {
995 logtrace(LogSlice,"# sao_type_idx_luma/chroma\n");
996
997 int bit0 = decode_CABAC_bit(&tctx->cabac_decoder,
998 &tctx->ctx_model[CONTEXT_MODEL_SAO_TYPE_IDX]);
999
1000 if (bit0==0) {
1001 return 0;
1002 }
1003 else {
1004 int bit1 = decode_CABAC_bypass(&tctx->cabac_decoder);
1005 if (bit1==0) {
1006 return 1;
1007 }
1008 else {
1009 return 2;
1010 }
1011 }
1012 }
1013
1014
1015 static int decode_sao_offset_abs(thread_context* tctx)
1016 {
1017 logtrace(LogSlice,"# sao_offset_abs\n");
1018 int bitDepth = 8;
1019 int cMax = (1<<(libde265_min(bitDepth,10)-5))-1;
1020 int value = decode_CABAC_TU_bypass(&tctx->cabac_decoder, cMax);
1021 return value;
1022 }
1023
1024
1025 static int decode_sao_class(thread_context* tctx)
1026 {
1027 logtrace(LogSlice,"# sao_class\n");
1028 int value = decode_CABAC_FL_bypass(&tctx->cabac_decoder, 2);
1029 return value;
1030 }
1031
1032
1033 static int decode_sao_offset_sign(thread_context* tctx)
1034 {
1035 logtrace(LogSlice,"# sao_offset_sign\n");
1036 int value = decode_CABAC_bypass(&tctx->cabac_decoder);
1037 return value;
1038 }
1039
1040
1041 static int decode_sao_band_position(thread_context* tctx)
1042 {
1043 logtrace(LogSlice,"# sao_band_position\n");
1044 int value = decode_CABAC_FL_bypass(&tctx->cabac_decoder,5);
1045 return value;
1046 }
1047
1048
1049 static int decode_transquant_bypass_flag(thread_context* tctx)
1050 {
1051 logtrace(LogSlice,"# cu_transquant_bypass_enable_flag\n");
1052 int value = decode_CABAC_bit(&tctx->cabac_decoder,
1053 &tctx->ctx_model[CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG]);
1054 return value;
1055 }
1056
1057
1058 #include <sys/types.h>
1059 #include <signal.h>
1060
1061 static int decode_split_cu_flag(thread_context* tctx,
1062 int x0, int y0, int ctDepth)
1063 {
1064 //decoder_context* ctx = tctx->decctx;
1065
1066 if (x0==64 && y0==448) {
1067 //raise(SIGINT);
1068 }
1069
1070 // check if neighbors are available
1071
1072 int availableL = check_CTB_available(tctx->img,tctx->shdr, x0,y0, x0-1,y0);
1073 int availableA = check_CTB_available(tctx->img,tctx->shdr, x0,y0, x0,y0-1);
1074
1075 int condL = 0;
1076 int condA = 0;
1077
1078 if (availableL && tctx->img->get_ctDepth(x0-1,y0) > ctDepth) condL=1;
1079 if (availableA && tctx->img->get_ctDepth(x0,y0-1) > ctDepth) condA=1;
1080
1081 int contextOffset = condL + condA;
1082 int context = contextOffset;
1083
1084 // decode bit
1085
1086 logtrace(LogSlice,"# split_cu_flag context=%d R=%x\n", context, tctx->cabac_decoder.range);
1087
1088 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_SPLIT_CU_FLAG + context]);
1089
1090 logtrace(LogSlice,"> split_cu_flag R=%x, ctx=%d, bit=%d\n", tctx->cabac_decoder.range,context,bit);
1091
1092 return bit;
1093 }
1094
1095
1096 static int decode_cu_skip_flag(thread_context* tctx,
1097 int x0, int y0, int ctDepth)
1098 {
1099 decoder_context* ctx = tctx->decctx;
1100
1101 // check if neighbors are available
1102
1103 int availableL = check_CTB_available(tctx->img,tctx->shdr, x0,y0, x0-1,y0);
1104 int availableA = check_CTB_available(tctx->img,tctx->shdr, x0,y0, x0,y0-1);
1105
1106 int condL = 0;
1107 int condA = 0;
1108
1109 if (availableL && tctx->img->get_cu_skip_flag(x0-1,y0)) condL=1;
1110 if (availableA && tctx->img->get_cu_skip_flag(x0,y0-1)) condA=1;
1111
1112 int contextOffset = condL + condA;
1113 int context = contextOffset;
1114
1115 // decode bit
1116
1117 logtrace(LogSlice,"# cu_skip_flag context=%d R=%x\n", context, tctx->cabac_decoder.range);
1118
1119 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_CU_SKIP_FLAG + context]);
1120
1121 logtrace(LogSlice,"> cu_skip_flag R=%x, ctx=%d, bit=%d\n", tctx->cabac_decoder.range,context,bit);
1122
1123 return bit;
1124 }
1125
1126
1127 static enum PartMode decode_part_mode(thread_context* tctx,
1128 enum PredMode pred_mode, int cLog2CbSize)
1129 {
1130 de265_image* img = tctx->img;
1131
1132 if (pred_mode == MODE_INTRA) {
1133 logtrace(LogSlice,"# part_mode (INTRA)\n");
1134
1135 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE]);
1136
1137 logtrace(LogSlice,"> %s\n",bit ? "2Nx2N" : "NxN");
1138
1139 return bit ? PART_2Nx2N : PART_NxN;
1140 }
1141 else {
1142 int bit0 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+0]);
1143 if (bit0) { return PART_2Nx2N; }
1144
1145 // CHECK_ME: I optimize code and fix bug here, need more VERIFY!
1146 int bit1 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+1]);
1147 if (cLog2CbSize > img->sps.Log2MinCbSizeY) {
1148 if (!img->sps.amp_enabled_flag) {
1149 return bit1 ? PART_2NxN : PART_Nx2N;
1150 }
1151 else {
1152 int bit3 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+3]);
1153 if (bit3) {
1154 return bit1 ? PART_2NxN : PART_Nx2N;
1155 }
1156
1157 int bit4 = decode_CABAC_bypass(&tctx->cabac_decoder);
1158 if ( bit1 && bit4) return PART_2NxnD;
1159 if ( bit1 && !bit4) return PART_2NxnU;
1160 if (!bit1 && !bit4) return PART_nLx2N;
1161 if (!bit1 && bit4) return PART_nRx2N;
1162 }
1163 }
1164 else {
1165 // TODO, we could save one if here when first decoding the next bin and then
1166 // checkcLog2CbSize==3 when it is '0'
1167
1168 if (bit1) return PART_2NxN;
1169
1170 if (cLog2CbSize==3) {
1171 return PART_Nx2N;
1172 }
1173 else {
1174 int bit2 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+2]);
1175 return (enum PartMode)((int)PART_NxN - bit2)/*bit2 ? PART_Nx2N : PART_NxN*/;
1176 }
1177 }
1178 }
1179
1180 assert(false); // should never be reached
1181 return PART_2Nx2N;
1182 }
1183
1184
1185 static int decode_prev_intra_luma_pred_flag(thread_context* tctx)
1186 {
1187 logtrace(LogSlice,"# prev_intra_luma_pred_flag\n");
1188 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG]);
1189 return bit;
1190 }
1191
1192
1193 static int decode_mpm_idx(thread_context* tctx)
1194 {
1195 logtrace(LogSlice,"# mpm_idx (TU:2)\n");
1196 int mpm = decode_CABAC_TU_bypass(&tctx->cabac_decoder, 2);
1197 logtrace(LogSlice,"> mpm_idx = %d\n",mpm);
1198 return mpm;
1199 }
1200
1201
1202 static int decode_rem_intra_luma_pred_mode(thread_context* tctx)
1203 {
1204 logtrace(LogSlice,"# rem_intra_luma_pred_mode (5 bits)\n");
1205 return decode_CABAC_FL_bypass(&tctx->cabac_decoder, 5);
1206 }
1207
1208
1209 static int decode_intra_chroma_pred_mode(thread_context* tctx)
1210 {
1211 logtrace(LogSlice,"# intra_chroma_pred_mode\n");
1212
1213 int prefix = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE]);
1214
1215 int mode;
1216 if (prefix==0) {
1217 mode=4;
1218 }
1219 else {
1220 mode = decode_CABAC_FL_bypass(&tctx->cabac_decoder, 2);
1221 }
1222
1223 logtrace(LogSlice,"> intra_chroma_pred_mode = %d\n",mode);
1224
1225 return mode;
1226 }
1227
1228
1229 static int decode_split_transform_flag(thread_context* tctx,
1230 int log2TrafoSize)
1231 {
1232 logtrace(LogSlice,"# split_transform_flag (log2TrafoSize=%d)\n",log2TrafoSize);
1233
1234 int context = 5-log2TrafoSize;
1235 assert(context >= 0 && context <= 2);
1236
1237 logtrace(LogSlice,"# context: %d\n",context);
1238
1239 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG + context]);
1240 return bit;
1241 }
1242
1243
1244 static int decode_cbf_chroma(thread_context* tctx,
1245 int trafoDepth)
1246 {
1247 logtrace(LogSlice,"# cbf_chroma\n");
1248
1249 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_CBF_CHROMA + trafoDepth]);
1250
1251 return bit;
1252 }
1253
1254
1255 static int decode_cbf_luma(thread_context* tctx,
1256 int trafoDepth)
1257 {
1258 logtrace(LogSlice,"# cbf_luma\n");
1259
1260 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_CBF_LUMA + (trafoDepth==0)]);
1261
1262 logtrace(LogSlice,"> cbf_luma = %d\n",bit);
1263
1264 return bit;
1265 }
1266
1267
1268 static inline int decode_coded_sub_block_flag(thread_context* tctx,
1269 int cIdx,
1270 uint8_t coded_sub_block_neighbors)
1271 {
1272 logtrace(LogSlice,"# coded_sub_block_flag\n");
1273
1274 // tricky computation of csbfCtx
1275 int csbfCtx = ((coded_sub_block_neighbors & 1) | // right neighbor set or
1276 (coded_sub_block_neighbors >> 1)); // bottom neighbor set -> csbfCtx=1
1277
1278 int ctxIdxInc = csbfCtx;
1279 if (cIdx!=0) {
1280 ctxIdxInc += 2;
1281 }
1282
1283 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1284 &tctx->ctx_model[CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG + ctxIdxInc]);
1285
1286 return bit;
1287 }
1288
1289
1290 static int decode_cu_qp_delta_abs(thread_context* tctx)
1291 {
1292 logtrace(LogSlice,"# cu_qp_delta_abs\n");
1293
1294 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1295 &tctx->ctx_model[CONTEXT_MODEL_CU_QP_DELTA_ABS + 0]);
1296 if (bit==0) {
1297 return 0;
1298 }
1299
1300 int prefix=1;
1301 for (int i=0;i<4;i++) {
1302 bit = decode_CABAC_bit(&tctx->cabac_decoder,
1303 &tctx->ctx_model[CONTEXT_MODEL_CU_QP_DELTA_ABS + 1]);
1304 if (bit==0) { break; }
1305 else { prefix++; }
1306 }
1307
1308 if (prefix==5) {
1309 int value = decode_CABAC_EGk_bypass(&tctx->cabac_decoder, 0);
1310 return value + 5;
1311 }
1312 else {
1313 return prefix;
1314 }
1315 }
1316
1317
1318 static int decode_last_significant_coeff_prefix(thread_context* tctx,
1319 int log2TrafoSize,
1320 int cIdx,
1321 context_model* model)
1322 {
1323 logtrace(LogSlice,"# last_significant_coeff_prefix log2TrafoSize:%d cIdx:%d\n",log2TrafoSize,cIdx);
1324
1325 int cMax = (log2TrafoSize<<1)-1;
1326
1327 int ctxOffset, ctxShift;
1328 if (cIdx==0) {
1329 ctxOffset = 3*(log2TrafoSize-2) + ((log2TrafoSize-1)>>2);
1330 ctxShift = (log2TrafoSize+1)>>2;
1331 }
1332 else {
1333 ctxOffset = 15;
1334 ctxShift = log2TrafoSize-2;
1335 }
1336
1337 int binIdx;
1338 int value = cMax;
1339 for (binIdx=0;binIdx<cMax;binIdx++)
1340 {
1341 int ctxIdxInc = (binIdx >> ctxShift);
1342
1343 logtrace(LogSlice,"context: %d+%d\n",ctxOffset,ctxIdxInc);
1344
1345 int bit = decode_CABAC_bit(&tctx->cabac_decoder, &model[ctxOffset + ctxIdxInc]);
1346 if (bit==0) {
1347 value=binIdx;
1348 break;
1349 }
1350 }
1351
1352 logtrace(LogSlice,"> last_significant_coeff_prefix: %d\n", value);
1353
1354 return value;
1355 }
1356
1357
1358 static const uint8_t ctxIdxMap[16] = {
1359 0,1,4,5,
1360 2,3,4,5,
1361 6,6,8,8,
1362 7,7,8,99
1363 };
1364
1365 uint8_t* ctxIdxLookup[4 /* 4-log2-32 */][2 /* !!cIdx */][2 /* !!scanIdx */][4 /* prevCsbf */];
1366
1367 bool alloc_and_init_significant_coeff_ctxIdx_lookupTable()
1368 {
1369 int tableSize = 4*4*(2) + 8*8*(2*2*4) + 16*16*(2*4) + 32*32*(2*4);
1370
1371 uint8_t* p = (uint8_t*)malloc(tableSize);
1372 if (p==NULL) {
1373 return false;
1374 }
1375
1376 memset(p,0xFF,tableSize); // just for debugging
1377
1378
1379 // --- Set pointers to memory areas. Note that some parameters share the same memory. ---
1380
1381 // 4x4
1382
1383 for (int cIdx=0;cIdx<2;cIdx++) {
1384 for (int scanIdx=0;scanIdx<2;scanIdx++)
1385 for (int prevCsbf=0;prevCsbf<4;prevCsbf++)
1386 ctxIdxLookup[0][cIdx][scanIdx][prevCsbf] = p;
1387
1388 p += 4*4;
1389 }
1390
1391 // 8x8
1392
1393 for (int cIdx=0;cIdx<2;cIdx++)
1394 for (int scanIdx=0;scanIdx<2;scanIdx++)
1395 for (int prevCsbf=0;prevCsbf<4;prevCsbf++) {
1396 ctxIdxLookup[1][cIdx][scanIdx][prevCsbf] = p;
1397 p += 8*8;
1398 }
1399
1400 // 16x16
1401
1402 for (int cIdx=0;cIdx<2;cIdx++)
1403 for (int prevCsbf=0;prevCsbf<4;prevCsbf++) {
1404 for (int scanIdx=0;scanIdx<2;scanIdx++) {
1405 ctxIdxLookup[2][cIdx][scanIdx][prevCsbf] = p;
1406 }
1407
1408 p += 16*16;
1409 }
1410
1411 // 32x32
1412
1413 for (int cIdx=0;cIdx<2;cIdx++)
1414 for (int prevCsbf=0;prevCsbf<4;prevCsbf++) {
1415 for (int scanIdx=0;scanIdx<2;scanIdx++) {
1416 ctxIdxLookup[3][cIdx][scanIdx][prevCsbf] = p;
1417 }
1418
1419 p += 32*32;
1420 }
1421
1422
1423 // --- precompute ctxIdx tables ---
1424
1425 for (int log2w=2; log2w<=5 ; log2w++)
1426 for (int cIdx=0;cIdx<2;cIdx++)
1427 for (int scanIdx=0;scanIdx<2;scanIdx++)
1428 for (int prevCsbf=0;prevCsbf<4;prevCsbf++)
1429 {
1430 for (int yC=0;yC<(1<<log2w);yC++)
1431 for (int xC=0;xC<(1<<log2w);xC++)
1432 {
1433 int w = 1<<log2w;
1434 int sbWidth = w>>2;
1435
1436 int sigCtx;
1437
1438 // if log2TrafoSize==2
1439 if (sbWidth==1) {
1440 sigCtx = ctxIdxMap[(yC<<2) + xC];
1441 }
1442 else if (xC+yC==0) {
1443 sigCtx = 0;
1444 }
1445 else {
1446 int xS = xC>>2;
1447 int yS = yC>>2;
1448 /*
1449 int prevCsbf = 0;
1450
1451 if (xS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+1 +yS*sbWidth]; }
1452 if (yS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+(1+yS)*sbWidth]<<1; }
1453 */
1454 int xP = xC & 3;
1455 int yP = yC & 3;
1456
1457 //logtrace(LogSlice,"posInSubset: %d,%d\n",xP,yP);
1458 //logtrace(LogSlice,"prevCsbf: %d\n",prevCsbf);
1459
1460 switch (prevCsbf) {
1461 case 0:
1462 sigCtx = (xP+yP>=3) ? 0 : (xP+yP>0) ? 1 : 2;
1463 break;
1464 case 1:
1465 sigCtx = (yP==0) ? 2 : (yP==1) ? 1 : 0;
1466 break;
1467 case 2:
1468 sigCtx = (xP==0) ? 2 : (xP==1) ? 1 : 0;
1469 break;
1470 default:
1471 sigCtx = 2;
1472 break;
1473 }
1474
1475 //logtrace(LogSlice,"a) sigCtx=%d\n",sigCtx);
1476
1477 if (cIdx==0) {
1478 if (xS+yS > 0) sigCtx+=3;
1479
1480 //logtrace(LogSlice,"b) sigCtx=%d\n",sigCtx);
1481
1482 // if log2TrafoSize==3
1483 if (sbWidth==2) { // 8x8 block
1484 sigCtx += (scanIdx==0) ? 9 : 15;
1485 } else {
1486 sigCtx += 21;
1487 }
1488
1489 //logtrace(LogSlice,"c) sigCtx=%d\n",sigCtx);
1490 }
1491 else {
1492 // if log2TrafoSize==3
1493 if (sbWidth==2) { // 8x8 block
1494 sigCtx+=9;
1495 }
1496 else {
1497 sigCtx+=12;
1498 }
1499 }
1500
1501 }
1502
1503 int ctxIdxInc;
1504 if (cIdx==0) { ctxIdxInc=sigCtx; }
1505 else { ctxIdxInc=27+sigCtx; }
1506
1507 if (ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][xC+(yC<<log2w)] != 0xFF) {
1508 assert(ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][xC+(yC<<log2w)] == ctxIdxInc);
1509 }
1510
1511 ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][xC+(yC<<log2w)] = ctxIdxInc;
1512
1513 //NOTE: when using this option, we have to include all three scanIdx in the table
1514 //ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][s] = ctxIdxInc;
1515 }
1516 }
1517
1518 return true;
1519 }
1520
1521
1522 bool alloc_and_init_significant_coeff_ctxIdx_lookupTable_OLD()
1523 {
1524 int tableSize = 2*2*4*(4*4 + 8*8 + 16*16 + 32*32);
1525 uint8_t* p = (uint8_t*)malloc(tableSize);
1526 if (p==NULL) {
1527 return false;
1528 }
1529
1530 for (int log2w=2; log2w<=5 ; log2w++)
1531 for (int cIdx=0;cIdx<2;cIdx++)
1532 for (int scanIdx=0;scanIdx<2;scanIdx++)
1533 for (int prevCsbf=0;prevCsbf<4;prevCsbf++)
1534 {
1535 // assign pointer into reserved memory area
1536
1537 ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf] = p;
1538 p += (1<<log2w)*(1<<log2w);
1539
1540 const position* ScanOrderSub = get_scan_order(log2w-2, scanIdx);
1541 const position* ScanOrderPos = get_scan_order(2, scanIdx);
1542
1543 //for (int yC=0;yC<(1<<log2w);yC++)
1544 // for (int xC=0;xC<(1<<log2w);xC++)
1545 for (int s=0;s<(1<<log2w)*(1<<log2w);s++)
1546 {
1547 position S = ScanOrderSub[s>>4];
1548 int x0 = S.x<<2;
1549 int y0 = S.y<<2;
1550
1551 int subX = ScanOrderPos[s & 0xF].x;
1552 int subY = ScanOrderPos[s & 0xF].y;
1553 int xC = x0 + subX;
1554 int yC = y0 + subY;
1555
1556
1557 int w = 1<<log2w;
1558 int sbWidth = w>>2;
1559
1560 int sigCtx;
1561
1562 // if log2TrafoSize==2
1563 if (sbWidth==1) {
1564 sigCtx = ctxIdxMap[(yC<<2) + xC];
1565 }
1566 else if (xC+yC==0) {
1567 sigCtx = 0;
1568 }
1569 else {
1570 int xS = xC>>2;
1571 int yS = yC>>2;
1572 /*
1573 int prevCsbf = 0;
1574
1575 if (xS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+1 +yS*sbWidth]; }
1576 if (yS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+(1+yS)*sbWidth]<<1; }
1577 */
1578 int xP = xC & 3;
1579 int yP = yC & 3;
1580
1581 logtrace(LogSlice,"posInSubset: %d,%d\n",xP,yP);
1582 logtrace(LogSlice,"prevCsbf: %d\n",prevCsbf);
1583
1584 //printf("%d | %d %d\n",prevCsbf,xP,yP);
1585
1586 switch (prevCsbf) {
1587 case 0:
1588 //sigCtx = (xP+yP==0) ? 2 : (xP+yP<3) ? 1 : 0;
1589 sigCtx = (xP+yP>=3) ? 0 : (xP+yP>0) ? 1 : 2;
1590 break;
1591 case 1:
1592 sigCtx = (yP==0) ? 2 : (yP==1) ? 1 : 0;
1593 break;
1594 case 2:
1595 sigCtx = (xP==0) ? 2 : (xP==1) ? 1 : 0;
1596 break;
1597 default:
1598 sigCtx = 2;
1599 break;
1600 }
1601
1602 logtrace(LogSlice,"a) sigCtx=%d\n",sigCtx);
1603
1604 if (cIdx==0) {
1605 if (xS+yS > 0) sigCtx+=3;
1606
1607 logtrace(LogSlice,"b) sigCtx=%d\n",sigCtx);
1608
1609 // if log2TrafoSize==3
1610 if (sbWidth==2) { // 8x8 block
1611 sigCtx += (scanIdx==0) ? 9 : 15;
1612 } else {
1613 sigCtx += 21;
1614 }
1615
1616 logtrace(LogSlice,"c) sigCtx=%d\n",sigCtx);
1617 }
1618 else {
1619 // if log2TrafoSize==3
1620 if (sbWidth==2) { // 8x8 block
1621 sigCtx+=9;
1622 }
1623 else {
1624 sigCtx+=12;
1625 }
1626 }
1627 }
1628
1629 int ctxIdxInc;
1630 if (cIdx==0) { ctxIdxInc=sigCtx; }
1631 else { ctxIdxInc=27+sigCtx; }
1632
1633
1634 ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][xC+(yC<<log2w)] = ctxIdxInc;
1635
1636 //NOTE: when using this option, we have to include all three scanIdx in the table
1637 //ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][s] = ctxIdxInc;
1638 }
1639 }
1640
1641 return true;
1642 }
1643
1644 void free_significant_coeff_ctxIdx_lookupTable()
1645 {
1646 free(ctxIdxLookup[0][0][0][0]);
1647 ctxIdxLookup[0][0][0][0]=NULL;
1648 }
1649
1650
1651
1652
1653 #if 0
1654 static int decode_significant_coeff_flag(thread_context* tctx,
1655 int xC,int yC,
1656 const uint8_t* coded_sub_block_flag,
1657 int sbWidth,
1658 int cIdx,
1659 int scanIdx)
1660 {
1661 logtrace(LogSlice,"# significant_coeff_flag (xC:%d yC:%d sbWidth:%d cIdx:%d scanIdx:%d)\n",
1662 xC,yC,sbWidth,cIdx,scanIdx);
1663
1664 int sigCtx;
1665
1666 // if log2TrafoSize==2
1667 if (sbWidth==1) {
1668 sigCtx = ctxIdxMap[(yC<<2) + xC];
1669 }
1670 else if (xC+yC==0) {
1671 sigCtx = 0;
1672 }
1673 else {
1674 int xS = xC>>2;
1675 int yS = yC>>2;
1676 int prevCsbf = 0;
1677 if (xS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+1 +yS*sbWidth]; }
1678 if (yS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+(1+yS)*sbWidth]<<1; }
1679
1680 int xP = xC & 3;
1681 int yP = yC & 3;
1682
1683 logtrace(LogSlice,"posInSubset: %d,%d\n",xP,yP);
1684 logtrace(LogSlice,"prevCsbf: %d\n",prevCsbf);
1685
1686 //printf("%d | %d %d\n",prevCsbf,xP,yP);
1687
1688 switch (prevCsbf) {
1689 case 0:
1690 //sigCtx = (xP+yP==0) ? 2 : (xP+yP<3) ? 1 : 0;
1691 sigCtx = (xP+yP>=3) ? 0 : (xP+yP>0) ? 1 : 2;
1692 break;
1693 case 1:
1694 sigCtx = (yP==0) ? 2 : (yP==1) ? 1 : 0;
1695 break;
1696 case 2:
1697 sigCtx = (xP==0) ? 2 : (xP==1) ? 1 : 0;
1698 break;
1699 default:
1700 sigCtx = 2;
1701 break;
1702 }
1703
1704 logtrace(LogSlice,"a) sigCtx=%d\n",sigCtx);
1705
1706 if (cIdx==0) {
1707 if (xS+yS > 0) sigCtx+=3;
1708
1709 logtrace(LogSlice,"b) sigCtx=%d\n",sigCtx);
1710
1711 // if log2TrafoSize==3
1712 if (sbWidth==2) {
1713 sigCtx += (scanIdx==0) ? 9 : 15;
1714 } else {
1715 sigCtx += 21;
1716 }
1717
1718 logtrace(LogSlice,"c) sigCtx=%d\n",sigCtx);
1719 }
1720 else {
1721 // if log2TrafoSize==3
1722 if (sbWidth==2) {
1723 sigCtx+=9;
1724 }
1725 else {
1726 sigCtx+=12;
1727 }
1728 }
1729 }
1730
1731 int ctxIdxInc;
1732 if (cIdx==0) { ctxIdxInc=sigCtx; }
1733 else { ctxIdxInc=27+sigCtx; }
1734
1735 int context = tctx->shdr->initType*42 + ctxIdxInc;
1736 logtrace(LogSlice,"context: %d\n",context);
1737
1738 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1739 &tctx->ctx_model[CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + context]);
1740 return bit;
1741 }
1742 #endif
1743
1744
1745
1746 static inline int decode_significant_coeff_flag_lookup(thread_context* tctx,
1747 uint8_t ctxIdxInc)
1748 {
1749 logtrace(LogSlice,"# significant_coeff_flag\n");
1750 logtrace(LogSlice,"context: %d\n",ctxIdxInc);
1751
1752 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1753 &tctx->ctx_model[CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + ctxIdxInc]);
1754 return bit;
1755 }
1756
1757
1758
1759
1760
1761 static inline int decode_coeff_abs_level_greater1(thread_context* tctx,
1762 int cIdx, int i,
1763 bool firstCoeffInSubblock,
1764 bool firstSubblock,
1765 int lastSubblock_greater1Ctx,
1766 int* lastInvocation_greater1Ctx,
1767 int* lastInvocation_coeff_abs_level_greater1_flag,
1768 int* lastInvocation_ctxSet, int c1)
1769 {
1770 logtrace(LogSlice,"# coeff_abs_level_greater1\n");
1771
1772 logtrace(LogSlice," cIdx:%d i:%d firstCoeffInSB:%d firstSB:%d lastSB>1:%d last>1Ctx:%d lastLev>1:%d lastCtxSet:%d\n", cIdx,i,firstCoeffInSubblock,firstSubblock,lastSubblock_greater1Ctx,
1773 *lastInvocation_greater1Ctx,
1774 *lastInvocation_coeff_abs_level_greater1_flag,
1775 *lastInvocation_ctxSet);
1776
1777 int lastGreater1Ctx;
1778 int greater1Ctx;
1779 int ctxSet;
1780
1781 logtrace(LogSlice,"c1: %d\n",c1);
1782
1783 if (firstCoeffInSubblock) {
1784 // block with real DC -> ctx 0
1785 if (i==0 || cIdx>0) { ctxSet=0; }
1786 else { ctxSet=2; }
1787
1788 if (firstSubblock) { lastGreater1Ctx=1; }
1789 else { lastGreater1Ctx = lastSubblock_greater1Ctx; }
1790
1791 if (lastGreater1Ctx==0) { ctxSet++; }
1792
1793 logtrace(LogSlice,"ctxSet: %d\n",ctxSet);
1794
1795 greater1Ctx=1;
1796 }
1797 else { // !firstCoeffInSubblock
1798 ctxSet = *lastInvocation_ctxSet;
1799 logtrace(LogSlice,"ctxSet (old): %d\n",ctxSet);
1800
1801 greater1Ctx = *lastInvocation_greater1Ctx;
1802 if (greater1Ctx>0) {
1803 int lastGreater1Flag=*lastInvocation_coeff_abs_level_greater1_flag;
1804 if (lastGreater1Flag==1) greater1Ctx=0;
1805 else { /*if (greater1Ctx>0)*/ greater1Ctx++; }
1806 }
1807 }
1808
1809 ctxSet = c1; // use HM algo
1810
1811 int ctxIdxInc = (ctxSet*4) + (greater1Ctx>=3 ? 3 : greater1Ctx);
1812
1813 if (cIdx>0) { ctxIdxInc+=16; }
1814
1815 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1816 &tctx->ctx_model[CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG + ctxIdxInc]);
1817
1818 *lastInvocation_greater1Ctx = greater1Ctx;
1819 *lastInvocation_coeff_abs_level_greater1_flag = bit;
1820 *lastInvocation_ctxSet = ctxSet;
1821
1822 return bit;
1823 }
1824
1825
1826 static int decode_coeff_abs_level_greater2(thread_context* tctx,
1827 int cIdx, // int i,int n,
1828 int ctxSet)
1829 {
1830 logtrace(LogSlice,"# coeff_abs_level_greater2\n");
1831
1832 int ctxIdxInc = ctxSet;
1833
1834 if (cIdx>0) ctxIdxInc+=4;
1835
1836 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1837 &tctx->ctx_model[CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG + ctxIdxInc]);
1838
1839 return bit;
1840 }
1841
1842
1843 static int decode_coeff_abs_level_remaining(thread_context* tctx,
1844 int cRiceParam)
1845 {
1846 logtrace(LogSlice,"# decode_coeff_abs_level_remaining\n");
1847
1848 int prefix=-1;
1849 int codeword=0;
1850 do {
1851 prefix++;
1852 codeword = decode_CABAC_bypass(&tctx->cabac_decoder);
1853 }
1854 while (codeword);
1855
1856 // prefix = nb. 1 bits
1857
1858 int value;
1859
1860 if (prefix <= 3) {
1861 // when code only TR part (level < TRMax)
1862
1863 codeword = decode_CABAC_FL_bypass(&tctx->cabac_decoder, cRiceParam);
1864 value = (prefix<<cRiceParam) + codeword;
1865 }
1866 else {
1867 // Suffix coded with EGk. Note that the unary part of EGk is already
1868 // included in the 'prefix' counter above.
1869
1870 codeword = decode_CABAC_FL_bypass(&tctx->cabac_decoder, prefix-3+cRiceParam);
1871 value = (((1<<(prefix-3))+3-1)<<cRiceParam)+codeword;
1872 }
1873
1874 return value;
1875 }
1876
1877
1878 static int decode_merge_flag(thread_context* tctx)
1879 {
1880 logtrace(LogSlice,"# merge_flag\n");
1881
1882 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1883 &tctx->ctx_model[CONTEXT_MODEL_MERGE_FLAG]);
1884
1885 return bit;
1886 }
1887
1888
1889 static int decode_merge_idx(thread_context* tctx)
1890 {
1891 logtrace(LogSlice,"# merge_idx\n");
1892
1893 // TU coding, first bin is CABAC, remaining are bypass.
1894 // cMax = MaxNumMergeCand-1
1895
1896 int idx = decode_CABAC_bit(&tctx->cabac_decoder,
1897 &tctx->ctx_model[CONTEXT_MODEL_MERGE_IDX]);
1898
1899 if (idx==0) {
1900 // nothing
1901 }
1902 else {
1903 idx=1;
1904
1905 while (idx<tctx->shdr->MaxNumMergeCand-1) {
1906 if (decode_CABAC_bypass(&tctx->cabac_decoder)) {
1907 idx++;
1908 }
1909 else {
1910 break;
1911 }
1912 }
1913 }
1914
1915 logtrace(LogSlice,"> merge_idx = %d\n",idx);
1916
1917 return idx;
1918 }
1919
1920
1921 static int decode_pred_mode_flag(thread_context* tctx)
1922 {
1923 logtrace(LogSlice,"# pred_mode_flag\n");
1924
1925 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1926 &tctx->ctx_model[CONTEXT_MODEL_PRED_MODE_FLAG]);
1927
1928 return bit;
1929 }
1930
1931 static int decode_mvp_lx_flag(thread_context* tctx)
1932 {
1933 logtrace(LogSlice,"# mvp_lx_flag\n");
1934
1935 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1936 &tctx->ctx_model[CONTEXT_MODEL_MVP_LX_FLAG]);
1937
1938 return bit;
1939 }
1940
1941 static int decode_rqt_root_cbf(thread_context* tctx)
1942 {
1943 logtrace(LogSlice,"# rqt_root_cbf\n");
1944
1945 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1946 &tctx->ctx_model[CONTEXT_MODEL_RQT_ROOT_CBF]);
1947
1948 return bit;
1949 }
1950
1951 static int decode_ref_idx_lX(thread_context* tctx, int numRefIdxLXActive)
1952 {
1953 logtrace(LogSlice,"# ref_idx_lX\n");
1954
1955 int cMax = numRefIdxLXActive-1;
1956
1957 if (cMax==0) {
1958 logtrace(LogSlice,"> ref_idx = 0 (cMax==0)\n");
1959 return 0;
1960 } // do check for single reference frame here
1961
1962 int bit = decode_CABAC_bit(&tctx->cabac_decoder,
1963 &tctx->ctx_model[CONTEXT_MODEL_REF_IDX_LX + 0]);
1964
1965 int idx=0;
1966
1967 while (bit) {
1968 idx++;
1969 if (idx==cMax) { break; }
1970
1971 if (idx==1) {
1972 bit = decode_CABAC_bit(&tctx->cabac_decoder,
1973 &tctx->ctx_model[CONTEXT_MODEL_REF_IDX_LX + 1]);
1974 }
1975 else {
1976 bit = decode_CABAC_bypass(&tctx->cabac_decoder);
1977 }
1978 }
1979
1980 logtrace(LogSlice,"> ref_idx = %d\n",idx);
1981
1982 return idx;
1983 }
1984
1985
1986 static enum InterPredIdc decode_inter_pred_idc(thread_context* tctx,
1987 int x0, int y0,
1988 int nPbW, int nPbH,
1989 int ctDepth)
1990 {
1991 logtrace(LogSlice,"# inter_pred_idc\n");
1992
1993 int value;
1994
1995 context_model* model = &tctx->ctx_model[CONTEXT_MODEL_INTER_PRED_IDC];
1996
1997 if (nPbW+nPbH==12) {
1998 value = decode_CABAC_bit(&tctx->cabac_decoder,
1999 &model[4]);
2000 }
2001 else {
2002 int bit0 = decode_CABAC_bit(&tctx->cabac_decoder,
2003 &model[ctDepth]);
2004 if (bit0==0) {
2005 value = decode_CABAC_bit(&tctx->cabac_decoder,
2006 &model[4]);
2007 }
2008 else {
2009 value = 2;
2010 }
2011 }
2012
2013 logtrace(LogSlice,"> inter_pred_idc = %d (%s)\n",value,
2014 value==0 ? "L0" : (value==1 ? "L1" : "BI"));
2015
2016 return (enum InterPredIdc) value;
2017 }
2018
2019
2020
2021 void initialize_CABAC(thread_context* tctx)
2022 {
2023 const int initType = tctx->shdr->initType;
2024 assert(initType >= 0 && initType <= 2);
2025
2026 init_context(tctx, CONTEXT_MODEL_SPLIT_CU_FLAG, initValue_split_cu_flag[initType], 3);
2027 if (initType > 0) {
2028 init_context(tctx, CONTEXT_MODEL_CU_SKIP_FLAG, initValue_cu_skip_flag[initType-1], 3);
2029 init_context(tctx, CONTEXT_MODEL_PRED_MODE_FLAG, &initValue_pred_mode_flag[initType-1], 1);
2030 init_context(tctx, CONTEXT_MODEL_MERGE_FLAG, &initValue_merge_flag[initType-1],1);
2031 init_context(tctx, CONTEXT_MODEL_MERGE_IDX, &initValue_merge_idx[initType-1], 1);
2032 init_context(tctx, CONTEXT_MODEL_INTER_PRED_IDC, initValue_inter_pred_idc, 5);
2033 init_context(tctx, CONTEXT_MODEL_REF_IDX_LX, initValue_ref_idx_lX, 2);
2034 init_context(tctx, CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG, &initValue_abs_mvd_greater01_flag[initType == 1 ? 0 : 2], 2);
2035 init_context(tctx, CONTEXT_MODEL_MVP_LX_FLAG, initValue_mvp_lx_flag, 1);
2036 init_context(tctx, CONTEXT_MODEL_RQT_ROOT_CBF, initValue_rqt_root_cbf, 1);
2037 }
2038
2039 init_context(tctx, CONTEXT_MODEL_PART_MODE, &initValue_part_mode[(initType!=2 ? initType : 5)], 4);
2040 init_context(tctx, CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG, &initValue_prev_intra_luma_pred_flag[initType], 1);
2041 init_context(tctx, CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE, &initValue_intra_chroma_pred_mode[initType], 1);
2042 init_context(tctx, CONTEXT_MODEL_CBF_LUMA, &initValue_cbf_luma[initType == 0 ? 0 : 2], 2);
2043 init_context(tctx, CONTEXT_MODEL_CBF_CHROMA, &initValue_cbf_chroma[initType * 4], 4);
2044 init_context(tctx, CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG, &initValue_split_transform_flag[initType * 3], 3);
2045 init_context(tctx, CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX, &initValue_last_significant_coefficient_prefix[initType * 18], 18);
2046 init_context(tctx, CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX, &initValue_last_significant_coefficient_prefix[initType * 18], 18);
2047 init_context(tctx, CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG, &initValue_coded_sub_block_flag[initType * 4], 4);
2048 init_context(tctx, CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG, initValue_significant_coeff_flag[initType], 42);
2049 init_context(tctx, CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG, &initValue_coeff_abs_level_greater1_flag[initType * 24], 24);
2050 init_context(tctx, CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG, &initValue_coeff_abs_level_greater2_flag[initType * 6], 6);
2051 init_context(tctx, CONTEXT_MODEL_SAO_MERGE_FLAG, &initValue_sao_merge_leftUp_flag[initType], 1);
2052 init_context(tctx, CONTEXT_MODEL_SAO_TYPE_IDX, &initValue_sao_type_idx_lumaChroma_flag[initType], 1);
2053 init_context(tctx, CONTEXT_MODEL_CU_QP_DELTA_ABS, initValue_cu_qp_delta_abs, 2);
2054 init_context(tctx, CONTEXT_MODEL_TRANSFORM_SKIP_FLAG, initValue_transform_skip_flag, 2);
2055 init_context(tctx, CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG, &initValue_cu_transquant_bypass_flag[initType], 1);
2056 }
2057
2058
2059 /* Take CtbAddrInTS and compute
2060 -> CtbAddrInRS, CtbX, CtbY
2061 */
2062 bool setCtbAddrFromTS(thread_context* tctx)
2063 {
2064 const seq_parameter_set* sps = &tctx->img->sps;
2065
2066 if (tctx->CtbAddrInTS < sps->PicSizeInCtbsY) {
2067 tctx->CtbAddrInRS = tctx->img->pps.CtbAddrTStoRS[tctx->CtbAddrInTS];
2068
2069 tctx->CtbX = tctx->CtbAddrInRS % sps->PicWidthInCtbsY;
2070 tctx->CtbY = tctx->CtbAddrInRS / sps->PicWidthInCtbsY;
2071 return false;
2072 }
2073 else {
2074 tctx->CtbAddrInRS = sps->PicSizeInCtbsY;
2075
2076 tctx->CtbX = tctx->CtbAddrInRS % sps->PicWidthInCtbsY;
2077 tctx->CtbY = tctx->CtbAddrInRS / sps->PicWidthInCtbsY;
2078 return true;
2079 }
2080 }
2081
2082 // returns true when we reached the end of the image (ctbAddr==picSizeInCtbsY)
2083 bool advanceCtbAddr(thread_context* tctx)
2084 {
2085 tctx->CtbAddrInTS++;
2086
2087 return setCtbAddrFromTS(tctx);
2088 }
2089
2090
2091 void read_sao(thread_context* tctx, int xCtb,int yCtb,
2092 int CtbAddrInSliceSeg)
2093 {
2094 slice_segment_header* shdr = tctx->shdr;
2095 de265_image* img = tctx->img;
2096 const seq_parameter_set* sps = &img->sps;
2097 const pic_parameter_set* pps = &img->pps;
2098
2099 logtrace(LogSlice,"# read_sao(%d,%d)\n",xCtb,yCtb);
2100
2101 sao_info saoinfo;
2102 memset(&saoinfo,0,sizeof(sao_info));
2103 logtrace(LogSlice,"sizeof saoinfo: %d\n",sizeof(sao_info));
2104
2105
2106 char sao_merge_left_flag = 0;
2107 char sao_merge_up_flag = 0;
2108
2109 if (xCtb>0) {
2110 //char leftCtbInSliceSeg = (CtbAddrInSliceSeg>0);
2111 char leftCtbInSliceSeg = (tctx->CtbAddrInRS > shdr->SliceAddrRS);
2112 char leftCtbInTile = (pps->TileIdRS[xCtb + yCtb * sps->PicWidthInCtbsY] ==
2113 pps->TileIdRS[xCtb-1 + yCtb * sps->PicWidthInCtbsY]);
2114
2115 if (leftCtbInSliceSeg && leftCtbInTile) {
2116 sao_merge_left_flag = decode_sao_merge_flag(tctx);
2117 logtrace(LogSlice,"sao_merge_left_flag: %d\n",sao_merge_left_flag);
2118 }
2119 }
2120
2121 if (yCtb>0 && sao_merge_left_flag==0) {
2122 logtrace(LogSlice,"CtbAddrInRS:%d PicWidthInCtbsY:%d slice_segment_address:%d\n",
2123 tctx->CtbAddrInRS,
2124 sps->PicWidthInCtbsY,
2125 shdr->slice_segment_address);
2126 char upCtbInSliceSeg = (tctx->CtbAddrInRS - sps->PicWidthInCtbsY) >= shdr->SliceAddrRS;
2127 char upCtbInTile = (pps->TileIdRS[xCtb + yCtb * sps->PicWidthInCtbsY] ==
2128 pps->TileIdRS[xCtb + (yCtb-1) * sps->PicWidthInCtbsY]);
2129
2130 if (upCtbInSliceSeg && upCtbInTile) {
2131 sao_merge_up_flag = decode_sao_merge_flag(tctx);
2132 logtrace(LogSlice,"sao_merge_up_flag: %d\n",sao_merge_up_flag);
2133 }
2134 }
2135
2136 if (!sao_merge_up_flag && !sao_merge_left_flag) {
2137 for (int cIdx=0; cIdx<3; cIdx++) {
2138 if ((shdr->slice_sao_luma_flag && cIdx==0) ||
2139 (shdr->slice_sao_chroma_flag && cIdx>0)) {
2140
2141 uint8_t SaoTypeIdx = 0;
2142
2143 if (cIdx==0) {
2144 char sao_type_idx_luma = decode_sao_type_idx(tctx);
2145 logtrace(LogSlice,"sao_type_idx_luma: %d\n", sao_type_idx_luma);
2146 saoinfo.SaoTypeIdx = SaoTypeIdx = sao_type_idx_luma;
2147 }
2148 else if (cIdx==1) {
2149 char sao_type_idx_chroma = decode_sao_type_idx(tctx);
2150 logtrace(LogSlice,"sao_type_idx_chroma: %d\n", sao_type_idx_chroma);
2151 SaoTypeIdx = sao_type_idx_chroma;
2152 saoinfo.SaoTypeIdx |= SaoTypeIdx<<(2*1);
2153 saoinfo.SaoTypeIdx |= SaoTypeIdx<<(2*2); // set for both chroma components
2154 }
2155 else {
2156 // SaoTypeIdx = 0
2157
2158 SaoTypeIdx = (saoinfo.SaoTypeIdx >> (2*cIdx)) & 0x3;
2159 }
2160
2161 if (SaoTypeIdx != 0) {
2162 for (int i=0;i<4;i++) {
2163 saoinfo.saoOffsetVal[cIdx][i] = decode_sao_offset_abs(tctx);
2164 logtrace(LogSlice,"saoOffsetVal[%d][%d] = %d\n",cIdx,i, saoinfo.saoOffsetVal[cIdx][i]);
2165 }
2166
2167 int sign[4];
2168 if (SaoTypeIdx==1) {
2169 for (int i=0;i<4;i++) {
2170 if (saoinfo.saoOffsetVal[cIdx][i] != 0) {
2171 sign[i] = decode_sao_offset_sign(tctx) ? -1 : 1;
2172 }
2173 else {
2174 sign[i] = 0; // not really required, but compiler warns about uninitialized values
2175 }
2176 }
2177
2178 saoinfo.sao_band_position[cIdx] = decode_sao_band_position(tctx);
2179 }
2180 else {
2181 uint8_t SaoEoClass = 0;
2182
2183 sign[0] = sign[1] = 1;
2184 sign[2] = sign[3] = -1;
2185
2186 if (cIdx==0) {
2187 saoinfo.SaoEoClass = SaoEoClass = decode_sao_class(tctx);
2188 }
2189 else if (cIdx==1) {
2190 SaoEoClass = decode_sao_class(tctx);
2191 saoinfo.SaoEoClass |= SaoEoClass << (2*1);
2192 saoinfo.SaoEoClass |= SaoEoClass << (2*2);
2193 }
2194
2195 logtrace(LogSlice,"SaoEoClass[%d] = %d\n",cIdx,SaoEoClass);
2196 }
2197
2198 int bitDepth = (cIdx==0 ?
2199 sps->BitDepth_Y :
2200 sps->BitDepth_C);
2201 int shift = bitDepth-libde265_min(bitDepth,10);
2202
2203 for (int i=0;i<4;i++) {
2204 saoinfo.saoOffsetVal[cIdx][i] = sign[i]*(saoinfo.saoOffsetVal[cIdx][i] << shift);
2205 }
2206 }
2207 }
2208 }
2209
2210 img->set_sao_info(xCtb,yCtb, &saoinfo);
2211 }
2212
2213
2214 if (sao_merge_left_flag) {
2215 img->set_sao_info(xCtb,yCtb, img->get_sao_info(xCtb-1,yCtb));
2216 }
2217
2218 if (sao_merge_up_flag) {
2219 img->set_sao_info(xCtb,yCtb, img->get_sao_info(xCtb,yCtb-1));
2220 }
2221 }
2222
2223
2224 void read_coding_tree_unit(thread_context* tctx)
2225 {
2226 slice_segment_header* shdr = tctx->shdr;
2227 de265_image* img = tctx->img;
2228 seq_parameter_set* sps = &img->sps;
2229
2230 int xCtb = (tctx->CtbAddrInRS % sps->PicWidthInCtbsY);
2231 int yCtb = (tctx->CtbAddrInRS / sps->PicWidthInCtbsY);
2232 int xCtbPixels = xCtb << sps->Log2CtbSizeY;
2233 int yCtbPixels = yCtb << sps->Log2CtbSizeY;
2234
2235 logtrace(LogSlice,"----- decode CTB %d;%d (%d;%d) POC=%d, SliceAddrRS=%d\n",
2236 xCtbPixels,yCtbPixels, xCtb,yCtb,
2237 tctx->img->PicOrderCntVal, tctx->shdr->SliceAddrRS);
2238
2239 img->set_SliceAddrRS(xCtb, yCtb, tctx->shdr->SliceAddrRS);
2240
2241 img->set_SliceHeaderIndex(xCtbPixels,yCtbPixels, shdr->slice_index);
2242
2243 int CtbAddrInSliceSeg = tctx->CtbAddrInRS - shdr->slice_segment_address;
2244
2245 if (shdr->slice_sao_luma_flag || shdr->slice_sao_chroma_flag)
2246 {
2247 read_sao(tctx, xCtb,yCtb, CtbAddrInSliceSeg);
2248 }
2249
2250 read_coding_quadtree(tctx, xCtbPixels, yCtbPixels, sps->Log2CtbSizeY, 0);
2251 }
2252
2253
2254 LIBDE265_INLINE static int luma_pos_to_ctbAddrRS(seq_parameter_set* sps, int x,int y)
2255 {
2256 int ctbX = x >> sps->Log2CtbSizeY;
2257 int ctbY = y >> sps->Log2CtbSizeY;
2258
2259 return ctbY * sps->PicWidthInCtbsY + ctbX;
2260 }
2261
2262
2263 int check_CTB_available(de265_image* img,
2264 slice_segment_header* shdr,
2265 int xC,int yC, int xN,int yN)
2266 {
2267 // check whether neighbor is outside of frame
2268
2269 if (xN < 0 || yN < 0) { return 0; }
2270 if (xN >= img->sps.pic_width_in_luma_samples) { return 0; }
2271 if (yN >= img->sps.pic_height_in_luma_samples) { return 0; }
2272
2273
2274 int current_ctbAddrRS = luma_pos_to_ctbAddrRS(&img->sps, xC,yC);
2275 int neighbor_ctbAddrRS = luma_pos_to_ctbAddrRS(&img->sps, xN,yN);
2276
2277 // TODO: check if this is correct (6.4.1)
2278
2279 if (img->get_SliceAddrRS_atCtbRS(current_ctbAddrRS) !=
2280 img->get_SliceAddrRS_atCtbRS(neighbor_ctbAddrRS)) {
2281 return 0;
2282 }
2283
2284 // check if both CTBs are in the same tile.
2285
2286 if (img->pps.TileIdRS[current_ctbAddrRS] !=
2287 img->pps.TileIdRS[neighbor_ctbAddrRS]) {
2288 return 0;
2289 }
2290
2291 return 1;
2292 }
2293
2294
2295 int residual_coding(thread_context* tctx,
2296 int x0, int y0, // position of TU in frame
2297 int xL, int yL, // position of TU in local CU
2298 int log2TrafoSize,
2299 int cIdx)
2300 {
2301 logtrace(LogSlice,"- residual_coding x0:%d y0:%d log2TrafoSize:%d cIdx:%d\n",x0,y0,log2TrafoSize,cIdx);
2302
2303 //slice_segment_header* shdr = tctx->shdr;
2304
2305 de265_image* img = tctx->img;
2306 const seq_parameter_set* sps = &img->sps;
2307 const pic_parameter_set* pps = &img->pps;
2308
2309
2310 if (cIdx==0) {
2311 img->set_nonzero_coefficient(x0,y0,log2TrafoSize);
2312 }
2313
2314
2315 if (pps->transform_skip_enabled_flag &&
2316 !tctx->cu_transquant_bypass_flag &&
2317 (log2TrafoSize==2))
2318 {
2319 tctx->transform_skip_flag[cIdx] = decode_transform_skip_flag(tctx,cIdx);
2320 }
2321 else
2322 {
2323 tctx->transform_skip_flag[cIdx] = 0;
2324 }
2325
2326
2327 // --- decode position of last coded coefficient ---
2328
2329 int last_significant_coeff_x_prefix =
2330 decode_last_significant_coeff_prefix(tctx,log2TrafoSize,cIdx,
2331 &tctx->ctx_model[CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX]);
2332
2333 int last_significant_coeff_y_prefix =
2334 decode_last_significant_coeff_prefix(tctx,log2TrafoSize,cIdx,
2335 &tctx->ctx_model[CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX]);
2336
2337
2338 // TODO: we can combine both FL-bypass calls into one, but the gain may be limited...
2339
2340 int LastSignificantCoeffX;
2341 if (last_significant_coeff_x_prefix > 3) {
2342 int nBits = (last_significant_coeff_x_prefix>>1)-1;
2343 int last_significant_coeff_x_suffix = decode_CABAC_FL_bypass(&tctx->cabac_decoder,nBits);
2344
2345 LastSignificantCoeffX =
2346 ((2+(last_significant_coeff_x_prefix & 1)) << nBits) + last_significant_coeff_x_suffix;
2347 }
2348 else {
2349 LastSignificantCoeffX = last_significant_coeff_x_prefix;
2350 }
2351
2352 int LastSignificantCoeffY;
2353 if (last_significant_coeff_y_prefix > 3) {
2354 int nBits = (last_significant_coeff_y_prefix>>1)-1;
2355 int last_significant_coeff_y_suffix = decode_CABAC_FL_bypass(&tctx->cabac_decoder,nBits);
2356
2357 LastSignificantCoeffY =
2358 ((2+(last_significant_coeff_y_prefix & 1)) << nBits) + last_significant_coeff_y_suffix;
2359 }
2360 else {
2361 LastSignificantCoeffY = last_significant_coeff_y_prefix;
2362 }
2363
2364
2365
2366 // --- determine scanIdx ---
2367
2368 int scanIdx;
2369
2370 enum PredMode PredMode = img->get_pred_mode(x0,y0);
2371
2372
2373 if (PredMode == MODE_INTRA) {
2374 if (cIdx==0) {
2375 if (log2TrafoSize==2 || log2TrafoSize==3) {
2376 enum IntraPredMode predMode = img->get_IntraPredMode(x0,y0);
2377 logtrace(LogSlice,"IntraPredMode[%d,%d] = %d\n",x0,y0,predMode);
2378
2379 if (predMode >= 6 && predMode <= 14) scanIdx=2;
2380 else if (predMode >= 22 && predMode <= 30) scanIdx=1;
2381 else scanIdx=0;
2382 }
2383 else { scanIdx=0; }
2384 }
2385 else {
2386 if (log2TrafoSize==1 || log2TrafoSize==2) {
2387 enum IntraPredMode predMode = tctx->IntraPredModeC;
2388
2389 if (predMode >= 6 && predMode <= 14) scanIdx=2;
2390 else if (predMode >= 22 && predMode <= 30) scanIdx=1;
2391 else scanIdx=0;
2392 }
2393 else { scanIdx=0; }
2394 }
2395
2396 logtrace(LogSlice,"pred: %d -> scan: %d\n",PredMode,scanIdx);
2397 }
2398 else {
2399 scanIdx=0;
2400 }
2401
2402
2403 // HM 9 only ?
2404 if (scanIdx==2) {
2405 int t = LastSignificantCoeffX;
2406 LastSignificantCoeffX = LastSignificantCoeffY;
2407 LastSignificantCoeffY = t;
2408 }
2409
2410 logtrace(LogSlice,"LastSignificantCoeff: x=%d;y=%d\n",LastSignificantCoeffX,LastSignificantCoeffY);
2411
2412 const position* ScanOrderSub = get_scan_order(log2TrafoSize-2, scanIdx);
2413 const position* ScanOrderPos = get_scan_order(2, scanIdx);
2414
2415 logtrace(LogSlice,"ScanOrderPos: ");
2416 for (int n=0;n<4*4;n++)
2417 logtrace(LogSlice,"*%d,%d ", ScanOrderPos[n].x, ScanOrderPos[n].y);
2418 logtrace(LogSlice,"*\n");
2419
2420
2421 // --- find last sub block and last scan pos ---
2422
2423 int xC,yC;
2424
2425 scan_position lastScanP = get_scan_position(LastSignificantCoeffX, LastSignificantCoeffY,
2426 scanIdx, log2TrafoSize);
2427
2428 int lastScanPos = lastScanP.scanPos;
2429 int lastSubBlock = lastScanP.subBlock;
2430
2431
2432 int sbWidth = 1<<(log2TrafoSize-2);
2433
2434 uint8_t coded_sub_block_neighbors[32/4*32/4];
2435 memset(coded_sub_block_neighbors,0,sbWidth*sbWidth);
2436
2437 int c1 = 1;
2438 bool firstSubblock = true; // for coeff_abs_level_greater1_flag context model
2439 int lastSubblock_greater1Ctx=false; /* for coeff_abs_level_greater1_flag context model
2440 (initialization not strictly needed)
2441 */
2442
2443 #ifdef DE265_LOG_TRACE
2444 int16_t TransCoeffLevel[32 * 32];
2445 memset(TransCoeffLevel,0, sizeof(uint16_t)*32*32);
2446 #endif
2447
2448 int CoeffStride = 1<<log2TrafoSize;
2449
2450 int lastInvocation_greater1Ctx=0;
2451 int lastInvocation_coeff_abs_level_greater1_flag=0;
2452 int lastInvocation_ctxSet=0;
2453
2454
2455
2456 // ----- decode coefficients -----
2457
2458 tctx->nCoeff[cIdx] = 0;
2459
2460
2461 // i - subblock index
2462 // n - coefficient index in subblock
2463
2464 for (int i=lastSubBlock;i>=0;i--) {
2465 position S = ScanOrderSub[i];
2466 int inferSbDcSigCoeffFlag=0;
2467
2468 logtrace(LogSlice,"sub block scan idx: %d\n",i);
2469
2470
2471 // --- check whether this sub-block is coded ---
2472
2473 int sub_block_is_coded = 0;
2474
2475 if ((i<lastSubBlock) && (i>0)) {
2476 sub_block_is_coded = decode_coded_sub_block_flag(tctx, cIdx,
2477 coded_sub_block_neighbors[S.x+S.y*sbWidth]);
2478 inferSbDcSigCoeffFlag=1;
2479 }
2480 else if (i==0 || i==lastSubBlock) {
2481 // first (DC) and last sub-block are always coded
2482 // - the first will most probably contain coefficients
2483 // - the last obviously contains the last coded coefficient
2484
2485 sub_block_is_coded = 1;
2486 }
2487
2488 if (sub_block_is_coded) {
2489 if (S.x > 0) coded_sub_block_neighbors[S.x-1 + S.y *sbWidth] |= 1;
2490 if (S.y > 0) coded_sub_block_neighbors[S.x + (S.y-1)*sbWidth] |= 2;
2491 }
2492
2493
2494 // ----- find significant coefficients in this sub-block -----
2495
2496 int16_t coeff_value[16];
2497 int8_t coeff_scan_pos[16];
2498 int8_t coeff_sign[16];
2499 int8_t coeff_has_max_base_level[16];
2500 int nCoefficients=0;
2501
2502
2503 if (sub_block_is_coded) {
2504 int x0 = S.x<<2;
2505 int y0 = S.y<<2;
2506
2507 int log2w = log2TrafoSize-2;
2508 int prevCsbf = coded_sub_block_neighbors[S.x+S.y*sbWidth];
2509 uint8_t* ctxIdxMap = ctxIdxLookup[log2w][!!cIdx][!!scanIdx][prevCsbf];
2510
2511
2512 // set the last coded coefficient in the last subblock
2513
2514 int last_coeff = (i==lastSubBlock) ? lastScanPos-1 : 15;
2515
2516 if (i==lastSubBlock) {
2517 coeff_value[nCoefficients] = 1;
2518 coeff_has_max_base_level[nCoefficients] = 1;
2519 coeff_scan_pos[nCoefficients] = lastScanPos;
2520 nCoefficients++;
2521 }
2522
2523
2524 // --- decode all coefficients' significant_coeff flags except for the DC coefficient ---
2525
2526 for (int n= last_coeff ; n>0 ; n--) {
2527 int subX = ScanOrderPos[n].x;
2528 int subY = ScanOrderPos[n].y;
2529 xC = x0 + subX;
2530 yC = y0 + subY;
2531
2532
2533 // for all AC coefficients in sub-block, a significant_coeff flag is coded
2534
2535 int significant_coeff = decode_significant_coeff_flag_lookup(tctx,
2536 ctxIdxMap[xC+(yC<<log2TrafoSize)]);
2537 //ctxIdxMap[(i<<4)+n]);
2538
2539 if (significant_coeff) {
2540 coeff_value[nCoefficients] = 1;
2541 coeff_has_max_base_level[nCoefficients] = 1;
2542 coeff_scan_pos[nCoefficients] = n;
2543 nCoefficients++;
2544
2545 // since we have a coefficient in the sub-block,
2546 // we cannot infer the DC coefficient anymore
2547 inferSbDcSigCoeffFlag = 0;
2548 }
2549 }
2550
2551
2552 // --- decode DC coefficient significance ---
2553
2554 if (last_coeff>=0) // last coded coefficient (always set to 1) is not the DC coefficient
2555 {
2556 if (inferSbDcSigCoeffFlag==0) {
2557 // if we cannot infert the DC coefficient, it is coded
2558 int significant_coeff = decode_significant_coeff_flag_lookup(tctx,
2559 ctxIdxMap[x0+(y0<<log2TrafoSize)]);
2560 //ctxIdxMap[(i<<4)+0]);
2561
2562
2563 if (significant_coeff) {
2564 coeff_value[nCoefficients] = 1;
2565 coeff_has_max_base_level[nCoefficients] = 1;
2566 coeff_scan_pos[nCoefficients] = 0;
2567 nCoefficients++;
2568 }
2569 }
2570 else {
2571 // we can infer that the DC coefficient must be present
2572 coeff_value[nCoefficients] = 1;
2573 coeff_has_max_base_level[nCoefficients] = 1;
2574 coeff_scan_pos[nCoefficients] = 0;
2575 nCoefficients++;
2576 }
2577 }
2578
2579 }
2580
2581
2582 /*
2583 logtrace(LogSlice,"significant_coeff_flags:\n");
2584 for (int y=0;y<4;y++) {
2585 logtrace(LogSlice," ");
2586 for (int x=0;x<4;x++) {
2587 logtrace(LogSlice,"*%d ",significant_coeff_flag[y][x]);
2588 }
2589 logtrace(LogSlice,"*\n");
2590 }
2591 */
2592
2593
2594 if (nCoefficients) {
2595 int ctxSet;
2596 if (i==0 || cIdx>0) { ctxSet=0; }
2597 else { ctxSet=2; }
2598
2599 if (c1==0) { ctxSet++; }
2600 c1=1;
2601
2602
2603 // --- decode greater-1 flags ---
2604
2605 int newLastGreater1ScanPos=-1;
2606
2607 int lastGreater1Coefficient = libde265_min(8,nCoefficients);
2608 for (int c=0;c<lastGreater1Coefficient;c++) {
2609 int greater1_flag =
2610 decode_coeff_abs_level_greater1(tctx, cIdx,i,
2611 c==0,
2612 firstSubblock,
2613 lastSubblock_greater1Ctx,
2614 &lastInvocation_greater1Ctx,
2615 &lastInvocation_coeff_abs_level_greater1_flag,
2616 &lastInvocation_ctxSet, ctxSet);
2617
2618 if (greater1_flag) {
2619 coeff_value[c]++;
2620
2621 c1=0;
2622
2623 if (newLastGreater1ScanPos == -1) {
2624 newLastGreater1ScanPos=c;
2625 }
2626 }
2627 else {
2628 coeff_has_max_base_level[c] = 0;
2629
2630 if (c1<3 && c1>0) {
2631 c1++;
2632 }
2633 }
2634 }
2635
2636 firstSubblock = false;
2637 lastSubblock_greater1Ctx = lastInvocation_greater1Ctx;
2638
2639
2640 // --- decode greater-2 flag ---
2641
2642 if (newLastGreater1ScanPos != -1) {
2643 int flag = decode_coeff_abs_level_greater2(tctx,cIdx, lastInvocation_ctxSet);
2644 coeff_value[newLastGreater1ScanPos] += flag;
2645 coeff_has_max_base_level[newLastGreater1ScanPos] = flag;
2646 }
2647
2648
2649 // --- decode coefficient signs ---
2650
2651 int signHidden = (coeff_scan_pos[0]-coeff_scan_pos[nCoefficients-1] > 3 &&
2652 !tctx->cu_transquant_bypass_flag);
2653
2654 for (int n=0;n<nCoefficients-1;n++) {
2655 coeff_sign[n] = decode_CABAC_bypass(&tctx->cabac_decoder);
2656 logtrace(LogSlice,"sign[%d] = %d\n", n, coeff_sign[n]);
2657 }
2658
2659 // n==nCoefficients-1
2660 if (!pps->sign_data_hiding_flag || !signHidden) {
2661 coeff_sign[nCoefficients-1] = decode_CABAC_bypass(&tctx->cabac_decoder);
2662 logtrace(LogSlice,"sign[%d] = %d\n", nCoefficients-1, coeff_sign[nCoefficients-1]);
2663 }
2664 else {
2665 coeff_sign[nCoefficients-1] = 0;
2666 }
2667
2668
2669 // --- decode coefficient value ---
2670
2671 int sumAbsLevel=0;
2672 int uiGoRiceParam=0;
2673
2674 for (int n=0;n<nCoefficients;n++) {
2675 int baseLevel = coeff_value[n];
2676
2677 int coeff_abs_level_remaining;
2678
2679 if (coeff_has_max_base_level[n]) {
2680 coeff_abs_level_remaining =
2681 decode_coeff_abs_level_remaining(tctx, uiGoRiceParam);
2682
2683 // (9-462)
2684 if (baseLevel + coeff_abs_level_remaining > 3*(1<<uiGoRiceParam)) {
2685 uiGoRiceParam++;
2686 if (uiGoRiceParam>4) uiGoRiceParam=4;
2687 }
2688 }
2689 else {
2690 coeff_abs_level_remaining = 0;
2691 }
2692
2693
2694 int16_t currCoeff = baseLevel + coeff_abs_level_remaining;
2695 if (coeff_sign[n]) {
2696 currCoeff = -currCoeff;
2697 }
2698
2699 if (pps->sign_data_hiding_flag && signHidden) {
2700 sumAbsLevel += baseLevel + coeff_abs_level_remaining;
2701
2702 if (n==nCoefficients-1 && (sumAbsLevel & 1)) {
2703 currCoeff = -currCoeff;
2704 }
2705 }
2706
2707 #ifdef DE265_LOG_TRACE
2708 //TransCoeffLevel[yC*CoeffStride + xC] = currCoeff;
2709 #endif
2710
2711 // put coefficient in list
2712 int p = coeff_scan_pos[n];
2713 xC = (S.x<<2) + ScanOrderPos[p].x;
2714 yC = (S.y<<2) + ScanOrderPos[p].y;
2715
2716 tctx->coeffList[cIdx][ tctx->nCoeff[cIdx] ] = currCoeff;
2717 tctx->coeffPos [cIdx][ tctx->nCoeff[cIdx] ] = xC + yC*CoeffStride;
2718 tctx->nCoeff[cIdx]++;
2719 } // iterate through coefficients in sub-block
2720 } // if nonZero
2721 } // next sub-block
2722
2723 return DE265_OK;
2724 }
2725
2726
2727 int read_transform_unit(thread_context* tctx,
2728 int x0, int y0, // position of TU in frame
2729 int xBase, int yBase, // position of parent TU in frame
2730 int xCUBase,int yCUBase, // position of CU in frame
2731 int log2TrafoSize,
2732 int trafoDepth,
2733 int blkIdx,
2734 int cbf_luma, int cbf_cb, int cbf_cr)
2735 {
2736 logtrace(LogSlice,"- read_transform_unit x0:%d y0:%d xBase:%d yBase:%d nT:%d cbf:%d:%d:%d\n",
2737 x0,y0,xBase,yBase, 1<<log2TrafoSize, cbf_luma, cbf_cb, cbf_cr);
2738
2739 assert(cbf_cb != -1);
2740 assert(cbf_cr != -1);
2741 assert(cbf_luma != -1);
2742
2743 tctx->transform_skip_flag[0]=0;
2744 tctx->transform_skip_flag[1]=0;
2745 tctx->transform_skip_flag[2]=0;
2746
2747
2748 if (cbf_luma || cbf_cb || cbf_cr)
2749 {
2750 if (tctx->img->pps.cu_qp_delta_enabled_flag &&
2751 !tctx->IsCuQpDeltaCoded) {
2752
2753 int cu_qp_delta_abs = decode_cu_qp_delta_abs(tctx);
2754 int cu_qp_delta_sign=0;
2755 if (cu_qp_delta_abs) {
2756 cu_qp_delta_sign = decode_CABAC_bypass(&tctx->cabac_decoder);
2757 }
2758
2759 tctx->IsCuQpDeltaCoded = 1;
2760 tctx->CuQpDelta = cu_qp_delta_abs*(1-2*cu_qp_delta_sign);
2761
2762 //printf("read cu_qp_delta (%d;%d) = %d\n",x0,y0,tctx->CuQpDelta);
2763
2764 logtrace(LogSlice,"cu_qp_delta_abs = %d\n",cu_qp_delta_abs);
2765 logtrace(LogSlice,"cu_qp_delta_sign = %d\n",cu_qp_delta_sign);
2766 logtrace(LogSlice,"CuQpDelta = %d\n",tctx->CuQpDelta);
2767
2768 decode_quantization_parameters(tctx, x0,y0, xCUBase, yCUBase);
2769 }
2770 }
2771
2772
2773 if (cbf_luma || cbf_cb || cbf_cr)
2774 {
2775 // position of TU in local CU
2776 int xL = x0 - xCUBase;
2777 int yL = y0 - yCUBase;
2778
2779 int err;
2780 if (cbf_luma) {
2781 if ((err=residual_coding(tctx,x0,y0, xL,yL,log2TrafoSize,0)) != DE265_OK) return err;
2782 }
2783
2784 if (log2TrafoSize>2) {
2785 if (cbf_cb) {
2786 if ((err=residual_coding(tctx,x0,y0,xL,yL,log2TrafoSize-1,1)) != DE265_OK) return err;
2787 }
2788
2789 if (cbf_cr) {
2790 if ((err=residual_coding(tctx,x0,y0,xL,yL,log2TrafoSize-1,2)) != DE265_OK) return err;
2791 }
2792 }
2793 else if (blkIdx==3) {
2794 if (cbf_cb) {
2795 if ((err=residual_coding(tctx,xBase,yBase,xBase-xCUBase,yBase-yCUBase,
2796 log2TrafoSize,1)) != DE265_OK) return err;
2797 }
2798
2799 if (cbf_cr) {
2800 if ((err=residual_coding(tctx,xBase,yBase,xBase-xCUBase,yBase-yCUBase,
2801 log2TrafoSize,2)) != DE265_OK) return err;
2802 }
2803 }
2804 }
2805
2806 return DE265_OK;
2807 }
2808
2809
2810 void read_transform_tree(thread_context* tctx,
2811 int x0, int y0, // position of TU in frame
2812 int xBase, int yBase, // position of parent TU in frame
2813 int xCUBase, int yCUBase, // position of CU in frame
2814 int log2TrafoSize,
2815 int trafoDepth,
2816 int blkIdx,
2817 int MaxTrafoDepth,
2818 int IntraSplitFlag,
2819 enum PredMode cuPredMode,
2820 bool parent_cbf_cb,bool parent_cbf_cr)
2821 {
2822 logtrace(LogSlice,"- read_transform_tree (interleaved) x0:%d y0:%d xBase:%d yBase:%d "
2823 "log2TrafoSize:%d trafoDepth:%d MaxTrafoDepth:%d\n",
2824 x0,y0,xBase,yBase,log2TrafoSize,trafoDepth,MaxTrafoDepth);
2825
2826 de265_image* img = tctx->img;
2827 const seq_parameter_set* sps = &img->sps;
2828
2829 enum PredMode PredMode = img->get_pred_mode(x0,y0);
2830 enum PartMode PartMode = img->get_PartMode(x0,y0);
2831
2832 int split_transform_flag;
2833
2834 int interSplitFlag= (sps->max_transform_hierarchy_depth_inter==0 &&
2835 PredMode == MODE_INTER &&
2836 PartMode != PART_2Nx2N &&
2837 trafoDepth == 0);
2838
2839
2840 /* If TrafoSize is larger than maximum size -> split automatically
2841 If TrafoSize is at minimum size -> do not split
2842 If maximum transformation depth is reached -> do not split
2843 If intra-prediction is NxN mode -> split automatically (only at level 0)
2844 Otherwise -> read split flag
2845 */
2846 if (log2TrafoSize <= sps->Log2MaxTrafoSize &&
2847 log2TrafoSize > sps->Log2MinTrafoSize &&
2848 trafoDepth < MaxTrafoDepth &&
2849 !(IntraSplitFlag && trafoDepth==0))
2850 {
2851 split_transform_flag = decode_split_transform_flag(tctx, log2TrafoSize);
2852 }
2853 else
2854 {
2855 split_transform_flag = (log2TrafoSize > sps->Log2MaxTrafoSize ||
2856 (IntraSplitFlag==1 && trafoDepth==0) ||
2857 interSplitFlag==1) ? 1:0;
2858 }
2859
2860
2861 if (split_transform_flag) {
2862 logtrace(LogSlice,"set_split_transform_flag(%d,%d, %d)\n",x0,y0,trafoDepth);
2863 img->set_split_transform_flag(x0,y0,trafoDepth);
2864 }
2865
2866
2867 int cbf_cb=-1;
2868 int cbf_cr=-1;
2869
2870 if (log2TrafoSize>2) {
2871 // we do not have to test for trafoDepth==0, because parent_cbf_cb is 1 at depth 0
2872 if (/*trafoDepth==0 ||*/ parent_cbf_cb) {
2873 cbf_cb = decode_cbf_chroma(tctx,trafoDepth);
2874 }
2875
2876 // we do not have to test for trafoDepth==0, because parent_cbf_cb is 1 at depth 0
2877 if (/*trafoDepth==0 ||*/ parent_cbf_cr) {
2878 cbf_cr = decode_cbf_chroma(tctx,trafoDepth);
2879 }
2880 }
2881
2882
2883 // cbf_cr/cbf_cb not present in bitstream -> induce values
2884
2885 if (cbf_cb<0) {
2886 if (trafoDepth>0 && log2TrafoSize==2) {
2887 cbf_cb = parent_cbf_cb;
2888 } else {
2889 cbf_cb=0;
2890 }
2891 }
2892
2893 if (cbf_cr<0) {
2894 if (trafoDepth>0 && log2TrafoSize==2) {
2895 cbf_cr = parent_cbf_cr;
2896 } else {
2897 cbf_cr=0;
2898 }
2899 }
2900
2901 if (split_transform_flag) {
2902 int x1 = x0 + (1<<(log2TrafoSize-1));
2903 int y1 = y0 + (1<<(log2TrafoSize-1));
2904
2905 logtrace(LogSlice,"transform split.\n");
2906
2907 read_transform_tree(tctx, x0,y0, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 0,
2908 MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr);
2909 read_transform_tree(tctx, x1,y0, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 1,
2910 MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr);
2911 read_transform_tree(tctx, x0,y1, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 2,
2912 MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr);
2913 read_transform_tree(tctx, x1,y1, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 3,
2914 MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr);
2915 }
2916 else {
2917 int cbf_luma=1;
2918
2919 if (PredMode==MODE_INTRA || trafoDepth!=0 || cbf_cb || cbf_cr) {
2920 cbf_luma = decode_cbf_luma(tctx,trafoDepth);
2921 }
2922
2923 logtrace(LogSlice,"call read_transform_unit %d/%d\n",x0,y0);
2924
2925 read_transform_unit(tctx, x0,y0,xBase,yBase, xCUBase,yCUBase, log2TrafoSize,trafoDepth, blkIdx,
2926 cbf_luma, cbf_cb, cbf_cr);
2927
2928
2929 int nT = 1<<log2TrafoSize;
2930
2931
2932 if (cuPredMode == MODE_INTRA) // if intra mode
2933 {
2934 enum IntraPredMode intraPredMode = img->get_IntraPredMode(x0,y0);
2935
2936 decode_intra_prediction(img, x0,y0, intraPredMode, nT, 0);
2937
2938 enum IntraPredMode chromaPredMode = tctx->IntraPredModeC;
2939
2940 if (nT>=8) {
2941 decode_intra_prediction(img, x0/2,y0/2, chromaPredMode, nT/2, 1);
2942 decode_intra_prediction(img, x0/2,y0/2, chromaPredMode, nT/2, 2);
2943 }
2944 else if (blkIdx==3) {
2945 decode_intra_prediction(img, xBase/2,yBase/2, chromaPredMode, nT, 1);
2946 decode_intra_prediction(img, xBase/2,yBase/2, chromaPredMode, nT, 2);
2947 }
2948 }
2949
2950 // NOTE: disable MC-mode residuals:
2951 { //if (cuPredMode == MODE_INTRA) {
2952 if (cbf_luma) {
2953 scale_coefficients(tctx, x0,y0, xCUBase,yCUBase, nT, 0,
2954 tctx->transform_skip_flag[0], PredMode==MODE_INTRA);
2955 }
2956
2957 if (nT>=8) {
2958 if (cbf_cb) {
2959 scale_coefficients(tctx, x0/2,y0/2, xCUBase/2,yCUBase/2, nT/2, 1,
2960 tctx->transform_skip_flag[1], PredMode==MODE_INTRA);
2961 }
2962 if (cbf_cr) {
2963 scale_coefficients(tctx, x0/2,y0/2, xCUBase/2,yCUBase/2, nT/2, 2,
2964 tctx->transform_skip_flag[2], PredMode==MODE_INTRA);
2965 }
2966 }
2967 else if (blkIdx==3) {
2968 if (cbf_cb) {
2969 scale_coefficients(tctx, xBase/2,yBase/2, xCUBase/2,yCUBase/2, nT, 1,
2970 tctx->transform_skip_flag[1], PredMode==MODE_INTRA);
2971 }
2972 if (cbf_cr) {
2973 scale_coefficients(tctx, xBase/2,yBase/2, xCUBase/2,yCUBase/2, nT, 2,
2974 tctx->transform_skip_flag[2], PredMode==MODE_INTRA);
2975 }
2976 }
2977 }
2978 }
2979 }
2980
2981
2982 #if DE265_LOG_TRACE
2983 static const char* part_mode_name(enum PartMode pm)
2984 {
2985 switch (pm) {
2986 case PART_2Nx2N: return "2Nx2N";
2987 case PART_2NxN: return "2NxN";
2988 case PART_Nx2N: return "Nx2N";
2989 case PART_NxN: return "NxN";
2990 case PART_2NxnU: return "2NxnU";
2991 case PART_2NxnD: return "2NxnD";
2992 case PART_nLx2N: return "nLx2N";
2993 case PART_nRx2N: return "nRx2N";
2994 }
2995
2996 return "undefined part mode";
2997 }
2998 #endif
2999
3000
3001 void read_mvd_coding(thread_context* tctx,
3002 int x0,int y0, int refList)
3003 {
3004 int abs_mvd_greater0_flag[2];
3005 abs_mvd_greater0_flag[0] = decode_CABAC_bit(&tctx->cabac_decoder,
3006 &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+0]);
3007 abs_mvd_greater0_flag[1] = decode_CABAC_bit(&tctx->cabac_decoder,
3008 &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+0]);
3009
3010 int abs_mvd_greater1_flag[2];
3011 if (abs_mvd_greater0_flag[0]) {
3012 abs_mvd_greater1_flag[0] = decode_CABAC_bit(&tctx->cabac_decoder,
3013 &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+1]);
3014 }
3015 else {
3016 abs_mvd_greater1_flag[0]=0;
3017 }
3018
3019 if (abs_mvd_greater0_flag[1]) {
3020 abs_mvd_greater1_flag[1] = decode_CABAC_bit(&tctx->cabac_decoder,
3021 &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+1]);
3022 }
3023 else {
3024 abs_mvd_greater1_flag[1]=0;
3025 }
3026
3027
3028 int abs_mvd_minus2[2];
3029 int mvd_sign_flag[2];
3030 int value[2];
3031
3032 for (int c=0;c<2;c++) {
3033 if (abs_mvd_greater0_flag[c]) {
3034 if (abs_mvd_greater1_flag[c]) {
3035 abs_mvd_minus2[c] = decode_CABAC_EGk_bypass(&tctx->cabac_decoder, 1);
3036 }
3037 else {
3038 abs_mvd_minus2[c] = abs_mvd_greater1_flag[c] -1;
3039 }
3040
3041 mvd_sign_flag[c] = decode_CABAC_bypass(&tctx->cabac_decoder);
3042
3043 value[c] = abs_mvd_minus2[c]+2;
3044 if (mvd_sign_flag[c]) { value[c] = -value[c]; }
3045 }
3046 else {
3047 value[c] = 0;
3048 }
3049 }
3050
3051 //set_mvd(tctx->decctx, x0,y0, refList, value[0],value[1]);
3052 tctx->mvd[refList][0] = value[0];
3053 tctx->mvd[refList][1] = value[1];
3054
3055 logtrace(LogSlice, "MVD[%d;%d|%d] = %d;%d\n",x0,y0,refList, value[0],value[1]);
3056 }
3057
3058
3059 void read_prediction_unit_SKIP(thread_context* tctx,
3060 int x0, int y0,
3061 int nPbW, int nPbH)
3062 {
3063 slice_segment_header* shdr = tctx->shdr;
3064
3065 int merge_idx;
3066 if (shdr->MaxNumMergeCand>1) {
3067 merge_idx = decode_merge_idx(tctx);
3068 }
3069 else {
3070 merge_idx = 0;
3071 }
3072
3073 tctx->merge_idx = merge_idx;
3074 tctx->merge_flag = true;
3075
3076 logtrace(LogSlice,"prediction skip 2Nx2N, merge_idx: %d\n",merge_idx);
3077 }
3078
3079
3080 void read_prediction_unit(thread_context* tctx,
3081 int xC,int yC, int xB,int yB,
3082 int nPbW, int nPbH,
3083 int ctDepth, int nCS,int partIdx)
3084 {
3085 logtrace(LogSlice,"read_prediction_unit %d;%d %dx%d\n",xC+xB,yC+xB,nPbW,nPbH);
3086
3087 int x0 = xC+xB;
3088 int y0 = yC+yB;
3089
3090 slice_segment_header* shdr = tctx->shdr;
3091
3092 int merge_flag = decode_merge_flag(tctx);
3093 tctx->merge_flag = merge_flag;
3094
3095 if (merge_flag) {
3096 int merge_idx;
3097
3098 if (shdr->MaxNumMergeCand>1) {
3099 merge_idx = decode_merge_idx(tctx);
3100 }
3101 else {
3102 merge_idx = 0;
3103 }
3104
3105 logtrace(LogSlice,"prediction unit %d,%d, merge mode, index: %d\n",x0,y0,merge_idx);
3106
3107 tctx->merge_idx = merge_idx;
3108 }
3109 else { // no merge flag
3110 enum InterPredIdc inter_pred_idc;
3111
3112 if (shdr->slice_type == SLICE_TYPE_B) {
3113 inter_pred_idc = decode_inter_pred_idc(tctx,x0,y0,nPbW,nPbH,ctDepth);
3114 }
3115 else {
3116 inter_pred_idc = PRED_L0;
3117 }
3118
3119 tctx->inter_pred_idc = inter_pred_idc; // set_inter_pred_idc(ctx,x0,y0, inter_pred_idc);
3120
3121 if (inter_pred_idc != PRED_L1) {
3122 int ref_idx_l0 = decode_ref_idx_lX(tctx, shdr->num_ref_idx_l0_active);
3123
3124 // NOTE: case for only one reference frame is handles in decode_ref_idx_lX()
3125 tctx->refIdx[0] = ref_idx_l0;
3126
3127 read_mvd_coding(tctx,x0,y0, 0);
3128
3129 int mvp_l0_flag = decode_mvp_lx_flag(tctx); // l0
3130 tctx->mvp_lX_flag[0] = mvp_l0_flag;
3131
3132 logtrace(LogSlice,"prediction unit %d,%d, L0, refIdx=%d mvp_l0_flag:%d\n",
3133 x0,y0, tctx->refIdx[0], mvp_l0_flag);
3134 }
3135
3136 if (inter_pred_idc != PRED_L0) {
3137 int ref_idx_l1 = decode_ref_idx_lX(tctx, shdr->num_ref_idx_l1_active);
3138
3139 // NOTE: case for only one reference frame is handles in decode_ref_idx_lX()
3140 tctx->refIdx[1] = ref_idx_l1;
3141
3142 if (shdr->mvd_l1_zero_flag &&
3143 inter_pred_idc == PRED_BI) {
3144 tctx->mvd[1][0] = 0;
3145 tctx->mvd[1][1] = 0;
3146 }
3147 else {
3148 read_mvd_coding(tctx,x0,y0, 1);
3149 }
3150
3151 int mvp_l1_flag = decode_mvp_lx_flag(tctx); // l1
3152 tctx->mvp_lX_flag[1] = mvp_l1_flag;
3153
3154 logtrace(LogSlice,"prediction unit %d,%d, L1, refIdx=%d mvp_l1_flag:%d\n",
3155 x0,y0, tctx->refIdx[1], mvp_l1_flag);
3156 }
3157 }
3158
3159
3160
3161 decode_prediction_unit(tctx, xC,yC,xB,yB, nCS, nPbW,nPbH, partIdx);
3162 }
3163
3164
3165
3166
3167 static void read_pcm_samples(thread_context* tctx, int x0, int y0, int log2CbSize)
3168 {
3169 bitreader br;
3170 br.data = tctx->cabac_decoder.bitstream_curr;
3171 br.bytes_remaining = tctx->cabac_decoder.bitstream_end - tctx->cabac_decoder.bitstream_curr;
3172 br.nextbits = 0;
3173 br.nextbits_cnt = 0;
3174
3175 const seq_parameter_set* sps = &tctx->img->sps;
3176 //fprintf(stderr,"PCM pos: %d %d (POC=%d)\n",x0,y0,tctx->decctx->img->PicOrderCntVal);
3177
3178 int nBitsY = sps->pcm_sample_bit_depth_luma;
3179 int nBitsC = sps->pcm_sample_bit_depth_chroma;
3180
3181 int wY = 1<<log2CbSize;
3182 int wC = 1<<(log2CbSize-1);
3183
3184 uint8_t* yPtr;
3185 uint8_t* cbPtr;
3186 uint8_t* crPtr;
3187 int stride;
3188 int chroma_stride;
3189 yPtr = tctx->img->get_image_plane(0);
3190 cbPtr = tctx->img->get_image_plane(1);
3191 crPtr = tctx->img->get_image_plane(2);
3192 stride = tctx->img->get_image_stride(0);
3193 chroma_stride = tctx->img->get_image_stride(1);
3194
3195 yPtr = &yPtr [y0*stride + x0];
3196 cbPtr = &cbPtr[y0/2*chroma_stride + x0/2];
3197 crPtr = &crPtr[y0/2*chroma_stride + x0/2];
3198
3199 int shiftY = sps->BitDepth_Y - nBitsY;
3200 int shiftC = sps->BitDepth_C - nBitsC;
3201
3202 for (int y=0;y<wY;y++)
3203 for (int x=0;x<wY;x++)
3204 {
3205 int value = get_bits(&br, nBitsY);
3206 yPtr[y*stride+x] = value << shiftY;
3207 }
3208
3209 for (int y=0;y<wC;y++)
3210 for (int x=0;x<wC;x++)
3211 {
3212 int value = get_bits(&br, nBitsC);
3213 cbPtr[y*chroma_stride+x] = value << shiftC;
3214 }
3215
3216 for (int y=0;y<wC;y++)
3217 for (int x=0;x<wC;x++)
3218 {
3219 int value = get_bits(&br, nBitsC);
3220 crPtr[y*chroma_stride+x] = value << shiftC;
3221 }
3222
3223 prepare_for_CABAC(&br);
3224 tctx->cabac_decoder.bitstream_curr = br.data;
3225 init_CABAC_decoder_2(&tctx->cabac_decoder);
3226 }
3227
3228
3229 void read_coding_unit(thread_context* tctx,
3230 int x0, int y0, // position of coding unit in frame
3231 int log2CbSize,
3232 int ctDepth)
3233 {
3234 de265_image* img = tctx->img;
3235 const seq_parameter_set* sps = &img->sps;
3236 const pic_parameter_set* pps = &img->pps;
3237 slice_segment_header* shdr = tctx->shdr;
3238
3239 logtrace(LogSlice,"- read_coding_unit %d;%d cbsize:%d\n",x0,y0,1<<log2CbSize);
3240
3241
3242 img->set_log2CbSize(x0,y0, log2CbSize);
3243
3244 int nCbS = 1<<log2CbSize; // number of coding block samples
3245
3246 decode_quantization_parameters(tctx, x0,y0, x0, y0);
3247
3248
3249 if (pps->transquant_bypass_enable_flag)
3250 {
3251 int transquant_bypass = decode_transquant_bypass_flag(tctx);
3252
3253 tctx->cu_transquant_bypass_flag = transquant_bypass;
3254
3255 if (transquant_bypass) {
3256 img->set_cu_transquant_bypass(x0,y0,log2CbSize);
3257 }
3258 }
3259 else {
3260 tctx->cu_transquant_bypass_flag = 0;
3261 }
3262
3263 uint8_t cu_skip_flag = 0;
3264 if (shdr->slice_type != SLICE_TYPE_I) {
3265 cu_skip_flag = decode_cu_skip_flag(tctx,x0,y0,ctDepth);
3266 }
3267
3268 int IntraSplitFlag = 0;
3269
3270 enum PredMode cuPredMode;
3271
3272 if (cu_skip_flag) {
3273 read_prediction_unit_SKIP(tctx,x0,y0,nCbS,nCbS);
3274
3275 img->set_PartMode(x0,y0, PART_2Nx2N); // need this for deblocking filter
3276 img->set_pred_mode(x0,y0,log2CbSize, MODE_SKIP);
3277 cuPredMode = MODE_SKIP;
3278
3279 logtrace(LogSlice,"CU pred mode: SKIP\n");
3280
3281
3282 // DECODE
3283
3284 int nCS_L = 1<<log2CbSize;
3285 decode_prediction_unit(tctx,x0,y0, 0,0, nCS_L, nCS_L,nCS_L, 0);
3286 }
3287 else /* not skipped */ {
3288 if (shdr->slice_type != SLICE_TYPE_I) {
3289 int pred_mode_flag = decode_pred_mode_flag(tctx);
3290 cuPredMode = pred_mode_flag ? MODE_INTRA : MODE_INTER;
3291 }
3292 else {
3293 cuPredMode = MODE_INTRA;
3294 }
3295
3296 img->set_pred_mode(x0,y0,log2CbSize, cuPredMode);
3297
3298 logtrace(LogSlice,"CU pred mode: %s\n", cuPredMode==MODE_INTRA ? "INTRA" : "INTER");
3299
3300
3301 enum PartMode PartMode;
3302
3303 if (cuPredMode != MODE_INTRA ||
3304 log2CbSize == sps->Log2MinCbSizeY) {
3305 PartMode = decode_part_mode(tctx, cuPredMode, log2CbSize);
3306
3307 if (PartMode==PART_NxN && cuPredMode==MODE_INTRA) {
3308 IntraSplitFlag=1;
3309 }
3310 } else {
3311 PartMode = PART_2Nx2N;
3312 }
3313
3314 img->set_PartMode(x0,y0, PartMode); // needed for deblocking ?
3315
3316 logtrace(LogSlice, "PartMode: %s\n", part_mode_name(PartMode));
3317
3318
3319 bool pcm_flag = false;
3320
3321 if (cuPredMode == MODE_INTRA) {
3322 if (PartMode == PART_2Nx2N && sps->pcm_enabled_flag &&
3323 log2CbSize >= sps->Log2MinIpcmCbSizeY &&
3324 log2CbSize <= sps->Log2MaxIpcmCbSizeY) {
3325 pcm_flag = decode_CABAC_term_bit(&tctx->cabac_decoder);
3326 }
3327
3328 if (pcm_flag) {
3329 img->set_pcm_flag(x0,y0,log2CbSize);
3330
3331 read_pcm_samples(tctx, x0,y0, log2CbSize);
3332 }
3333 else {
3334 int pbOffset = (PartMode == PART_NxN) ? (nCbS/2) : nCbS;
3335 int log2IntraPredSize = (PartMode == PART_NxN) ? (log2CbSize-1) : log2CbSize;
3336
3337 logtrace(LogSlice,"nCbS:%d pbOffset:%d\n",nCbS,pbOffset);
3338
3339 int prev_intra_luma_pred_flag[4];
3340
3341 int idx=0;
3342 for (int j=0;j<nCbS;j+=pbOffset)
3343 for (int i=0;i<nCbS;i+=pbOffset)
3344 {
3345 prev_intra_luma_pred_flag[idx++] = decode_prev_intra_luma_pred_flag(tctx);
3346 }
3347
3348 int mpm_idx[4], rem_intra_luma_pred_mode[4];
3349 idx=0;
3350
3351 for (int j=0;j<nCbS;j+=pbOffset)
3352 for (int i=0;i<nCbS;i+=pbOffset)
3353 {
3354 if (prev_intra_luma_pred_flag[idx]) {
3355 mpm_idx[idx] = decode_mpm_idx(tctx);
3356 }
3357 else {
3358 rem_intra_luma_pred_mode[idx] = decode_rem_intra_luma_pred_mode(tctx);
3359 }
3360
3361
3362 int x = x0+i;
3363 int y = y0+j;
3364
3365 // --- find intra prediction mode ---
3366
3367 int IntraPredMode;
3368
3369 int availableA = check_CTB_available(img, shdr, x,y, x-1,y);
3370 int availableB = check_CTB_available(img, shdr, x,y, x,y-1);
3371
3372 int PUidx = (x>>sps->Log2MinPUSize) + (y>>sps->Log2MinPUSize)*sps->PicWidthInMinPUs;
3373
3374 // block on left side
3375
3376 enum IntraPredMode candIntraPredModeA, candIntraPredModeB;
3377 if (availableA==false) {
3378 candIntraPredModeA=INTRA_DC;
3379 }
3380 else if (img->get_pred_mode(x-1,y) != MODE_INTRA ||
3381 img->get_pcm_flag (x-1,y)) {
3382 candIntraPredModeA=INTRA_DC;
3383 }
3384 else {
3385 candIntraPredModeA = img->get_IntraPredMode_atIndex(PUidx-1);
3386 }
3387
3388 // block above
3389
3390 if (availableB==false) {
3391 candIntraPredModeB=INTRA_DC;
3392 }
3393 else if (img->get_pred_mode(x,y-1) != MODE_INTRA ||
3394 img->get_pcm_flag (x,y-1)) {
3395 candIntraPredModeB=INTRA_DC;
3396 }
3397 else if (y-1 < ((y >> sps->Log2CtbSizeY) << sps->Log2CtbSizeY)) {
3398 candIntraPredModeB=INTRA_DC;
3399 }
3400 else {
3401 candIntraPredModeB = img->get_IntraPredMode_atIndex(PUidx-sps->PicWidthInMinPUs);
3402 }
3403
3404 // build candidate list
3405
3406 int candModeList[3];
3407
3408 logtrace(LogSlice,"availableA:%d candA:%d & availableB:%d candB:%d\n",
3409 availableA, candIntraPredModeA,
3410 availableB, candIntraPredModeB);
3411
3412 if (candIntraPredModeA == candIntraPredModeB) {
3413 if (candIntraPredModeA < 2) {
3414 candModeList[0] = INTRA_PLANAR;
3415 candModeList[1] = INTRA_DC;
3416 candModeList[2] = INTRA_ANGULAR_26;
3417 }
3418 else {
3419 candModeList[0] = candIntraPredModeA;
3420 candModeList[1] = 2 + ((candIntraPredModeA-2 -1 +32) % 32);
3421 candModeList[2] = 2 + ((candIntraPredModeA-2 +1 ) % 32);
3422 }
3423 }
3424 else {
3425 candModeList[0] = candIntraPredModeA;
3426 candModeList[1] = candIntraPredModeB;
3427
3428 if (candIntraPredModeA != INTRA_PLANAR &&
3429 candIntraPredModeB != INTRA_PLANAR) {
3430 candModeList[2] = INTRA_PLANAR;
3431 }
3432 else if (candIntraPredModeA != INTRA_DC &&
3433 candIntraPredModeB != INTRA_DC) {
3434 candModeList[2] = INTRA_DC;
3435 }
3436 else {
3437 candModeList[2] = INTRA_ANGULAR_26;
3438 }
3439 }
3440
3441 for (int i=0;i<3;i++)
3442 logtrace(LogSlice,"candModeList[%d] = %d\n", i, candModeList[i]);
3443
3444 if (prev_intra_luma_pred_flag[idx]==1) {
3445 IntraPredMode = candModeList[ mpm_idx[idx] ];
3446 }
3447 else {
3448 // sort candModeList
3449
3450 if (candModeList[0] > candModeList[1]) {
3451 int t = candModeList[0]; candModeList[0]=candModeList[1]; candModeList[1]=t;
3452 }
3453 if (candModeList[0] > candModeList[2]) {
3454 int t = candModeList[0]; candModeList[0]=candModeList[2]; candModeList[2]=t;
3455 }
3456 if (candModeList[1] > candModeList[2]) {
3457 int t = candModeList[1]; candModeList[1]=candModeList[2]; candModeList[2]=t;
3458 }
3459
3460 // skip modes in the list
3461 // (we have 35 modes. skipping the 3 in the list gives us 32, which can be selected by 5 bits)
3462 IntraPredMode = rem_intra_luma_pred_mode[idx];
3463 for (int n=0;n<=2;n++) {
3464 if (IntraPredMode >= candModeList[n]) { IntraPredMode++; }
3465 }
3466 }
3467
3468 logtrace(LogSlice,"IntraPredMode[%d][%d] = %d (log2blk:%d)\n",x,y,IntraPredMode, log2IntraPredSize);
3469
3470 img->set_IntraPredMode(PUidx, log2IntraPredSize,
3471 (enum IntraPredMode)IntraPredMode);
3472
3473 idx++;
3474 }
3475
3476
3477 // set chroma intra prediction mode
3478
3479 int intra_chroma_pred_mode = decode_intra_chroma_pred_mode(tctx);
3480
3481 int IntraPredMode = img->get_IntraPredMode(x0,y0);
3482 logtrace(LogSlice,"IntraPredMode: %d\n",IntraPredMode);
3483
3484 int IntraPredModeC;
3485 if (intra_chroma_pred_mode==4) {
3486 IntraPredModeC = IntraPredMode;
3487 }
3488 else {
3489 static enum IntraPredMode IntraPredModeCCand[4] = {
3490 INTRA_PLANAR,
3491 INTRA_ANGULAR_26, // vertical
3492 INTRA_ANGULAR_10, // horizontal
3493 INTRA_DC
3494 };
3495
3496 IntraPredModeC = IntraPredModeCCand[intra_chroma_pred_mode];
3497 if (IntraPredModeC == IntraPredMode) {
3498 IntraPredModeC = INTRA_ANGULAR_34;
3499 }
3500 }
3501
3502 logtrace(LogSlice,"IntraPredModeC[%d][%d]: %d\n",x0,y0,IntraPredModeC);
3503
3504 tctx->IntraPredModeC = (enum IntraPredMode) IntraPredModeC;
3505 }
3506 }
3507 else { // INTER
3508 int nCS = 1<<log2CbSize;
3509
3510 if (PartMode == PART_2Nx2N) {
3511 read_prediction_unit(tctx,x0,y0,0,0,nCbS,nCbS,ctDepth,nCS,0);
3512 }
3513 else if (PartMode == PART_2NxN) {
3514 read_prediction_unit(tctx,x0,y0,0,0 ,nCbS,nCbS/2,ctDepth,nCS,0);
3515 read_prediction_unit(tctx,x0,y0,0,nCbS/2,nCbS,nCbS/2,ctDepth,nCS,1);
3516 }
3517 else if (PartMode == PART_Nx2N) {
3518 read_prediction_unit(tctx,x0,y0,0,0 , nCbS/2,nCbS,ctDepth,nCS,0);
3519 read_prediction_unit(tctx,x0,y0,nCbS/2,0,nCbS/2,nCbS,ctDepth,nCS,1);
3520 }
3521 else if (PartMode == PART_2NxnU) {
3522 read_prediction_unit(tctx,x0,y0,0,0, nCbS,nCbS/4,ctDepth,nCS,0);
3523 read_prediction_unit(tctx,x0,y0,0,nCbS/4,nCbS,nCbS*3/4,ctDepth,nCS,1);
3524 }
3525 else if (PartMode == PART_2NxnD) {
3526 read_prediction_unit(tctx,x0,y0,0,0, nCbS,nCbS*3/4,ctDepth,nCS,0);
3527 read_prediction_unit(tctx,x0,y0,0,nCbS*3/4,nCbS,nCbS/4,ctDepth,nCS,1);
3528 }
3529 else if (PartMode == PART_nLx2N) {
3530 read_prediction_unit(tctx,x0,y0,0,0, nCbS/4,nCbS,ctDepth,nCS,0);
3531 read_prediction_unit(tctx,x0,y0,nCbS/4,0,nCbS*3/4,nCbS,ctDepth,nCS,1);
3532 }
3533 else if (PartMode == PART_nRx2N) {
3534 read_prediction_unit(tctx,x0,y0,0,0, nCbS*3/4,nCbS,ctDepth,nCS,0);
3535 read_prediction_unit(tctx,x0,y0,nCbS*3/4,0,nCbS/4,nCbS,ctDepth,nCS,1);
3536 }
3537 else if (PartMode == PART_NxN) {
3538 read_prediction_unit(tctx,x0,y0,0,0, nCbS/2,nCbS/2,ctDepth,nCS,0);
3539 read_prediction_unit(tctx,x0,y0,nCbS/2,0, nCbS/2,nCbS/2,ctDepth,nCS,1);
3540 read_prediction_unit(tctx,x0,y0,0,nCbS/2, nCbS/2,nCbS/2,ctDepth,nCS,2);
3541 read_prediction_unit(tctx,x0,y0,nCbS/2,nCbS/2,nCbS/2,nCbS/2,ctDepth,nCS,3);
3542 }
3543 else {
3544 assert(0); // undefined PartMode
3545 }
3546 } // INTER
3547
3548
3549 // decode residual
3550
3551 if (!pcm_flag) { // !pcm
3552 bool rqt_root_cbf;
3553
3554 uint8_t merge_flag = tctx->merge_flag; // !!get_merge_flag(ctx,x0,y0);
3555
3556 if (cuPredMode != MODE_INTRA &&
3557 !(PartMode == PART_2Nx2N && merge_flag)) {
3558
3559 rqt_root_cbf = !!decode_rqt_root_cbf(tctx);
3560 }
3561 else {
3562 rqt_root_cbf = true;
3563 }
3564
3565 //set_rqt_root_cbf(ctx,x0,y0, log2CbSize, rqt_root_cbf);
3566
3567 if (rqt_root_cbf) {
3568 int MaxTrafoDepth;
3569
3570 if (cuPredMode==MODE_INTRA) {
3571 MaxTrafoDepth = sps->max_transform_hierarchy_depth_intra + IntraSplitFlag;
3572 }
3573 else {
3574 MaxTrafoDepth = sps->max_transform_hierarchy_depth_inter;
3575 }
3576
3577 logtrace(LogSlice,"MaxTrafoDepth: %d\n",MaxTrafoDepth);
3578
3579 read_transform_tree(tctx, x0,y0, x0,y0, x0,y0, log2CbSize, 0,0,
3580 MaxTrafoDepth, IntraSplitFlag, cuPredMode, 1,1);
3581 }
3582 } // !pcm
3583 }
3584 }
3585
3586
3587 // ------------------------------------------------------------------------------------------
3588
3589
3590 void read_coding_quadtree(thread_context* tctx,
3591 int x0, int y0,
3592 int log2CbSize,
3593 int ctDepth)
3594 {
3595 logtrace(LogSlice,"- read_coding_quadtree %d;%d cbsize:%d depth:%d POC:%d\n",x0,y0,1<<log2CbSize,ctDepth,tctx->img->PicOrderCntVal);
3596
3597 de265_image* img = tctx->img;
3598 const seq_parameter_set* sps = &img->sps;
3599
3600 int split_flag;
3601
3602 // We only send a split flag if CU is larger than minimum size and
3603 // completely contained within the image area.
3604 // If it is partly outside the image area and not at minimum size,
3605 // it is split. If already at minimum size, it is not split further.
3606 if (x0+(1<<log2CbSize) <= sps->pic_width_in_luma_samples &&
3607 y0+(1<<log2CbSize) <= sps->pic_height_in_luma_samples &&
3608 log2CbSize > sps->Log2MinCbSizeY) {
3609 split_flag = decode_split_cu_flag(tctx, x0,y0, ctDepth);
3610 } else {
3611 if (log2CbSize > sps->Log2MinCbSizeY) { split_flag=1; }
3612 else { split_flag=0; }
3613 }
3614
3615
3616 if (img->pps.cu_qp_delta_enabled_flag &&
3617 log2CbSize >= img->pps.Log2MinCuQpDeltaSize)
3618 {
3619 tctx->IsCuQpDeltaCoded = 0;
3620 tctx->CuQpDelta = 0;
3621 }
3622 else
3623 {
3624 // shdr->CuQpDelta = 0; // TODO check: is this the right place to set to default value ?
3625 }
3626
3627 if (split_flag) {
3628 int x1 = x0 + (1<<(log2CbSize-1));
3629 int y1 = y0 + (1<<(log2CbSize-1));
3630
3631 read_coding_quadtree(tctx,x0,y0, log2CbSize-1, ctDepth+1);
3632
3633 if (x1<sps->pic_width_in_luma_samples)
3634 read_coding_quadtree(tctx,x1,y0, log2CbSize-1, ctDepth+1);
3635
3636 if (y1<sps->pic_height_in_luma_samples)
3637 read_coding_quadtree(tctx,x0,y1, log2CbSize-1, ctDepth+1);
3638
3639 if (x1<sps->pic_width_in_luma_samples &&
3640 y1<sps->pic_height_in_luma_samples)
3641 read_coding_quadtree(tctx,x1,y1, log2CbSize-1, ctDepth+1);
3642 }
3643 else {
3644 // set ctDepth of this CU
3645
3646 img->set_ctDepth(x0,y0, log2CbSize, ctDepth);
3647
3648 read_coding_unit(tctx, x0,y0, log2CbSize, ctDepth);
3649 }
3650
3651 logtrace(LogSlice,"-\n");
3652 }
3653
3654
3655 // ---------------------------------------------------------------------------
3656
3657 enum DecodeResult {
3658 Decode_EndOfSliceSegment,
3659 Decode_EndOfSubstream,
3660 Decode_Error
3661 };
3662
3663 /* Decode CTBs until the end of sub-stream, the end-of-slice, or some error occurs.
3664 */
3665 enum DecodeResult decode_substream(thread_context* tctx,
3666 bool block_wpp, // block on WPP dependencies
3667 bool first_independent_substream)
3668 {
3669 const pic_parameter_set* pps = &tctx->img->pps;
3670 const seq_parameter_set* sps = &tctx->img->sps;
3671
3672 const int ctbW = sps->PicWidthInCtbsY;
3673
3674
3675 const int startCtbY = tctx->CtbY;
3676
3677 // in WPP mode: initialize CABAC model with stored model from row above
3678
3679 if ((!first_independent_substream || tctx->CtbY != startCtbY) &&
3680 pps->entropy_coding_sync_enabled_flag &&
3681 tctx->CtbY>=1 && tctx->CtbX==0)
3682 {
3683 if (sps->PicWidthInCtbsY>1) {
3684 // we have to wait until the context model data is there
3685 tctx->img->wait_for_progress(tctx->task, 1,tctx->CtbY-1,CTB_PROGRESS_PREFILTER);
3686
3687 // copy CABAC model from previous CTB row
3688 memcpy(tctx->ctx_model,
3689 &tctx->imgunit->ctx_models[(tctx->CtbY-1) * CONTEXT_MODEL_TABLE_LENGTH],
3690 CONTEXT_MODEL_TABLE_LENGTH * sizeof(context_model));
3691 }
3692 else {
3693 tctx->img->wait_for_progress(tctx->task, 0,tctx->CtbY-1,CTB_PROGRESS_PREFILTER);
3694 initialize_CABAC(tctx);
3695 }
3696 }
3697
3698
3699 do {
3700 const int ctbx = tctx->CtbX;
3701 const int ctby = tctx->CtbY;
3702
3703 if (block_wpp && ctby>0 && ctbx < ctbW-1) {
3704 //printf("wait on %d/%d\n",ctbx+1,ctby-1);
3705
3706 tctx->img->wait_for_progress(tctx->task, ctbx+1,ctby-1, CTB_PROGRESS_PREFILTER);
3707 }
3708
3709 //printf("%p: decode %d;%d\n", tctx, tctx->CtbY,tctx->CtbX);
3710
3711
3712 // read and decode CTB
3713
3714 read_coding_tree_unit(tctx);
3715
3716
3717 // save CABAC-model for WPP (except in last CTB row)
3718
3719 if (pps->entropy_coding_sync_enabled_flag &&
3720 ctbx == 1 &&
3721 ctby < sps->PicHeightInCtbsY-1)
3722 {
3723 context_model* ctx_store = &tctx->imgunit->ctx_models[ctby * CONTEXT_MODEL_TABLE_LENGTH];
3724
3725 memcpy(ctx_store,
3726 &tctx->ctx_model,
3727 CONTEXT_MODEL_TABLE_LENGTH * sizeof(context_model));
3728 }
3729
3730
3731 // end of slice segment ?
3732
3733 int end_of_slice_segment_flag = decode_CABAC_term_bit(&tctx->cabac_decoder);
3734
3735 if (end_of_slice_segment_flag) {
3736 // at the end of the slice segment, we store the CABAC model if we need it
3737 // because a dependent slice may follow
3738
3739 if (pps->dependent_slice_segments_enabled_flag) {
3740 memcpy(tctx->shdr->ctx_model_storage,
3741 tctx->ctx_model,
3742 CONTEXT_MODEL_TABLE_LENGTH * sizeof(context_model));
3743 }
3744 }
3745
3746 tctx->img->ctb_progress[ctbx+ctby*ctbW].set_progress(CTB_PROGRESS_PREFILTER);
3747
3748 //printf("%p: decoded %d|%d\n",tctx, ctby,ctbx);
3749
3750
3751 logtrace(LogSlice,"read CTB %d -> end=%d\n", tctx->CtbAddrInRS, end_of_slice_segment_flag);
3752 //printf("read CTB %d -> end=%d\n", tctx->CtbAddrInRS, end_of_slice_segment_flag);
3753
3754 const int lastCtbY = tctx->CtbY;
3755
3756 bool endOfPicture = advanceCtbAddr(tctx); // true if we read past the end of the image
3757
3758 if (endOfPicture &&
3759 end_of_slice_segment_flag == false)
3760 {
3761 tctx->decctx->add_warning(DE265_WARNING_CTB_OUTSIDE_IMAGE_AREA, false);
3762 tctx->img->integrity = INTEGRITY_DECODING_ERRORS;
3763 return Decode_Error;
3764 }
3765
3766
3767 if (end_of_slice_segment_flag) {
3768 return Decode_EndOfSliceSegment;
3769 }
3770
3771
3772 if (!end_of_slice_segment_flag) {
3773 bool end_of_sub_stream = false;
3774 end_of_sub_stream |= (pps->tiles_enabled_flag &&
3775 pps->TileId[tctx->CtbAddrInTS] != pps->TileId[tctx->CtbAddrInTS-1]);
3776 end_of_sub_stream |= (pps->entropy_coding_sync_enabled_flag &&
3777 lastCtbY != tctx->CtbY);
3778
3779 if (end_of_sub_stream) {
3780 int end_of_sub_stream_one_bit = decode_CABAC_term_bit(&tctx->cabac_decoder);
3781 if (!end_of_sub_stream_one_bit) {
3782 tctx->decctx->add_warning(DE265_WARNING_EOSS_BIT_NOT_SET, false);
3783 tctx->img->integrity = INTEGRITY_DECODING_ERRORS;
3784 return Decode_Error;
3785 }
3786
3787 init_CABAC_decoder_2(&tctx->cabac_decoder); // byte alignment
3788 return Decode_EndOfSubstream;
3789 }
3790 }
3791
3792 } while (true);
3793 }
3794
3795
3796
3797 void initialize_CABAC_at_slice_segment_start(thread_context* tctx)
3798 {
3799 de265_image* img = tctx->img;
3800 const pic_parameter_set* pps = &img->pps;
3801 const seq_parameter_set* sps = &img->sps;
3802 slice_segment_header* shdr = tctx->shdr;
3803
3804 if (shdr->dependent_slice_segment_flag) {
3805 int prevCtb = pps->CtbAddrTStoRS[ pps->CtbAddrRStoTS[shdr->slice_segment_address] -1 ];
3806
3807 slice_segment_header* prevCtbHdr = img->slices[ img->get_SliceHeaderIndex_atIndex(prevCtb) ];
3808
3809 if (pps->is_tile_start_CTB(shdr->slice_segment_address % sps->PicWidthInCtbsY,
3810 shdr->slice_segment_address / sps->PicWidthInCtbsY
3811 )) {
3812 initialize_CABAC(tctx);
3813 }
3814 else {
3815 tctx->img->wait_for_progress(tctx->task, prevCtb, CTB_PROGRESS_PREFILTER);
3816
3817 memcpy(tctx->ctx_model,
3818 prevCtbHdr->ctx_model_storage,
3819 CONTEXT_MODEL_TABLE_LENGTH * sizeof(context_model));
3820 }
3821 }
3822 else {
3823 initialize_CABAC(tctx);
3824 }
3825 }
3826
3827
3828 void thread_task_slice_segment::work()
3829 {
3830 struct thread_task_slice_segment* data = this;
3831 thread_context* tctx = data->tctx;
3832 de265_image* img = tctx->img;
3833
3834 state = Running;
3835 img->thread_run();
3836
3837 setCtbAddrFromTS(tctx);
3838
3839 //printf("%p: A start decoding at %d/%d\n", tctx, tctx->CtbX,tctx->CtbY);
3840
3841 if (data->firstSliceSubstream) {
3842 initialize_CABAC_at_slice_segment_start(tctx);
3843 }
3844 else {
3845 initialize_CABAC(tctx);
3846 }
3847
3848 init_CABAC_decoder_2(&tctx->cabac_decoder);
3849
3850 /*enum DecodeResult result =*/ decode_substream(tctx, false, data->firstSliceSubstream);
3851
3852 state = Finished;
3853 img->thread_finishes();
3854
3855 return; // DE265_OK;
3856 }
3857
3858
3859 void thread_task_ctb_row::work()
3860 {
3861 struct thread_task_ctb_row* data = this;
3862 thread_context* tctx = data->tctx;
3863 de265_image* img = tctx->img;
3864
3865 seq_parameter_set* sps = &img->sps;
3866 int ctbW = sps->PicWidthInCtbsY;
3867
3868 state = Running;
3869 img->thread_run();
3870
3871 setCtbAddrFromTS(tctx);
3872
3873 int ctby = tctx->CtbAddrInRS / ctbW;
3874 int myCtbRow = ctby;
3875
3876 // printf("start decoding at %d/%d\n", ctbx,ctby);
3877
3878 if (data->firstSliceSubstream) {
3879 initialize_CABAC_at_slice_segment_start(tctx);
3880 //initialize_CABAC(tctx);
3881 }
3882
3883 init_CABAC_decoder_2(&tctx->cabac_decoder);
3884
3885 bool firstIndependentSubstream =
3886 data->firstSliceSubstream && !tctx->shdr->dependent_slice_segment_flag;
3887
3888 /*enum DecodeResult result =*/
3889 decode_substream(tctx, true, firstIndependentSubstream);
3890
3891 // mark progress on remaining CTBs in row (in case of decoder error and early termination)
3892
3893 // TODO: what about slices that end properly in the middle of a CTB row?
3894
3895 #if 1
3896 if (tctx->CtbY == myCtbRow) {
3897 int lastCtbX = sps->PicWidthInCtbsY; // assume no tiles when WPP is on
3898 for (int x = tctx->CtbX; x<lastCtbX ; x++) {
3899 img->ctb_progress[myCtbRow*ctbW + x].set_progress(CTB_PROGRESS_PREFILTER);
3900 }
3901 }
3902 #endif
3903
3904 state = Finished;
3905 img->thread_finishes();
3906 }
3907
3908
3909 de265_error read_slice_segment_data(thread_context* tctx)
3910 {
3911 setCtbAddrFromTS(tctx);
3912
3913 de265_image* img = tctx->img;
3914 const pic_parameter_set* pps = &img->pps;
3915 const seq_parameter_set* sps = &img->sps;
3916 slice_segment_header* shdr = tctx->shdr;
3917
3918 initialize_CABAC_at_slice_segment_start(tctx);
3919
3920 init_CABAC_decoder_2(&tctx->cabac_decoder);
3921
3922 // printf("-----\n");
3923
3924 bool first_slice_substream = !shdr->dependent_slice_segment_flag;
3925
3926 int substream=0;
3927
3928 enum DecodeResult result;
3929 do {
3930 int ctby = tctx->CtbY;
3931
3932
3933 // check whether entry_points[] are correct in the bitstream
3934
3935 if (substream>0) {
3936 if (substream-1 >= tctx->shdr->entry_point_offset.size() ||
3937 tctx->cabac_decoder.bitstream_curr - tctx->cabac_decoder.bitstream_start -2 /* -2 because of CABAC init */
3938 != tctx->shdr->entry_point_offset[substream-1]) {
3939 tctx->decctx->add_warning(DE265_WARNING_INCORRECT_ENTRY_POINT_OFFSET, true);
3940 }
3941 }
3942
3943 substream++;
3944
3945
3946 result = decode_substream(tctx, false, first_slice_substream);
3947
3948
3949 if (result == Decode_EndOfSliceSegment ||
3950 result == Decode_Error) {
3951 break;
3952 }
3953
3954 first_slice_substream = false;
3955
3956 if (pps->tiles_enabled_flag) {
3957 initialize_CABAC(tctx);
3958 }
3959 } while (true);
3960
3961 return DE265_OK;
3962 }
3963
3964
3965 /* TODO:
3966 When a task wants to block, but is the first in the list of pending tasks,
3967 do some error concealment instead of blocking, since it will never be deblocked.
3968 This will only happen in the case of input error.
3969 */
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * Authors: StrukturAG, Dirk Farin <farin@struktur.de>
5 * Min Chen <chenm003@163.com>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * Authors: struktur AG, Dirk Farin <farin@struktur.de>
5 * Min Chen <chenm003@163.com>
66 *
77 * This file is part of libde265.
88 *
2727 #include "libde265/de265.h"
2828 #include "libde265/util.h"
2929 #include "libde265/refpic.h"
30
31
32 #define MAX_CTB_ROWS 68 // enough for 4K @ 32 pixel CTBs, but TODO: make this dynamic
33 #define MAX_ENTRY_POINTS MAX_CTB_ROWS
34 #define MAX_THREAD_CONTEXTS MAX_CTB_ROWS
30 #include "libde265/threads.h"
31
32 #include <vector>
33
3534 #define MAX_NUM_REF_PICS 16
3635
3736 #define SLICE_TYPE_B 0
133132
134133
135134 typedef struct slice_segment_header {
135 slice_segment_header() { }
136
137 de265_error read(bitreader* br, struct decoder_context*, bool* continueDecoding);
138 void dump_slice_segment_header(const decoder_context*, int fd) const;
139
140
136141 int slice_index; // index through all slices in a picture
137 char inUse; // slice is used by a picture currently in the buffer
138142
139143 char first_slice_segment_in_pic_flag;
140144 char no_output_of_prior_pics_flag;
167171 char num_ref_idx_active_override_flag;
168172 int num_ref_idx_l0_active; // [1;16]
169173 int num_ref_idx_l1_active; // [1;16]
170
171 //ref_pic_lists_modification()
172174
173175 char ref_pic_list_modification_flag_l0;
174176 char ref_pic_list_modification_flag_l1;
209211
210212 int num_entry_point_offsets;
211213 int offset_len;
212 int entry_point_offset[MAX_ENTRY_POINTS];
214 std::vector<int> entry_point_offset;
213215
214216 int slice_segment_header_extension_length;
215217
216218
217219 // --- derived data ---
218220
219 int SliceAddrRS; // start if last independent slice
221 int SliceAddrRS; // start of last independent slice
220222 int SliceQPY;
221223
222224 int initType;
223225
224226 int MaxNumMergeCand;
225227 int CurrRpsIdx;
226 const ref_pic_set* CurrRps; // the active reference-picture set
228 ref_pic_set CurrRps; // the active reference-picture set
229 int NumPocTotalCurr;
227230
228231 int RefPicList[2][MAX_NUM_REF_PICS]; // contains indices into DPB
229232 int RefPicList_POC[2][MAX_NUM_REF_PICS];
233 int RefPicList_PicState[2][MAX_NUM_REF_PICS]; /* We have to save the PicState because the decoding
234 of an image may be delayed and the PicState can
235 change in the mean-time (e.g. from ShortTerm to
236 LongTerm). PicState is used in motion.cc */
230237
231238 char LongTermRefPic[2][MAX_NUM_REF_PICS]; /* Flag whether the picture at this ref-pic-list
232239 is a long-term picture. */
233240
234 // context storage for dependent slices and WPP in single-thread mode
241 // context storage for dependent slices (stores CABAC model at end of slice segment)
235242 context_model ctx_model_storage[CONTEXT_MODEL_TABLE_LENGTH];
243
244 std::vector<int> RemoveReferencesList; // images that can be removed from the DPB before decoding this slice
236245
237246 } slice_segment_header;
238247
249258 } sao_info;
250259
251260
261
262
263 de265_error read_slice_segment_data(struct thread_context* tctx);
264
265 bool alloc_and_init_significant_coeff_ctxIdx_lookupTable();
266 void free_significant_coeff_ctxIdx_lookupTable();
267
268
269 class thread_task_ctb_row : public thread_task
270 {
271 public:
272 bool firstSliceSubstream;
273 struct thread_context* tctx;
274
275 virtual void work();
276 };
277
278 class thread_task_slice_segment : public thread_task
279 {
280 public:
281 bool firstSliceSubstream;
282 struct thread_context* tctx;
283
284 virtual void work();
285 };
286
252287 #endif
+0
-42
libde265/slice_func.h less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef DE265_SLICE_FUNC_H
21 #define DE265_SLICE_FUNC_H
22
23 #include "libde265/slice.h"
24 #include "libde265/decctx.h"
25 #include "libde265/bitstream.h"
26 #include "libde265/threads.h"
27
28
29 de265_error read_slice_segment_header(bitreader* br, slice_segment_header* shdr, decoder_context*,
30 bool* continueDecoding);
31 void dump_slice_segment_header(const slice_segment_header* shdr, const decoder_context*, int fd);
32
33
34 de265_error read_slice_segment_data(decoder_context*, thread_context* tctx);
35
36 bool add_CTB_decode_task_syntax(struct thread_context* tctx, int ctbx,int ctby, int sx,int sy, thread_task* nextCTBTask);
37
38 bool alloc_and_init_significant_coeff_ctxIdx_lookupTable();
39 void free_significant_coeff_ctxIdx_lookupTable();
40
41 #endif
+0
-794
libde265/sps.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "sps.h"
21 #include "sps_func.h"
22 #include "util.h"
23 #include "scan.h"
24
25 #include <assert.h>
26 #include <stdlib.h>
27 #include <string.h>
28
29
30 static int SubWidthC[] = { -1,2,2,1 };
31 static int SubHeightC[] = { -1,2,1,1 };
32
33
34 // TODO if (!check_high(ctx, vlc, 15)) return false;
35 // TODO if (!check_ulvc(ctx, vlc)) return false;
36
37
38 // TODO: should be in some header-file of refpic.c
39 extern bool read_short_term_ref_pic_set(decoder_context* ctx,
40 const seq_parameter_set* sps,
41 bitreader* br,
42 ref_pic_set* out_set,
43 int idxRps, // index of the set to be read
44 const ref_pic_set* sets,
45 bool sliceRefPicSet);
46
47 void init_sps(seq_parameter_set* sps)
48 {
49 memset(sps,0,sizeof(seq_parameter_set));
50 }
51
52
53 void free_sps(seq_parameter_set* sps)
54 {
55 free(sps->ref_pic_sets);
56 sps->ref_pic_sets = NULL;
57 }
58
59
60 de265_error read_sps(decoder_context* ctx, bitreader* br,
61 seq_parameter_set* sps) //, ref_pic_set** ref_pic_sets)
62 {
63 int vlc;
64
65 sps->video_parameter_set_id = get_bits(br,4);
66 sps->sps_max_sub_layers = get_bits(br,3) +1;
67 if (sps->sps_max_sub_layers>7) {
68 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
69 }
70
71 sps->sps_temporal_id_nesting_flag = get_bits(br,1);
72
73 read_profile_tier_level(br,&sps->profile_tier_level, sps->sps_max_sub_layers);
74
75 sps->seq_parameter_set_id = get_uvlc(br);
76
77
78 // --- decode chroma type ---
79
80 sps->chroma_format_idc = get_uvlc(br);
81
82 if (sps->chroma_format_idc == 3) {
83 sps->separate_colour_plane_flag = get_bits(br,1);
84 }
85 else {
86 sps->separate_colour_plane_flag = 0;
87 }
88
89 if (sps->separate_colour_plane_flag) {
90 sps->ChromaArrayType = 0;
91 }
92 else {
93 sps->ChromaArrayType = sps->chroma_format_idc;
94 }
95
96 if (sps->chroma_format_idc<0 ||
97 sps->chroma_format_idc>3) {
98 add_warning(ctx, DE265_WARNING_INVALID_CHROMA_FORMAT, false);
99 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
100 }
101
102 sps->SubWidthC = SubWidthC [sps->chroma_format_idc];
103 sps->SubHeightC = SubHeightC[sps->chroma_format_idc];
104
105
106 // --- picture size ---
107
108 sps->pic_width_in_luma_samples = get_uvlc(br);
109 sps->pic_height_in_luma_samples = get_uvlc(br);
110
111 sps->conformance_window_flag = get_bits(br,1);
112
113 if (sps->conformance_window_flag) {
114 sps->conf_win_left_offset = get_uvlc(br);
115 sps->conf_win_right_offset = get_uvlc(br);
116 sps->conf_win_top_offset = get_uvlc(br);
117 sps->conf_win_bottom_offset= get_uvlc(br);
118 }
119 else {
120 sps->conf_win_left_offset = 0;
121 sps->conf_win_right_offset = 0;
122 sps->conf_win_top_offset = 0;
123 sps->conf_win_bottom_offset= 0;
124 }
125
126 if (sps->ChromaArrayType==0) {
127 sps->WinUnitX = 1;
128 sps->WinUnitY = 1;
129 }
130 else {
131 sps->WinUnitX = SubWidthC[sps->chroma_format_idc];
132 sps->WinUnitY = SubHeightC[sps->chroma_format_idc];
133 }
134
135
136 sps->bit_depth_luma = get_uvlc(br) +8;
137 sps->bit_depth_chroma = get_uvlc(br) +8;
138
139 sps->log2_max_pic_order_cnt_lsb = get_uvlc(br) +4;
140 sps->MaxPicOrderCntLsb = 1<<(sps->log2_max_pic_order_cnt_lsb);
141
142
143 // --- sub_layer_ordering_info ---
144
145 sps->sps_sub_layer_ordering_info_present_flag = get_bits(br,1);
146
147 int firstLayer = (sps->sps_sub_layer_ordering_info_present_flag ?
148 0 : sps->sps_max_sub_layers-1 );
149
150 for (int i=firstLayer ; i <= sps->sps_max_sub_layers-1; i++ ) {
151
152 // sps_max_dec_pic_buffering[i]
153
154 vlc=get_uvlc(br);
155 if (vlc == UVLC_ERROR ||
156 vlc+1 > MAX_NUM_REF_PICS) {
157 add_warning(ctx, DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
158 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
159 }
160
161 sps->sps_max_dec_pic_buffering[i] = vlc+1;
162
163 // sps_max_num_reorder_pics[i]
164
165 vlc = get_uvlc(br);
166 if (vlc == UVLC_ERROR) {
167 add_warning(ctx, DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
168 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
169 }
170 sps->sps_max_num_reorder_pics[i] = vlc;
171
172
173 // sps_max_latency_increase[i]
174
175 vlc = get_uvlc(br);
176 if (vlc == UVLC_ERROR) {
177 add_warning(ctx, DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
178 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
179 }
180
181 sps->sps_max_latency_increase_plus1[i] = vlc;
182
183 sps->SpsMaxLatencyPictures[i] = (sps->sps_max_num_reorder_pics[i] +
184 sps->sps_max_latency_increase_plus1[i]-1);
185 }
186
187 // copy info to all layers if only specified once
188
189 if (sps->sps_sub_layer_ordering_info_present_flag) {
190 int ref = sps->sps_max_sub_layers-1;
191 assert(ref<7);
192
193 for (int i=0 ; i < sps->sps_max_sub_layers-1; i++ ) {
194 sps->sps_max_dec_pic_buffering[i] = sps->sps_max_dec_pic_buffering[ref];
195 sps->sps_max_num_reorder_pics[i] = sps->sps_max_num_reorder_pics[ref];
196 sps->sps_max_latency_increase_plus1[i] = sps->sps_max_latency_increase_plus1[ref];
197 }
198 }
199
200
201 sps->log2_min_luma_coding_block_size = get_uvlc(br)+3;
202 sps->log2_diff_max_min_luma_coding_block_size = get_uvlc(br);
203 sps->log2_min_transform_block_size = get_uvlc(br)+2;
204 sps->log2_diff_max_min_transform_block_size = get_uvlc(br);
205 sps->max_transform_hierarchy_depth_inter = get_uvlc(br);
206 sps->max_transform_hierarchy_depth_intra = get_uvlc(br);
207 sps->scaling_list_enable_flag = get_bits(br,1);
208
209 if (sps->scaling_list_enable_flag) {
210
211 sps->sps_scaling_list_data_present_flag = get_bits(br,1);
212 if (sps->sps_scaling_list_data_present_flag) {
213
214 de265_error err;
215 if ((err=read_scaling_list(br,sps, &sps->scaling_list, false)) != DE265_OK) {
216 return err;
217 }
218 }
219 else {
220 set_default_scaling_lists(&sps->scaling_list);
221 }
222 }
223
224 sps->amp_enabled_flag = get_bits(br,1);
225 sps->sample_adaptive_offset_enabled_flag = get_bits(br,1);
226 sps->pcm_enabled_flag = get_bits(br,1);
227 if (sps->pcm_enabled_flag) {
228 sps->pcm_sample_bit_depth_luma = get_bits(br,4)+1;
229 sps->pcm_sample_bit_depth_chroma = get_bits(br,4)+1;
230 sps->log2_min_pcm_luma_coding_block_size = get_uvlc(br)+3;
231 sps->log2_diff_max_min_pcm_luma_coding_block_size = get_uvlc(br);
232 sps->pcm_loop_filter_disable_flag = get_bits(br,1);
233 }
234 else {
235 sps->pcm_sample_bit_depth_luma = 0;
236 sps->pcm_sample_bit_depth_chroma = 0;
237 sps->log2_min_pcm_luma_coding_block_size = 0;
238 sps->log2_diff_max_min_pcm_luma_coding_block_size = 0;
239 sps->pcm_loop_filter_disable_flag = 0;
240 }
241
242 sps->num_short_term_ref_pic_sets = get_uvlc(br);
243 if (sps->num_short_term_ref_pic_sets < 0 ||
244 sps->num_short_term_ref_pic_sets > 64) {
245 add_warning(ctx, DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE, false);
246 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
247 }
248
249 // --- allocate reference pic set ---
250
251 // we do not allocate the ref-pic-set for the slice header here, but in the slice header itself
252 sps->ref_pic_sets = (ref_pic_set*)realloc(sps->ref_pic_sets,
253 sizeof(ref_pic_set)*sps->num_short_term_ref_pic_sets);
254
255 for (int i = 0; i < sps->num_short_term_ref_pic_sets; i++) {
256
257 //alloc_ref_pic_set(&(*ref_pic_sets)[i],
258 //sps->sps_max_dec_pic_buffering[sps->sps_max_sub_layers-1]);
259
260 bool success = read_short_term_ref_pic_set(ctx,sps,br,
261 &sps->ref_pic_sets[i], i,
262 sps->ref_pic_sets,
263 false);
264
265 if (!success) {
266 return DE265_WARNING_SPS_HEADER_INVALID;
267 }
268
269 // dump_short_term_ref_pic_set(&(*ref_pic_sets)[i], fh);
270 }
271
272 sps->long_term_ref_pics_present_flag = get_bits(br,1);
273
274 if (sps->long_term_ref_pics_present_flag) {
275
276 sps->num_long_term_ref_pics_sps = get_uvlc(br);
277 if (sps->num_long_term_ref_pics_sps > MAX_NUM_LT_REF_PICS_SPS) {
278 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
279 }
280
281 for (int i = 0; i < sps->num_long_term_ref_pics_sps; i++ ) {
282 sps->lt_ref_pic_poc_lsb_sps[i] = get_bits(br, sps->log2_max_pic_order_cnt_lsb);
283 sps->used_by_curr_pic_lt_sps_flag[i] = get_bits(br,1);
284 }
285 }
286 else {
287 sps->num_long_term_ref_pics_sps = 0; // NOTE: missing definition in standard !
288 }
289
290 sps->sps_temporal_mvp_enabled_flag = get_bits(br,1);
291 sps->strong_intra_smoothing_enable_flag = get_bits(br,1);
292 sps->vui_parameters_present_flag = get_bits(br,1);
293
294 #if 0
295 if (sps->vui_parameters_present_flag) {
296 assert(false);
297 /*
298 vui_parameters()
299
300 sps_extension_flag
301 u(1)
302 if( sps_extension_flag )
303
304 while( more_rbsp_data() )
305
306 sps_extension_data_flag
307 u(1)
308 rbsp_trailing_bits()
309 */
310 }
311
312 sps->sps_extension_flag = get_bits(br,1);
313 if (sps->sps_extension_flag) {
314 assert(false);
315 }
316
317 check_rbsp_trailing_bits(br);
318 #endif
319
320 // --- compute derived values ---
321
322 sps->BitDepth_Y = sps->bit_depth_luma;
323 sps->QpBdOffset_Y = 6*(sps->bit_depth_luma-8);
324 sps->BitDepth_C = sps->bit_depth_chroma;
325 sps->QpBdOffset_C = 6*(sps->bit_depth_chroma-8);
326
327 sps->Log2MinCbSizeY = sps->log2_min_luma_coding_block_size;
328 sps->Log2CtbSizeY = sps->Log2MinCbSizeY + sps->log2_diff_max_min_luma_coding_block_size;
329 sps->MinCbSizeY = 1 << sps->Log2MinCbSizeY;
330 sps->CtbSizeY = 1 << sps->Log2CtbSizeY;
331 sps->PicWidthInMinCbsY = sps->pic_width_in_luma_samples / sps->MinCbSizeY;
332 sps->PicWidthInCtbsY = ceil_div(sps->pic_width_in_luma_samples, sps->CtbSizeY);
333 sps->PicHeightInMinCbsY = sps->pic_height_in_luma_samples / sps->MinCbSizeY;
334 sps->PicHeightInCtbsY = ceil_div(sps->pic_height_in_luma_samples,sps->CtbSizeY);
335 sps->PicSizeInMinCbsY = sps->PicWidthInMinCbsY * sps->PicHeightInMinCbsY;
336 sps->PicSizeInCtbsY = sps->PicWidthInCtbsY * sps->PicHeightInCtbsY;
337 sps->PicSizeInSamplesY = sps->pic_width_in_luma_samples * sps->pic_height_in_luma_samples;
338
339 if (sps->chroma_format_idc==0 || sps->separate_colour_plane_flag) {
340 sps->CtbWidthC = 0;
341 sps->CtbHeightC = 0;
342 }
343 else {
344 sps->CtbWidthC = sps->CtbSizeY / sps->SubWidthC;
345 sps->CtbHeightC = sps->CtbSizeY / sps->SubHeightC;
346 }
347
348 sps->Log2MinTrafoSize = sps->log2_min_transform_block_size;
349 sps->Log2MaxTrafoSize = sps->log2_min_transform_block_size + sps->log2_diff_max_min_transform_block_size;
350
351 sps->Log2MinPUSize = sps->Log2MinCbSizeY-1;
352 sps->PicWidthInMinPUs = sps->PicWidthInCtbsY << (sps->Log2CtbSizeY - sps->Log2MinPUSize);
353 sps->PicHeightInMinPUs = sps->PicHeightInCtbsY << (sps->Log2CtbSizeY - sps->Log2MinPUSize);
354
355 sps->Log2MinIpcmCbSizeY = sps->log2_min_pcm_luma_coding_block_size;
356 sps->Log2MaxIpcmCbSizeY = (sps->log2_min_pcm_luma_coding_block_size +
357 sps->log2_diff_max_min_pcm_luma_coding_block_size);
358
359 // the following are not in the standard
360 sps->PicWidthInTbsY = sps->PicWidthInCtbsY << (sps->Log2CtbSizeY - sps->Log2MinTrafoSize);
361 sps->PicHeightInTbsY = sps->PicHeightInCtbsY << (sps->Log2CtbSizeY - sps->Log2MinTrafoSize);
362 sps->PicSizeInTbsY = sps->PicWidthInTbsY * sps->PicHeightInTbsY;
363
364 sps->sps_read = true;
365
366 return DE265_OK;
367 }
368
369
370
371 void dump_sps(seq_parameter_set* sps, /*ref_pic_set* sets,*/ int fd)
372 {
373 //#if (_MSC_VER >= 1500)
374 //#define LOG0(t) loginfo(LogHeaders, t)
375 //#define LOG1(t,d) loginfo(LogHeaders, t,d)
376 //#define LOG2(t,d1,d2) loginfo(LogHeaders, t,d1,d2)
377 //#define LOG3(t,d1,d2,d3) loginfo(LogHeaders, t,d1,d2,d3)
378
379 FILE* fh;
380 if (fd==1) fh=stdout;
381 else if (fd==2) fh=stderr;
382 else { return; }
383
384 #define LOG0(t) log2fh(fh, t)
385 #define LOG1(t,d) log2fh(fh, t,d)
386 #define LOG2(t,d1,d2) log2fh(fh, t,d1,d2)
387 #define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3)
388
389
390 LOG0("----------------- SPS -----------------\n");
391 LOG1("video_parameter_set_id : %d\n", sps->video_parameter_set_id);
392 LOG1("sps_max_sub_layers : %d\n", sps->sps_max_sub_layers);
393 LOG1("sps_temporal_id_nesting_flag : %d\n", sps->sps_temporal_id_nesting_flag);
394
395 dump_profile_tier_level(&sps->profile_tier_level, sps->sps_max_sub_layers, fh);
396
397 LOG1("seq_parameter_set_id : %d\n", sps->seq_parameter_set_id);
398 LOG2("chroma_format_idc : %d (%s)\n", sps->chroma_format_idc,
399 sps->chroma_format_idc == 1 ? "4:2:0" :
400 sps->chroma_format_idc == 2 ? "4:2:2" :
401 sps->chroma_format_idc == 3 ? "4:4:4" : "unknown");
402
403 if (sps->chroma_format_idc == 3) {
404 LOG1("separate_colour_plane_flag : %d\n", sps->separate_colour_plane_flag);
405 }
406
407 LOG1("pic_width_in_luma_samples : %d\n", sps->pic_width_in_luma_samples);
408 LOG1("pic_height_in_luma_samples : %d\n", sps->pic_height_in_luma_samples);
409 LOG1("conformance_window_flag : %d\n", sps->conformance_window_flag);
410
411 if (sps->conformance_window_flag) {
412 LOG1("conf_win_left_offset : %d\n", sps->conf_win_left_offset);
413 LOG1("conf_win_right_offset : %d\n", sps->conf_win_right_offset);
414 LOG1("conf_win_top_offset : %d\n", sps->conf_win_top_offset);
415 LOG1("conf_win_bottom_offset: %d\n", sps->conf_win_bottom_offset);
416 }
417
418 LOG1("bit_depth_luma : %d\n", sps->bit_depth_luma);
419 LOG1("bit_depth_chroma : %d\n", sps->bit_depth_chroma);
420
421 LOG1("log2_max_pic_order_cnt_lsb : %d\n", sps->log2_max_pic_order_cnt_lsb);
422 LOG1("sps_sub_layer_ordering_info_present_flag : %d\n", sps->sps_sub_layer_ordering_info_present_flag);
423
424 int firstLayer = (sps->sps_sub_layer_ordering_info_present_flag ?
425 0 : sps->sps_max_sub_layers-1 );
426
427 for (int i=firstLayer ; i <= sps->sps_max_sub_layers-1; i++ ) {
428 LOG1("Layer %d\n",i);
429 LOG1(" sps_max_dec_pic_buffering : %d\n", sps->sps_max_dec_pic_buffering[i]);
430 LOG1(" sps_max_num_reorder_pics : %d\n", sps->sps_max_num_reorder_pics[i]);
431 LOG1(" sps_max_latency_increase_plus1 : %d\n", sps->sps_max_latency_increase_plus1[i]);
432 }
433
434 LOG1("log2_min_luma_coding_block_size : %d\n", sps->log2_min_luma_coding_block_size);
435 LOG1("log2_diff_max_min_luma_coding_block_size : %d\n",sps->log2_diff_max_min_luma_coding_block_size);
436 LOG1("log2_min_transform_block_size : %d\n", sps->log2_min_transform_block_size);
437 LOG1("log2_diff_max_min_transform_block_size : %d\n", sps->log2_diff_max_min_transform_block_size);
438 LOG1("max_transform_hierarchy_depth_inter : %d\n", sps->max_transform_hierarchy_depth_inter);
439 LOG1("max_transform_hierarchy_depth_intra : %d\n", sps->max_transform_hierarchy_depth_intra);
440 LOG1("scaling_list_enable_flag : %d\n", sps->scaling_list_enable_flag);
441
442 if (sps->scaling_list_enable_flag) {
443
444 LOG1("sps_scaling_list_data_present_flag : %d\n", sps->sps_scaling_list_data_present_flag);
445 if (sps->sps_scaling_list_data_present_flag) {
446
447 LOG0("scaling list logging output not implemented");
448 //assert(0);
449 //scaling_list_data()
450 }
451 }
452
453 LOG1("amp_enabled_flag : %d\n", sps->amp_enabled_flag);
454 LOG1("sample_adaptive_offset_enabled_flag : %d\n", sps->sample_adaptive_offset_enabled_flag);
455 LOG1("pcm_enabled_flag : %d\n", sps->pcm_enabled_flag);
456
457 if (sps->pcm_enabled_flag) {
458 LOG1("pcm_sample_bit_depth_luma : %d\n", sps->pcm_sample_bit_depth_luma);
459 LOG1("pcm_sample_bit_depth_chroma : %d\n", sps->pcm_sample_bit_depth_chroma);
460 LOG1("log2_min_pcm_luma_coding_block_size : %d\n", sps->log2_min_pcm_luma_coding_block_size);
461 LOG1("log2_diff_max_min_pcm_luma_coding_block_size : %d\n", sps->log2_diff_max_min_pcm_luma_coding_block_size);
462 LOG1("pcm_loop_filter_disable_flag : %d\n", sps->pcm_loop_filter_disable_flag);
463 }
464
465 LOG1("num_short_term_ref_pic_sets : %d\n", sps->num_short_term_ref_pic_sets);
466
467 for (int i = 0; i < sps->num_short_term_ref_pic_sets; i++) {
468 LOG1("ref_pic_set[ %2d ]: ",i);
469 dump_compact_short_term_ref_pic_set(&sps->ref_pic_sets[i], 16, fh);
470 }
471
472 LOG1("long_term_ref_pics_present_flag : %d\n", sps->long_term_ref_pics_present_flag);
473
474 if (sps->long_term_ref_pics_present_flag) {
475
476 LOG1("num_long_term_ref_pics_sps : %d\n", sps->num_long_term_ref_pics_sps);
477
478 for (int i = 0; i < sps->num_long_term_ref_pics_sps; i++ ) {
479 LOG3("lt_ref_pic_poc_lsb_sps[%d] : %d (used_by_curr_pic_lt_sps_flag=%d)\n",
480 i, sps->lt_ref_pic_poc_lsb_sps[i], sps->used_by_curr_pic_lt_sps_flag[i]);
481 }
482 }
483
484 LOG1("sps_temporal_mvp_enabled_flag : %d\n", sps->sps_temporal_mvp_enabled_flag);
485 LOG1("strong_intra_smoothing_enable_flag : %d\n", sps->strong_intra_smoothing_enable_flag);
486 LOG1("vui_parameters_present_flag : %d\n", sps->vui_parameters_present_flag);
487
488 LOG1("CtbSizeY : %d\n", sps->CtbSizeY);
489 LOG1("MinCbSizeY : %d\n", sps->MinCbSizeY);
490 LOG1("MaxCbSizeY : %d\n", 1<<(sps->log2_min_luma_coding_block_size + sps->log2_diff_max_min_luma_coding_block_size));
491 LOG1("MinTBSizeY : %d\n", 1<<sps->log2_min_transform_block_size);
492 LOG1("MaxTBSizeY : %d\n", 1<<(sps->log2_min_transform_block_size + sps->log2_diff_max_min_transform_block_size));
493
494 LOG1("SubWidthC : %d\n", sps->SubWidthC);
495 LOG1("SubHeightC : %d\n", sps->SubHeightC);
496
497 return;
498
499 if (sps->vui_parameters_present_flag) {
500 assert(false);
501 /*
502 vui_parameters()
503
504 sps_extension_flag
505 u(1)
506 if( sps_extension_flag )
507
508 while( more_rbsp_data() )
509
510 sps_extension_data_flag
511 u(1)
512 rbsp_trailing_bits()
513 */
514 }
515 #undef LOG0
516 #undef LOG1
517 #undef LOG2
518 #undef LOG3
519 //#endif
520 }
521
522
523 static uint8_t default_ScalingList_4x4[16] = {
524 16,16,16,16,16,16,16,16,
525 16,16,16,16,16,16,16,16
526 };
527
528 static uint8_t default_ScalingList_8x8_intra[64] = {
529 16,16,16,16,16,16,16,16,
530 16,16,17,16,17,16,17,18,
531 17,18,18,17,18,21,19,20,
532 21,20,19,21,24,22,22,24,
533 24,22,22,24,25,25,27,30,
534 27,25,25,29,31,35,35,31,
535 29,36,41,44,41,36,47,54,
536 54,47,65,70,65,88,88,115
537 };
538
539 static uint8_t default_ScalingList_8x8_inter[64] = {
540 16,16,16,16,16,16,16,16,
541 16,16,17,17,17,17,17,18,
542 18,18,18,18,18,20,20,20,
543 20,20,20,20,24,24,24,24,
544 24,24,24,24,25,25,25,25,
545 25,25,25,28,28,28,28,28,
546 28,33,33,33,33,33,41,41,
547 41,41,54,54,54,71,71,91
548 };
549
550
551 void fill_scaling_factor(uint8_t* scalingFactors, const uint8_t* sclist, int sizeId)
552 {
553 const position* scan;
554 int width;
555 int subWidth;
556
557 switch (sizeId) {
558 case 0:
559 width=4;
560 subWidth=1;
561 scan = get_scan_order(2, 0 /* diag */);
562
563 for (int i=0;i<4*4;i++) {
564 scalingFactors[scan[i].x + width*scan[i].y] = sclist[i];
565 }
566 break;
567
568 case 1:
569 width=8;
570 subWidth=1;
571 scan = get_scan_order(3, 0 /* diag */);
572
573 for (int i=0;i<8*8;i++) {
574 scalingFactors[scan[i].x + width*scan[i].y] = sclist[i];
575 }
576 break;
577
578 case 2:
579 width=8;
580 subWidth=2;
581 scan = get_scan_order(3, 0 /* diag */);
582
583 for (int i=0;i<8*8;i++) {
584 for (int dy=0;dy<2;dy++)
585 for (int dx=0;dx<2;dx++)
586 {
587 int x = 2*scan[i].x+dx;
588 int y = 2*scan[i].y+dy;
589 scalingFactors[x+width*subWidth*y] = sclist[i];
590 }
591 }
592 break;
593
594 case 3:
595 width=8;
596 subWidth=4;
597 scan = get_scan_order(3, 0 /* diag */);
598
599 for (int i=0;i<8*8;i++) {
600 for (int dy=0;dy<4;dy++)
601 for (int dx=0;dx<4;dx++)
602 {
603 int x = 4*scan[i].x+dx;
604 int y = 4*scan[i].y+dy;
605 scalingFactors[x+width*subWidth*y] = sclist[i];
606 }
607 }
608 break;
609
610 default:
611 assert(0);
612 break;
613 }
614
615
616 // --- dump matrix ---
617
618 #if 0
619 for (int y=0;y<width;y++) {
620 for (int x=0;x<width;x++)
621 printf("%d,",scalingFactors[x*subWidth + width*subWidth*subWidth*y]);
622
623 printf("\n");
624 }
625 #endif
626 }
627
628
629 de265_error read_scaling_list(bitreader* br, const seq_parameter_set* sps,
630 scaling_list_data* sclist, bool inPPS)
631 {
632 int dc_coeff[4][6];
633
634 for (int sizeId=0;sizeId<4;sizeId++) {
635 int n = ((sizeId==3) ? 2 : 6);
636 uint8_t scaling_list[6][32*32];
637
638 for (int matrixId=0;matrixId<n;matrixId++) {
639 uint8_t* curr_scaling_list = scaling_list[matrixId];
640 int scaling_list_dc_coef;
641
642 int canonicalMatrixId = matrixId;
643 if (sizeId==3 && matrixId==1) { canonicalMatrixId=3; }
644
645
646 //printf("----- matrix %d\n",matrixId);
647
648 char scaling_list_pred_mode_flag = get_bits(br,1);
649 if (!scaling_list_pred_mode_flag) {
650 int scaling_list_pred_matrix_id_delta = get_uvlc(br);
651 if (scaling_list_pred_matrix_id_delta < 0 ||
652 scaling_list_pred_matrix_id_delta > matrixId) {
653 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
654 }
655
656 //printf("scaling_list_pred_matrix_id_delta=%d\n", scaling_list_pred_matrix_id_delta);
657
658 dc_coeff[sizeId][matrixId] = 16;
659 scaling_list_dc_coef = 16;
660
661 if (scaling_list_pred_matrix_id_delta==0) {
662 if (sizeId==0) {
663 memcpy(curr_scaling_list, default_ScalingList_4x4, 16);
664 }
665 else {
666 if (canonicalMatrixId<3)
667 { memcpy(curr_scaling_list, default_ScalingList_8x8_intra,64); }
668 else
669 { memcpy(curr_scaling_list, default_ScalingList_8x8_inter,64); }
670 }
671 }
672 else {
673 // TODO: CHECK: for sizeID=3 and the second matrix, should we have delta=1 or delta=3 ?
674 if (sizeId==3) { assert(scaling_list_pred_matrix_id_delta==1); }
675
676 int mID = matrixId - scaling_list_pred_matrix_id_delta;
677
678 int len = (sizeId == 0 ? 16 : 64);
679 memcpy(curr_scaling_list, scaling_list[mID], len);
680
681 scaling_list_dc_coef = dc_coeff[sizeId][mID];
682 dc_coeff[sizeId][matrixId] = dc_coeff[sizeId][mID];
683 }
684 }
685 else {
686 int nextCoef=8;
687 int coefNum = (sizeId==0 ? 16 : 64);
688 if (sizeId>1) {
689 scaling_list_dc_coef = get_svlc(br);
690 if (scaling_list_dc_coef < -7 ||
691 scaling_list_dc_coef > 247) {
692 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
693 }
694
695 scaling_list_dc_coef += 8;
696 nextCoef=scaling_list_dc_coef;
697 dc_coeff[sizeId][matrixId] = scaling_list_dc_coef;
698 }
699 else {
700 scaling_list_dc_coef = 16;
701 }
702 //printf("DC = %d\n",scaling_list_dc_coef);
703
704 for (int i=0;i<coefNum;i++) {
705 int scaling_list_delta_coef = get_svlc(br);
706 if (scaling_list_delta_coef < -128 ||
707 scaling_list_delta_coef > 127) {
708 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
709 }
710
711 nextCoef = (nextCoef + scaling_list_delta_coef + 256) % 256;
712 if (nextCoef < 0 || nextCoef > 255) {
713 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
714 }
715
716 curr_scaling_list[i] = nextCoef;
717 //printf("curr %d = %d\n",i,nextCoef);
718 }
719 }
720
721
722 // --- generate ScalingFactor arrays ---
723
724 switch (sizeId) {
725 case 0:
726 fill_scaling_factor(&sclist->ScalingFactor_Size0[matrixId][0][0], curr_scaling_list, 0);
727 break;
728
729 case 1:
730 fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId][0][0], curr_scaling_list, 1);
731 break;
732
733 case 2:
734 fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId][0][0], curr_scaling_list, 2);
735 sclist->ScalingFactor_Size2[matrixId][0][0] = scaling_list_dc_coef;
736 //printf("DC coeff: %d\n", scaling_list_dc_coef);
737 break;
738
739 case 3:
740 fill_scaling_factor(&sclist->ScalingFactor_Size3[matrixId][0][0], curr_scaling_list, 3);
741 sclist->ScalingFactor_Size3[matrixId][0][0] = scaling_list_dc_coef;
742 //printf("DC coeff: %d\n", scaling_list_dc_coef);
743 break;
744 }
745 }
746 }
747
748 return DE265_OK;
749 }
750
751
752 void set_default_scaling_lists(scaling_list_data* sclist)
753 {
754 // 4x4
755
756 for (int matrixId=0;matrixId<6;matrixId++) {
757 fill_scaling_factor(&sclist->ScalingFactor_Size0[matrixId][0][0],
758 default_ScalingList_4x4, 0);
759 }
760
761 // 8x8
762
763 for (int matrixId=0;matrixId<3;matrixId++) {
764 fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId+0][0][0],
765 default_ScalingList_8x8_intra, 1);
766 fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId+3][0][0],
767 default_ScalingList_8x8_inter, 1);
768 }
769
770 // 16x16
771
772 for (int matrixId=0;matrixId<3;matrixId++) {
773 fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId+0][0][0],
774 default_ScalingList_8x8_intra, 2);
775 fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId+3][0][0],
776 default_ScalingList_8x8_inter, 2);
777 }
778
779 // 32x32
780
781 fill_scaling_factor(&sclist->ScalingFactor_Size3[0][0][0],
782 default_ScalingList_8x8_intra, 3);
783 fill_scaling_factor(&sclist->ScalingFactor_Size3[1][0][0],
784 default_ScalingList_8x8_inter, 3);
785 }
786
787
788 void move_sps(seq_parameter_set* dest,
789 seq_parameter_set* src)
790 {
791 memcpy(dest, src, sizeof(seq_parameter_set));
792 memset(src, 0, sizeof(seq_parameter_set));
793 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "sps.h"
21 #include "util.h"
22 #include "scan.h"
23 #include "decctx.h"
24
25 #include <assert.h>
26 #include <stdlib.h>
27 #include <string.h>
28
29 #define READ_VLC_OFFSET(variable, vlctype, offset) \
30 if ((vlc = get_ ## vlctype(br)) == UVLC_ERROR) { \
31 ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); \
32 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; \
33 } \
34 variable = vlc + offset;
35
36 #define READ_VLC(variable, vlctype) READ_VLC_OFFSET(variable,vlctype,0)
37
38
39 static int SubWidthC_tab[] = { -1,2,2,1 };
40 static int SubHeightC_tab[] = { -1,2,1,1 };
41
42
43 // TODO if (!check_high(ctx, vlc, 15)) return false;
44 // TODO if (!check_ulvc(ctx, vlc)) return false;
45
46
47 // TODO: should be in some header-file of refpic.c
48 extern bool read_short_term_ref_pic_set(decoder_context* ctx,
49 const seq_parameter_set* sps,
50 bitreader* br,
51 ref_pic_set* out_set,
52 int idxRps, // index of the set to be read
53 const std::vector<ref_pic_set>& sets,
54 bool sliceRefPicSet);
55
56 seq_parameter_set::seq_parameter_set()
57 {
58 // TODO: this is dangerous
59 //memset(this,0,sizeof(seq_parameter_set));
60
61 sps_read = false;
62 //ref_pic_sets = NULL;
63 }
64
65
66 seq_parameter_set::~seq_parameter_set()
67 {
68 //free(ref_pic_sets);
69 }
70
71
72 de265_error seq_parameter_set::read(decoder_context* ctx, bitreader* br)
73 {
74 int vlc;
75
76 video_parameter_set_id = get_bits(br,4);
77 sps_max_sub_layers = get_bits(br,3) +1;
78 if (sps_max_sub_layers>7) {
79 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
80 }
81
82 sps_temporal_id_nesting_flag = get_bits(br,1);
83
84 read_profile_tier_level(br,&profile_tier_level, sps_max_sub_layers);
85
86 READ_VLC(seq_parameter_set_id, uvlc);
87
88
89 // --- decode chroma type ---
90
91 READ_VLC(chroma_format_idc, uvlc);
92
93 if (chroma_format_idc == 3) {
94 separate_colour_plane_flag = get_bits(br,1);
95 }
96 else {
97 separate_colour_plane_flag = 0;
98 }
99
100 if (separate_colour_plane_flag) {
101 ChromaArrayType = 0;
102 }
103 else {
104 ChromaArrayType = chroma_format_idc;
105 }
106
107 if (chroma_format_idc<0 ||
108 chroma_format_idc>3) {
109 ctx->add_warning(DE265_WARNING_INVALID_CHROMA_FORMAT, false);
110 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
111 }
112
113 SubWidthC = SubWidthC_tab [chroma_format_idc];
114 SubHeightC = SubHeightC_tab[chroma_format_idc];
115
116
117 // --- picture size ---
118
119 READ_VLC(pic_width_in_luma_samples, uvlc);
120 READ_VLC(pic_height_in_luma_samples, uvlc);
121
122 conformance_window_flag = get_bits(br,1);
123
124 if (conformance_window_flag) {
125 READ_VLC(conf_win_left_offset, uvlc);
126 READ_VLC(conf_win_right_offset, uvlc);
127 READ_VLC(conf_win_top_offset, uvlc);
128 READ_VLC(conf_win_bottom_offset,uvlc);
129 }
130 else {
131 conf_win_left_offset = 0;
132 conf_win_right_offset = 0;
133 conf_win_top_offset = 0;
134 conf_win_bottom_offset= 0;
135 }
136
137 if (ChromaArrayType==0) {
138 WinUnitX = 1;
139 WinUnitY = 1;
140 }
141 else {
142 WinUnitX = SubWidthC_tab [chroma_format_idc];
143 WinUnitY = SubHeightC_tab[chroma_format_idc];
144 }
145
146
147 READ_VLC_OFFSET(bit_depth_luma, uvlc, 8);
148 READ_VLC_OFFSET(bit_depth_chroma,uvlc, 8);
149
150 READ_VLC_OFFSET(log2_max_pic_order_cnt_lsb, uvlc, 4);
151 MaxPicOrderCntLsb = 1<<(log2_max_pic_order_cnt_lsb);
152
153
154 // --- sub_layer_ordering_info ---
155
156 sps_sub_layer_ordering_info_present_flag = get_bits(br,1);
157
158 int firstLayer = (sps_sub_layer_ordering_info_present_flag ?
159 0 : sps_max_sub_layers-1 );
160
161 for (int i=firstLayer ; i <= sps_max_sub_layers-1; i++ ) {
162
163 // sps_max_dec_pic_buffering[i]
164
165 vlc=get_uvlc(br);
166 if (vlc == UVLC_ERROR ||
167 vlc+1 > MAX_NUM_REF_PICS) {
168 ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
169 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
170 }
171
172 sps_max_dec_pic_buffering[i] = vlc+1;
173
174 // sps_max_num_reorder_pics[i]
175
176 READ_VLC(sps_max_num_reorder_pics[i], uvlc);
177
178
179 // sps_max_latency_increase[i]
180
181 READ_VLC(sps_max_latency_increase_plus1[i], uvlc);
182
183 SpsMaxLatencyPictures[i] = (sps_max_num_reorder_pics[i] +
184 sps_max_latency_increase_plus1[i]-1);
185 }
186
187 // copy info to all layers if only specified once
188
189 if (sps_sub_layer_ordering_info_present_flag) {
190 int ref = sps_max_sub_layers-1;
191 assert(ref<7);
192
193 for (int i=0 ; i < sps_max_sub_layers-1; i++ ) {
194 sps_max_dec_pic_buffering[i] = sps_max_dec_pic_buffering[ref];
195 sps_max_num_reorder_pics[i] = sps_max_num_reorder_pics[ref];
196 sps_max_latency_increase_plus1[i] = sps_max_latency_increase_plus1[ref];
197 }
198 }
199
200
201 READ_VLC_OFFSET(log2_min_luma_coding_block_size, uvlc, 3);
202 READ_VLC (log2_diff_max_min_luma_coding_block_size, uvlc);
203 READ_VLC_OFFSET(log2_min_transform_block_size, uvlc, 2);
204 READ_VLC(log2_diff_max_min_transform_block_size, uvlc);
205 READ_VLC(max_transform_hierarchy_depth_inter, uvlc);
206 READ_VLC(max_transform_hierarchy_depth_intra, uvlc);
207 scaling_list_enable_flag = get_bits(br,1);
208
209 if (scaling_list_enable_flag) {
210
211 sps_scaling_list_data_present_flag = get_bits(br,1);
212 if (sps_scaling_list_data_present_flag) {
213
214 de265_error err;
215 if ((err=read_scaling_list(br,this, &scaling_list, false)) != DE265_OK) {
216 return err;
217 }
218 }
219 else {
220 set_default_scaling_lists(&scaling_list);
221 }
222 }
223
224 amp_enabled_flag = get_bits(br,1);
225 sample_adaptive_offset_enabled_flag = get_bits(br,1);
226 pcm_enabled_flag = get_bits(br,1);
227 if (pcm_enabled_flag) {
228 pcm_sample_bit_depth_luma = get_bits(br,4)+1;
229 pcm_sample_bit_depth_chroma = get_bits(br,4)+1;
230 READ_VLC_OFFSET(log2_min_pcm_luma_coding_block_size, uvlc, 3);
231 READ_VLC(log2_diff_max_min_pcm_luma_coding_block_size, uvlc);
232 pcm_loop_filter_disable_flag = get_bits(br,1);
233 }
234 else {
235 pcm_sample_bit_depth_luma = 0;
236 pcm_sample_bit_depth_chroma = 0;
237 log2_min_pcm_luma_coding_block_size = 0;
238 log2_diff_max_min_pcm_luma_coding_block_size = 0;
239 pcm_loop_filter_disable_flag = 0;
240 }
241
242 READ_VLC(num_short_term_ref_pic_sets, uvlc);
243 if (num_short_term_ref_pic_sets < 0 ||
244 num_short_term_ref_pic_sets > 64) {
245 ctx->add_warning(DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE, false);
246 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
247 }
248
249 // --- allocate reference pic set ---
250
251 // we do not allocate the ref-pic-set for the slice header here, but in the slice header itself
252
253 ref_pic_sets.resize(num_short_term_ref_pic_sets);
254
255 for (int i = 0; i < num_short_term_ref_pic_sets; i++) {
256
257 bool success = read_short_term_ref_pic_set(ctx,this,br,
258 &ref_pic_sets[i], i,
259 ref_pic_sets,
260 false);
261
262 if (!success) {
263 return DE265_WARNING_SPS_HEADER_INVALID;
264 }
265
266 // dump_short_term_ref_pic_set(&(*ref_pic_sets)[i], fh);
267 }
268
269 long_term_ref_pics_present_flag = get_bits(br,1);
270
271 if (long_term_ref_pics_present_flag) {
272
273 READ_VLC(num_long_term_ref_pics_sps, uvlc);
274 if (num_long_term_ref_pics_sps > MAX_NUM_LT_REF_PICS_SPS) {
275 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
276 }
277
278 for (int i = 0; i < num_long_term_ref_pics_sps; i++ ) {
279 lt_ref_pic_poc_lsb_sps[i] = get_bits(br, log2_max_pic_order_cnt_lsb);
280 used_by_curr_pic_lt_sps_flag[i] = get_bits(br,1);
281 }
282 }
283 else {
284 num_long_term_ref_pics_sps = 0; // NOTE: missing definition in standard !
285 }
286
287 sps_temporal_mvp_enabled_flag = get_bits(br,1);
288 strong_intra_smoothing_enable_flag = get_bits(br,1);
289 vui_parameters_present_flag = get_bits(br,1);
290
291 #if 0
292 if (vui_parameters_present_flag) {
293 assert(false);
294 /*
295 vui_parameters()
296
297 sps_extension_flag
298 u(1)
299 if( sps_extension_flag )
300
301 while( more_rbsp_data() )
302
303 sps_extension_data_flag
304 u(1)
305 rbsp_trailing_bits()
306 */
307 }
308
309 sps_extension_flag = get_bits(br,1);
310 if (sps_extension_flag) {
311 assert(false);
312 }
313
314 check_rbsp_trailing_bits(br);
315 #endif
316
317 // --- compute derived values ---
318
319 BitDepth_Y = bit_depth_luma;
320 QpBdOffset_Y = 6*(bit_depth_luma-8);
321 BitDepth_C = bit_depth_chroma;
322 QpBdOffset_C = 6*(bit_depth_chroma-8);
323
324 Log2MinCbSizeY = log2_min_luma_coding_block_size;
325 Log2CtbSizeY = Log2MinCbSizeY + log2_diff_max_min_luma_coding_block_size;
326 MinCbSizeY = 1 << Log2MinCbSizeY;
327 CtbSizeY = 1 << Log2CtbSizeY;
328 PicWidthInMinCbsY = pic_width_in_luma_samples / MinCbSizeY;
329 PicWidthInCtbsY = ceil_div(pic_width_in_luma_samples, CtbSizeY);
330 PicHeightInMinCbsY = pic_height_in_luma_samples / MinCbSizeY;
331 PicHeightInCtbsY = ceil_div(pic_height_in_luma_samples,CtbSizeY);
332 PicSizeInMinCbsY = PicWidthInMinCbsY * PicHeightInMinCbsY;
333 PicSizeInCtbsY = PicWidthInCtbsY * PicHeightInCtbsY;
334 PicSizeInSamplesY = pic_width_in_luma_samples * pic_height_in_luma_samples;
335
336 if (chroma_format_idc==0 || separate_colour_plane_flag) {
337 CtbWidthC = 0;
338 CtbHeightC = 0;
339 }
340 else {
341 CtbWidthC = CtbSizeY / SubWidthC;
342 CtbHeightC = CtbSizeY / SubHeightC;
343 }
344
345 Log2MinTrafoSize = log2_min_transform_block_size;
346 Log2MaxTrafoSize = log2_min_transform_block_size + log2_diff_max_min_transform_block_size;
347
348 Log2MinPUSize = Log2MinCbSizeY-1;
349 PicWidthInMinPUs = PicWidthInCtbsY << (Log2CtbSizeY - Log2MinPUSize);
350 PicHeightInMinPUs = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinPUSize);
351
352 Log2MinIpcmCbSizeY = log2_min_pcm_luma_coding_block_size;
353 Log2MaxIpcmCbSizeY = (log2_min_pcm_luma_coding_block_size +
354 log2_diff_max_min_pcm_luma_coding_block_size);
355
356 // the following are not in the standard
357 PicWidthInTbsY = PicWidthInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize);
358 PicHeightInTbsY = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize);
359 PicSizeInTbsY = PicWidthInTbsY * PicHeightInTbsY;
360
361 sps_read = true;
362
363 return DE265_OK;
364 }
365
366
367
368 void seq_parameter_set::dump_sps(int fd) const
369 {
370 //#if (_MSC_VER >= 1500)
371 //#define LOG0(t) loginfo(LogHeaders, t)
372 //#define LOG1(t,d) loginfo(LogHeaders, t,d)
373 //#define LOG2(t,d1,d2) loginfo(LogHeaders, t,d1,d2)
374 //#define LOG3(t,d1,d2,d3) loginfo(LogHeaders, t,d1,d2,d3)
375
376 FILE* fh;
377 if (fd==1) fh=stdout;
378 else if (fd==2) fh=stderr;
379 else { return; }
380
381 #define LOG0(t) log2fh(fh, t)
382 #define LOG1(t,d) log2fh(fh, t,d)
383 #define LOG2(t,d1,d2) log2fh(fh, t,d1,d2)
384 #define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3)
385
386
387 LOG0("----------------- SPS -----------------\n");
388 LOG1("video_parameter_set_id : %d\n", video_parameter_set_id);
389 LOG1("sps_max_sub_layers : %d\n", sps_max_sub_layers);
390 LOG1("sps_temporal_id_nesting_flag : %d\n", sps_temporal_id_nesting_flag);
391
392 dump_profile_tier_level(&profile_tier_level, sps_max_sub_layers, fh);
393
394 LOG1("seq_parameter_set_id : %d\n", seq_parameter_set_id);
395 LOG2("chroma_format_idc : %d (%s)\n", chroma_format_idc,
396 chroma_format_idc == 1 ? "4:2:0" :
397 chroma_format_idc == 2 ? "4:2:2" :
398 chroma_format_idc == 3 ? "4:4:4" : "unknown");
399
400 if (chroma_format_idc == 3) {
401 LOG1("separate_colour_plane_flag : %d\n", separate_colour_plane_flag);
402 }
403
404 LOG1("pic_width_in_luma_samples : %d\n", pic_width_in_luma_samples);
405 LOG1("pic_height_in_luma_samples : %d\n", pic_height_in_luma_samples);
406 LOG1("conformance_window_flag : %d\n", conformance_window_flag);
407
408 if (conformance_window_flag) {
409 LOG1("conf_win_left_offset : %d\n", conf_win_left_offset);
410 LOG1("conf_win_right_offset : %d\n", conf_win_right_offset);
411 LOG1("conf_win_top_offset : %d\n", conf_win_top_offset);
412 LOG1("conf_win_bottom_offset: %d\n", conf_win_bottom_offset);
413 }
414
415 LOG1("bit_depth_luma : %d\n", bit_depth_luma);
416 LOG1("bit_depth_chroma : %d\n", bit_depth_chroma);
417
418 LOG1("log2_max_pic_order_cnt_lsb : %d\n", log2_max_pic_order_cnt_lsb);
419 LOG1("sps_sub_layer_ordering_info_present_flag : %d\n", sps_sub_layer_ordering_info_present_flag);
420
421 int firstLayer = (sps_sub_layer_ordering_info_present_flag ?
422 0 : sps_max_sub_layers-1 );
423
424 for (int i=firstLayer ; i <= sps_max_sub_layers-1; i++ ) {
425 LOG1("Layer %d\n",i);
426 LOG1(" sps_max_dec_pic_buffering : %d\n", sps_max_dec_pic_buffering[i]);
427 LOG1(" sps_max_num_reorder_pics : %d\n", sps_max_num_reorder_pics[i]);
428 LOG1(" sps_max_latency_increase_plus1 : %d\n", sps_max_latency_increase_plus1[i]);
429 }
430
431 LOG1("log2_min_luma_coding_block_size : %d\n", log2_min_luma_coding_block_size);
432 LOG1("log2_diff_max_min_luma_coding_block_size : %d\n",log2_diff_max_min_luma_coding_block_size);
433 LOG1("log2_min_transform_block_size : %d\n", log2_min_transform_block_size);
434 LOG1("log2_diff_max_min_transform_block_size : %d\n", log2_diff_max_min_transform_block_size);
435 LOG1("max_transform_hierarchy_depth_inter : %d\n", max_transform_hierarchy_depth_inter);
436 LOG1("max_transform_hierarchy_depth_intra : %d\n", max_transform_hierarchy_depth_intra);
437 LOG1("scaling_list_enable_flag : %d\n", scaling_list_enable_flag);
438
439 if (scaling_list_enable_flag) {
440
441 LOG1("sps_scaling_list_data_present_flag : %d\n", sps_scaling_list_data_present_flag);
442 if (sps_scaling_list_data_present_flag) {
443
444 LOG0("scaling list logging output not implemented");
445 //assert(0);
446 //scaling_list_data()
447 }
448 }
449
450 LOG1("amp_enabled_flag : %d\n", amp_enabled_flag);
451 LOG1("sample_adaptive_offset_enabled_flag : %d\n", sample_adaptive_offset_enabled_flag);
452 LOG1("pcm_enabled_flag : %d\n", pcm_enabled_flag);
453
454 if (pcm_enabled_flag) {
455 LOG1("pcm_sample_bit_depth_luma : %d\n", pcm_sample_bit_depth_luma);
456 LOG1("pcm_sample_bit_depth_chroma : %d\n", pcm_sample_bit_depth_chroma);
457 LOG1("log2_min_pcm_luma_coding_block_size : %d\n", log2_min_pcm_luma_coding_block_size);
458 LOG1("log2_diff_max_min_pcm_luma_coding_block_size : %d\n", log2_diff_max_min_pcm_luma_coding_block_size);
459 LOG1("pcm_loop_filter_disable_flag : %d\n", pcm_loop_filter_disable_flag);
460 }
461
462 LOG1("num_short_term_ref_pic_sets : %d\n", num_short_term_ref_pic_sets);
463
464 for (int i = 0; i < num_short_term_ref_pic_sets; i++) {
465 LOG1("ref_pic_set[ %2d ]: ",i);
466 dump_compact_short_term_ref_pic_set(&ref_pic_sets[i], 16, fh);
467 }
468
469 LOG1("long_term_ref_pics_present_flag : %d\n", long_term_ref_pics_present_flag);
470
471 if (long_term_ref_pics_present_flag) {
472
473 LOG1("num_long_term_ref_pics_sps : %d\n", num_long_term_ref_pics_sps);
474
475 for (int i = 0; i < num_long_term_ref_pics_sps; i++ ) {
476 LOG3("lt_ref_pic_poc_lsb_sps[%d] : %d (used_by_curr_pic_lt_sps_flag=%d)\n",
477 i, lt_ref_pic_poc_lsb_sps[i], used_by_curr_pic_lt_sps_flag[i]);
478 }
479 }
480
481 LOG1("sps_temporal_mvp_enabled_flag : %d\n", sps_temporal_mvp_enabled_flag);
482 LOG1("strong_intra_smoothing_enable_flag : %d\n", strong_intra_smoothing_enable_flag);
483 LOG1("vui_parameters_present_flag : %d\n", vui_parameters_present_flag);
484
485 LOG1("CtbSizeY : %d\n", CtbSizeY);
486 LOG1("MinCbSizeY : %d\n", MinCbSizeY);
487 LOG1("MaxCbSizeY : %d\n", 1<<(log2_min_luma_coding_block_size + log2_diff_max_min_luma_coding_block_size));
488 LOG1("MinTBSizeY : %d\n", 1<<log2_min_transform_block_size);
489 LOG1("MaxTBSizeY : %d\n", 1<<(log2_min_transform_block_size + log2_diff_max_min_transform_block_size));
490
491 LOG1("SubWidthC : %d\n", SubWidthC);
492 LOG1("SubHeightC : %d\n", SubHeightC);
493
494 return;
495
496 if (vui_parameters_present_flag) {
497 assert(false);
498 /*
499 vui_parameters()
500
501 sps_extension_flag
502 u(1)
503 if( sps_extension_flag )
504
505 while( more_rbsp_data() )
506
507 sps_extension_data_flag
508 u(1)
509 rbsp_trailing_bits()
510 */
511 }
512 #undef LOG0
513 #undef LOG1
514 #undef LOG2
515 #undef LOG3
516 //#endif
517 }
518
519
520 static uint8_t default_ScalingList_4x4[16] = {
521 16,16,16,16,16,16,16,16,
522 16,16,16,16,16,16,16,16
523 };
524
525 static uint8_t default_ScalingList_8x8_intra[64] = {
526 16,16,16,16,16,16,16,16,
527 16,16,17,16,17,16,17,18,
528 17,18,18,17,18,21,19,20,
529 21,20,19,21,24,22,22,24,
530 24,22,22,24,25,25,27,30,
531 27,25,25,29,31,35,35,31,
532 29,36,41,44,41,36,47,54,
533 54,47,65,70,65,88,88,115
534 };
535
536 static uint8_t default_ScalingList_8x8_inter[64] = {
537 16,16,16,16,16,16,16,16,
538 16,16,17,17,17,17,17,18,
539 18,18,18,18,18,20,20,20,
540 20,20,20,20,24,24,24,24,
541 24,24,24,24,25,25,25,25,
542 25,25,25,28,28,28,28,28,
543 28,33,33,33,33,33,41,41,
544 41,41,54,54,54,71,71,91
545 };
546
547
548 void fill_scaling_factor(uint8_t* scalingFactors, const uint8_t* sclist, int sizeId)
549 {
550 const position* scan;
551 int width;
552 int subWidth;
553
554 switch (sizeId) {
555 case 0:
556 width=4;
557 subWidth=1;
558 scan = get_scan_order(2, 0 /* diag */);
559
560 for (int i=0;i<4*4;i++) {
561 scalingFactors[scan[i].x + width*scan[i].y] = sclist[i];
562 }
563 break;
564
565 case 1:
566 width=8;
567 subWidth=1;
568 scan = get_scan_order(3, 0 /* diag */);
569
570 for (int i=0;i<8*8;i++) {
571 scalingFactors[scan[i].x + width*scan[i].y] = sclist[i];
572 }
573 break;
574
575 case 2:
576 width=8;
577 subWidth=2;
578 scan = get_scan_order(3, 0 /* diag */);
579
580 for (int i=0;i<8*8;i++) {
581 for (int dy=0;dy<2;dy++)
582 for (int dx=0;dx<2;dx++)
583 {
584 int x = 2*scan[i].x+dx;
585 int y = 2*scan[i].y+dy;
586 scalingFactors[x+width*subWidth*y] = sclist[i];
587 }
588 }
589 break;
590
591 case 3:
592 width=8;
593 subWidth=4;
594 scan = get_scan_order(3, 0 /* diag */);
595
596 for (int i=0;i<8*8;i++) {
597 for (int dy=0;dy<4;dy++)
598 for (int dx=0;dx<4;dx++)
599 {
600 int x = 4*scan[i].x+dx;
601 int y = 4*scan[i].y+dy;
602 scalingFactors[x+width*subWidth*y] = sclist[i];
603 }
604 }
605 break;
606
607 default:
608 assert(0);
609 break;
610 }
611
612
613 // --- dump matrix ---
614
615 #if 0
616 for (int y=0;y<width;y++) {
617 for (int x=0;x<width;x++)
618 printf("%d,",scalingFactors[x*subWidth + width*subWidth*subWidth*y]);
619
620 printf("\n");
621 }
622 #endif
623 }
624
625
626 de265_error read_scaling_list(bitreader* br, const seq_parameter_set* sps,
627 scaling_list_data* sclist, bool inPPS)
628 {
629 int dc_coeff[4][6];
630
631 for (int sizeId=0;sizeId<4;sizeId++) {
632 int n = ((sizeId==3) ? 2 : 6);
633 uint8_t scaling_list[6][32*32];
634
635 for (int matrixId=0;matrixId<n;matrixId++) {
636 uint8_t* curr_scaling_list = scaling_list[matrixId];
637 int scaling_list_dc_coef;
638
639 int canonicalMatrixId = matrixId;
640 if (sizeId==3 && matrixId==1) { canonicalMatrixId=3; }
641
642
643 //printf("----- matrix %d\n",matrixId);
644
645 char scaling_list_pred_mode_flag = get_bits(br,1);
646 if (!scaling_list_pred_mode_flag) {
647 int scaling_list_pred_matrix_id_delta = get_uvlc(br);
648 if (scaling_list_pred_matrix_id_delta < 0 ||
649 scaling_list_pred_matrix_id_delta > matrixId) {
650 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
651 }
652
653 //printf("scaling_list_pred_matrix_id_delta=%d\n", scaling_list_pred_matrix_id_delta);
654
655 dc_coeff[sizeId][matrixId] = 16;
656 scaling_list_dc_coef = 16;
657
658 if (scaling_list_pred_matrix_id_delta==0) {
659 if (sizeId==0) {
660 memcpy(curr_scaling_list, default_ScalingList_4x4, 16);
661 }
662 else {
663 if (canonicalMatrixId<3)
664 { memcpy(curr_scaling_list, default_ScalingList_8x8_intra,64); }
665 else
666 { memcpy(curr_scaling_list, default_ScalingList_8x8_inter,64); }
667 }
668 }
669 else {
670 // TODO: CHECK: for sizeID=3 and the second matrix, should we have delta=1 or delta=3 ?
671 if (sizeId==3) { assert(scaling_list_pred_matrix_id_delta==1); }
672
673 int mID = matrixId - scaling_list_pred_matrix_id_delta;
674
675 int len = (sizeId == 0 ? 16 : 64);
676 memcpy(curr_scaling_list, scaling_list[mID], len);
677
678 scaling_list_dc_coef = dc_coeff[sizeId][mID];
679 dc_coeff[sizeId][matrixId] = dc_coeff[sizeId][mID];
680 }
681 }
682 else {
683 int nextCoef=8;
684 int coefNum = (sizeId==0 ? 16 : 64);
685 if (sizeId>1) {
686 scaling_list_dc_coef = get_svlc(br);
687 if (scaling_list_dc_coef < -7 ||
688 scaling_list_dc_coef > 247) {
689 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
690 }
691
692 scaling_list_dc_coef += 8;
693 nextCoef=scaling_list_dc_coef;
694 dc_coeff[sizeId][matrixId] = scaling_list_dc_coef;
695 }
696 else {
697 scaling_list_dc_coef = 16;
698 }
699 //printf("DC = %d\n",scaling_list_dc_coef);
700
701 for (int i=0;i<coefNum;i++) {
702 int scaling_list_delta_coef = get_svlc(br);
703 if (scaling_list_delta_coef < -128 ||
704 scaling_list_delta_coef > 127) {
705 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
706 }
707
708 nextCoef = (nextCoef + scaling_list_delta_coef + 256) % 256;
709 if (nextCoef < 0) {
710 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
711 }
712
713 curr_scaling_list[i] = nextCoef;
714 //printf("curr %d = %d\n",i,nextCoef);
715 }
716 }
717
718
719 // --- generate ScalingFactor arrays ---
720
721 switch (sizeId) {
722 case 0:
723 fill_scaling_factor(&sclist->ScalingFactor_Size0[matrixId][0][0], curr_scaling_list, 0);
724 break;
725
726 case 1:
727 fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId][0][0], curr_scaling_list, 1);
728 break;
729
730 case 2:
731 fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId][0][0], curr_scaling_list, 2);
732 sclist->ScalingFactor_Size2[matrixId][0][0] = scaling_list_dc_coef;
733 //printf("DC coeff: %d\n", scaling_list_dc_coef);
734 break;
735
736 case 3:
737 fill_scaling_factor(&sclist->ScalingFactor_Size3[matrixId][0][0], curr_scaling_list, 3);
738 sclist->ScalingFactor_Size3[matrixId][0][0] = scaling_list_dc_coef;
739 //printf("DC coeff: %d\n", scaling_list_dc_coef);
740 break;
741 }
742 }
743 }
744
745 return DE265_OK;
746 }
747
748
749 void set_default_scaling_lists(scaling_list_data* sclist)
750 {
751 // 4x4
752
753 for (int matrixId=0;matrixId<6;matrixId++) {
754 fill_scaling_factor(&sclist->ScalingFactor_Size0[matrixId][0][0],
755 default_ScalingList_4x4, 0);
756 }
757
758 // 8x8
759
760 for (int matrixId=0;matrixId<3;matrixId++) {
761 fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId+0][0][0],
762 default_ScalingList_8x8_intra, 1);
763 fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId+3][0][0],
764 default_ScalingList_8x8_inter, 1);
765 }
766
767 // 16x16
768
769 for (int matrixId=0;matrixId<3;matrixId++) {
770 fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId+0][0][0],
771 default_ScalingList_8x8_intra, 2);
772 fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId+3][0][0],
773 default_ScalingList_8x8_inter, 2);
774 }
775
776 // 32x32
777
778 fill_scaling_factor(&sclist->ScalingFactor_Size3[0][0][0],
779 default_ScalingList_8x8_intra, 3);
780 fill_scaling_factor(&sclist->ScalingFactor_Size3[1][0][0],
781 default_ScalingList_8x8_inter, 3);
782 }
783
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
2525 #include "libde265/refpic.h"
2626 #include "libde265/de265.h"
2727
28 #include <vector>
29
2830 #define MAX_REF_PIC_SETS 64 // maximum according to standard
2931 #define MAX_NUM_LT_REF_PICS_SPS 32
3032
3941
4042
4143 typedef struct scaling_list_data {
44 // structure size: approx. 4 kB
45
4246 uint8_t ScalingFactor_Size0[6][4][4];
4347 uint8_t ScalingFactor_Size1[6][8][8];
4448 uint8_t ScalingFactor_Size2[6][16][16];
4650 } scaling_list_data;
4751
4852
49 typedef struct {
53 struct seq_parameter_set {
54 seq_parameter_set();
55 ~seq_parameter_set();
56
57 de265_error read(struct decoder_context*, bitreader*);
58
59 void dump_sps(int fd) const;
60
5061 bool sps_read; // whether the sps has been read from the bitstream
5162
5263
103114 char pcm_loop_filter_disable_flag;
104115
105116 int num_short_term_ref_pic_sets;
106 ref_pic_set* ref_pic_sets; // [0 ; num_short_term_ref_pic_set (<=MAX_REF_PIC_SETS) )
117 std::vector<ref_pic_set> ref_pic_sets; // [0 ; num_short_term_ref_pic_set (<=MAX_REF_PIC_SETS) )
107118
108119 /*
109120 for( i = 0; i < num_short_term_ref_pic_sets; i++)
176187 int Log2MaxIpcmCbSizeY;
177188
178189 int SpsMaxLatencyPictures[7]; // [temporal layer]
190 };
179191
180 } seq_parameter_set;
181
192 de265_error read_scaling_list(bitreader*, const seq_parameter_set*, scaling_list_data*, bool inPPS);
193 void set_default_scaling_lists(scaling_list_data*);
182194
183195 #endif
184
+0
-38
libde265/sps_func.h less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef DE265_SPS_FUNC_H
21 #define DE265_SPS_FUNC_H
22
23 #include "libde265/sps.h"
24 #include "libde265/decctx.h"
25
26 void init_sps(seq_parameter_set*);
27 de265_error read_sps(decoder_context*, bitreader*, seq_parameter_set*); //, ref_pic_set**);
28 void dump_sps(seq_parameter_set*, /*ref_pic_set* sets,*/ int fd);
29 void free_sps(seq_parameter_set*);
30 void move_sps(seq_parameter_set* dest, seq_parameter_set* src);
31
32 de265_error read_scaling_list(bitreader*, const seq_parameter_set*, scaling_list_data*, bool inPPS);
33 void set_default_scaling_lists(scaling_list_data*);
34
35 void free_ref_pic_sets(ref_pic_set**);
36
37 #endif
+0
-304
libde265/threads.c less more
0
1 #include "threads.h"
2 #include <assert.h>
3 #include <string.h>
4
5 #if defined(_MSC_VER) || defined(__MINGW32__)
6 # include <malloc.h>
7 #else
8 # include <alloca.h>
9 #endif
10
11
12 #ifndef _WIN32
13 // #include <intrin.h>
14
15 #define THREAD_RESULT void*
16 #define THREAD_PARAM void*
17
18 int de265_thread_create(de265_thread* t, void *(*start_routine) (void *), void *arg) { return pthread_create(t,NULL,start_routine,arg); }
19 void de265_thread_join(de265_thread t) { pthread_join(t,NULL); }
20 void de265_thread_destroy(de265_thread* t) { }
21 void de265_mutex_init(de265_mutex* m) { pthread_mutex_init(m,NULL); }
22 void de265_mutex_destroy(de265_mutex* m) { pthread_mutex_destroy(m); }
23 void de265_mutex_lock(de265_mutex* m) { pthread_mutex_lock(m); }
24 void de265_mutex_unlock(de265_mutex* m) { pthread_mutex_unlock(m); }
25 void de265_cond_init(de265_cond* c) { pthread_cond_init(c,NULL); }
26 void de265_cond_destroy(de265_cond* c) { pthread_cond_destroy(c); }
27 void de265_cond_broadcast(de265_cond* c,de265_mutex* m) { pthread_cond_broadcast(c); }
28 void de265_cond_wait(de265_cond* c,de265_mutex* m) { pthread_cond_wait(c,m); }
29 void de265_cond_signal(de265_cond* c) { pthread_cond_signal(c); }
30 #else // _WIN32
31
32 #define THREAD_RESULT DWORD WINAPI
33 #define THREAD_PARAM LPVOID
34
35 int de265_thread_create(de265_thread* t, LPTHREAD_START_ROUTINE start_routine, void *arg) {
36 HANDLE handle = CreateThread(NULL, 0, start_routine, arg, 0, NULL);
37 if (handle == NULL) {
38 return -1;
39 }
40 *t = handle;
41 return 0;
42 }
43 void de265_thread_join(de265_thread t) { WaitForSingleObject(t, INFINITE); }
44 void de265_thread_destroy(de265_thread* t) { CloseHandle(*t); *t = NULL; }
45 void de265_mutex_init(de265_mutex* m) { *m = CreateMutex(NULL, FALSE, NULL); }
46 void de265_mutex_destroy(de265_mutex* m) { CloseHandle(*m); }
47 void de265_mutex_lock(de265_mutex* m) { WaitForSingleObject(*m, INFINITE); }
48 void de265_mutex_unlock(de265_mutex* m) { ReleaseMutex(*m); }
49 void de265_cond_init(de265_cond* c) { win32_cond_init(c); }
50 void de265_cond_destroy(de265_cond* c) { win32_cond_destroy(c); }
51 void de265_cond_broadcast(de265_cond* c,de265_mutex* m)
52 {
53 de265_mutex_lock(m);
54 win32_cond_broadcast(c);
55 de265_mutex_unlock(m);
56 }
57 void de265_cond_wait(de265_cond* c,de265_mutex* m) { win32_cond_wait(c,m); }
58 void de265_cond_signal(de265_cond* c) { win32_cond_signal(c); }
59 #endif // _WIN32
60
61
62
63
64 void de265_progress_lock_init(de265_progress_lock* lock)
65 {
66 lock->progress = 0;
67
68 de265_mutex_init(&lock->mutex);
69 de265_cond_init(&lock->cond);
70 }
71
72 void de265_progress_lock_destroy(de265_progress_lock* lock)
73 {
74 de265_mutex_destroy(&lock->mutex);
75 de265_cond_destroy(&lock->cond);
76 }
77
78 int de265_wait_for_progress(de265_progress_lock* lock, int progress)
79 {
80 if (lock->progress >= progress) {
81 return lock->progress;
82 }
83
84 de265_mutex_lock(&lock->mutex);
85 while (lock->progress < progress) {
86 de265_cond_wait(&lock->cond, &lock->mutex);
87 }
88 de265_mutex_unlock(&lock->mutex);
89
90 return lock->progress;
91 }
92
93 void de265_announce_progress(de265_progress_lock* lock, int progress)
94 {
95 de265_mutex_lock(&lock->mutex);
96 lock->progress = progress;
97
98 de265_cond_broadcast(&lock->cond, &lock->mutex);
99 de265_mutex_unlock(&lock->mutex);
100 }
101
102
103
104
105 #include "libde265/decctx.h"
106
107 const char* line="--------------------------------------------------";
108 void printblks(const thread_pool* pool)
109 {
110 int w = pool->tasks[0].data.task_ctb.ctx->current_sps->PicWidthInCtbsY;
111 int h = pool->tasks[0].data.task_ctb.ctx->current_sps->PicHeightInCtbsY;
112
113 printf("active threads: %d queue len: %d\n",pool->num_threads_working,pool->num_tasks);
114
115 char *const p = (char *)alloca(w * h * sizeof(char));
116 assert(p != NULL);
117 memset(p,' ',w*h);
118
119 for (int i=0;i<pool->num_tasks;i++) {
120 int b = 0; //pool->tasks[i].num_blockers;
121 int x = pool->tasks[i].data.task_ctb.ctb_x;
122 int y = pool->tasks[i].data.task_ctb.ctb_y;
123 p[y*w+x] = b+'0';
124 }
125
126 for (int i=0;i<pool->num_threads_working;i++) {
127 int x = pool->ctbx[i];
128 int y = pool->ctby[i];
129 p[y*w+x] = '*';
130 }
131
132 printf("+%s+\n",line+50-w);
133 for (int y=0;y<h;y++)
134 {
135 printf("|");
136 for (int x=0;x<w;x++)
137 {
138 printf("%c",p[x+y*w]);
139 }
140 printf("|\n");
141 }
142 printf("+%s+\n",line+50-w);
143 }
144
145
146 static THREAD_RESULT worker_thread(THREAD_PARAM pool_ptr)
147 {
148 thread_pool* pool = (thread_pool*)pool_ptr;
149
150
151 de265_mutex_lock(&pool->mutex);
152
153 while(true) {
154
155 // wait until we can pick a task or until the pool has been stopped
156
157 for (;;) {
158 // end waiting if thread-pool has been stopped or we have a task to execute
159
160 if (pool->stopped || pool->num_tasks>0) {
161 break;
162 }
163
164 //printf("going idle\n");
165 de265_cond_wait(&pool->cond_var, &pool->mutex);
166 }
167
168 // if the pool was shut down, end the execution
169
170 if (pool->stopped) {
171 de265_mutex_unlock(&pool->mutex);
172 return NULL;
173 }
174
175 if (0)
176 {
177 printf("%03d [%d]: ",pool->num_tasks,pool->num_threads_working);
178
179 for (int i=0;i<pool->num_tasks;i++) {
180 printf("%d%c%d ",
181 pool->tasks[i].data.task_ctb.ctb_x,
182 i==0 ? 'X' :
183 '*',
184 pool->tasks[i].data.task_ctb.ctb_y);
185 }
186
187 printf("\n");
188 }
189
190
191 // get a task
192
193 thread_task task = pool->tasks[0];
194 pool->num_tasks--;
195
196 if (pool->num_tasks>0) {
197 if (1) {
198 memmove(&pool->tasks[0],
199 &pool->tasks[1],
200 pool->num_tasks*sizeof(thread_task));
201 }
202 else {
203 pool->tasks[0] = pool->tasks[pool->num_tasks];
204 }
205 }
206
207 pool->num_threads_working++;
208
209 //printblks(pool);
210
211 de265_mutex_unlock(&pool->mutex);
212
213
214 // execute the task
215
216 if (task.work_routine != NULL) {
217 task.work_routine( &task.data );
218 }
219
220
221 // end processing and check if this was the last task to be processed
222
223 de265_mutex_lock(&pool->mutex);
224
225 pool->num_threads_working--;
226 }
227 de265_mutex_unlock(&pool->mutex);
228
229 return NULL;
230 }
231
232
233 de265_error start_thread_pool(thread_pool* pool, int num_threads)
234 {
235 de265_error err = DE265_OK;
236
237 // limit number of threads to maximum
238
239 if (num_threads > MAX_THREADS) {
240 num_threads = MAX_THREADS;
241 err = DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM;
242 }
243
244 pool->num_tasks = 0;
245 pool->num_threads = 0; // will be increased below
246 pool->num_threads_working = 0;
247 pool->stopped = false;
248
249 de265_mutex_init(&pool->mutex);
250 de265_cond_init(&pool->cond_var);
251
252 // start worker threads
253
254 for (int i=0; i<num_threads; i++) {
255 int ret = de265_thread_create(&pool->thread[i], worker_thread, pool);
256 if (ret != 0) {
257 // cerr << "pthread_create() failed: " << ret << endl;
258 return DE265_ERROR_CANNOT_START_THREADPOOL;
259 }
260
261 pool->num_threads++;
262 }
263
264 return err;
265 }
266
267
268 void stop_thread_pool(thread_pool* pool)
269 {
270 de265_mutex_lock(&pool->mutex);
271 pool->stopped = true;
272 de265_mutex_unlock(&pool->mutex);
273
274 de265_cond_broadcast(&pool->cond_var, &pool->mutex);
275
276 for (int i=0;i<pool->num_threads;i++) {
277 de265_thread_join(pool->thread[i]);
278 de265_thread_destroy(&pool->thread[i]);
279 }
280
281 de265_mutex_destroy(&pool->mutex);
282 de265_cond_destroy(&pool->cond_var);
283 }
284
285
286 void add_task(thread_pool* pool, const thread_task* task)
287 {
288 de265_mutex_lock(&pool->mutex);
289 if (!pool->stopped) {
290
291 assert(pool->num_tasks < MAX_THREAD_TASKS);
292 pool->tasks[pool->num_tasks] = *task;
293 pool->num_tasks++;
294
295 // wake up one thread
296
297 de265_cond_signal(&pool->cond_var);
298 }
299 de265_mutex_unlock(&pool->mutex);
300 }
301
302 extern inline int de265_sync_sub_and_fetch(de265_sync_int* cnt, int n);
303 extern inline int de265_sync_add_and_fetch(de265_sync_int* cnt, int n);
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "threads.h"
21 #include <assert.h>
22 #include <string.h>
23
24 #if defined(_MSC_VER) || defined(__MINGW32__)
25 # include <malloc.h>
26 #else
27 # include <alloca.h>
28 #endif
29
30
31 #ifndef _WIN32
32 // #include <intrin.h>
33
34 #define THREAD_RESULT void*
35 #define THREAD_PARAM void*
36
37 int de265_thread_create(de265_thread* t, void *(*start_routine) (void *), void *arg) { return pthread_create(t,NULL,start_routine,arg); }
38 void de265_thread_join(de265_thread t) { pthread_join(t,NULL); }
39 void de265_thread_destroy(de265_thread* t) { }
40 void de265_mutex_init(de265_mutex* m) { pthread_mutex_init(m,NULL); }
41 void de265_mutex_destroy(de265_mutex* m) { pthread_mutex_destroy(m); }
42 void de265_mutex_lock(de265_mutex* m) { pthread_mutex_lock(m); }
43 void de265_mutex_unlock(de265_mutex* m) { pthread_mutex_unlock(m); }
44 void de265_cond_init(de265_cond* c) { pthread_cond_init(c,NULL); }
45 void de265_cond_destroy(de265_cond* c) { pthread_cond_destroy(c); }
46 void de265_cond_broadcast(de265_cond* c,de265_mutex* m) { pthread_cond_broadcast(c); }
47 void de265_cond_wait(de265_cond* c,de265_mutex* m) { pthread_cond_wait(c,m); }
48 void de265_cond_signal(de265_cond* c) { pthread_cond_signal(c); }
49 #else // _WIN32
50
51 #define THREAD_RESULT DWORD WINAPI
52 #define THREAD_PARAM LPVOID
53
54 int de265_thread_create(de265_thread* t, LPTHREAD_START_ROUTINE start_routine, void *arg) {
55 HANDLE handle = CreateThread(NULL, 0, start_routine, arg, 0, NULL);
56 if (handle == NULL) {
57 return -1;
58 }
59 *t = handle;
60 return 0;
61 }
62 void de265_thread_join(de265_thread t) { WaitForSingleObject(t, INFINITE); }
63 void de265_thread_destroy(de265_thread* t) { CloseHandle(*t); *t = NULL; }
64 void de265_mutex_init(de265_mutex* m) { *m = CreateMutex(NULL, FALSE, NULL); }
65 void de265_mutex_destroy(de265_mutex* m) { CloseHandle(*m); }
66 void de265_mutex_lock(de265_mutex* m) { WaitForSingleObject(*m, INFINITE); }
67 void de265_mutex_unlock(de265_mutex* m) { ReleaseMutex(*m); }
68 void de265_cond_init(de265_cond* c) { win32_cond_init(c); }
69 void de265_cond_destroy(de265_cond* c) { win32_cond_destroy(c); }
70 void de265_cond_broadcast(de265_cond* c,de265_mutex* m)
71 {
72 de265_mutex_lock(m);
73 win32_cond_broadcast(c);
74 de265_mutex_unlock(m);
75 }
76 void de265_cond_wait(de265_cond* c,de265_mutex* m) { win32_cond_wait(c,m); }
77 void de265_cond_signal(de265_cond* c) { win32_cond_signal(c); }
78 #endif // _WIN32
79
80
81
82
83 de265_progress_lock::de265_progress_lock()
84 {
85 mProgress = 0;
86
87 de265_mutex_init(&mutex);
88 de265_cond_init(&cond);
89 }
90
91 de265_progress_lock::~de265_progress_lock()
92 {
93 de265_mutex_destroy(&mutex);
94 de265_cond_destroy(&cond);
95 }
96
97 void de265_progress_lock::wait_for_progress(int progress)
98 {
99 if (mProgress >= progress) {
100 return;
101 }
102
103 de265_mutex_lock(&mutex);
104 while (mProgress < progress) {
105 de265_cond_wait(&cond, &mutex);
106 }
107 de265_mutex_unlock(&mutex);
108 }
109
110 void de265_progress_lock::set_progress(int progress)
111 {
112 de265_mutex_lock(&mutex);
113
114 if (progress>mProgress) {
115 mProgress = progress;
116
117 de265_cond_broadcast(&cond, &mutex);
118 }
119
120 de265_mutex_unlock(&mutex);
121 }
122
123 int de265_progress_lock::get_progress() const
124 {
125 return mProgress;
126 }
127
128
129
130
131 #include "libde265/decctx.h"
132
133 #if 0
134 const char* line="--------------------------------------------------";
135 void printblks(const thread_pool* pool)
136 {
137 int w = pool->tasks[0].data.task_ctb.ctx->current_sps->PicWidthInCtbsY;
138 int h = pool->tasks[0].data.task_ctb.ctx->current_sps->PicHeightInCtbsY;
139
140 printf("active threads: %d queue len: %d\n",pool->num_threads_working,pool->num_tasks);
141
142 char *const p = (char *)alloca(w * h * sizeof(char));
143 assert(p != NULL);
144 memset(p,' ',w*h);
145
146 for (int i=0;i<pool->num_tasks;i++) {
147 int b = 0; //pool->tasks[i].num_blockers;
148 int x = pool->tasks[i].data.task_ctb.ctb_x;
149 int y = pool->tasks[i].data.task_ctb.ctb_y;
150 p[y*w+x] = b+'0';
151 }
152
153 for (int i=0;i<pool->num_threads_working;i++) {
154 int x = pool->ctbx[i];
155 int y = pool->ctby[i];
156 p[y*w+x] = '*';
157 }
158
159 printf("+%s+\n",line+50-w);
160 for (int y=0;y<h;y++)
161 {
162 printf("|");
163 for (int x=0;x<w;x++)
164 {
165 printf("%c",p[x+y*w]);
166 }
167 printf("|\n");
168 }
169 printf("+%s+\n",line+50-w);
170 }
171 #endif
172
173
174 static THREAD_RESULT worker_thread(THREAD_PARAM pool_ptr)
175 {
176 thread_pool* pool = (thread_pool*)pool_ptr;
177
178
179 de265_mutex_lock(&pool->mutex);
180
181 while(true) {
182
183 // wait until we can pick a task or until the pool has been stopped
184
185 for (;;) {
186 // end waiting if thread-pool has been stopped or we have a task to execute
187
188 if (pool->stopped || pool->tasks.size()>0) {
189 break;
190 }
191
192 //printf("going idle\n");
193 de265_cond_wait(&pool->cond_var, &pool->mutex);
194 }
195
196 // if the pool was shut down, end the execution
197
198 if (pool->stopped) {
199 de265_mutex_unlock(&pool->mutex);
200 return NULL;
201 }
202
203
204 // get a task
205
206 thread_task* task = pool->tasks.front();
207 pool->tasks.pop_front();
208
209 pool->num_threads_working++;
210
211 //printblks(pool);
212
213 de265_mutex_unlock(&pool->mutex);
214
215
216 // execute the task
217
218 task->work();
219
220 // end processing and check if this was the last task to be processed
221
222 de265_mutex_lock(&pool->mutex);
223
224 pool->num_threads_working--;
225 }
226 de265_mutex_unlock(&pool->mutex);
227
228 return NULL;
229 }
230
231
232 de265_error start_thread_pool(thread_pool* pool, int num_threads)
233 {
234 de265_error err = DE265_OK;
235
236 // limit number of threads to maximum
237
238 if (num_threads > MAX_THREADS) {
239 num_threads = MAX_THREADS;
240 err = DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM;
241 }
242
243 pool->num_threads = 0; // will be increased below
244 pool->num_threads_working = 0;
245 pool->stopped = false;
246
247 de265_mutex_init(&pool->mutex);
248 de265_cond_init(&pool->cond_var);
249
250 // start worker threads
251
252 for (int i=0; i<num_threads; i++) {
253 int ret = de265_thread_create(&pool->thread[i], worker_thread, pool);
254 if (ret != 0) {
255 // cerr << "pthread_create() failed: " << ret << endl;
256 return DE265_ERROR_CANNOT_START_THREADPOOL;
257 }
258
259 pool->num_threads++;
260 }
261
262 return err;
263 }
264
265
266 void stop_thread_pool(thread_pool* pool)
267 {
268 de265_mutex_lock(&pool->mutex);
269 pool->stopped = true;
270 de265_mutex_unlock(&pool->mutex);
271
272 de265_cond_broadcast(&pool->cond_var, &pool->mutex);
273
274 for (int i=0;i<pool->num_threads;i++) {
275 de265_thread_join(pool->thread[i]);
276 de265_thread_destroy(&pool->thread[i]);
277 }
278
279 de265_mutex_destroy(&pool->mutex);
280 de265_cond_destroy(&pool->cond_var);
281 }
282
283
284 void add_task(thread_pool* pool, thread_task* task)
285 {
286 de265_mutex_lock(&pool->mutex);
287 if (!pool->stopped) {
288
289 pool->tasks.push_back(task);
290
291 // wake up one thread
292
293 de265_cond_signal(&pool->cond_var);
294 }
295 de265_mutex_unlock(&pool->mutex);
296 }
297
298 extern inline int de265_sync_sub_and_fetch(de265_sync_int* cnt, int n);
299 extern inline int de265_sync_add_and_fetch(de265_sync_int* cnt, int n);
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
2929 #ifdef HAVE_STDBOOL_H
3030 #include <stdbool.h>
3131 #endif
32
33 #include <deque>
3234
3335 #ifndef _WIN32
3436 #include <pthread.h>
8991 }
9092
9193
92 typedef struct de265_progress_lock
94 class de265_progress_lock
9395 {
94 int progress;
96 public:
97 de265_progress_lock();
98 ~de265_progress_lock();
99
100 void wait_for_progress(int progress);
101 void set_progress(int progress);
102 int get_progress() const;
103 void reset(int value=0) { mProgress=value; }
104
105 private:
106 int mProgress;
95107
96108 // private data
97109
98110 de265_mutex mutex;
99111 de265_cond cond;
100 } de265_progress_lock;
101
102 void de265_progress_lock_init(de265_progress_lock* lock);
103 void de265_progress_lock_destroy(de265_progress_lock* lock);
104 int de265_wait_for_progress(de265_progress_lock* lock, int progress);
105 void de265_announce_progress(de265_progress_lock* lock, int progress);
112 };
106113
107114
108115
109 enum thread_task_ctb_init_type { INIT_RESET, INIT_COPY, INIT_NONE };
116 class thread_task
117 {
118 public:
119 thread_task() : state(Queued) { }
120 virtual ~thread_task() { }
110121
111 struct thread_task_ctb
112 {
113 int ctb_x, ctb_y;
114 struct decoder_context* ctx;
115 struct thread_context* tctx;
116 struct slice_segment_header* shdr;
122 enum { Queued, Running, Blocked, Finished } state;
117123
118 enum thread_task_ctb_init_type CABAC_init;
124 virtual void work() = 0;
119125 };
120126
121 struct thread_task_ctb_row
122 {
123 int thread_context_id;
124 bool initCABAC;
125 struct decoder_context* ctx;
126 };
127127
128 struct thread_task_deblock
129 {
130 struct decoder_context* ctx;
131 int first; // stripe row
132 int last;
133 int ctb_x,ctb_y;
134 bool vertical;
135 };
136
137 enum thread_task_id {
138 THREAD_TASK_SYNTAX_DECODE_CTB,
139 THREAD_TASK_DEBLOCK,
140 THREAD_TASK_DECODE_CTB_ROW,
141 THREAD_TASK_DECODE_SLICE_SEGMENT,
142 //THREAD_TASK_PIXEL_DECODE_CTB,
143 //THREAD_TASK_POSTPROC_CTB
144 };
145
146 typedef struct
147 {
148 int task_id;
149 enum thread_task_id task_cmd;
150
151 void (*work_routine)(void* data);
152
153 union {
154 struct thread_task_ctb task_ctb;
155 struct thread_task_ctb_row task_ctb_row;
156 struct thread_task_deblock task_deblock;
157 } data;
158 } thread_task;
159
160
161 #define MAX_THREAD_TASKS 1024
162128 #define MAX_THREADS 32
163129
164 typedef struct thread_pool
130 /* TODO NOTE: When unblocking a task, we have to check first
131 if there are threads waiting because of the run-count limit.
132 If there are higher-priority tasks, those should be run instead
133 of the just unblocked task.
134 */
135
136 class thread_pool
165137 {
138 public:
166139 bool stopped;
167140
168 thread_task tasks[MAX_THREAD_TASKS];
169 int num_tasks;
141 std::deque<thread_task*> tasks; // we are not the owner
170142
171143 de265_thread thread[MAX_THREADS];
172144 int num_threads;
173145
174146 int num_threads_working;
175 //long tasks_pending;
176147
177148 int ctbx[MAX_THREADS]; // the CTB the thread is working on
178149 int ctby[MAX_THREADS];
179150
180151 de265_mutex mutex;
181152 de265_cond cond_var;
182 } thread_pool;
153 };
183154
184155
185156 de265_error start_thread_pool(thread_pool* pool, int num_threads);
186157 void stop_thread_pool(thread_pool* pool); // do not process remaining tasks
187158
188 void add_task(thread_pool* pool, const thread_task* task);
159 void add_task(thread_pool* pool, thread_task* task); // TOCO: can make thread_task const
189160
190161 #endif
+0
-455
libde265/transform.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "transform.h"
21 #include "util.h"
22 #include "pps_func.h"
23
24 #include <assert.h>
25
26
27 static int nDCT_4x4, nDCT_8x8, nDCT_16x16, nDCT_32x32, nDST_4x4;
28 static int nSkip_4x4;
29 static int nCoeff4x4[16+1], nCoeff8x8[64+1], nCoeff16x16[16*16+1], nCoeff32x32[32*32+1];
30
31 LIBDE265_API void showTransformProfile()
32 {
33 const int nDCT_sum = nDST_4x4 + nDCT_4x4 + nDCT_8x8 + nDCT_16x16 + nDCT_32x32 + nSkip_4x4;
34 fprintf(stderr,"transform usage:\n");
35 fprintf(stderr," IDST 4x4: %8d %4.1f%%\n",nDST_4x4,(float)(nDST_4x4 * 100) / nDCT_sum);
36 fprintf(stderr," IDCT 4x4: %8d %4.1f%%\n",nDCT_4x4,(float)(nDCT_4x4 * 100) / nDCT_sum);
37 fprintf(stderr," IDCT 8x8: %8d %4.1f%%\n",nDCT_8x8,(float)(nDCT_8x8 * 100) / nDCT_sum);
38 fprintf(stderr," IDCT 16x16: %8d %4.1f%%\n",nDCT_16x16,(float)(nDCT_16x16 * 100) / nDCT_sum);
39 fprintf(stderr," IDCT 32x32: %8d %4.1f%%\n",nDCT_32x32,(float)(nDCT_32x32 * 100) / nDCT_sum);
40 fprintf(stderr," Skip 4x4: %8d %4.1f%%\n",nSkip_4x4,(float)(nSkip_4x4 * 100) / nDCT_sum);
41
42 fprintf(stderr,"nCoeff DCT 4x4: ");
43 for (int i=1;i<=16;i++)
44 fprintf(stderr,"%d ",nCoeff4x4[i]);
45 fprintf(stderr,"\n");
46
47 fprintf(stderr,"nCoeff DCT 8x8: ");
48 for (int i=1;i<=8*8;i++)
49 fprintf(stderr,"%d ",nCoeff8x8[i]);
50 fprintf(stderr,"\n");
51
52 fprintf(stderr,"nCoeff DCT 16x16: ");
53 for (int i=1;i<=16*16;i++)
54 fprintf(stderr,"%d ",nCoeff16x16[i]);
55 fprintf(stderr,"\n");
56
57 fprintf(stderr,"nCoeff DCT 32x32: ");
58 for (int i=1;i<=32*32;i++)
59 fprintf(stderr,"%d ",nCoeff32x32[i]);
60 fprintf(stderr,"\n");
61 }
62
63
64 static const int tab8_22[] = { 29,30,31,32,33,33,34,34,35,35,36,36,37 /*,37*/ };
65
66 int table8_22(int qPi)
67 {
68 if (qPi<30) return qPi;
69 if (qPi>=43) return qPi-6;
70 return tab8_22[qPi-30];
71 }
72
73 #include <sys/types.h>
74 #include <signal.h>
75
76 // (8.6.1)
77 void decode_quantization_parameters(decoder_context* ctx,
78 thread_context* tctx, int xC,int yC,
79 int xCUBase, int yCUBase)
80 {
81 logtrace(LogTransform,">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> decode_quantization_parameters(int xC,int yC)=(%d,%d)\n", xC,yC);
82
83 if (/*ctx->img->PicOrderCntVal==3 &&*/ xC==168 && yC==128) {
84 //raise(SIGINT);
85 }
86
87 pic_parameter_set* pps = ctx->current_pps;
88 seq_parameter_set* sps = ctx->current_sps;
89 slice_segment_header* shdr = tctx->shdr;
90
91 // top left pixel position of current quantization group
92 int xQG = xCUBase - (xCUBase & ((1<<pps->Log2MinCuQpDeltaSize)-1));
93 int yQG = yCUBase - (yCUBase & ((1<<pps->Log2MinCuQpDeltaSize)-1));
94
95 logtrace(LogTransform,"QG: %d,%d\n",xQG,yQG);
96
97
98 // we only have to set QP in the first call in a quantization-group
99
100 /* TODO: check why this does not work with HoneyBee stream
101
102 if (xQG == tctx->currentQG_x &&
103 yQG == tctx->currentQG_y)
104 {
105 return;
106 }
107 */
108
109 // if first QG in CU, remember last QPY of last CU previous QG
110
111 if (xQG != tctx->currentQG_x ||
112 yQG != tctx->currentQG_y)
113 {
114 tctx->lastQPYinPreviousQG = tctx->currentQPY;
115 tctx->currentQG_x = xQG;
116 tctx->currentQG_y = yQG;
117 }
118
119 int qPY_PRED;
120 bool firstQGInSlice;
121 bool firstQGInTile = false;
122 bool firstInCTBRow = (xQG==0); // TODO: Tiles
123
124 int first_ctb_in_slice_RS = tctx->shdr->SliceAddrRS; // slice_segment_address;
125
126 int SliceStartX = (first_ctb_in_slice_RS % sps->PicWidthInCtbsY) * sps->CtbSizeY;
127 int SliceStartY = (first_ctb_in_slice_RS / sps->PicWidthInCtbsY) * sps->CtbSizeY;
128
129 firstQGInSlice = (SliceStartX == xQG && SliceStartY == yQG);
130
131 if (pps->tiles_enabled_flag) {
132 if ((xQG & ((1 << sps->Log2CtbSizeY)-1)) == 0 &&
133 (yQG & ((1 << sps->Log2CtbSizeY)-1)) == 0)
134 {
135 int ctbX = xQG >> sps->Log2CtbSizeY;
136 int ctbY = yQG >> sps->Log2CtbSizeY;
137
138 firstQGInTile = is_tile_start_CTB(pps,ctbX,ctbY);
139 }
140 }
141
142 if (firstQGInSlice || firstQGInTile ||
143 (firstInCTBRow && pps->entropy_coding_sync_enabled_flag)) {
144 qPY_PRED = tctx->shdr->SliceQPY;
145 }
146 else {
147 qPY_PRED = tctx->lastQPYinPreviousQG;
148 }
149
150
151 int qPYA,qPYB;
152
153 if (available_zscan(ctx->img,xQG,yQG, xQG-1,yQG)) {
154 int xTmp = (xQG-1) >> sps->Log2MinTrafoSize;
155 int yTmp = (yQG ) >> sps->Log2MinTrafoSize;
156 int minTbAddrA = pps->MinTbAddrZS[xTmp + yTmp*sps->PicWidthInTbsY];
157 int ctbAddrA = minTbAddrA >> (2 * (sps->Log2CtbSizeY-sps->Log2MinTrafoSize));
158 if (ctbAddrA == tctx->CtbAddrInTS) {
159 qPYA = get_QPY(ctx->img,sps,xQG-1,yQG);
160 }
161 else {
162 qPYA = qPY_PRED;
163 }
164 }
165 else {
166 qPYA = qPY_PRED;
167 }
168
169 if (available_zscan(ctx->img,xQG,yQG, xQG,yQG-1)) {
170 int xTmp = (xQG ) >> sps->Log2MinTrafoSize;
171 int yTmp = (yQG-1) >> sps->Log2MinTrafoSize;
172 int minTbAddrB = pps->MinTbAddrZS[xTmp + yTmp*sps->PicWidthInTbsY];
173 int ctbAddrB = minTbAddrB >> (2 * (sps->Log2CtbSizeY-sps->Log2MinTrafoSize));
174 if (ctbAddrB == tctx->CtbAddrInTS) {
175 qPYB = get_QPY(ctx->img,sps,xQG,yQG-1);
176 }
177 else {
178 qPYB = qPY_PRED;
179 }
180 }
181 else {
182 qPYB = qPY_PRED;
183 }
184
185 qPY_PRED = (qPYA + qPYB + 1)>>1;
186
187 logtrace(LogTransform,"qPY_PRED = %d (%d, %d)\n",qPY_PRED, qPYA, qPYB);
188
189 int QPY = ((qPY_PRED + tctx->CuQpDelta + 52+2*sps->QpBdOffset_Y) %
190 (52 + sps->QpBdOffset_Y)) - sps->QpBdOffset_Y;
191
192 tctx->qPYPrime = QPY + sps->QpBdOffset_Y;
193
194 int qPiCb = Clip3(-sps->QpBdOffset_C,57, QPY+pps->pic_cb_qp_offset + shdr->slice_cb_qp_offset);
195 int qPiCr = Clip3(-sps->QpBdOffset_C,57, QPY+pps->pic_cr_qp_offset + shdr->slice_cr_qp_offset);
196
197 logtrace(LogTransform,"qPiCb:%d (%d %d), qPiCr:%d (%d %d)\n",
198 qPiCb, pps->pic_cb_qp_offset, shdr->slice_cb_qp_offset,
199 qPiCr, pps->pic_cr_qp_offset, shdr->slice_cr_qp_offset);
200
201 int qPCb = table8_22(qPiCb);
202 int qPCr = table8_22(qPiCr);
203
204 tctx->qPCbPrime = qPCb + sps->QpBdOffset_C;
205 tctx->qPCrPrime = qPCr + sps->QpBdOffset_C;
206
207 int log2CbSize = get_log2CbSize (ctx->img, sps, xCUBase, yCUBase);
208 set_QPY(ctx->img, sps, pps,xCUBase, yCUBase, log2CbSize, QPY);
209 tctx->currentQPY = QPY;
210
211 /*
212 printf("SET QPY POC=%d %d;%d-%d;%d = %d\n",ctx->img->PicOrderCntVal,xCUBase,yCUBase,
213 xCUBase+(1<<log2CbSize),yCUBase+(1<<log2CbSize), QPY);
214 */
215
216 logtrace(LogTransform,"qPY(%d,%d,%d)= %d, qPYPrime=%d\n",
217 xCUBase,yCUBase,1<<log2CbSize,QPY,tctx->qPYPrime);
218 }
219
220
221
222 void transform_coefficients(decoder_context* ctx, slice_segment_header* shdr,
223 int16_t* coeff, int coeffStride, int nT, int trType, int postShift,
224 uint8_t* dst, int dstStride)
225 {
226 logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT);
227
228 if (trType==1) {
229
230 ctx->acceleration.transform_4x4_luma_add_8(dst, coeff, dstStride);
231 nDST_4x4++;
232
233 } else {
234
235 /**/ if (nT==4) { ctx->acceleration.transform_4x4_add_8(dst,coeff,dstStride); nDCT_4x4++; }
236 else if (nT==8) { ctx->acceleration.transform_8x8_add_8(dst,coeff,dstStride); nDCT_8x8++; }
237 else if (nT==16) { ctx->acceleration.transform_16x16_add_8(dst,coeff,dstStride); nDCT_16x16++; }
238 else { ctx->acceleration.transform_32x32_add_8(dst,coeff,dstStride); nDCT_32x32++; }
239 }
240 }
241
242
243 static const int levelScale[] = { 40,45,51,57,64,72 };
244
245 // (8.6.2) and (8.6.3)
246 void scale_coefficients(decoder_context* ctx, thread_context* tctx,
247 int xT,int yT, // position of TU in frame (chroma adapted)
248 int x0,int y0, // position of CU in frame (chroma adapted)
249 int nT, int cIdx,
250 bool transform_skip_flag, bool intra)
251 {
252 seq_parameter_set* sps = ctx->current_sps;
253 pic_parameter_set* pps = ctx->current_pps;
254 slice_segment_header* shdr = tctx->shdr;
255
256 int qP;
257 switch (cIdx) {
258 case 0: qP = tctx->qPYPrime; break;
259 case 1: qP = tctx->qPCbPrime; break;
260 case 2: qP = tctx->qPCrPrime; break;
261 default: qP = 0; assert(0); break; // should never happen
262 }
263
264 logtrace(LogTransform,"qP: %d\n",qP);
265
266 //printf("residual %d;%d cIdx=%d qp=%d\n",xT * (cIdx?2:1),yT * (cIdx?2:1),cIdx,qP);
267
268
269 int16_t* coeff;
270 int coeffStride;
271
272 coeff = tctx->coeffBuf;
273 coeffStride = nT;
274
275
276
277
278
279 uint8_t* pred;
280 int stride;
281 get_image_plane(ctx->img,cIdx,&pred,&stride);
282 pred += xT + yT*stride;
283
284 //fprintf(stderr,"POC=%d pred: %p (%d;%d stride=%d)\n",ctx->img->PicOrderCntVal,pred,xT,yT,stride);
285
286 /*
287 int x,y;
288 for (y=0;y<nT;y++)
289 {
290 printf("P: ");
291
292 for (x=0;x<nT;x++)
293 {
294 printf("%02x ",pred[x+y*stride]);
295 }
296
297 printf("\n");
298 }
299 */
300
301 if (tctx->cu_transquant_bypass_flag) {
302 //assert(false); // TODO
303
304 for (int i=0;i<tctx->nCoeff[cIdx];i++) {
305 int32_t currCoeff = tctx->coeffList[cIdx][i];
306 tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
307 }
308
309 ctx->acceleration.transform_bypass_8(pred, coeff, nT, stride);
310 }
311 else {
312 // (8.6.3)
313
314 int bdShift = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C) + Log2(nT) - 5;
315
316 logtrace(LogTransform,"bdShift=%d\n",bdShift);
317
318 logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP);
319
320 if (sps->scaling_list_enable_flag==0) {
321
322 //const int m_x_y = 16;
323 const int m_x_y = 1;
324 bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers
325
326 const int offset = (1<<(bdShift-1));
327 const int fact = m_x_y * levelScale[qP%6] << (qP/6);
328
329 for (int i=0;i<tctx->nCoeff[cIdx];i++) {
330
331 // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit
332 int32_t currCoeff = tctx->coeffList[cIdx][i];
333
334 currCoeff = Clip3(-32768,32767,
335 ( (currCoeff * fact + offset ) >> bdShift));
336
337 tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
338 }
339 }
340 else {
341 const int offset = (1<<(bdShift-1));
342
343 uint8_t* sclist;
344 int matrixID = cIdx;
345 if (!intra) {
346 if (nT<32) { matrixID += 3; }
347 else { matrixID++; }
348 }
349
350 switch (nT) {
351 case 4: sclist = &pps->scaling_list.ScalingFactor_Size0[matrixID][0][0]; break;
352 case 8: sclist = &pps->scaling_list.ScalingFactor_Size1[matrixID][0][0]; break;
353 case 16: sclist = &pps->scaling_list.ScalingFactor_Size2[matrixID][0][0]; break;
354 case 32: sclist = &pps->scaling_list.ScalingFactor_Size3[matrixID][0][0]; break;
355 default: assert(0);
356 }
357
358 for (int i=0;i<tctx->nCoeff[cIdx];i++) {
359 int pos = tctx->coeffPos[cIdx][i];
360 int x = pos%nT;
361 int y = pos/nT;
362
363 const int m_x_y = sclist[x+y*nT];
364 const int fact = m_x_y * levelScale[qP%6] << (qP/6);
365
366 int64_t currCoeff = tctx->coeffList[cIdx][i];
367
368 currCoeff = Clip3(-32768,32767,
369 ( (currCoeff * fact + offset ) >> bdShift));
370
371 tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
372 }
373 }
374
375 logtrace(LogTransform,"coefficients OUT:\n");
376 for (int y=0;y<nT;y++) {
377 logtrace(LogTransform," ");
378 for (int x=0;x<nT;x++) {
379 logtrace(LogTransform,"*%3d ", coeff[x+y*coeffStride]);
380 }
381 logtrace(LogTransform,"*\n");
382 }
383
384 int bdShift2 = (cIdx==0) ? 20-sps->BitDepth_Y : 20-sps->BitDepth_C;
385
386 logtrace(LogTransform,"bdShift2=%d\n",bdShift2);
387
388 logtrace(LogSlice,"get_transform_skip_flag(%d,%d, cIdx=%d)=%d\n",xT,yT,cIdx,
389 transform_skip_flag);
390
391 if (transform_skip_flag) {
392
393 ctx->acceleration.transform_skip_8(pred, coeff, stride);
394
395 nSkip_4x4++;
396 }
397 else {
398 int trType;
399
400 if (nT==4 && cIdx==0 && get_pred_mode(ctx->img,ctx->current_sps,xT,yT)==MODE_INTRA) {
401 trType=1;
402 }
403 else {
404 trType=0;
405 }
406
407 transform_coefficients(ctx,shdr, coeff, coeffStride, nT, trType, bdShift2,
408 pred, stride);
409 }
410 }
411
412
413 logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT);
414
415 for (int y=0;y<nT;y++) {
416 logtrace(LogTransform,"RECO-%d-%d-%d ",xT,yT+y,cIdx);
417
418 for (int x=0;x<nT;x++) {
419 logtrace(LogTransform,"*%02x ", pred[x+y*stride]);
420 }
421
422 logtrace(LogTransform,"*\n");
423 }
424
425 /*
426 for (y=0;y<nT;y++)
427 {
428 printf("C: ");
429
430 for (x=0;x<nT;x++)
431 {
432 printf("%4d ",coeff[x+y*nT]);
433 }
434
435 printf("\n");
436 }
437
438 for (y=0;y<nT;y++)
439 {
440 for (x=0;x<nT;x++)
441 {
442 printf("%02x ",pred[x+y*stride]);
443 }
444
445 printf("\n");
446 }
447 */
448
449 // zero out scrap coefficient buffer again
450
451 for (int i=0;i<tctx->nCoeff[cIdx];i++) {
452 tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0;
453 }
454 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "transform.h"
21 #include "util.h"
22
23 #include <assert.h>
24
25
26 const int tab8_22[] = { 29,30,31,32,33,33,34,34,35,35,36,36,37 /*,37*/ };
27
28
29 // (8.6.1)
30 void decode_quantization_parameters(thread_context* tctx, int xC,int yC,
31 int xCUBase, int yCUBase)
32 {
33 logtrace(LogTransform,">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> decode_quantization_parameters(int xC,int yC)=(%d,%d)\n", xC,yC);
34
35 pic_parameter_set* pps = &tctx->img->pps;
36 seq_parameter_set* sps = &tctx->img->sps;
37 slice_segment_header* shdr = tctx->shdr;
38
39 // top left pixel position of current quantization group
40 int xQG = xCUBase - (xCUBase & ((1<<pps->Log2MinCuQpDeltaSize)-1));
41 int yQG = yCUBase - (yCUBase & ((1<<pps->Log2MinCuQpDeltaSize)-1));
42
43 logtrace(LogTransform,"QG: %d,%d\n",xQG,yQG);
44
45
46 // we only have to set QP in the first call in a quantization-group
47
48 /* TODO: check why this does not work with HoneyBee stream
49
50 if (xQG == tctx->currentQG_x &&
51 yQG == tctx->currentQG_y)
52 {
53 return;
54 }
55 */
56
57 // if first QG in CU, remember last QPY of last CU previous QG
58
59 if (xQG != tctx->currentQG_x ||
60 yQG != tctx->currentQG_y)
61 {
62 tctx->lastQPYinPreviousQG = tctx->currentQPY;
63 tctx->currentQG_x = xQG;
64 tctx->currentQG_y = yQG;
65 }
66
67 int qPY_PRED;
68
69 // first QG in CTB row ?
70
71 int ctbLSBMask = ((1<<sps->Log2CtbSizeY)-1);
72 bool firstInCTBRow = (xQG == 0 && ((yQG & ctbLSBMask)==0));
73
74 // first QG in slice ? TODO: a "firstQG" flag in the thread context would be faster
75
76 int first_ctb_in_slice_RS = tctx->shdr->SliceAddrRS;
77
78 int SliceStartX = (first_ctb_in_slice_RS % sps->PicWidthInCtbsY) * sps->CtbSizeY;
79 int SliceStartY = (first_ctb_in_slice_RS / sps->PicWidthInCtbsY) * sps->CtbSizeY;
80
81 bool firstQGInSlice = (SliceStartX == xQG && SliceStartY == yQG);
82
83 // first QG in tile ?
84
85 bool firstQGInTile = false;
86 if (pps->tiles_enabled_flag) {
87 if ((xQG & ((1 << sps->Log2CtbSizeY)-1)) == 0 &&
88 (yQG & ((1 << sps->Log2CtbSizeY)-1)) == 0)
89 {
90 int ctbX = xQG >> sps->Log2CtbSizeY;
91 int ctbY = yQG >> sps->Log2CtbSizeY;
92
93 firstQGInTile = pps->is_tile_start_CTB(ctbX,ctbY); // TODO: this is slow
94 }
95 }
96
97
98 if (firstQGInSlice || firstQGInTile ||
99 (firstInCTBRow && pps->entropy_coding_sync_enabled_flag)) {
100 qPY_PRED = tctx->shdr->SliceQPY;
101 }
102 else {
103 qPY_PRED = tctx->lastQPYinPreviousQG;
104 }
105
106
107 int qPYA,qPYB;
108
109 if (tctx->img->available_zscan(xQG,yQG, xQG-1,yQG)) {
110 int xTmp = (xQG-1) >> sps->Log2MinTrafoSize;
111 int yTmp = (yQG ) >> sps->Log2MinTrafoSize;
112 int minTbAddrA = pps->MinTbAddrZS[xTmp + yTmp*sps->PicWidthInTbsY];
113 int ctbAddrA = minTbAddrA >> (2 * (sps->Log2CtbSizeY-sps->Log2MinTrafoSize));
114 if (ctbAddrA == tctx->CtbAddrInTS) {
115 qPYA = tctx->img->get_QPY(xQG-1,yQG);
116 }
117 else {
118 qPYA = qPY_PRED;
119 }
120 }
121 else {
122 qPYA = qPY_PRED;
123 }
124
125 if (tctx->img->available_zscan(xQG,yQG, xQG,yQG-1)) {
126 int xTmp = (xQG ) >> sps->Log2MinTrafoSize;
127 int yTmp = (yQG-1) >> sps->Log2MinTrafoSize;
128 int minTbAddrB = pps->MinTbAddrZS[xTmp + yTmp*sps->PicWidthInTbsY];
129 int ctbAddrB = minTbAddrB >> (2 * (sps->Log2CtbSizeY-sps->Log2MinTrafoSize));
130 if (ctbAddrB == tctx->CtbAddrInTS) {
131 qPYB = tctx->img->get_QPY(xQG,yQG-1);
132 }
133 else {
134 qPYB = qPY_PRED;
135 }
136 }
137 else {
138 qPYB = qPY_PRED;
139 }
140
141 qPY_PRED = (qPYA + qPYB + 1)>>1;
142
143 logtrace(LogTransform,"qPY_PRED = %d (%d, %d)\n",qPY_PRED, qPYA, qPYB);
144
145 int QPY = ((qPY_PRED + tctx->CuQpDelta + 52+2*sps->QpBdOffset_Y) %
146 (52 + sps->QpBdOffset_Y)) - sps->QpBdOffset_Y;
147
148 tctx->qPYPrime = QPY + sps->QpBdOffset_Y;
149
150 int qPiCb = Clip3(-sps->QpBdOffset_C,57, QPY+pps->pic_cb_qp_offset + shdr->slice_cb_qp_offset);
151 int qPiCr = Clip3(-sps->QpBdOffset_C,57, QPY+pps->pic_cr_qp_offset + shdr->slice_cr_qp_offset);
152
153 logtrace(LogTransform,"qPiCb:%d (%d %d), qPiCr:%d (%d %d)\n",
154 qPiCb, pps->pic_cb_qp_offset, shdr->slice_cb_qp_offset,
155 qPiCr, pps->pic_cr_qp_offset, shdr->slice_cr_qp_offset);
156
157 int qPCb = table8_22(qPiCb);
158 int qPCr = table8_22(qPiCr);
159
160 tctx->qPCbPrime = qPCb + sps->QpBdOffset_C;
161 tctx->qPCrPrime = qPCr + sps->QpBdOffset_C;
162
163 int log2CbSize = tctx->img->get_log2CbSize(xCUBase, yCUBase);
164 tctx->img->set_QPY(xCUBase, yCUBase, log2CbSize, QPY);
165 tctx->currentQPY = QPY;
166
167 /*
168 printf("SET QPY POC=%d %d;%d-%d;%d = %d\n",ctx->img->PicOrderCntVal,xCUBase,yCUBase,
169 xCUBase+(1<<log2CbSize),yCUBase+(1<<log2CbSize), QPY);
170 */
171
172 logtrace(LogTransform,"qPY(%d,%d,%d)= %d, qPYPrime=%d\n",
173 xCUBase,yCUBase,1<<log2CbSize,QPY,tctx->qPYPrime);
174 }
175
176
177
178 void transform_coefficients(decoder_context* ctx,
179 int16_t* coeff, int coeffStride, int nT, int trType, int postShift,
180 uint8_t* dst, int dstStride)
181 {
182 logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT);
183
184 if (trType==1) {
185
186 ctx->acceleration.transform_4x4_luma_add_8(dst, coeff, dstStride);
187
188 } else {
189
190 /**/ if (nT==4) { ctx->acceleration.transform_4x4_add_8(dst,coeff,dstStride); }
191 else if (nT==8) { ctx->acceleration.transform_8x8_add_8(dst,coeff,dstStride); }
192 else if (nT==16) { ctx->acceleration.transform_16x16_add_8(dst,coeff,dstStride); }
193 else { ctx->acceleration.transform_32x32_add_8(dst,coeff,dstStride); }
194 }
195 }
196
197
198 static const int levelScale[] = { 40,45,51,57,64,72 };
199
200 // (8.6.2) and (8.6.3)
201 void scale_coefficients(thread_context* tctx,
202 int xT,int yT, // position of TU in frame (chroma adapted)
203 int x0,int y0, // position of CU in frame (chroma adapted)
204 int nT, int cIdx,
205 bool transform_skip_flag, bool intra)
206 {
207 seq_parameter_set* sps = &tctx->img->sps;
208 pic_parameter_set* pps = &tctx->img->pps;
209
210 int qP;
211 switch (cIdx) {
212 case 0: qP = tctx->qPYPrime; break;
213 case 1: qP = tctx->qPCbPrime; break;
214 case 2: qP = tctx->qPCrPrime; break;
215 default: qP = 0; assert(0); break; // should never happen
216 }
217
218 logtrace(LogTransform,"qP: %d\n",qP);
219
220 //printf("residual %d;%d cIdx=%d qp=%d\n",xT * (cIdx?2:1),yT * (cIdx?2:1),cIdx,qP);
221
222
223 int16_t* coeff;
224 int coeffStride;
225
226 coeff = tctx->coeffBuf;
227 coeffStride = nT;
228
229
230
231
232
233 uint8_t* pred;
234 int stride;
235 pred = tctx->img->get_image_plane_at_pos(cIdx, xT,yT);
236 stride = tctx->img->get_image_stride(cIdx);
237
238 //fprintf(stderr,"POC=%d pred: %p (%d;%d stride=%d)\n",ctx->img->PicOrderCntVal,pred,xT,yT,stride);
239
240 /*
241 int x,y;
242 for (y=0;y<nT;y++)
243 {
244 printf("P: ");
245
246 for (x=0;x<nT;x++)
247 {
248 printf("%02x ",pred[x+y*stride]);
249 }
250
251 printf("\n");
252 }
253 */
254
255 if (tctx->cu_transquant_bypass_flag) {
256 //assert(false); // TODO
257
258 for (int i=0;i<tctx->nCoeff[cIdx];i++) {
259 int32_t currCoeff = tctx->coeffList[cIdx][i];
260 tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
261 }
262
263 tctx->decctx->acceleration.transform_bypass_8(pred, coeff, nT, stride);
264 }
265 else {
266 // (8.6.3)
267
268 int bdShift = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C) + Log2(nT) - 5;
269
270 logtrace(LogTransform,"bdShift=%d\n",bdShift);
271
272 logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP);
273
274 if (sps->scaling_list_enable_flag==0) {
275
276 //const int m_x_y = 16;
277 const int m_x_y = 1;
278 bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers
279
280 const int offset = (1<<(bdShift-1));
281 const int fact = m_x_y * levelScale[qP%6] << (qP/6);
282
283 for (int i=0;i<tctx->nCoeff[cIdx];i++) {
284
285 // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit
286 int32_t currCoeff = tctx->coeffList[cIdx][i];
287
288 currCoeff = Clip3(-32768,32767,
289 ( (currCoeff * fact + offset ) >> bdShift));
290
291 tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
292 }
293 }
294 else {
295 const int offset = (1<<(bdShift-1));
296
297 uint8_t* sclist;
298 int matrixID = cIdx;
299 if (!intra) {
300 if (nT<32) { matrixID += 3; }
301 else { matrixID++; }
302 }
303
304 switch (nT) {
305 case 4: sclist = &pps->scaling_list.ScalingFactor_Size0[matrixID][0][0]; break;
306 case 8: sclist = &pps->scaling_list.ScalingFactor_Size1[matrixID][0][0]; break;
307 case 16: sclist = &pps->scaling_list.ScalingFactor_Size2[matrixID][0][0]; break;
308 case 32: sclist = &pps->scaling_list.ScalingFactor_Size3[matrixID][0][0]; break;
309 default: assert(0);
310 }
311
312 for (int i=0;i<tctx->nCoeff[cIdx];i++) {
313 int pos = tctx->coeffPos[cIdx][i];
314 int x = pos%nT;
315 int y = pos/nT;
316
317 const int m_x_y = sclist[x+y*nT];
318 const int fact = m_x_y * levelScale[qP%6] << (qP/6);
319
320 int64_t currCoeff = tctx->coeffList[cIdx][i];
321
322 currCoeff = Clip3(-32768,32767,
323 ( (currCoeff * fact + offset ) >> bdShift));
324
325 tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
326 }
327 }
328
329 logtrace(LogTransform,"coefficients OUT:\n");
330 for (int y=0;y<nT;y++) {
331 logtrace(LogTransform," ");
332 for (int x=0;x<nT;x++) {
333 logtrace(LogTransform,"*%3d ", coeff[x+y*coeffStride]);
334 }
335 logtrace(LogTransform,"*\n");
336 }
337
338 int bdShift2 = (cIdx==0) ? 20-sps->BitDepth_Y : 20-sps->BitDepth_C;
339
340 logtrace(LogTransform,"bdShift2=%d\n",bdShift2);
341
342 logtrace(LogSlice,"get_transform_skip_flag(%d,%d, cIdx=%d)=%d\n",xT,yT,cIdx,
343 transform_skip_flag);
344
345 if (transform_skip_flag) {
346
347 tctx->decctx->acceleration.transform_skip_8(pred, coeff, stride);
348 }
349 else {
350 int trType;
351
352 if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) {
353 trType=1;
354 }
355 else {
356 trType=0;
357 }
358
359 transform_coefficients(tctx->decctx, coeff, coeffStride, nT, trType, bdShift2,
360 pred, stride);
361 }
362 }
363
364
365 logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT);
366
367 for (int y=0;y<nT;y++) {
368 logtrace(LogTransform,"RECO-%d-%d-%d ",xT,yT+y,cIdx);
369
370 for (int x=0;x<nT;x++) {
371 logtrace(LogTransform,"*%02x ", pred[x+y*stride]);
372 }
373
374 logtrace(LogTransform,"*\n");
375 }
376
377 /*
378 for (y=0;y<nT;y++)
379 {
380 printf("C: ");
381
382 for (x=0;x<nT;x++)
383 {
384 printf("%4d ",coeff[x+y*nT]);
385 }
386
387 printf("\n");
388 }
389
390 for (y=0;y<nT;y++)
391 {
392 for (x=0;x<nT;x++)
393 {
394 printf("%02x ",pred[x+y*stride]);
395 }
396
397 printf("\n");
398 }
399 */
400
401 // zero out scrap coefficient buffer again
402
403 for (int i=0;i<tctx->nCoeff[cIdx];i++) {
404 tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0;
405 }
406 }
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
2020 #ifndef DE265_TRANSFORM_H
2121 #define DE265_TRANSFORM_H
2222
23 #include "libde265/de265.h"
2324 #include "libde265/decctx.h"
2425
25 int table8_22(int qPi);
26 extern const int tab8_22[];
27
28 LIBDE265_INLINE static int table8_22(int qPi)
29 {
30 if (qPi<30) return qPi;
31 if (qPi>=43) return qPi-6;
32 return tab8_22[qPi-30];
33 }
2634
2735 // (8.6.1)
28 void decode_quantization_parameters(decoder_context* ctx,
29 thread_context* shdr, int xC,int yC,
36 void decode_quantization_parameters(thread_context* tctx, int xC,int yC,
3037 int xCUBase, int yCUBase);
3138
3239 // (8.6.2)
33 void scale_coefficients(decoder_context* ctx, thread_context* shdr,
40 void scale_coefficients(thread_context* tctx,
3441 int xT,int yT, // position of TU in frame (chroma adapted)
3542 int x0,int y0, // position of CU in frame (chroma adapted)
3643 int nT, int cIdx,
+0
-170
libde265/util.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "util.h"
21 #include "de265.h"
22
23 #include <stdarg.h>
24 #include <stdio.h>
25
26
27 int ceil_div(int num,int denom)
28 {
29 num += denom-1;
30 return num/denom;
31 }
32
33
34 int ceil_log2(int val)
35 {
36 int n=0;
37 while (val > (1<<n)) {
38 n++;
39 }
40
41 return n;
42 }
43
44
45 int Log2(int v)
46 {
47 int n=0;
48 while (v>1) {
49 n++;
50 v>>=1;
51 }
52
53 return n;
54 }
55
56
57 #ifdef DE265_LOGGING
58 static int current_poc=0;
59 static int log_poc_start=-9999; // frame-numbers can be negative
60 static int enable_log = 1;
61 void log_set_current_POC(int poc) { current_poc=poc; }
62 #endif
63
64
65 static int disable_logging=0;
66 static int verbosity = 0;
67
68 LIBDE265_API void de265_disable_logging() // DEPRECATED
69 {
70 disable_logging=1;
71 }
72
73 LIBDE265_API void de265_set_verbosity(int level)
74 {
75 verbosity = level;
76 }
77
78 #if defined(DE265_LOG_ERROR) || defined(DE265_LOG_INFO) || defined(DE265_LOG_DEBUG) || defined(DE265_LOG_INFO)
79 void enablelog() { enable_log=1; }
80 #endif
81
82 #ifdef DE265_LOG_ERROR
83 void logerror(enum LogModule module, const char* string, ...)
84 {
85 if (disable_logging) return;
86 if (current_poc < log_poc_start) { return; }
87 if (!enable_log) return;
88
89 va_list va;
90
91 int noPrefix = (string[0]=='*');
92 if (!noPrefix) fprintf(stdout, "ERR: ");
93 va_start(va, string);
94 vfprintf(stdout, string + (noPrefix ? 1 : 0), va);
95 va_end(va);
96 fflush(stdout);
97 }
98 #endif
99
100 #ifdef DE265_LOG_INFO
101 void loginfo (enum LogModule module, const char* string, ...)
102 {
103 if (verbosity<1) return;
104 if (disable_logging) return;
105 if (current_poc < log_poc_start) { return; }
106 if (!enable_log) return;
107
108 va_list va;
109
110 int noPrefix = (string[0]=='*');
111 if (!noPrefix) fprintf(stdout, "INFO: ");
112 va_start(va, string);
113 vfprintf(stdout, string + (noPrefix ? 1 : 0), va);
114 va_end(va);
115 fflush(stdout);
116 }
117 #endif
118
119 #ifdef DE265_LOG_DEBUG
120 void logdebug(enum LogModule module, const char* string, ...)
121 {
122 if (verbosity<2) return;
123 if (disable_logging) return;
124 if (current_poc < log_poc_start) { return; }
125 if (!enable_log) return;
126
127 va_list va;
128
129 int noPrefix = (string[0]=='*');
130 if (!noPrefix) fprintf(stdout, "DEBUG: ");
131 va_start(va, string);
132 vfprintf(stdout, string + (noPrefix ? 1 : 0), va);
133 va_end(va);
134 fflush(stdout);
135 }
136 #endif
137
138 #ifdef DE265_LOG_TRACE
139 void logtrace(enum LogModule module, const char* string, ...)
140 {
141 if (verbosity<3) return;
142 if (disable_logging) return;
143 if (current_poc < log_poc_start) { return; }
144 if (!enable_log) return;
145
146 //if (module != LogCABAC) return;
147
148 va_list va;
149
150 int noPrefix = (string[0]=='*');
151 if (!noPrefix) { } // fprintf(stdout, "ERR: ");
152 va_start(va, string);
153 vfprintf(stdout, string + (noPrefix ? 1 : 0), va);
154 va_end(va);
155 fflush(stdout);
156 }
157 #endif
158
159 void log2fh(FILE* fh, const char* string, ...)
160 {
161 va_list va;
162
163 int noPrefix = (string[0]=='*');
164 if (!noPrefix) fprintf(stdout, "INFO: ");
165 va_start(va, string);
166 vfprintf(fh, string + (noPrefix ? 1 : 0), va);
167 va_end(va);
168 fflush(stdout);
169 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "util.h"
21 #include "de265.h"
22
23 #include <stdarg.h>
24 #include <stdio.h>
25
26
27 #ifdef DE265_LOGGING
28 static int current_poc=0;
29 static int log_poc_start=-9999; // frame-numbers can be negative
30 static int enable_log = 1;
31 void log_set_current_POC(int poc) { current_poc=poc; }
32 #endif
33
34
35 static int disable_logging=0;
36 static int verbosity = 0;
37
38 LIBDE265_API void de265_disable_logging() // DEPRECATED
39 {
40 disable_logging=1;
41 }
42
43 LIBDE265_API void de265_set_verbosity(int level)
44 {
45 verbosity = level;
46 }
47
48 #if defined(DE265_LOG_ERROR) || defined(DE265_LOG_INFO) || defined(DE265_LOG_DEBUG) || defined(DE265_LOG_INFO)
49 void enablelog() { enable_log=1; }
50 #endif
51
52 #ifdef DE265_LOG_ERROR
53 void logerror(enum LogModule module, const char* string, ...)
54 {
55 if (disable_logging) return;
56 if (current_poc < log_poc_start) { return; }
57 if (!enable_log) return;
58
59 va_list va;
60
61 int noPrefix = (string[0]=='*');
62 if (!noPrefix) fprintf(stdout, "ERR: ");
63 va_start(va, string);
64 vfprintf(stdout, string + (noPrefix ? 1 : 0), va);
65 va_end(va);
66 fflush(stdout);
67 }
68 #endif
69
70 #ifdef DE265_LOG_INFO
71 void loginfo (enum LogModule module, const char* string, ...)
72 {
73 if (verbosity<1) return;
74 if (disable_logging) return;
75 if (current_poc < log_poc_start) { return; }
76 if (!enable_log) return;
77
78 va_list va;
79
80 int noPrefix = (string[0]=='*');
81 if (!noPrefix) fprintf(stdout, "INFO: ");
82 va_start(va, string);
83 vfprintf(stdout, string + (noPrefix ? 1 : 0), va);
84 va_end(va);
85 fflush(stdout);
86 }
87 #endif
88
89 #ifdef DE265_LOG_DEBUG
90 void logdebug(enum LogModule module, const char* string, ...)
91 {
92 if (verbosity<2) return;
93 if (disable_logging) return;
94 if (current_poc < log_poc_start) { return; }
95 if (!enable_log) return;
96
97 va_list va;
98
99 int noPrefix = (string[0]=='*');
100 if (!noPrefix) fprintf(stdout, "DEBUG: ");
101 va_start(va, string);
102 vfprintf(stdout, string + (noPrefix ? 1 : 0), va);
103 va_end(va);
104 fflush(stdout);
105 }
106 #endif
107
108 #ifdef DE265_LOG_TRACE
109 void logtrace(enum LogModule module, const char* string, ...)
110 {
111 if (verbosity<3) return;
112 if (disable_logging) return;
113 if (current_poc < log_poc_start) { return; }
114 if (!enable_log) return;
115
116 //if (module != LogCABAC) return;
117
118 va_list va;
119
120 int noPrefix = (string[0]=='*');
121 if (!noPrefix) { } // fprintf(stdout, "ERR: ");
122 va_start(va, string);
123 vfprintf(stdout, string + (noPrefix ? 1 : 0), va);
124 va_end(va);
125 fflush(stdout);
126 }
127 #endif
128
129 void log2fh(FILE* fh, const char* string, ...)
130 {
131 va_list va;
132
133 int noPrefix = (string[0]=='*');
134 if (!noPrefix) fprintf(stdout, "INFO: ");
135 va_start(va, string);
136 vfprintf(fh, string + (noPrefix ? 1 : 0), va);
137 va_end(va);
138 fflush(stdout);
139 }
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
2626
2727 #include <stdio.h>
2828
29 #include "libde265/de265.h"
30
2931
3032 #ifdef _MSC_VER
3133 #define LIBDE265_DECLARE_ALIGNED( var, n ) __declspec(align(n)) var
4749 //inline uint8_t Clip1_8bit(int16_t value) { if (value<=0) return 0; else if (value>=255) return 255; else return value; }
4850 #define Clip1_8bit(value) ((value)<0 ? 0 : (value)>255 ? 255 : (value))
4951 #define Clip3(low,high,value) ((value)<(low) ? (low) : (value)>(high) ? (high) : (value))
50 #define Sign(value) (((value)>0) ? 1 : ((value)<0) ? -1 : 0)
52 #define Sign(value) (((value)<0) ? -1 : ((value)>0) ? 1 : 0)
5153 #define abs_value(a) (((a)<0) ? -(a) : (a))
5254 #define libde265_min(a,b) (((a)<(b)) ? (a) : (b))
5355 #define libde265_max(a,b) (((a)>(b)) ? (a) : (b))
5456
55 int ceil_div(int num,int denom);
56 int ceil_log2(int val);
57 int Log2(int v);
57 LIBDE265_INLINE static int ceil_div(int num,int denom)
58 {
59 num += denom-1;
60 return num/denom;
61 }
62 LIBDE265_INLINE static int ceil_log2(int val)
63 {
64 int n=0;
65 while (val > (1<<n)) {
66 n++;
67 }
5868
69 return n;
70 }
71 LIBDE265_INLINE static int Log2(int v)
72 {
73 int n=0;
74 while (v>1) {
75 n++;
76 v>>=1;
77 }
78
79 return n;
80 }
5981
6082
6183 // === logging ===
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "visualize.h"
21 #include "decctx.h"
22
23 #include <math.h>
24
25 #if 0
26 void writeFrame_Y(de265_image* img,const char* filename)
27 {
28 int w = ctx->img->get_width();
29 int h = ctx->img->get_height();
30 //int c_idx=0;
31 int ctb_size = 64; // HACK
32
33 int stride = ctx->img->get_luma_stride();
34
35 for (int ctbY=0;ctbY<ctx->current_sps->PicHeightInCtbsY;ctbY++)
36 for (int ctbX=0;ctbX<ctx->current_sps->PicWidthInCtbsY;ctbX++)
37 {
38 int x0 = ctbX*ctb_size;
39 int y0 = ctbY*ctb_size;
40
41
42 uint8_t *src = ctx->img->get_image_plane_at_pos(0,x0,y0);
43
44 printf("%s %d %d\n",filename,x0,y0);
45 int dx,dy;
46 for (dy=0;dy<ctb_size;dy++)
47 if (y0+dy < h)
48 {
49 printf("%s %d %d ",filename,y0+dy,x0);
50
51 for (dx=0;dx<ctb_size;dx++)
52 if (x0+dx < w)
53 {
54 printf("%02x ",*(src+dx+dy*stride));
55 }
56
57 printf("\n");
58 }
59 }
60 }
61 #endif
62
63
64 void write_picture_to_file(const de265_image* img, const char* filename)
65 {
66 FILE* fh = fopen(filename, "wb");
67
68 for (int c=0;c<3;c++)
69 for (int y=0;y<de265_get_image_height(img,c);y++)
70 fwrite(img->get_image_plane_at_pos(c, 0,y), de265_get_image_width(img,c), 1, fh);
71
72 fflush(fh);
73 fclose(fh);
74 }
75
76
77 void set_pixel(uint8_t* img, int x,int y, int stride, uint32_t color, int pixelSize)
78 {
79 for (int i=0;i<pixelSize;i++) {
80 uint8_t col = (color>>(i*8)) & 0xFF;
81 img[y*stride + x*pixelSize + i] = col;
82 }
83 }
84
85
86 void draw_block_boundary(const de265_image* srcimg,
87 uint8_t* img,int stride,
88 int x,int y,int hBlkSize, int vBlkSize, uint32_t color, int pixelSize)
89 {
90 for (int i=0;i<vBlkSize;i++)
91 {
92 int yi = y + i;
93
94 if (yi < srcimg->sps.pic_height_in_luma_samples) {
95 set_pixel(img,x,yi,stride,color,pixelSize);
96 }
97 }
98
99 for (int i=0;i<hBlkSize;i++)
100 {
101 int xi = x + i;
102
103 if (xi < srcimg->sps.pic_width_in_luma_samples) {
104 set_pixel(img,xi,y,stride,color,pixelSize);
105 }
106 }
107 }
108
109
110 #include "intrapred.h"
111
112 void draw_intra_pred_mode(const de265_image* srcimg,
113 uint8_t* img,int stride,
114 int x0,int y0,int log2BlkSize,
115 enum IntraPredMode mode, uint32_t color,int pixelSize)
116 {
117 int w = 1<<log2BlkSize;
118
119 if (mode==0) {
120 // Planar -> draw square
121
122 for (int i=-w*1/4;i<=w*1/4;i++)
123 {
124 set_pixel(img, x0+w*1/4, y0+w/2+i,stride, color, pixelSize);
125 set_pixel(img, x0+w*3/4, y0+w/2+i,stride, color, pixelSize);
126 set_pixel(img, x0+w/2+i, y0+w*1/4,stride, color, pixelSize);
127 set_pixel(img, x0+w/2+i, y0+w*3/4,stride, color, pixelSize);
128 }
129 }
130 else if (mode==1) {
131 // DC -> draw circle
132
133 for (int i=-w/4;i<w/4;i++)
134 {
135 int k = (sqrt((double)(w*w - i*i*16))+2)/4;
136
137 set_pixel(img, x0+w/2+i, y0+w/2+k, stride, color, pixelSize);
138 set_pixel(img, x0+w/2+i, y0+w/2-k, stride, color, pixelSize);
139 set_pixel(img, x0+w/2+k, y0+w/2+i, stride, color, pixelSize);
140 set_pixel(img, x0+w/2-k, y0+w/2+i, stride, color, pixelSize);
141 }
142 }
143 else {
144 // angular -> draw line in prediction direction
145
146 int slope = intraPredAngle_table[mode];
147 bool horiz = (mode<18);
148
149 if (horiz) {
150 for (int i=-w/2;i<w/2;i++)
151 {
152 int dy = (slope*i+Sign(slope*i)*16)/32;
153 int y = y0+w/2-dy;
154 if (y>=0 && y<srcimg->sps.pic_height_in_luma_samples) {
155 set_pixel(img, x0+i+w/2, y, stride, color, pixelSize);
156 }
157 }
158 }
159 else {
160 for (int i=-w/2;i<w/2;i++)
161 {
162 int dx = (slope*i+Sign(slope*i)*16)/32;
163 int x = x0+w/2-dx;
164 if (x>=0 && x<srcimg->sps.pic_width_in_luma_samples) {
165 set_pixel(img, x, y0+i+w/2, stride, color, pixelSize);
166 }
167 }
168 }
169 }
170 }
171
172
173 void drawTBgrid(const de265_image* srcimg, uint8_t* img, int stride,
174 int x0,int y0, uint32_t color, int pixelSize, int log2CbSize, int trafoDepth)
175 {
176 int split_transform_flag = srcimg->get_split_transform_flag(x0,y0,trafoDepth);
177 if (split_transform_flag) {
178 int x1 = x0 + ((1<<(log2CbSize-trafoDepth))>>1);
179 int y1 = y0 + ((1<<(log2CbSize-trafoDepth))>>1);
180 drawTBgrid(srcimg,img,stride,x0,y0,color,pixelSize,log2CbSize,trafoDepth+1);
181 drawTBgrid(srcimg,img,stride,x1,y0,color,pixelSize,log2CbSize,trafoDepth+1);
182 drawTBgrid(srcimg,img,stride,x0,y1,color,pixelSize,log2CbSize,trafoDepth+1);
183 drawTBgrid(srcimg,img,stride,x1,y1,color,pixelSize,log2CbSize,trafoDepth+1);
184 }
185 else {
186 draw_block_boundary(srcimg,img,stride,x0,y0,1<<(log2CbSize-trafoDepth),1<<(log2CbSize-trafoDepth), color, pixelSize);
187 }
188 }
189
190
191 enum DrawMode {
192 Partitioning_CB,
193 Partitioning_TB,
194 Partitioning_PB,
195 IntraPredMode,
196 PBPredMode,
197 PBMotionVectors,
198 QuantP_Y
199 };
200
201
202 void tint_rect(uint8_t* img, int stride, int x0,int y0,int w,int h, uint32_t color, int pixelSize)
203 {
204 for (int y=0;y<h;y++)
205 for (int x=0;x<w;x++)
206 {
207 int xp = x0+x;
208 int yp = y0+y;
209
210 for (int i=0;i<pixelSize;i++) {
211 uint8_t col = (color>>(i*8)) & 0xFF;
212 img[yp*stride+xp*pixelSize + i] = (img[yp*stride+xp*pixelSize + i] + col)/2;
213 }
214 }
215 }
216
217 void fill_rect(uint8_t* img, int stride, int x0,int y0,int w,int h, uint32_t color, int pixelSize)
218 {
219 for (int y=0;y<h;y++)
220 for (int x=0;x<w;x++)
221 {
222 int xp = x0+x;
223 int yp = y0+y;
224
225 for (int i=0;i<pixelSize;i++) {
226 uint8_t col = (color>>(i*8)) & 0xFF;
227 img[yp*stride+xp*pixelSize + i] = col;
228 }
229 }
230 }
231
232
233 void draw_QuantPY_block(const de265_image* srcimg,uint8_t* img,int stride,
234 int x0,int y0, int w,int h, int pixelSize)
235 {
236 int q = srcimg->get_QPY(x0,y0);
237
238 const int MIN_DRAW_Q = 20;
239 const int MAX_DRAW_Q = 40;
240
241 if (q<MIN_DRAW_Q) q=MIN_DRAW_Q;
242 if (q>MAX_DRAW_Q) q=MAX_DRAW_Q;
243
244 float f = ((float)q-MIN_DRAW_Q)/(MAX_DRAW_Q-MIN_DRAW_Q);
245 uint32_t col = 0xFF * f;
246 col = col | (col<<8) | (col<<16);
247
248 fill_rect(img,stride, x0,y0,w,h, col, pixelSize);
249 }
250
251
252 void draw_line(uint8_t* img,int stride,uint32_t color,int pixelSize,
253 int width,int height,
254 int x0,int y0,int x1,int y1)
255 {
256 if (x1==x0 && y1==y0) {
257 set_pixel(img,x0,y0,stride,color,pixelSize);
258 }
259 else if (abs(x1-x0) < abs(y1-y0)) {
260 for (int y=y0;y<=y1;y += Sign(y1-y0))
261 {
262 int x = (y-y0)*(x1-x0)/(y1-y0) + x0;
263
264 if (x>=0 && x<width && y>=0 && y<height)
265 set_pixel(img,x,y,stride,color,pixelSize);
266 }
267 }
268 else {
269 for (int x=x0;x<=x1;x += Sign(x1-x0))
270 {
271 int y = (x-x0)*(y1-y0)/(x1-x0) + y0;
272
273 if (x>=0 && x<width && y>=0 && y<height)
274 set_pixel(img,x,y,stride,color,pixelSize);
275 }
276 }
277 }
278
279
280 void draw_PB_block(const de265_image* srcimg,uint8_t* img,int stride,
281 int x0,int y0, int w,int h, enum DrawMode what, uint32_t color, int pixelSize)
282 {
283 if (what == Partitioning_PB) {
284 draw_block_boundary(srcimg,img,stride,x0,y0,w,h, color,pixelSize);
285 }
286 else if (what == PBPredMode) {
287 enum PredMode predMode = srcimg->get_pred_mode(x0,y0);
288
289 uint32_t cols[3] = { 0xff0000, 0x0000ff, 0x00ff00 };
290
291 tint_rect(img,stride, x0,y0,w,h, cols[predMode], pixelSize);
292 }
293 else if (what == PBMotionVectors) {
294 const PredVectorInfo* mvi = srcimg->get_mv_info(x0,y0);
295 int x = x0+w/2;
296 int y = y0+h/2;
297 if (mvi->predFlag[0]) {
298 draw_line(img,stride,0xFF0000,pixelSize,
299 srcimg->get_width(),
300 srcimg->get_height(),
301 x,y,x+mvi->mv[0].x,y+mvi->mv[0].y);
302 }
303 if (mvi->predFlag[1]) {
304 draw_line(img,stride,0x00FF00,pixelSize,
305 srcimg->get_width(),
306 srcimg->get_height(),
307 x,y,x+mvi->mv[1].x,y+mvi->mv[1].y);
308 }
309 }
310 }
311
312
313 void draw_tree_grid(const de265_image* srcimg, uint8_t* img, int stride,
314 uint32_t color, int pixelSize, enum DrawMode what)
315 {
316 const seq_parameter_set* sps = &srcimg->sps;
317 int minCbSize = sps->MinCbSizeY;
318
319 for (int y0=0;y0<sps->PicHeightInMinCbsY;y0++)
320 for (int x0=0;x0<sps->PicWidthInMinCbsY;x0++)
321 {
322 int log2CbSize = srcimg->get_log2CbSize_cbUnits(x0,y0);
323 if (log2CbSize==0) {
324 continue;
325 }
326
327 int xb = x0*minCbSize;
328 int yb = y0*minCbSize;
329
330 int CbSize = 1<<log2CbSize;
331
332 if (what == Partitioning_TB) {
333 drawTBgrid(srcimg,img,stride,x0*minCbSize,y0*minCbSize, color,pixelSize, log2CbSize, 0);
334 }
335 else if (what == Partitioning_CB) {
336 draw_block_boundary(srcimg,img,stride,xb,yb, 1<<log2CbSize,1<<log2CbSize, color,pixelSize);
337 }
338 else if (what == PBPredMode) {
339 draw_PB_block(srcimg,img,stride,xb,yb,CbSize,CbSize, what,color,pixelSize);
340 }
341 else if (what == QuantP_Y) {
342 draw_QuantPY_block(srcimg,img,stride,xb,yb,CbSize,CbSize,pixelSize);
343 }
344 else if (what == Partitioning_PB ||
345 what == PBMotionVectors) {
346 enum PartMode partMode = srcimg->get_PartMode(xb,yb);
347
348 int HalfCbSize = (1<<(log2CbSize-1));
349
350 switch (partMode) {
351 case PART_2Nx2N:
352 draw_PB_block(srcimg,img,stride,xb,yb,CbSize,CbSize, what,color,pixelSize);
353 break;
354 case PART_NxN:
355 draw_PB_block(srcimg,img,stride,xb, yb, CbSize/2,CbSize/2, what,color,pixelSize);
356 draw_PB_block(srcimg,img,stride,xb+HalfCbSize,yb, CbSize/2,CbSize/2, what,color,pixelSize);
357 draw_PB_block(srcimg,img,stride,xb ,yb+HalfCbSize,CbSize/2,CbSize/2, what,color,pixelSize);
358 draw_PB_block(srcimg,img,stride,xb+HalfCbSize,yb+HalfCbSize,CbSize/2,CbSize/2, what,color,pixelSize);
359 break;
360 case PART_2NxN:
361 draw_PB_block(srcimg,img,stride,xb, yb, CbSize ,CbSize/2, what,color,pixelSize);
362 draw_PB_block(srcimg,img,stride,xb, yb+HalfCbSize,CbSize ,CbSize/2, what,color,pixelSize);
363 break;
364 case PART_Nx2N:
365 draw_PB_block(srcimg,img,stride,xb, yb, CbSize/2,CbSize, what,color,pixelSize);
366 draw_PB_block(srcimg,img,stride,xb+HalfCbSize,yb, CbSize/2,CbSize, what,color,pixelSize);
367 break;
368 case PART_2NxnU:
369 draw_PB_block(srcimg,img,stride,xb, yb, CbSize ,CbSize/4, what,color,pixelSize);
370 draw_PB_block(srcimg,img,stride,xb, yb+CbSize/4 ,CbSize ,CbSize*3/4, what,color,pixelSize);
371 break;
372 case PART_2NxnD:
373 draw_PB_block(srcimg,img,stride,xb, yb, CbSize ,CbSize*3/4, what,color,pixelSize);
374 draw_PB_block(srcimg,img,stride,xb, yb+CbSize*3/4,CbSize ,CbSize/4, what,color,pixelSize);
375 break;
376 case PART_nLx2N:
377 draw_PB_block(srcimg,img,stride,xb, yb, CbSize/4 ,CbSize, what,color,pixelSize);
378 draw_PB_block(srcimg,img,stride,xb+CbSize/4 ,yb, CbSize*3/4,CbSize, what,color,pixelSize);
379 break;
380 case PART_nRx2N:
381 draw_PB_block(srcimg,img,stride,xb, yb, CbSize*3/4,CbSize, what,color,pixelSize);
382 draw_PB_block(srcimg,img,stride,xb+CbSize*3/4,yb, CbSize/4 ,CbSize, what,color,pixelSize);
383 break;
384 default:
385 assert(false);
386 break;
387 }
388 }
389 else if (what==IntraPredMode) {
390 enum PredMode predMode = srcimg->get_pred_mode(xb,yb);
391 if (predMode == MODE_INTRA) {
392 enum PartMode partMode = srcimg->get_PartMode(xb,yb);
393
394 int HalfCbSize = (1<<(log2CbSize-1));
395
396 switch (partMode) {
397 case PART_2Nx2N:
398 draw_intra_pred_mode(srcimg,img,stride,xb,yb,log2CbSize,
399 srcimg->get_IntraPredMode(xb,yb), color,pixelSize);
400 break;
401 case PART_NxN:
402 draw_intra_pred_mode(srcimg,img,stride,xb, yb, log2CbSize-1,
403 srcimg->get_IntraPredMode(xb,yb), color,pixelSize);
404 draw_intra_pred_mode(srcimg,img,stride,xb+HalfCbSize,yb, log2CbSize-1,
405 srcimg->get_IntraPredMode(xb+HalfCbSize,yb), color,pixelSize);
406 draw_intra_pred_mode(srcimg,img,stride,xb ,yb+HalfCbSize,log2CbSize-1,
407 srcimg->get_IntraPredMode(xb,yb+HalfCbSize), color,pixelSize);
408 draw_intra_pred_mode(srcimg,img,stride,xb+HalfCbSize,yb+HalfCbSize,log2CbSize-1,
409 srcimg->get_IntraPredMode(xb+HalfCbSize,yb+HalfCbSize), color,pixelSize);
410 break;
411 default:
412 assert(false);
413 break;
414 }
415 }
416 }
417 }
418 }
419
420
421 void draw_CB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize)
422 {
423 draw_tree_grid(img,dst,stride,color,pixelSize, Partitioning_CB);
424 }
425
426 void draw_TB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize)
427 {
428 draw_tree_grid(img,dst,stride,color,pixelSize, Partitioning_TB);
429 }
430
431 void draw_PB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize)
432 {
433 draw_tree_grid(img,dst,stride,color,pixelSize, Partitioning_PB);
434 }
435
436 void draw_intra_pred_modes(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize)
437 {
438 draw_tree_grid(img,dst,stride,color,pixelSize, IntraPredMode);
439 }
440
441 void draw_PB_pred_modes(const de265_image* img, uint8_t* dst, int stride, int pixelSize)
442 {
443 draw_tree_grid(img,dst,stride,0,pixelSize, PBPredMode);
444 }
445
446 void draw_QuantPY(const de265_image* img, uint8_t* dst, int stride, int pixelSize)
447 {
448 draw_tree_grid(img,dst,stride,0,pixelSize, QuantP_Y);
449 }
450
451 void draw_Motion(const de265_image* img, uint8_t* dst, int stride, int pixelSize)
452 {
453 draw_tree_grid(img,dst,stride,0,pixelSize, PBMotionVectors);
454 }
455
456 void draw_Slices(const de265_image* img, uint8_t* dst, int stride, int pixelSize)
457 {
458 // --- mark first CTB in slice (red - independent / green - dependent) ---
459
460 for (int ctby=0;ctby<img->sps.PicHeightInCtbsY;ctby++)
461 for (int ctbx=0;ctbx<img->sps.PicWidthInCtbsY;ctbx++)
462 {
463 const int blkw = img->sps.Log2CtbSizeY;
464
465 int ctbAddrRS = ctby*img->sps.PicWidthInCtbsY + ctbx;
466 int prevCtbRS = -1;
467 if (ctbx>0 || ctby>0) { prevCtbRS = img->pps.CtbAddrTStoRS[ img->pps.CtbAddrRStoTS[ctbAddrRS] -1 ]; }
468
469 if (prevCtbRS<0 ||
470 img->get_SliceHeaderIndex_atIndex(ctbAddrRS) !=
471 img->get_SliceHeaderIndex_atIndex(prevCtbRS)) {
472 int step=2;
473 int fillcolor = 0xFF0000;
474
475 if (img->get_SliceHeaderCtb(ctbx,ctby)->dependent_slice_segment_flag) {
476 step=2;
477 fillcolor = 0x00FF00;
478 }
479
480 for (int x=0;x<1<<blkw;x+=step)
481 for (int y=0;y<1<<blkw;y+=step) {
482 int x1 = x + (ctbx<<blkw);
483 int y1 = y + (ctby<<blkw);
484
485 if (x1<img->sps.pic_width_in_luma_samples &&
486 y1<img->sps.pic_height_in_luma_samples)
487 {
488 set_pixel(dst,x1,y1,stride,fillcolor,pixelSize);
489 }
490 }
491 }
492 }
493
494
495
496 // --- draw slice boundaries ---
497
498 const uint32_t color = 0xff0000;
499
500 for (int ctby=0;ctby<img->sps.PicHeightInCtbsY;ctby++)
501 for (int ctbx=0;ctbx<img->sps.PicWidthInCtbsY;ctbx++) {
502 if (ctbx>0 && (img->get_SliceHeaderIndexCtb(ctbx ,ctby) !=
503 img->get_SliceHeaderIndexCtb(ctbx-1,ctby))) {
504 int x = ctbx << img->sps.Log2CtbSizeY;
505 int y0 = ctby << img->sps.Log2CtbSizeY;
506
507 for (int y=y0;
508 (y<y0+(1<<img->sps.Log2CtbSizeY) &&
509 y<img->sps.pic_height_in_luma_samples) ;
510 y++) {
511 set_pixel(dst,x,y,stride,color,pixelSize);
512 }
513 }
514 }
515
516
517 for (int ctby=0;ctby<img->sps.PicHeightInCtbsY;ctby++)
518 for (int ctbx=0;ctbx<img->sps.PicWidthInCtbsY;ctbx++) {
519 if (ctby>0 && (img->get_SliceHeaderIndexCtb(ctbx,ctby ) !=
520 img->get_SliceHeaderIndexCtb(ctbx,ctby-1))) {
521 int x0 = ctbx << img->sps.Log2CtbSizeY;
522 int y = ctby << img->sps.Log2CtbSizeY;
523
524 for (int x=x0 ;
525 (x<x0+(1<<img->sps.Log2CtbSizeY) &&
526 x<img->sps.pic_width_in_luma_samples) ;
527 x++) {
528 set_pixel(dst,x,y,stride,color,pixelSize);
529 }
530 }
531 }
532
533
534 }
535
536 void draw_Tiles(const de265_image* img, uint8_t* dst, int stride, int pixelSize)
537 {
538 const uint32_t color = 0xffff00;
539
540 for (int tx=1;tx<img->pps.num_tile_columns;tx++) {
541 int x = img->pps.colBd[tx] << img->sps.Log2CtbSizeY;
542
543 for (int y=0;y<img->sps.pic_height_in_luma_samples;y++) {
544 set_pixel(dst,x,y,stride,color,pixelSize);
545 }
546 }
547
548 for (int ty=1;ty<img->pps.num_tile_rows;ty++) {
549 int y = img->pps.rowBd[ty] << img->sps.Log2CtbSizeY;
550
551 for (int x=0;x<img->sps.pic_width_in_luma_samples;x++) {
552 set_pixel(dst,x,y,stride,color,pixelSize);
553 }
554 }
555 }
556
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef DE265_VISUALIZE_H
21 #define DE265_VISUALIZE_H
22
23 #include "libde265/image.h"
24
25
26 void write_picture_to_file(const de265_image* img, const char* filename);
27
28 void draw_CB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize);
29 void draw_TB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize);
30 void draw_PB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize);
31 void draw_PB_pred_modes(const de265_image* img, uint8_t* dst, int stride, int pixelSize);
32 void draw_intra_pred_modes(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize);
33 void draw_QuantPY(const de265_image* img, uint8_t* dst, int stride, int pixelSize);
34 void draw_Motion(const de265_image* img, uint8_t* dst, int stride, int pixelSize);
35 void draw_Slices(const de265_image* img, uint8_t* dst, int stride, int pixelSize);
36 void draw_Tiles(const de265_image* img, uint8_t* dst, int stride, int pixelSize);
37
38 #endif
+0
-372
libde265/vps.c less more
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "vps.h"
21 #include "util.h"
22 #include "decctx.h"
23
24 #include <assert.h>
25
26
27 de265_error read_vps(decoder_context* ctx, bitreader* reader, video_parameter_set* vps)
28 {
29 int vlc;
30
31 vps->video_parameter_set_id = vlc = get_bits(reader, 4);
32 if (vlc >= DE265_MAX_VPS_SETS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
33
34 skip_bits(reader, 2);
35 vps->vps_max_layers = vlc = get_bits(reader,6) +1;
36 if (vlc != 1) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; // TODO: out of specification
37
38 vps->vps_max_sub_layers = vlc = get_bits(reader,3) +1;
39 if (vlc >= MAX_TEMPORAL_SUBLAYERS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
40
41 vps->vps_temporal_id_nesting_flag = get_bits(reader,1);
42 skip_bits(reader, 16);
43
44 read_profile_tier_level(reader, &vps->profile_tier_level,
45 vps->vps_max_sub_layers);
46
47 /*
48 read_bit_rate_pic_rate_info(reader, &vps->bit_rate_pic_rate_info,
49 0, vps->vps_max_sub_layers-1);
50 */
51
52 vps->vps_sub_layer_ordering_info_present_flag = get_bits(reader,1);
53 //assert(vps->vps_max_sub_layers-1 < MAX_TEMPORAL_SUBLAYERS);
54
55 int firstLayerRead = vps->vps_sub_layer_ordering_info_present_flag ? 0 : (vps->vps_max_sub_layers-1);
56
57 for (int i=firstLayerRead;i<vps->vps_max_sub_layers;i++) {
58 vps->layer[i].vps_max_dec_pic_buffering = get_uvlc(reader);
59 vps->layer[i].vps_max_num_reorder_pics = get_uvlc(reader);
60 vps->layer[i].vps_max_latency_increase = get_uvlc(reader);
61 }
62
63 if (!vps->vps_sub_layer_ordering_info_present_flag) {
64 assert(firstLayerRead < MAX_TEMPORAL_SUBLAYERS);
65
66 for (int i=0;i<firstLayerRead;i++) {
67 vps->layer[i].vps_max_dec_pic_buffering = vps->layer[firstLayerRead].vps_max_dec_pic_buffering;
68 vps->layer[i].vps_max_num_reorder_pics = vps->layer[firstLayerRead].vps_max_num_reorder_pics;
69 vps->layer[i].vps_max_latency_increase = vps->layer[firstLayerRead].vps_max_latency_increase;
70 }
71 }
72
73
74 vps->vps_max_layer_id = get_bits(reader,6);
75 vps->vps_num_layer_sets = get_uvlc(reader)+1;
76
77 if (vps->vps_num_layer_sets<0 ||
78 vps->vps_num_layer_sets>=1024) {
79 add_warning(ctx, DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
80 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
81 }
82
83 for (int i=1; i <= vps->vps_num_layer_sets-1; i++)
84 for (int j=0; j <= vps->vps_max_layer_id; j++)
85 {
86 vps->layer_id_included_flag[i][j] = get_bits(reader,1);
87 }
88
89 vps->vps_timing_info_present_flag = get_bits(reader,1);
90
91 if (vps->vps_timing_info_present_flag) {
92 vps->vps_num_units_in_tick = get_bits(reader,32);
93 vps->vps_time_scale = get_bits(reader,32);
94 vps->vps_poc_proportional_to_timing_flag = get_bits(reader,1);
95
96 if (vps->vps_poc_proportional_to_timing_flag) {
97 vps->vps_num_ticks_poc_diff_one = get_uvlc(reader)+1;
98 vps->vps_num_hrd_parameters = get_uvlc(reader);
99
100 if (vps->vps_num_hrd_parameters >= 1024) {
101 assert(false); // TODO: return bitstream error
102 }
103
104 for (int i=0; i<vps->vps_num_hrd_parameters; i++) {
105 vps->hrd_layer_set_idx[i] = get_uvlc(reader);
106
107 if (i > 0) {
108 vps->cprms_present_flag[i] = get_bits(reader,1);
109 }
110
111 //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1)
112
113 return DE265_OK; // TODO: decode hrd_parameters()
114 }
115 }
116 }
117
118 vps->vps_extension_flag = get_bits(reader,1);
119
120 if (vps->vps_extension_flag) {
121 /*
122 while( more_rbsp_data() )
123 vps_extension_data_flag u(1)
124 rbsp_trailing_bits()
125 */
126 }
127
128 return DE265_OK;
129 }
130
131
132 void read_profile_tier_level(bitreader* reader,
133 struct profile_tier_level* hdr,
134 int max_sub_layers)
135 {
136 hdr->general_profile_space = get_bits(reader,2);
137 hdr->general_tier_flag = get_bits(reader,1);
138 hdr->general_profile_idc = get_bits(reader,5);
139
140 for (int i=0; i<32; i++) {
141 hdr->general_profile_compatibility_flag[i] = get_bits(reader,1);
142 }
143
144 hdr->general_progressive_source_flag = get_bits(reader,1);
145 hdr->general_interlaced_source_flag = get_bits(reader,1);
146 hdr->general_non_packed_constraint_flag = get_bits(reader,1);
147 hdr->general_frame_only_constraint_flag = get_bits(reader,1);
148 skip_bits(reader,44);
149
150 hdr->general_level_idc = get_bits(reader,8);
151
152
153 for (int i=0; i<max_sub_layers-1; i++)
154 {
155 hdr->profile[i].sub_layer_profile_present_flag = get_bits(reader,1);
156 hdr->profile[i].sub_layer_level_present_flag = get_bits(reader,1);
157 }
158
159 if (max_sub_layers > 1)
160 {
161 for (int i=max_sub_layers-1; i<8; i++)
162 {
163 skip_bits(reader,2);
164 }
165 }
166
167 for (int i=0; i<max_sub_layers-1; i++)
168 {
169 if (hdr->profile[i].sub_layer_profile_present_flag)
170 {
171 hdr->profile[i].sub_layer_profile_space = get_bits(reader,2);
172 hdr->profile[i].sub_layer_tier_flag = get_bits(reader,1);
173 hdr->profile[i].sub_layer_profile_idc = get_bits(reader,5);
174
175 for (int j=0; j<32; j++)
176 {
177 hdr->profile[i].sub_layer_profile_compatibility_flag[j] = get_bits(reader,1);
178 }
179
180 hdr->profile[i].sub_layer_progressive_source_flag = get_bits(reader,1);
181 hdr->profile[i].sub_layer_interlaced_source_flag = get_bits(reader,1);
182 hdr->profile[i].sub_layer_non_packed_constraint_flag = get_bits(reader,1);
183 hdr->profile[i].sub_layer_frame_only_constraint_flag = get_bits(reader,1);
184 skip_bits(reader,44);
185 }
186
187 if (hdr->profile[i].sub_layer_level_present_flag)
188 {
189 hdr->profile[i].sub_layer_level_idc = get_bits(reader,8);
190 }
191 }
192 }
193
194
195 /*
196 void read_bit_rate_pic_rate_info(bitreader* reader,
197 struct bit_rate_pic_rate_info* hdr,
198 int TempLevelLow,
199 int TempLevelHigh)
200 {
201 for (int i=TempLevelLow; i<=TempLevelHigh; i++) {
202
203 hdr->bit_rate_info_present_flag[i] = get_bits(reader,1);
204 hdr->pic_rate_info_present_flag[i] = get_bits(reader,1);
205
206 if (hdr->bit_rate_info_present_flag[i]) {
207 hdr->avg_bit_rate[i] = get_bits(reader,16);
208 hdr->max_bit_rate[i] = get_bits(reader,16);
209 }
210
211 if (hdr->pic_rate_info_present_flag[i]) {
212 hdr->constant_pic_rate_idc[i] = get_bits(reader,2);
213 hdr->avg_pic_rate[i] = get_bits(reader,16);
214 }
215 }
216 }
217 */
218
219
220
221 #define LOG0(t) log2fh(fh, t)
222 #define LOG1(t,d) log2fh(fh, t,d)
223 #define LOG2(t,d1,d2) log2fh(fh, t,d1,d2)
224 #define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3)
225
226 void dump_vps(video_parameter_set* vps, int fd)
227 {
228 FILE* fh;
229 if (fd==1) fh=stdout;
230 else if (fd==2) fh=stderr;
231 else { return; }
232
233 LOG0("----------------- VPS -----------------\n");
234 LOG1("video_parameter_set_id : %d\n", vps->video_parameter_set_id);
235 LOG1("vps_max_layers : %d\n", vps->vps_max_layers);
236 LOG1("vps_max_sub_layers : %d\n", vps->vps_max_sub_layers);
237 LOG1("vps_temporal_id_nesting_flag : %d\n", vps->vps_temporal_id_nesting_flag);
238
239 dump_profile_tier_level(&vps->profile_tier_level, vps->vps_max_sub_layers, fh);
240 //dump_bit_rate_pic_rate_info(&vps->bit_rate_pic_rate_info, 0, vps->vps_max_sub_layers-1);
241
242 LOG1("vps_sub_layer_ordering_info_present_flag : %d\n",
243 vps->vps_sub_layer_ordering_info_present_flag);
244
245 if (vps->vps_sub_layer_ordering_info_present_flag) {
246 for (int i=0;i<vps->vps_max_sub_layers;i++) {
247 LOG2("layer %d: vps_max_dec_pic_buffering = %d\n",i,vps->layer[i].vps_max_dec_pic_buffering);
248 LOG1(" vps_max_num_reorder_pics = %d\n",vps->layer[i].vps_max_num_reorder_pics);
249 LOG1(" vps_max_latency_increase = %d\n",vps->layer[i].vps_max_latency_increase);
250 }
251 }
252 else {
253 LOG1("layer (all): vps_max_dec_pic_buffering = %d\n",vps->layer[0].vps_max_dec_pic_buffering);
254 LOG1(" vps_max_num_reorder_pics = %d\n",vps->layer[0].vps_max_num_reorder_pics);
255 LOG1(" vps_max_latency_increase = %d\n",vps->layer[0].vps_max_latency_increase);
256 }
257
258
259 LOG1("vps_max_layer_id = %d\n", vps->vps_max_layer_id);
260 LOG1("vps_num_layer_sets = %d\n", vps->vps_num_layer_sets);
261
262 for (int i=1; i <= vps->vps_num_layer_sets-1; i++)
263 for (int j=0; j <= vps->vps_max_layer_id; j++)
264 {
265 LOG3("layer_id_included_flag[%d][%d] = %d\n",i,j,
266 vps->layer_id_included_flag[i][j]);
267 }
268
269 LOG1("vps_timing_info_present_flag = %d\n",
270 vps->vps_timing_info_present_flag);
271
272 if (vps->vps_timing_info_present_flag) {
273 LOG1("vps_num_units_in_tick = %d\n", vps->vps_num_units_in_tick);
274 LOG1("vps_time_scale = %d\n", vps->vps_time_scale);
275 LOG1("vps_poc_proportional_to_timing_flag = %d\n", vps->vps_poc_proportional_to_timing_flag);
276
277 if (vps->vps_poc_proportional_to_timing_flag) {
278 LOG1("vps_num_ticks_poc_diff_one = %d\n", vps->vps_num_ticks_poc_diff_one);
279 LOG1("vps_num_hrd_parameters = %d\n", vps->vps_num_hrd_parameters);
280
281 for (int i=0; i<vps->vps_num_hrd_parameters; i++) {
282 LOG2("hrd_layer_set_idx[%d] = %d\n", i, vps->hrd_layer_set_idx[i]);
283
284 if (i > 0) {
285 LOG2("cprms_present_flag[%d] = %d\n", i, vps->cprms_present_flag[i]);
286 }
287
288 //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1)
289
290 return; // TODO: decode hrd_parameters()
291 }
292 }
293 }
294
295 LOG1("vps_extension_flag = %d\n", vps->vps_extension_flag);
296 }
297
298
299 void dump_profile_tier_level(struct profile_tier_level* hdr,
300 int max_sub_layers, FILE* fh)
301 {
302 LOG1(" general_profile_space : %d\n", hdr->general_profile_space);
303 LOG1(" general_tier_flag : %d\n", hdr->general_tier_flag);
304 LOG1(" general_profile_idc : %d\n", hdr->general_profile_idc);
305
306 LOG0(" general_profile_compatibility_flags: ");
307 for (int i=0; i<32; i++) {
308 if (i) LOG0("*,");
309 LOG1("*%d",hdr->general_profile_compatibility_flag[i]);
310 }
311 LOG0("*\n");
312
313 LOG1(" general_level_idc : %d\n", hdr->general_level_idc);
314
315 for (int i=0; i<max_sub_layers-1; i++)
316 {
317 LOG1(" Profile/Tier/Level [Layer %d]\n",i);
318
319 if (hdr->profile[i].sub_layer_profile_present_flag) {
320
321 LOG1(" sub_layer_profile_space : %d\n",hdr->profile[i].sub_layer_profile_space);
322 LOG1(" sub_layer_tier_flag : %d\n",hdr->profile[i].sub_layer_tier_flag);
323 LOG1(" sub_layer_profile_idc : %d\n",hdr->profile[i].sub_layer_profile_idc);
324
325 LOG0(" sub_layer_profile_compatibility_flags: ");
326 for (int j=0; j<32; j++) {
327 if (j) LOG0(",");
328 LOG1("%d",hdr->profile[i].sub_layer_profile_compatibility_flag[j]);
329 }
330 LOG0("\n");
331
332 LOG1(" sub_layer_progressive_source_flag : %d\n",hdr->profile[i].sub_layer_progressive_source_flag);
333 LOG1(" sub_layer_interlaced_source_flag : %d\n",hdr->profile[i].sub_layer_interlaced_source_flag);
334 LOG1(" sub_layer_non_packed_constraint_flag : %d\n",hdr->profile[i].sub_layer_non_packed_constraint_flag);
335 LOG1(" sub_layer_frame_only_constraint_flag : %d\n",hdr->profile[i].sub_layer_frame_only_constraint_flag);
336 }
337
338
339 if (hdr->profile[i].sub_layer_level_present_flag) {
340 LOG1(" sub_layer_level_idc : %d\n", hdr->profile[i].sub_layer_level_idc);
341 }
342 }
343 }
344
345 #undef LOG0
346 #undef LOG1
347 #undef LOG2
348 #undef LOG3
349
350
351 /*
352 void dump_bit_rate_pic_rate_info(struct bit_rate_pic_rate_info* hdr,
353 int TempLevelLow,
354 int TempLevelHigh)
355 {
356 for (int i=TempLevelLow; i<=TempLevelHigh; i++) {
357
358 LOG(" Bitrate [Layer %d]\n", i);
359
360 if (hdr->bit_rate_info_present_flag[i]) {
361 LOG(" avg_bit_rate : %d\n", hdr->avg_bit_rate[i]);
362 LOG(" max_bit_rate : %d\n", hdr->max_bit_rate[i]);
363 }
364
365 if (hdr->pic_rate_info_present_flag[i]) {
366 LOG(" constant_pic_rate_idc : %d\n", hdr->constant_pic_rate_idc[i]);
367 LOG(" avg_pic_rate[i] : %d\n", hdr->avg_pic_rate[i]);
368 }
369 }
370 }
371 */
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "vps.h"
21 #include "util.h"
22 #include "decctx.h"
23
24 #include <assert.h>
25
26
27 de265_error read_vps(decoder_context* ctx, bitreader* reader, video_parameter_set* vps)
28 {
29 int vlc;
30
31 vps->video_parameter_set_id = vlc = get_bits(reader, 4);
32 if (vlc >= DE265_MAX_VPS_SETS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
33
34 skip_bits(reader, 2);
35 vps->vps_max_layers = vlc = get_bits(reader,6) +1;
36 if (vlc != 1) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; // TODO: out of specification
37
38 vps->vps_max_sub_layers = vlc = get_bits(reader,3) +1;
39 if (vlc >= MAX_TEMPORAL_SUBLAYERS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
40
41 vps->vps_temporal_id_nesting_flag = get_bits(reader,1);
42 skip_bits(reader, 16);
43
44 read_profile_tier_level(reader, &vps->profile_tier_level,
45 vps->vps_max_sub_layers);
46
47 /*
48 read_bit_rate_pic_rate_info(reader, &vps->bit_rate_pic_rate_info,
49 0, vps->vps_max_sub_layers-1);
50 */
51
52 vps->vps_sub_layer_ordering_info_present_flag = get_bits(reader,1);
53 //assert(vps->vps_max_sub_layers-1 < MAX_TEMPORAL_SUBLAYERS);
54
55 int firstLayerRead = vps->vps_sub_layer_ordering_info_present_flag ? 0 : (vps->vps_max_sub_layers-1);
56
57 for (int i=firstLayerRead;i<vps->vps_max_sub_layers;i++) {
58 vps->layer[i].vps_max_dec_pic_buffering = get_uvlc(reader);
59 vps->layer[i].vps_max_num_reorder_pics = get_uvlc(reader);
60 vps->layer[i].vps_max_latency_increase = get_uvlc(reader);
61 }
62
63 if (!vps->vps_sub_layer_ordering_info_present_flag) {
64 assert(firstLayerRead < MAX_TEMPORAL_SUBLAYERS);
65
66 for (int i=0;i<firstLayerRead;i++) {
67 vps->layer[i].vps_max_dec_pic_buffering = vps->layer[firstLayerRead].vps_max_dec_pic_buffering;
68 vps->layer[i].vps_max_num_reorder_pics = vps->layer[firstLayerRead].vps_max_num_reorder_pics;
69 vps->layer[i].vps_max_latency_increase = vps->layer[firstLayerRead].vps_max_latency_increase;
70 }
71 }
72
73
74 vps->vps_max_layer_id = get_bits(reader,6);
75 vps->vps_num_layer_sets = get_uvlc(reader)+1;
76
77 if (vps->vps_num_layer_sets<0 ||
78 vps->vps_num_layer_sets>=1024) {
79 ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
80 return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
81 }
82
83 for (int i=1; i <= vps->vps_num_layer_sets-1; i++)
84 for (int j=0; j <= vps->vps_max_layer_id; j++)
85 {
86 vps->layer_id_included_flag[i][j] = get_bits(reader,1);
87 }
88
89 vps->vps_timing_info_present_flag = get_bits(reader,1);
90
91 if (vps->vps_timing_info_present_flag) {
92 vps->vps_num_units_in_tick = get_bits(reader,32);
93 vps->vps_time_scale = get_bits(reader,32);
94 vps->vps_poc_proportional_to_timing_flag = get_bits(reader,1);
95
96 if (vps->vps_poc_proportional_to_timing_flag) {
97 vps->vps_num_ticks_poc_diff_one = get_uvlc(reader)+1;
98 vps->vps_num_hrd_parameters = get_uvlc(reader);
99
100 if (vps->vps_num_hrd_parameters >= 1024) {
101 assert(false); // TODO: return bitstream error
102 }
103
104 for (int i=0; i<vps->vps_num_hrd_parameters; i++) {
105 vps->hrd_layer_set_idx[i] = get_uvlc(reader);
106
107 if (i > 0) {
108 vps->cprms_present_flag[i] = get_bits(reader,1);
109 }
110
111 //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1)
112
113 return DE265_OK; // TODO: decode hrd_parameters()
114 }
115 }
116 }
117
118 vps->vps_extension_flag = get_bits(reader,1);
119
120 if (vps->vps_extension_flag) {
121 /*
122 while( more_rbsp_data() )
123 vps_extension_data_flag u(1)
124 rbsp_trailing_bits()
125 */
126 }
127
128 return DE265_OK;
129 }
130
131
132 void read_profile_tier_level(bitreader* reader,
133 struct profile_tier_level* hdr,
134 int max_sub_layers)
135 {
136 hdr->general_profile_space = get_bits(reader,2);
137 hdr->general_tier_flag = get_bits(reader,1);
138 hdr->general_profile_idc = get_bits(reader,5);
139
140 for (int i=0; i<32; i++) {
141 hdr->general_profile_compatibility_flag[i] = get_bits(reader,1);
142 }
143
144 hdr->general_progressive_source_flag = get_bits(reader,1);
145 hdr->general_interlaced_source_flag = get_bits(reader,1);
146 hdr->general_non_packed_constraint_flag = get_bits(reader,1);
147 hdr->general_frame_only_constraint_flag = get_bits(reader,1);
148 skip_bits(reader,44);
149
150 hdr->general_level_idc = get_bits(reader,8);
151
152
153 for (int i=0; i<max_sub_layers-1; i++)
154 {
155 hdr->profile[i].sub_layer_profile_present_flag = get_bits(reader,1);
156 hdr->profile[i].sub_layer_level_present_flag = get_bits(reader,1);
157 }
158
159 if (max_sub_layers > 1)
160 {
161 for (int i=max_sub_layers-1; i<8; i++)
162 {
163 skip_bits(reader,2);
164 }
165 }
166
167 for (int i=0; i<max_sub_layers-1; i++)
168 {
169 if (hdr->profile[i].sub_layer_profile_present_flag)
170 {
171 hdr->profile[i].sub_layer_profile_space = get_bits(reader,2);
172 hdr->profile[i].sub_layer_tier_flag = get_bits(reader,1);
173 hdr->profile[i].sub_layer_profile_idc = get_bits(reader,5);
174
175 for (int j=0; j<32; j++)
176 {
177 hdr->profile[i].sub_layer_profile_compatibility_flag[j] = get_bits(reader,1);
178 }
179
180 hdr->profile[i].sub_layer_progressive_source_flag = get_bits(reader,1);
181 hdr->profile[i].sub_layer_interlaced_source_flag = get_bits(reader,1);
182 hdr->profile[i].sub_layer_non_packed_constraint_flag = get_bits(reader,1);
183 hdr->profile[i].sub_layer_frame_only_constraint_flag = get_bits(reader,1);
184 skip_bits(reader,44);
185 }
186
187 if (hdr->profile[i].sub_layer_level_present_flag)
188 {
189 hdr->profile[i].sub_layer_level_idc = get_bits(reader,8);
190 }
191 }
192 }
193
194
195 /*
196 void read_bit_rate_pic_rate_info(bitreader* reader,
197 struct bit_rate_pic_rate_info* hdr,
198 int TempLevelLow,
199 int TempLevelHigh)
200 {
201 for (int i=TempLevelLow; i<=TempLevelHigh; i++) {
202
203 hdr->bit_rate_info_present_flag[i] = get_bits(reader,1);
204 hdr->pic_rate_info_present_flag[i] = get_bits(reader,1);
205
206 if (hdr->bit_rate_info_present_flag[i]) {
207 hdr->avg_bit_rate[i] = get_bits(reader,16);
208 hdr->max_bit_rate[i] = get_bits(reader,16);
209 }
210
211 if (hdr->pic_rate_info_present_flag[i]) {
212 hdr->constant_pic_rate_idc[i] = get_bits(reader,2);
213 hdr->avg_pic_rate[i] = get_bits(reader,16);
214 }
215 }
216 }
217 */
218
219
220
221 #define LOG0(t) log2fh(fh, t)
222 #define LOG1(t,d) log2fh(fh, t,d)
223 #define LOG2(t,d1,d2) log2fh(fh, t,d1,d2)
224 #define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3)
225
226 void dump_vps(video_parameter_set* vps, int fd)
227 {
228 FILE* fh;
229 if (fd==1) fh=stdout;
230 else if (fd==2) fh=stderr;
231 else { return; }
232
233 LOG0("----------------- VPS -----------------\n");
234 LOG1("video_parameter_set_id : %d\n", vps->video_parameter_set_id);
235 LOG1("vps_max_layers : %d\n", vps->vps_max_layers);
236 LOG1("vps_max_sub_layers : %d\n", vps->vps_max_sub_layers);
237 LOG1("vps_temporal_id_nesting_flag : %d\n", vps->vps_temporal_id_nesting_flag);
238
239 dump_profile_tier_level(&vps->profile_tier_level, vps->vps_max_sub_layers, fh);
240 //dump_bit_rate_pic_rate_info(&vps->bit_rate_pic_rate_info, 0, vps->vps_max_sub_layers-1);
241
242 LOG1("vps_sub_layer_ordering_info_present_flag : %d\n",
243 vps->vps_sub_layer_ordering_info_present_flag);
244
245 if (vps->vps_sub_layer_ordering_info_present_flag) {
246 for (int i=0;i<vps->vps_max_sub_layers;i++) {
247 LOG2("layer %d: vps_max_dec_pic_buffering = %d\n",i,vps->layer[i].vps_max_dec_pic_buffering);
248 LOG1(" vps_max_num_reorder_pics = %d\n",vps->layer[i].vps_max_num_reorder_pics);
249 LOG1(" vps_max_latency_increase = %d\n",vps->layer[i].vps_max_latency_increase);
250 }
251 }
252 else {
253 LOG1("layer (all): vps_max_dec_pic_buffering = %d\n",vps->layer[0].vps_max_dec_pic_buffering);
254 LOG1(" vps_max_num_reorder_pics = %d\n",vps->layer[0].vps_max_num_reorder_pics);
255 LOG1(" vps_max_latency_increase = %d\n",vps->layer[0].vps_max_latency_increase);
256 }
257
258
259 LOG1("vps_max_layer_id = %d\n", vps->vps_max_layer_id);
260 LOG1("vps_num_layer_sets = %d\n", vps->vps_num_layer_sets);
261
262 for (int i=1; i <= vps->vps_num_layer_sets-1; i++)
263 for (int j=0; j <= vps->vps_max_layer_id; j++)
264 {
265 LOG3("layer_id_included_flag[%d][%d] = %d\n",i,j,
266 vps->layer_id_included_flag[i][j]);
267 }
268
269 LOG1("vps_timing_info_present_flag = %d\n",
270 vps->vps_timing_info_present_flag);
271
272 if (vps->vps_timing_info_present_flag) {
273 LOG1("vps_num_units_in_tick = %d\n", vps->vps_num_units_in_tick);
274 LOG1("vps_time_scale = %d\n", vps->vps_time_scale);
275 LOG1("vps_poc_proportional_to_timing_flag = %d\n", vps->vps_poc_proportional_to_timing_flag);
276
277 if (vps->vps_poc_proportional_to_timing_flag) {
278 LOG1("vps_num_ticks_poc_diff_one = %d\n", vps->vps_num_ticks_poc_diff_one);
279 LOG1("vps_num_hrd_parameters = %d\n", vps->vps_num_hrd_parameters);
280
281 for (int i=0; i<vps->vps_num_hrd_parameters; i++) {
282 LOG2("hrd_layer_set_idx[%d] = %d\n", i, vps->hrd_layer_set_idx[i]);
283
284 if (i > 0) {
285 LOG2("cprms_present_flag[%d] = %d\n", i, vps->cprms_present_flag[i]);
286 }
287
288 //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1)
289
290 return; // TODO: decode hrd_parameters()
291 }
292 }
293 }
294
295 LOG1("vps_extension_flag = %d\n", vps->vps_extension_flag);
296 }
297
298
299 void dump_profile_tier_level(const struct profile_tier_level* hdr,
300 int max_sub_layers, FILE* fh)
301 {
302 LOG1(" general_profile_space : %d\n", hdr->general_profile_space);
303 LOG1(" general_tier_flag : %d\n", hdr->general_tier_flag);
304 LOG1(" general_profile_idc : %d\n", hdr->general_profile_idc);
305
306 LOG0(" general_profile_compatibility_flags: ");
307 for (int i=0; i<32; i++) {
308 if (i) LOG0("*,");
309 LOG1("*%d",hdr->general_profile_compatibility_flag[i]);
310 }
311 LOG0("*\n");
312
313 LOG1(" general_level_idc : %d\n", hdr->general_level_idc);
314
315 for (int i=0; i<max_sub_layers-1; i++)
316 {
317 LOG1(" Profile/Tier/Level [Layer %d]\n",i);
318
319 if (hdr->profile[i].sub_layer_profile_present_flag) {
320
321 LOG1(" sub_layer_profile_space : %d\n",hdr->profile[i].sub_layer_profile_space);
322 LOG1(" sub_layer_tier_flag : %d\n",hdr->profile[i].sub_layer_tier_flag);
323 LOG1(" sub_layer_profile_idc : %d\n",hdr->profile[i].sub_layer_profile_idc);
324
325 LOG0(" sub_layer_profile_compatibility_flags: ");
326 for (int j=0; j<32; j++) {
327 if (j) LOG0(",");
328 LOG1("%d",hdr->profile[i].sub_layer_profile_compatibility_flag[j]);
329 }
330 LOG0("\n");
331
332 LOG1(" sub_layer_progressive_source_flag : %d\n",hdr->profile[i].sub_layer_progressive_source_flag);
333 LOG1(" sub_layer_interlaced_source_flag : %d\n",hdr->profile[i].sub_layer_interlaced_source_flag);
334 LOG1(" sub_layer_non_packed_constraint_flag : %d\n",hdr->profile[i].sub_layer_non_packed_constraint_flag);
335 LOG1(" sub_layer_frame_only_constraint_flag : %d\n",hdr->profile[i].sub_layer_frame_only_constraint_flag);
336 }
337
338
339 if (hdr->profile[i].sub_layer_level_present_flag) {
340 LOG1(" sub_layer_level_idc : %d\n", hdr->profile[i].sub_layer_level_idc);
341 }
342 }
343 }
344
345 #undef LOG0
346 #undef LOG1
347 #undef LOG2
348 #undef LOG3
349
350
351 /*
352 void dump_bit_rate_pic_rate_info(struct bit_rate_pic_rate_info* hdr,
353 int TempLevelLow,
354 int TempLevelHigh)
355 {
356 for (int i=TempLevelLow; i<=TempLevelHigh; i++) {
357
358 LOG(" Bitrate [Layer %d]\n", i);
359
360 if (hdr->bit_rate_info_present_flag[i]) {
361 LOG(" avg_bit_rate : %d\n", hdr->avg_bit_rate[i]);
362 LOG(" max_bit_rate : %d\n", hdr->max_bit_rate[i]);
363 }
364
365 if (hdr->pic_rate_info_present_flag[i]) {
366 LOG(" constant_pic_rate_idc : %d\n", hdr->constant_pic_rate_idc[i]);
367 LOG(" avg_pic_rate[i] : %d\n", hdr->avg_pic_rate[i]);
368 }
369 }
370 }
371 */
00 /*
11 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
44 * This file is part of libde265.
55 *
8080 struct profile_tier_level* hdr,
8181 int max_sub_layers);
8282
83 void dump_profile_tier_level(struct profile_tier_level* hdr,
83 void dump_profile_tier_level(const struct profile_tier_level* hdr,
8484 int max_sub_layers, FILE* fh);
8585
8686
148148 } video_parameter_set;
149149
150150
151 de265_error read_vps(struct decoder_context* ctx, bitreader* reader, video_parameter_set* vps);
151152 void dump_vps(video_parameter_set*, int fd);
152153
153154 #endif
00 noinst_LTLIBRARIES = libde265_x86.la libde265_x86_sse.la
11
2 libde265_x86_la_CFLAGS = -I..
3 libde265_x86_la_SOURCES = sse.c sse.h
2 libde265_x86_la_CXXFLAGS = -I..
3 libde265_x86_la_SOURCES = sse.cc sse.h
44 libde265_x86_la_LIBADD = libde265_x86_sse.la
55
66
77 # SSE4 specific functions
88
9 libde265_x86_sse_la_CFLAGS = -msse4.1 -I..
10 libde265_x86_sse_la_SOURCES = sse-motion.c sse-motion.h sse-dct.h sse-dct.c
9 libde265_x86_sse_la_CXXFLAGS = -msse4.1 -I..
10 libde265_x86_sse_la_SOURCES = sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc
1111
1212
0 # Makefile.in generated by automake 1.13.3 from Makefile.am.
0 # Makefile.in generated by automake 1.14.1 from Makefile.am.
11 # @configure_input@
22
33 # Copyright (C) 1994-2013 Free Software Foundation, Inc.
101101 am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
102102 am__v_lt_0 = --silent
103103 am__v_lt_1 =
104 libde265_x86_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
105 $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
106 $(libde265_x86_la_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
107 -o $@
104 libde265_x86_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
105 $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
106 $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
107 $(LDFLAGS) -o $@
108108 libde265_x86_sse_la_LIBADD =
109109 am_libde265_x86_sse_la_OBJECTS = libde265_x86_sse_la-sse-motion.lo \
110110 libde265_x86_sse_la-sse-dct.lo
111111 libde265_x86_sse_la_OBJECTS = $(am_libde265_x86_sse_la_OBJECTS)
112 libde265_x86_sse_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
113 $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
114 $(libde265_x86_sse_la_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
112 libde265_x86_sse_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
113 $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
114 $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
115115 $(LDFLAGS) -o $@
116116 AM_V_P = $(am__v_P_@AM_V@)
117117 am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
129129 depcomp = $(SHELL) $(top_srcdir)/depcomp
130130 am__depfiles_maybe = depfiles
131131 am__mv = mv -f
132 CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
133 $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
134 LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
135 $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \
136 $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
137 $(AM_CXXFLAGS) $(CXXFLAGS)
138 AM_V_CXX = $(am__v_CXX_@AM_V@)
139 am__v_CXX_ = $(am__v_CXX_@AM_DEFAULT_V@)
140 am__v_CXX_0 = @echo " CXX " $@;
141 am__v_CXX_1 =
142 CXXLD = $(CXX)
143 CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
144 $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \
145 $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
146 AM_V_CXXLD = $(am__v_CXXLD_@AM_V@)
147 am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@)
148 am__v_CXXLD_0 = @echo " CXXLD " $@;
149 am__v_CXXLD_1 =
132150 COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
133151 $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
134152 LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
251269 SET_MAKE = @SET_MAKE@
252270 SHELL = @SHELL@
253271 STRIP = @STRIP@
272 SWSCALE_CFLAGS = @SWSCALE_CFLAGS@
273 SWSCALE_LIBS = @SWSCALE_LIBS@
254274 VERSION = @VERSION@
255275 VIDEOGFX_CFLAGS = @VIDEOGFX_CFLAGS@
256276 VIDEOGFX_LIBS = @VIDEOGFX_LIBS@
312332 top_builddir = @top_builddir@
313333 top_srcdir = @top_srcdir@
314334 noinst_LTLIBRARIES = libde265_x86.la libde265_x86_sse.la
315 libde265_x86_la_CFLAGS = -I..
316 libde265_x86_la_SOURCES = sse.c sse.h
335 libde265_x86_la_CXXFLAGS = -I..
336 libde265_x86_la_SOURCES = sse.cc sse.h
317337 libde265_x86_la_LIBADD = libde265_x86_sse.la
318338
319339 # SSE4 specific functions
320 libde265_x86_sse_la_CFLAGS = -msse4.1 -I..
321 libde265_x86_sse_la_SOURCES = sse-motion.c sse-motion.h sse-dct.h sse-dct.c
340 libde265_x86_sse_la_CXXFLAGS = -msse4.1 -I..
341 libde265_x86_sse_la_SOURCES = sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc
322342 all: all-am
323343
324344 .SUFFIXES:
325 .SUFFIXES: .c .lo .o .obj
345 .SUFFIXES: .cc .lo .o .obj
326346 $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
327347 @for dep in $?; do \
328348 case '$(am__configure_deps)' in \
366386 }
367387
368388 libde265_x86.la: $(libde265_x86_la_OBJECTS) $(libde265_x86_la_DEPENDENCIES) $(EXTRA_libde265_x86_la_DEPENDENCIES)
369 $(AM_V_CCLD)$(libde265_x86_la_LINK) $(libde265_x86_la_OBJECTS) $(libde265_x86_la_LIBADD) $(LIBS)
389 $(AM_V_CXXLD)$(libde265_x86_la_LINK) $(libde265_x86_la_OBJECTS) $(libde265_x86_la_LIBADD) $(LIBS)
370390
371391 libde265_x86_sse.la: $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_DEPENDENCIES) $(EXTRA_libde265_x86_sse_la_DEPENDENCIES)
372 $(AM_V_CCLD)$(libde265_x86_sse_la_LINK) $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_LIBADD) $(LIBS)
392 $(AM_V_CXXLD)$(libde265_x86_sse_la_LINK) $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_LIBADD) $(LIBS)
373393
374394 mostlyclean-compile:
375395 -rm -f *.$(OBJEXT)
381401 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo@am__quote@
382402 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo@am__quote@
383403
384 .c.o:
385 @am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
386 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
387 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
388 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
389 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c $<
390
391 .c.obj:
392 @am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
393 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
394 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
395 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
396 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c `$(CYGPATH_W) '$<'`
397
398 .c.lo:
399 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
400 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
401 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
402 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
403 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
404
405 libde265_x86_la-sse.lo: sse.c
406 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CFLAGS) $(CFLAGS) -MT libde265_x86_la-sse.lo -MD -MP -MF $(DEPDIR)/libde265_x86_la-sse.Tpo -c -o libde265_x86_la-sse.lo `test -f 'sse.c' || echo '$(srcdir)/'`sse.c
407 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_la-sse.Tpo $(DEPDIR)/libde265_x86_la-sse.Plo
408 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sse.c' object='libde265_x86_la-sse.lo' libtool=yes @AMDEPBACKSLASH@
409 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
410 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CFLAGS) $(CFLAGS) -c -o libde265_x86_la-sse.lo `test -f 'sse.c' || echo '$(srcdir)/'`sse.c
411
412 libde265_x86_sse_la-sse-motion.lo: sse-motion.c
413 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CFLAGS) $(CFLAGS) -MT libde265_x86_sse_la-sse-motion.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.c' || echo '$(srcdir)/'`sse-motion.c
414 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo
415 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sse-motion.c' object='libde265_x86_sse_la-sse-motion.lo' libtool=yes @AMDEPBACKSLASH@
416 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
417 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CFLAGS) $(CFLAGS) -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.c' || echo '$(srcdir)/'`sse-motion.c
418
419 libde265_x86_sse_la-sse-dct.lo: sse-dct.c
420 @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CFLAGS) $(CFLAGS) -MT libde265_x86_sse_la-sse-dct.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.c' || echo '$(srcdir)/'`sse-dct.c
421 @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo
422 @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sse-dct.c' object='libde265_x86_sse_la-sse-dct.lo' libtool=yes @AMDEPBACKSLASH@
423 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
424 @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CFLAGS) $(CFLAGS) -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.c' || echo '$(srcdir)/'`sse-dct.c
404 .cc.o:
405 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
406 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
407 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
408 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
409 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $<
410
411 .cc.obj:
412 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
413 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
414 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
415 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
416 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
417
418 .cc.lo:
419 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
420 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
421 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
422 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
423 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
424
425 libde265_x86_la-sse.lo: sse.cc
426 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_la-sse.lo -MD -MP -MF $(DEPDIR)/libde265_x86_la-sse.Tpo -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc
427 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_la-sse.Tpo $(DEPDIR)/libde265_x86_la-sse.Plo
428 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='sse.cc' object='libde265_x86_la-sse.lo' libtool=yes @AMDEPBACKSLASH@
429 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
430 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc
431
432 libde265_x86_sse_la-sse-motion.lo: sse-motion.cc
433 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-motion.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc
434 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo
435 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='sse-motion.cc' object='libde265_x86_sse_la-sse-motion.lo' libtool=yes @AMDEPBACKSLASH@
436 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
437 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc
438
439 libde265_x86_sse_la-sse-dct.lo: sse-dct.cc
440 @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-dct.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc
441 @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo
442 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='sse-dct.cc' object='libde265_x86_sse_la-sse-dct.lo' libtool=yes @AMDEPBACKSLASH@
443 @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
444 @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc
425445
426446 mostlyclean-libtool:
427447 -rm -f *.lo
+0
-7077
libde265/x86/sse-dct.c less more
0 /*
1 Code was taken over from openHEVC and slightly modified.
2 */
3
4 #include "x86/sse-dct.h"
5 #include "libde265/util.h"
6
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
10
11 #include <emmintrin.h> // SSE2
12 #include <tmmintrin.h> // SSSE3
13
14 #if HAVE_SSE4_1
15 #include <smmintrin.h> // SSE4.1
16 #endif
17
18
19 ALIGNED_16(static const int16_t) transform4x4_luma[8][8] =
20 {
21 { 29, +84, 29, +84, 29, +84, 29, +84 },
22 { +74, +55, +74, +55, +74, +55, +74, +55 },
23 { 55, -29, 55, -29, 55, -29, 55, -29 },
24 { +74, -84, +74, -84, +74, -84, +74, -84 },
25 { 74, -74, 74, -74, 74, -74, 74, -74 },
26 { 0, +74, 0, +74, 0, +74, 0, +74 },
27 { 84, +55, 84, +55, 84, +55, 84, +55 },
28 { -74, -29, -74, -29, -74, -29, -74, -29 }
29 };
30
31 ALIGNED_16(static const int16_t) transform4x4[4][8] = {
32 { 64, 64, 64, 64, 64, 64, 64, 64 },
33 { 64, -64, 64, -64, 64, -64, 64, -64 },
34 { 83, 36, 83, 36, 83, 36, 83, 36 },
35 { 36, -83, 36, -83, 36, -83, 36, -83 }
36 };
37
38 ALIGNED_16(static const int16_t) transform8x8[12][8] =
39 {
40 { 89, 75, 89, 75, 89, 75, 89, 75 },
41 { 50, 18, 50, 18, 50, 18, 50, 18 },
42 { 75, -18, 75, -18, 75, -18, 75, -18 },
43 { -89, -50, -89, -50,-89, -50,-89, -50 },
44 { 50, -89, 50, -89, 50, -89, 50, -89 },
45 { 18, 75, 18, 75, 18, 75, 18, 75 },
46 { 18, -50, 18, -50, 18, -50, 18, -50 },
47 { 75, -89, 75, -89, 75, -89, 75, -89 },
48 { 64, 64, 64, 64, 64, 64, 64, 64 },
49 { 64, -64, 64, -64, 64, -64, 64, -64 },
50 { 83, 36, 83, 36, 83, 36, 83, 36 },
51 { 36, -83, 36, -83, 36, -83, 36, -83 }
52 };
53
54 ALIGNED_16(static const int16_t) transform16x16_1[4][8][8] =
55 {
56 {/*1-3*/ /*2-6*/
57 { 90, 87, 90, 87, 90, 87, 90, 87 },
58 { 87, 57, 87, 57, 87, 57, 87, 57 },
59 { 80, 9, 80, 9, 80, 9, 80, 9 },
60 { 70, -43, 70, -43, 70, -43, 70, -43 },
61 { 57, -80, 57, -80, 57, -80, 57, -80 },
62 { 43, -90, 43, -90, 43, -90, 43, -90 },
63 { 25, -70, 25, -70, 25, -70, 25, -70 },
64 { 9, -25, 9, -25, 9, -25, 9, -25 },
65 },{ /*5-7*/ /*10-14*/
66 { 80, 70, 80, 70, 80, 70, 80, 70 },
67 { 9, -43, 9, -43, 9, -43, 9, -43 },
68 { -70, -87, -70, -87, -70, -87, -70, -87 },
69 { -87, 9, -87, 9, -87, 9, -87, 9 },
70 { -25, 90, -25, 90, -25, 90, -25, 90 },
71 { 57, 25, 57, 25, 57, 25, 57, 25 },
72 { 90, -80, 90, -80, 90, -80, 90, -80 },
73 { 43, -57, 43, -57, 43, -57, 43, -57 },
74 },{ /*9-11*/ /*18-22*/
75 { 57, 43, 57, 43, 57, 43, 57, 43 },
76 { -80, -90, -80, -90, -80, -90, -80, -90 },
77 { -25, 57, -25, 57, -25, 57, -25, 57 },
78 { 90, 25, 90, 25, 90, 25, 90, 25 },
79 { -9, -87, -9, -87, -9, -87, -9, -87 },
80 { -87, 70, -87, 70, -87, 70, -87, 70 },
81 { 43, 9, 43, 9, 43, 9, 43, 9 },
82 { 70, -80, 70, -80, 70, -80, 70, -80 },
83 },{/*13-15*/ /* 26-30 */
84 { 25, 9, 25, 9, 25, 9, 25, 9 },
85 { -70, -25, -70, -25, -70, -25, -70, -25 },
86 { 90, 43, 90, 43, 90, 43, 90, 43 },
87 { -80, -57, -80, -57, -80, -57, -80, -57 },
88 { 43, 70, 43, 70, 43, 70, 43, 70 },
89 { 9, -80, 9, -80, 9, -80, 9, -80 },
90 { -57, 87, -57, 87, -57, 87, -57, 87 },
91 { 87, -90, 87, -90, 87, -90, 87, -90 },
92 }
93 };
94
95 ALIGNED_16(static const int16_t) transform16x16_2[2][4][8] =
96 {
97 { /*2-6*/ /*4-12*/
98 { 89, 75, 89, 75, 89, 75, 89, 75 },
99 { 75, -18, 75, -18, 75, -18, 75, -18 },
100 { 50, -89, 50, -89, 50, -89, 50, -89 },
101 { 18, -50, 18, -50, 18, -50, 18, -50 },
102 },{ /*10-14*/ /*20-28*/
103 { 50, 18, 50, 18, 50, 18, 50, 18 },
104 { -89, -50, -89, -50, -89, -50, -89, -50 },
105 { 18, 75, 18, 75, 18, 75, 18, 75 },
106 { 75, -89, 75, -89, 75, -89, 75, -89 },
107 }
108 };
109
110 ALIGNED_16(static const int16_t) transform16x16_3[2][2][8] =
111 {
112 {/*4-12*/ /*8-24*/
113 { 83, 36, 83, 36, 83, 36, 83, 36 },
114 { 36, -83, 36, -83, 36, -83, 36, -83 },
115 },{ /*0-8*/ /*0-16*/
116 { 64, 64, 64, 64, 64, 64, 64, 64 },
117 { 64, -64, 64, -64, 64, -64, 64, -64 },
118 }
119 };
120
121
122 ALIGNED_16(static const int16_t) transform32x32[8][16][8] =
123 {
124 { /* 1-3 */
125 { 90, 90, 90, 90, 90, 90, 90, 90 },
126 { 90, 82, 90, 82, 90, 82, 90, 82 },
127 { 88, 67, 88, 67, 88, 67, 88, 67 },
128 { 85, 46, 85, 46, 85, 46, 85, 46 },
129 { 82, 22, 82, 22, 82, 22, 82, 22 },
130 { 78, -4, 78, -4, 78, -4, 78, -4 },
131 { 73, -31, 73, -31, 73, -31, 73, -31 },
132 { 67, -54, 67, -54, 67, -54, 67, -54 },
133 { 61, -73, 61, -73, 61, -73, 61, -73 },
134 { 54, -85, 54, -85, 54, -85, 54, -85 },
135 { 46, -90, 46, -90, 46, -90, 46, -90 },
136 { 38, -88, 38, -88, 38, -88, 38, -88 },
137 { 31, -78, 31, -78, 31, -78, 31, -78 },
138 { 22, -61, 22, -61, 22, -61, 22, -61 },
139 { 13, -38, 13, -38, 13, -38, 13, -38 },
140 { 4, -13, 4, -13, 4, -13, 4, -13 },
141 },{/* 5-7 */
142 { 88, 85, 88, 85, 88, 85, 88, 85 },
143 { 67, 46, 67, 46, 67, 46, 67, 46 },
144 { 31, -13, 31, -13, 31, -13, 31, -13 },
145 { -13, -67, -13, -67, -13, -67, -13, -67 },
146 { -54, -90, -54, -90, -54, -90, -54, -90 },
147 { -82, -73, -82, -73, -82, -73, -82, -73 },
148 { -90, -22, -90, -22, -90, -22, -90, -22 },
149 { -78, 38, -78, 38, -78, 38, -78, 38 },
150 { -46, 82, -46, 82, -46, 82, -46, 82 },
151 { -4, 88, -4, 88, -4, 88, -4, 88 },
152 { 38, 54, 38, 54, 38, 54, 38, 54 },
153 { 73, -4, 73, -4, 73, -4, 73, -4 },
154 { 90, -61, 90, -61, 90, -61, 90, -61 },
155 { 85, -90, 85, -90, 85, -90, 85, -90 },
156 { 61, -78, 61, -78, 61, -78, 61, -78 },
157 { 22, -31, 22, -31, 22, -31, 22, -31 },
158 },{/* 9-11 */
159 { 82, 78, 82, 78, 82, 78, 82, 78 },
160 { 22, -4, 22, -4, 22, -4, 22, -4 },
161 { -54, -82, -54, -82, -54, -82, -54, -82 },
162 { -90, -73, -90, -73, -90, -73, -90, -73 },
163 { -61, 13, -61, 13, -61, 13, -61, 13 },
164 { 13, 85, 13, 85, 13, 85, 13, 85 },
165 { 78, 67, 78, 67, 78, 67, 78, 67 },
166 { 85, -22, 85, -22, 85, -22, 85, -22 },
167 { 31, -88, 31, -88, 31, -88, 31, -88 },
168 { -46, -61, -46, -61, -46, -61, -46, -61 },
169 { -90, 31, -90, 31, -90, 31, -90, 31 },
170 { -67, 90, -67, 90, -67, 90, -67, 90 },
171 { 4, 54, 4, 54, 4, 54, 4, 54 },
172 { 73, -38, 73, -38, 73, -38, 73, -38 },
173 { 88, -90, 88, -90, 88, -90, 88, -90 },
174 { 38, -46, 38, -46, 38, -46, 38, -46 },
175 },{/* 13-15 */
176 { 73, 67, 73, 67, 73, 67, 73, 67 },
177 { -31, -54, -31, -54, -31, -54, -31, -54 },
178 { -90, -78, -90, -78, -90, -78, -90, -78 },
179 { -22, 38, -22, 38, -22, 38, -22, 38 },
180 { 78, 85, 78, 85, 78, 85, 78, 85 },
181 { 67, -22, 67, -22, 67, -22, 67, -22 },
182 { -38, -90, -38, -90, -38, -90, -38, -90 },
183 { -90, 4, -90, 4, -90, 4, -90, 4 },
184 { -13, 90, -13, 90, -13, 90, -13, 90 },
185 { 82, 13, 82, 13, 82, 13, 82, 13 },
186 { 61, -88, 61, -88, 61, -88, 61, -88 },
187 { -46, -31, -46, -31, -46, -31, -46, -31 },
188 { -88, 82, -88, 82, -88, 82, -88, 82 },
189 { -4, 46, -4, 46, -4, 46, -4, 46 },
190 { 85, -73, 85, -73, 85, -73, 85, -73 },
191 { 54, -61, 54, -61, 54, -61, 54, -61 },
192 },{/* 17-19 */
193 { 61, 54, 61, 54, 61, 54, 61, 54 },
194 { -73, -85, -73, -85, -73, -85, -73, -85 },
195 { -46, -4, -46, -4, -46, -4, -46, -4 },
196 { 82, 88, 82, 88, 82, 88, 82, 88 },
197 { 31, -46, 31, -46, 31, -46, 31, -46 },
198 { -88, -61, -88, -61, -88, -61, -88, -61 },
199 { -13, 82, -13, 82, -13, 82, -13, 82 },
200 { 90, 13, 90, 13, 90, 13, 90, 13 },
201 { -4, -90, -4, -90, -4, -90, -4, -90 },
202 { -90, 38, -90, 38, -90, 38, -90, 38 },
203 { 22, 67, 22, 67, 22, 67, 22, 67 },
204 { 85, -78, 85, -78, 85, -78, 85, -78 },
205 { -38, -22, -38, -22, -38, -22, -38, -22 },
206 { -78, 90, -78, 90, -78, 90, -78, 90 },
207 { 54, -31, 54, -31, 54, -31, 54, -31 },
208 { 67, -73, 67, -73, 67, -73, 67, -73 },
209 },{ /* 21-23 */
210 { 46, 38, 46, 38, 46, 38, 46, 38 },
211 { -90, -88, -90, -88, -90, -88, -90, -88 },
212 { 38, 73, 38, 73, 38, 73, 38, 73 },
213 { 54, -4, 54, -4, 54, -4, 54, -4 },
214 { -90, -67, -90, -67, -90, -67, -90, -67 },
215 { 31, 90, 31, 90, 31, 90, 31, 90 },
216 { 61, -46, 61, -46, 61, -46, 61, -46 },
217 { -88, -31, -88, -31, -88, -31, -88, -31 },
218 { 22, 85, 22, 85, 22, 85, 22, 85 },
219 { 67, -78, 67, -78, 67, -78, 67, -78 },
220 { -85, 13, -85, 13, -85, 13, -85, 13 },
221 { 13, 61, 13, 61, 13, 61, 13, 61 },
222 { 73, -90, 73, -90, 73, -90, 73, -90 },
223 { -82, 54, -82, 54, -82, 54, -82, 54 },
224 { 4, 22, 4, 22, 4, 22, 4, 22 },
225 { 78, -82, 78, -82, 78, -82, 78, -82 },
226 },{ /* 25-27 */
227 { 31, 22, 31, 22, 31, 22, 31, 22 },
228 { -78, -61, -78, -61, -78, -61, -78, -61 },
229 { 90, 85, 90, 85, 90, 85, 90, 85 },
230 { -61, -90, -61, -90, -61, -90, -61, -90 },
231 { 4, 73, 4, 73, 4, 73, 4, 73 },
232 { 54, -38, 54, -38, 54, -38, 54, -38 },
233 { -88, -4, -88, -4, -88, -4, -88, -4 },
234 { 82, 46, 82, 46, 82, 46, 82, 46 },
235 { -38, -78, -38, -78, -38, -78, -38, -78 },
236 { -22, 90, -22, 90, -22, 90, -22, 90 },
237 { 73, -82, 73, -82, 73, -82, 73, -82 },
238 { -90, 54, -90, 54, -90, 54, -90, 54 },
239 { 67, -13, 67, -13, 67, -13, 67, -13 },
240 { -13, -31, -13, -31, -13, -31, -13, -31 },
241 { -46, 67, -46, 67, -46, 67, -46, 67 },
242 { 85, -88, 85, -88, 85, -88, 85, -88 },
243 },{/* 29-31 */
244 { 13, 4, 13, 4, 13, 4, 13, 4 },
245 { -38, -13, -38, -13, -38, -13, -38, -13 },
246 { 61, 22, 61, 22, 61, 22, 61, 22 },
247 { -78, -31, -78, -31, -78, -31, -78, -31 },
248 { 88, 38, 88, 38, 88, 38, 88, 38 },
249 { -90, -46, -90, -46, -90, -46, -90, -46 },
250 { 85, 54, 85, 54, 85, 54, 85, 54 },
251 { -73, -61, -73, -61, -73, -61, -73, -61 },
252 { 54, 67, 54, 67, 54, 67, 54, 67 },
253 { -31, -73, -31, -73, -31, -73, -31, -73 },
254 { 4, 78, 4, 78, 4, 78, 4, 78 },
255 { 22, -82, 22, -82, 22, -82, 22, -82 },
256 { -46, 85, -46, 85, -46, 85, -46, 85 },
257 { 67, -88, 67, -88, 67, -88, 67, -88 },
258 { -82, 90, -82, 90, -82, 90, -82, 90 },
259 { 90, -90, 90, -90, 90, -90, 90, -90 },
260 }
261 };
262
263 #define shift_1st 7
264 #define add_1st (1 << (shift_1st - 1))
265
266
267 void ff_hevc_transform_skip_8_sse(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride)
268 {
269 uint8_t *dst = (uint8_t*)_dst;
270 ptrdiff_t stride = _stride;
271 int shift = 5;
272 int offset = 16;
273 __m128i r0,r1,r2,r3,r4,r5,r6,r9;
274
275 r9= _mm_setzero_si128();
276 //r8= _mm_set_epi32(0,0,0,-1);
277 r2= _mm_set1_epi16(offset);
278
279 r0= _mm_load_si128((__m128i*)(coeffs));
280 r1= _mm_load_si128((__m128i*)(coeffs+8));
281
282
283 r0= _mm_adds_epi16(r0,r2);
284 r1= _mm_adds_epi16(r1,r2);
285
286 r0= _mm_srai_epi16(r0,shift);
287 r1= _mm_srai_epi16(r1,shift);
288
289 r3= _mm_loadl_epi64((__m128i*)(dst));
290 r4= _mm_loadl_epi64((__m128i*)(dst + stride));
291 r5= _mm_loadl_epi64((__m128i*)(dst + 2*stride));
292 r6= _mm_loadl_epi64((__m128i*)(dst + 3*stride));
293
294 r3= _mm_unpacklo_epi8(r3,r9);
295 r4= _mm_unpacklo_epi8(r4,r9);
296 r5= _mm_unpacklo_epi8(r5,r9);
297 r6= _mm_unpacklo_epi8(r6,r9);
298 r3= _mm_unpacklo_epi64(r3,r4);
299 r4= _mm_unpacklo_epi64(r5,r6);
300
301
302 r3= _mm_adds_epi16(r3,r0);
303 r4= _mm_adds_epi16(r4,r1);
304
305 r3= _mm_packus_epi16(r3,r4);
306 //r8= _mm_set_epi32(0,0,0,-1);
307
308 //_mm_maskmoveu_si128(r3,r8,(char *) (dst));
309 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(r3);
310
311 r3= _mm_srli_si128(r3,4);
312 //_mm_maskmoveu_si128(r3,r8,(char *) (dst+stride));
313 *((uint32_t*)(dst+stride)) = _mm_cvtsi128_si32(r3);
314
315 r3= _mm_srli_si128(r3,4);
316 //_mm_maskmoveu_si128(r3,r8,(char *) (dst+2*stride));
317 *((uint32_t*)(dst+2*stride)) = _mm_cvtsi128_si32(r3);
318
319 r3= _mm_srli_si128(r3,4);
320 //_mm_maskmoveu_si128(r3,r8,(char *) (dst+3*stride));
321 *((uint32_t*)(dst+3*stride)) = _mm_cvtsi128_si32(r3);
322 }
323
324
325
326 #if HAVE_SSE4_1
327 void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
328 ptrdiff_t _stride) {
329
330 uint8_t shift_2nd = 12; // 20 - Bit depth
331 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
332
333 uint8_t *dst = (uint8_t*) _dst;
334 ptrdiff_t stride = _stride;
335 int16_t *src = coeffs;
336 __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
337 m128iD;
338 m128iAdd = _mm_set1_epi32(64);
339
340 S0 = _mm_load_si128((__m128i *) (src));
341 S8 = _mm_load_si128((__m128i *) (src + 8));
342
343 m128iAC = _mm_unpacklo_epi16(S0, S8);
344 m128iBD = _mm_unpackhi_epi16(S0, S8);
345
346 m128iTmp1 = _mm_madd_epi16(m128iAC,
347 _mm_load_si128((__m128i *) (transform4x4_luma[0])));
348 m128iTmp2 = _mm_madd_epi16(m128iBD,
349 _mm_load_si128((__m128i *) (transform4x4_luma[1])));
350 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
351 S0 = _mm_add_epi32(S0, m128iAdd);
352 S0 = _mm_srai_epi32(S0, shift_1st);
353
354 m128iTmp1 = _mm_madd_epi16(m128iAC,
355 _mm_load_si128((__m128i *) (transform4x4_luma[2])));
356 m128iTmp2 = _mm_madd_epi16(m128iBD,
357 _mm_load_si128((__m128i *) (transform4x4_luma[3])));
358 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
359 S8 = _mm_add_epi32(S8, m128iAdd);
360 S8 = _mm_srai_epi32(S8, shift_1st);
361
362 m128iA = _mm_packs_epi32(S0, S8);
363
364 m128iTmp1 = _mm_madd_epi16(m128iAC,
365 _mm_load_si128((__m128i *) (transform4x4_luma[4])));
366 m128iTmp2 = _mm_madd_epi16(m128iBD,
367 _mm_load_si128((__m128i *) (transform4x4_luma[5])));
368 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
369 S0 = _mm_add_epi32(S0, m128iAdd);
370 S0 = _mm_srai_epi32(S0, shift_1st);
371
372 m128iTmp1 = _mm_madd_epi16(m128iAC,
373 _mm_load_si128((__m128i *) (transform4x4_luma[6])));
374 m128iTmp2 = _mm_madd_epi16(m128iBD,
375 _mm_load_si128((__m128i *) (transform4x4_luma[7])));
376 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
377 S8 = _mm_add_epi32(S8, m128iAdd);
378 S8 = _mm_srai_epi32(S8, shift_1st);
379
380 m128iD = _mm_packs_epi32(S0, S8);
381
382 S0 = _mm_unpacklo_epi16(m128iA, m128iD);
383 S8 = _mm_unpackhi_epi16(m128iA, m128iD);
384
385 m128iA = _mm_unpacklo_epi16(S0, S8);
386 m128iD = _mm_unpackhi_epi16(S0, S8);
387
388 /* ################### */
389 m128iAdd = _mm_set1_epi32(add_2nd);
390
391 m128iAC = _mm_unpacklo_epi16(m128iA, m128iD);
392 m128iBD = _mm_unpackhi_epi16(m128iA, m128iD);
393
394 m128iTmp1 = _mm_madd_epi16(m128iAC,
395 _mm_load_si128((__m128i *) (transform4x4_luma[0])));
396 m128iTmp2 = _mm_madd_epi16(m128iBD,
397 _mm_load_si128((__m128i *) (transform4x4_luma[1])));
398 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
399 S0 = _mm_add_epi32(S0, m128iAdd);
400 S0 = _mm_srai_epi32(S0, shift_2nd);
401
402 m128iTmp1 = _mm_madd_epi16(m128iAC,
403 _mm_load_si128((__m128i *) (transform4x4_luma[2])));
404 m128iTmp2 = _mm_madd_epi16(m128iBD,
405 _mm_load_si128((__m128i *) (transform4x4_luma[3])));
406 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
407 S8 = _mm_add_epi32(S8, m128iAdd);
408 S8 = _mm_srai_epi32(S8, shift_2nd);
409
410 m128iA = _mm_packs_epi32(S0, S8);
411
412 m128iTmp1 = _mm_madd_epi16(m128iAC,
413 _mm_load_si128((__m128i *) (transform4x4_luma[4])));
414 m128iTmp2 = _mm_madd_epi16(m128iBD,
415 _mm_load_si128((__m128i *) (transform4x4_luma[5])));
416 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
417 S0 = _mm_add_epi32(S0, m128iAdd);
418 S0 = _mm_srai_epi32(S0, shift_2nd);
419
420 m128iTmp1 = _mm_madd_epi16(m128iAC,
421 _mm_load_si128((__m128i *) (transform4x4_luma[6])));
422 m128iTmp2 = _mm_madd_epi16(m128iBD,
423 _mm_load_si128((__m128i *) (transform4x4_luma[7])));
424 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
425 S8 = _mm_add_epi32(S8, m128iAdd);
426 S8 = _mm_srai_epi32(S8, shift_2nd);
427
428 m128iD = _mm_packs_epi32(S0, S8);
429
430 // _mm_storeu_si128((__m128i *) (src), m128iA);
431 // _mm_storeu_si128((__m128i *) (src + 8), m128iD);
432
433 S0 = _mm_move_epi64(m128iA); //contains row 0
434 S8 = _mm_move_epi64(m128iD); //row 2
435 m128iA = _mm_srli_si128(m128iA, 8); // row 1
436 m128iD = _mm_srli_si128(m128iD, 8); // row 3
437 m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA);
438 m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD);
439 S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2);
440 S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2);
441
442 //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1); //mask to store 4 * 8bit data
443
444 m128iA = _mm_loadl_epi64((__m128i *) dst);
445 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
446 m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values
447 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
448 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
449 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
450
451 dst += stride;
452
453 m128iA = _mm_loadl_epi64((__m128i *) dst);
454 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
455 m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA);
456 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
457 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
458 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
459
460 dst += stride;
461
462 m128iA = _mm_loadl_epi64((__m128i *) dst);
463 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
464 m128iTmp1 = _mm_adds_epi16(S8, m128iA);
465 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
466 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
467 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
468
469 dst += stride;
470
471 m128iA = _mm_loadl_epi64((__m128i *) dst);
472 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
473 m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA);
474 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
475 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
476 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
477 }
478 #endif // SSE4.1
479
480 #if 0
481 void ff_hevc_transform_4x4_luma_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
482 ptrdiff_t _stride) {
483 int i,j;
484 uint8_t shift_2nd = 10; // 20 - Bit depth
485 uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
486
487 uint16_t *dst = (uint16_t*) _dst;
488 ptrdiff_t stride = _stride/(sizeof(uint16_t));
489 int16_t *src = coeffs;
490 __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
491 m128iD;
492
493 m128iAdd = _mm_set1_epi32(64);
494
495 S0 = _mm_loadu_si128((__m128i *) (src));
496 S8 = _mm_loadu_si128((__m128i *) (src + 8));
497
498 m128iAC = _mm_unpacklo_epi16(S0, S8);
499 m128iBD = _mm_unpackhi_epi16(S0, S8);
500
501 m128iTmp1 = _mm_madd_epi16(m128iAC,
502 _mm_loadu_si128((__m128i *) (transform4x4_luma[0])));
503 m128iTmp2 = _mm_madd_epi16(m128iBD,
504 _mm_loadu_si128((__m128i *) (transform4x4_luma[1])));
505 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
506 S0 = _mm_add_epi32(S0, m128iAdd);
507 S0 = _mm_srai_epi32(S0, shift_1st);
508
509 m128iTmp1 = _mm_madd_epi16(m128iAC,
510 _mm_loadu_si128((__m128i *) (transform4x4_luma[2])));
511 m128iTmp2 = _mm_madd_epi16(m128iBD,
512 _mm_loadu_si128((__m128i *) (transform4x4_luma[3])));
513 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
514 S8 = _mm_add_epi32(S8, m128iAdd);
515 S8 = _mm_srai_epi32(S8, shift_1st);
516
517 m128iA = _mm_packs_epi32(S0, S8);
518
519 m128iTmp1 = _mm_madd_epi16(m128iAC,
520 _mm_loadu_si128((__m128i *) (transform4x4_luma[4])));
521 m128iTmp2 = _mm_madd_epi16(m128iBD,
522 _mm_loadu_si128((__m128i *) (transform4x4_luma[5])));
523 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
524 S0 = _mm_add_epi32(S0, m128iAdd);
525 S0 = _mm_srai_epi32(S0, shift_1st);
526
527 m128iTmp1 = _mm_madd_epi16(m128iAC,
528 _mm_loadu_si128((__m128i *) (transform4x4_luma[6])));
529 m128iTmp2 = _mm_madd_epi16(m128iBD,
530 _mm_loadu_si128((__m128i *) (transform4x4_luma[7])));
531 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
532 S8 = _mm_add_epi32(S8, m128iAdd);
533 S8 = _mm_srai_epi32(S8, shift_1st);
534
535 m128iD = _mm_packs_epi32(S0, S8);
536
537 S0 = _mm_unpacklo_epi16(m128iA, m128iD);
538 S8 = _mm_unpackhi_epi16(m128iA, m128iD);
539
540 m128iA = _mm_unpacklo_epi16(S0, S8);
541 m128iD = _mm_unpackhi_epi16(S0, S8);
542
543 /* ################### */
544 m128iAdd = _mm_set1_epi32(add_2nd);
545
546 m128iAC = _mm_unpacklo_epi16(m128iA, m128iD);
547 m128iBD = _mm_unpackhi_epi16(m128iA, m128iD);
548
549 m128iTmp1 = _mm_madd_epi16(m128iAC,
550 _mm_load_si128((__m128i *) (transform4x4_luma[0])));
551 m128iTmp2 = _mm_madd_epi16(m128iBD,
552 _mm_load_si128((__m128i *) (transform4x4_luma[1])));
553 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
554 S0 = _mm_add_epi32(S0, m128iAdd);
555 S0 = _mm_srai_epi32(S0, shift_2nd);
556
557 m128iTmp1 = _mm_madd_epi16(m128iAC,
558 _mm_load_si128((__m128i *) (transform4x4_luma[2])));
559 m128iTmp2 = _mm_madd_epi16(m128iBD,
560 _mm_load_si128((__m128i *) (transform4x4_luma[3])));
561 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
562 S8 = _mm_add_epi32(S8, m128iAdd);
563 S8 = _mm_srai_epi32(S8, shift_2nd);
564
565 m128iA = _mm_packs_epi32(S0, S8);
566
567 m128iTmp1 = _mm_madd_epi16(m128iAC,
568 _mm_load_si128((__m128i *) (transform4x4_luma[4])));
569 m128iTmp2 = _mm_madd_epi16(m128iBD,
570 _mm_load_si128((__m128i *) (transform4x4_luma[5])));
571 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
572 S0 = _mm_add_epi32(S0, m128iAdd);
573 S0 = _mm_srai_epi32(S0, shift_2nd);
574
575 m128iTmp1 = _mm_madd_epi16(m128iAC,
576 _mm_load_si128((__m128i *) (transform4x4_luma[6])));
577 m128iTmp2 = _mm_madd_epi16(m128iBD,
578 _mm_load_si128((__m128i *) (transform4x4_luma[7])));
579 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
580 S8 = _mm_add_epi32(S8, m128iAdd);
581 S8 = _mm_srai_epi32(S8, shift_2nd);
582
583 m128iD = _mm_packs_epi32(S0, S8);
584
585 _mm_storeu_si128((__m128i *) (src), m128iA);
586 _mm_storeu_si128((__m128i *) (src + 8), m128iD);
587 j = 0;
588 for (i = 0; i < 2; i++) {
589 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
590 dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
591 dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
592 dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
593 j += 1;
594 dst += stride;
595 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
596 dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
597 dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
598 dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
599 j += 1;
600 dst += stride;
601 }
602
603 }
604 #endif
605
606
607 #if HAVE_SSE4_1
608 void ff_hevc_transform_4x4_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
609 ptrdiff_t _stride) {
610 uint8_t shift_2nd = 12; // 20 - Bit depth
611 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
612
613 uint8_t *dst = (uint8_t*) _dst;
614 ptrdiff_t stride = _stride;
615 int16_t *src = coeffs;
616
617 __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD, m128iTmp1,m128iTmp2;
618 S0 = _mm_load_si128((__m128i *) (src));
619 S8 = _mm_load_si128((__m128i *) (src + 8));
620 m128iAdd = _mm_set1_epi32(add_1st);
621
622 m128Tmp = _mm_unpacklo_epi16(S0, S8);
623 E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
624 E1 = _mm_add_epi32(E1, m128iAdd);
625
626 E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
627 E2 = _mm_add_epi32(E2, m128iAdd);
628
629 m128Tmp = _mm_unpackhi_epi16(S0, S8);
630 O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
631 O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
632
633 m128iA = _mm_add_epi32(E1, O1);
634 m128iA = _mm_srai_epi32(m128iA, shift_1st); // Sum = Sum >> iShiftNum
635 m128Tmp = _mm_add_epi32(E2, O2);
636 m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum
637 m128iA = _mm_packs_epi32(m128iA, m128Tmp);
638
639 m128iD = _mm_sub_epi32(E2, O2);
640 m128iD = _mm_srai_epi32(m128iD, shift_1st); // Sum = Sum >> iShiftNum
641
642 m128Tmp = _mm_sub_epi32(E1, O1);
643 m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum
644
645 m128iD = _mm_packs_epi32(m128iD, m128Tmp);
646
647 S0 = _mm_unpacklo_epi16(m128iA, m128iD);
648 S8 = _mm_unpackhi_epi16(m128iA, m128iD);
649
650 m128iA = _mm_unpacklo_epi16(S0, S8);
651 m128iD = _mm_unpackhi_epi16(S0, S8);
652
653 /* ########################## */
654
655 m128iAdd = _mm_set1_epi32(add_2nd);
656 m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD);
657 E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
658 E1 = _mm_add_epi32(E1, m128iAdd);
659
660 E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
661 E2 = _mm_add_epi32(E2, m128iAdd);
662
663 m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD);
664 O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
665 O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
666
667 m128iA = _mm_add_epi32(E1, O1);
668 m128iA = _mm_srai_epi32(m128iA, shift_2nd);
669 m128Tmp = _mm_add_epi32(E2, O2);
670 m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
671 m128iA = _mm_packs_epi32(m128iA, m128Tmp);
672
673 m128iD = _mm_sub_epi32(E2, O2);
674 m128iD = _mm_srai_epi32(m128iD, shift_2nd);
675
676 m128Tmp = _mm_sub_epi32(E1, O1);
677 m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
678
679 m128iD = _mm_packs_epi32(m128iD, m128Tmp);
680
681 S0 = _mm_move_epi64(m128iA); //contains row 0
682 S8 = _mm_move_epi64(m128iD); //row 2
683 m128iA = _mm_srli_si128(m128iA, 8); // row 1
684 m128iD = _mm_srli_si128(m128iD, 8); // row 3
685 m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA);
686 m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD);
687 S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2);
688 S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2);
689
690 //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1); //mask to store 4 * 8bit data
691
692 m128iA = _mm_loadl_epi64((__m128i *) dst);
693 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
694 m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values
695 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
696 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
697 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
698
699 dst += stride;
700
701 m128iA = _mm_loadl_epi64((__m128i *) dst);
702 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
703 m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA);
704 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
705 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
706 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
707
708 dst += stride;
709
710 m128iA = _mm_loadl_epi64((__m128i *) dst);
711 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
712 m128iTmp1 = _mm_adds_epi16(S8, m128iA);
713 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
714 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
715 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
716
717 dst += stride;
718
719 m128iA = _mm_loadl_epi64((__m128i *) dst);
720 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
721 m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA);
722 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
723 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
724 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
725 }
726 #endif
727
728 #if 0
729 void ff_hevc_transform_4x4_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
730 ptrdiff_t _stride) {
731 int i;
732 uint8_t shift_2nd = 10; // 20 - Bit depth
733 uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
734
735 uint16_t *dst = (uint16_t*) _dst;
736 ptrdiff_t stride = _stride/2;
737 int16_t *src = coeffs;
738
739 int j;
740 __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD;
741 S0 = _mm_load_si128((__m128i *) (src));
742 S8 = _mm_load_si128((__m128i *) (src + 8));
743 m128iAdd = _mm_set1_epi32(add_1st);
744
745 m128Tmp = _mm_unpacklo_epi16(S0, S8);
746 E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
747 E1 = _mm_add_epi32(E1, m128iAdd);
748
749 E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
750 E2 = _mm_add_epi32(E2, m128iAdd);
751
752 m128Tmp = _mm_unpackhi_epi16(S0, S8);
753 O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
754 O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
755
756 m128iA = _mm_add_epi32(E1, O1);
757 m128iA = _mm_srai_epi32(m128iA, shift_1st); // Sum = Sum >> iShiftNum
758 m128Tmp = _mm_add_epi32(E2, O2);
759 m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum
760 m128iA = _mm_packs_epi32(m128iA, m128Tmp);
761
762 m128iD = _mm_sub_epi32(E2, O2);
763 m128iD = _mm_srai_epi32(m128iD, shift_1st); // Sum = Sum >> iShiftNum
764
765 m128Tmp = _mm_sub_epi32(E1, O1);
766 m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum
767
768 m128iD = _mm_packs_epi32(m128iD, m128Tmp);
769
770 S0 = _mm_unpacklo_epi16(m128iA, m128iD);
771 S8 = _mm_unpackhi_epi16(m128iA, m128iD);
772
773 m128iA = _mm_unpacklo_epi16(S0, S8);
774 m128iD = _mm_unpackhi_epi16(S0, S8);
775
776 /* ########################## */
777
778 m128iAdd = _mm_set1_epi32(add_2nd);
779 m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD);
780 E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
781 E1 = _mm_add_epi32(E1, m128iAdd);
782
783 E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
784 E2 = _mm_add_epi32(E2, m128iAdd);
785
786 m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD);
787 O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
788 O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
789
790 m128iA = _mm_add_epi32(E1, O1);
791 m128iA = _mm_srai_epi32(m128iA, shift_2nd);
792 m128Tmp = _mm_add_epi32(E2, O2);
793 m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
794 m128iA = _mm_packs_epi32(m128iA, m128Tmp);
795
796 m128iD = _mm_sub_epi32(E2, O2);
797 m128iD = _mm_srai_epi32(m128iD, shift_2nd);
798
799 m128Tmp = _mm_sub_epi32(E1, O1);
800 m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
801
802 m128iD = _mm_packs_epi32(m128iD, m128Tmp);
803 _mm_storeu_si128((__m128i *) (src), m128iA);
804 _mm_storeu_si128((__m128i *) (src + 8), m128iD);
805 j = 0;
806 for (i = 0; i < 2; i++) {
807 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
808 dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
809 dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
810 dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
811 j += 1;
812 dst += stride;
813 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
814 dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
815 dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
816 dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
817 j += 1;
818 dst += stride;
819 }
820 }
821 #endif
822
823 #if HAVE_SSE4_1
824 void ff_hevc_transform_8x8_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
825 ptrdiff_t _stride) {
826 uint8_t shift_2nd = 12; // 20 - Bit depth
827 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
828
829 uint8_t *dst = (uint8_t*) _dst;
830 ptrdiff_t stride = _stride / sizeof(uint8_t);
831 int16_t *src = coeffs;
832 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
833 m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
834 E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
835
836 O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h,
837 T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11;
838 T0= _mm_load_si128((__m128i *) (transform8x8[0]));
839 T1= _mm_load_si128((__m128i *) (transform8x8[1]));
840 T2= _mm_load_si128((__m128i *) (transform8x8[2]));
841 T3= _mm_load_si128((__m128i *) (transform8x8[3]));
842 T4= _mm_load_si128((__m128i *) (transform8x8[4]));
843 T5= _mm_load_si128((__m128i *) (transform8x8[5]));
844 T6= _mm_load_si128((__m128i *) (transform8x8[6]));
845 T7= _mm_load_si128((__m128i *) (transform8x8[7]));
846 T8= _mm_load_si128((__m128i *) (transform8x8[8]));
847 T9= _mm_load_si128((__m128i *) (transform8x8[9]));
848 T10= _mm_load_si128((__m128i *) (transform8x8[10]));
849 T11= _mm_load_si128((__m128i *) (transform8x8[11]));
850
851 m128iAdd = _mm_set1_epi32(add_1st);
852
853 m128iS1 = _mm_load_si128((__m128i *) (src + 8));
854 m128iS3 = _mm_load_si128((__m128i *) (src + 24));
855 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
856 E1l = _mm_madd_epi16(m128Tmp0, T0);
857 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
858 E1h = _mm_madd_epi16(m128Tmp1, T0);
859 m128iS5 = _mm_load_si128((__m128i *) (src + 40));
860 m128iS7 = _mm_load_si128((__m128i *) (src + 56));
861 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
862 E2l = _mm_madd_epi16(m128Tmp2, T1);
863 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
864 E2h = _mm_madd_epi16(m128Tmp3, T1);
865 O0l = _mm_add_epi32(E1l, E2l);
866 O0h = _mm_add_epi32(E1h, E2h);
867
868 E1l = _mm_madd_epi16(m128Tmp0, T2);
869 E1h = _mm_madd_epi16(m128Tmp1, T2);
870 E2l = _mm_madd_epi16(m128Tmp2, T3);
871 E2h = _mm_madd_epi16(m128Tmp3, T3);
872
873 O1l = _mm_add_epi32(E1l, E2l);
874 O1h = _mm_add_epi32(E1h, E2h);
875
876 E1l = _mm_madd_epi16(m128Tmp0, T4);
877 E1h = _mm_madd_epi16(m128Tmp1, T4);
878 E2l = _mm_madd_epi16(m128Tmp2, T5);
879 E2h = _mm_madd_epi16(m128Tmp3, T5);
880 O2l = _mm_add_epi32(E1l, E2l);
881 O2h = _mm_add_epi32(E1h, E2h);
882
883 E1l = _mm_madd_epi16(m128Tmp0, T6);
884 E1h = _mm_madd_epi16(m128Tmp1, T6);
885 E2l = _mm_madd_epi16(m128Tmp2, T7);
886 E2h = _mm_madd_epi16(m128Tmp3, T7);
887 O3h = _mm_add_epi32(E1h, E2h);
888 O3l = _mm_add_epi32(E1l, E2l);
889
890 /* ------- */
891
892 m128iS0 = _mm_load_si128((__m128i *) (src + 0));
893 m128iS4 = _mm_load_si128((__m128i *) (src + 32));
894 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
895 EE0l = _mm_madd_epi16(m128Tmp0, T8);
896 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
897 EE0h = _mm_madd_epi16(m128Tmp1, T8);
898
899 EE1l = _mm_madd_epi16(m128Tmp0, T9);
900 EE1h = _mm_madd_epi16(m128Tmp1, T9);
901
902 /* ------- */
903
904 m128iS2 = _mm_load_si128((__m128i *) (src + 16));
905 m128iS6 = _mm_load_si128((__m128i *) (src + 48));
906 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
907 E00l = _mm_madd_epi16(m128Tmp0, T10);
908 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
909 E00h = _mm_madd_epi16(m128Tmp1, T10);
910 E01l = _mm_madd_epi16(m128Tmp0, T11);
911 E01h = _mm_madd_epi16(m128Tmp1, T11);
912 E0l = _mm_add_epi32(EE0l, E00l);
913 E0l = _mm_add_epi32(E0l, m128iAdd);
914 E0h = _mm_add_epi32(EE0h, E00h);
915 E0h = _mm_add_epi32(E0h, m128iAdd);
916 E3l = _mm_sub_epi32(EE0l, E00l);
917 E3l = _mm_add_epi32(E3l, m128iAdd);
918 E3h = _mm_sub_epi32(EE0h, E00h);
919 E3h = _mm_add_epi32(E3h, m128iAdd);
920
921 E1l = _mm_add_epi32(EE1l, E01l);
922 E1l = _mm_add_epi32(E1l, m128iAdd);
923 E1h = _mm_add_epi32(EE1h, E01h);
924 E1h = _mm_add_epi32(E1h, m128iAdd);
925 E2l = _mm_sub_epi32(EE1l, E01l);
926 E2l = _mm_add_epi32(E2l, m128iAdd);
927 E2h = _mm_sub_epi32(EE1h, E01h);
928 E2h = _mm_add_epi32(E2h, m128iAdd);
929 m128iS0 = _mm_packs_epi32(
930 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st),
931 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st));
932 m128iS1 = _mm_packs_epi32(
933 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st),
934 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st));
935 m128iS2 = _mm_packs_epi32(
936 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st),
937 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st));
938 m128iS3 = _mm_packs_epi32(
939 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st),
940 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st));
941 m128iS4 = _mm_packs_epi32(
942 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st),
943 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st));
944 m128iS5 = _mm_packs_epi32(
945 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st),
946 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st));
947 m128iS6 = _mm_packs_epi32(
948 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st),
949 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st));
950 m128iS7 = _mm_packs_epi32(
951 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st),
952 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st));
953 /* Invers matrix */
954
955 E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
956 E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
957 E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
958 E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
959 O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
960 O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
961 O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
962 O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
963 m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
964 m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
965 m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
966 m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
967 m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
968 m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
969 m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
970 m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
971 m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
972 m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
973 m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
974 m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
975 m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
976 m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
977 m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
978 m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
979
980 m128iAdd = _mm_set1_epi32(add_2nd);
981
982 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
983 E1l = _mm_madd_epi16(m128Tmp0, T0);
984 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
985 E1h = _mm_madd_epi16(m128Tmp1, T0);
986 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
987 E2l = _mm_madd_epi16(m128Tmp2, T1);
988 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
989 E2h = _mm_madd_epi16(m128Tmp3, T1);
990 O0l = _mm_add_epi32(E1l, E2l);
991 O0h = _mm_add_epi32(E1h, E2h);
992 E1l = _mm_madd_epi16(m128Tmp0, T2);
993 E1h = _mm_madd_epi16(m128Tmp1, T2);
994 E2l = _mm_madd_epi16(m128Tmp2, T3);
995 E2h = _mm_madd_epi16(m128Tmp3, T3);
996 O1l = _mm_add_epi32(E1l, E2l);
997 O1h = _mm_add_epi32(E1h, E2h);
998 E1l = _mm_madd_epi16(m128Tmp0, T4);
999 E1h = _mm_madd_epi16(m128Tmp1, T4);
1000 E2l = _mm_madd_epi16(m128Tmp2, T5);
1001 E2h = _mm_madd_epi16(m128Tmp3, T5);
1002 O2l = _mm_add_epi32(E1l, E2l);
1003 O2h = _mm_add_epi32(E1h, E2h);
1004 E1l = _mm_madd_epi16(m128Tmp0, T6);
1005 E1h = _mm_madd_epi16(m128Tmp1, T6);
1006 E2l = _mm_madd_epi16(m128Tmp2, T7);
1007 E2h = _mm_madd_epi16(m128Tmp3, T7);
1008 O3h = _mm_add_epi32(E1h, E2h);
1009 O3l = _mm_add_epi32(E1l, E2l);
1010
1011 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1012 EE0l = _mm_madd_epi16(m128Tmp0, T8);
1013 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1014 EE0h = _mm_madd_epi16(m128Tmp1, T8);
1015 EE1l = _mm_madd_epi16(m128Tmp0, T9);
1016 EE1h = _mm_madd_epi16(m128Tmp1, T9);
1017
1018 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1019 E00l = _mm_madd_epi16(m128Tmp0, T10);
1020 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1021 E00h = _mm_madd_epi16(m128Tmp1, T10);
1022 E01l = _mm_madd_epi16(m128Tmp0, T11);
1023 E01h = _mm_madd_epi16(m128Tmp1, T11);
1024 E0l = _mm_add_epi32(EE0l, E00l);
1025 E0l = _mm_add_epi32(E0l, m128iAdd);
1026 E0h = _mm_add_epi32(EE0h, E00h);
1027 E0h = _mm_add_epi32(E0h, m128iAdd);
1028 E3l = _mm_sub_epi32(EE0l, E00l);
1029 E3l = _mm_add_epi32(E3l, m128iAdd);
1030 E3h = _mm_sub_epi32(EE0h, E00h);
1031 E3h = _mm_add_epi32(E3h, m128iAdd);
1032 E1l = _mm_add_epi32(EE1l, E01l);
1033 E1l = _mm_add_epi32(E1l, m128iAdd);
1034 E1h = _mm_add_epi32(EE1h, E01h);
1035 E1h = _mm_add_epi32(E1h, m128iAdd);
1036 E2l = _mm_sub_epi32(EE1l, E01l);
1037 E2l = _mm_add_epi32(E2l, m128iAdd);
1038 E2h = _mm_sub_epi32(EE1h, E01h);
1039 E2h = _mm_add_epi32(E2h, m128iAdd);
1040
1041 m128iS0 = _mm_packs_epi32(
1042 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd),
1043 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd));
1044 m128iS1 = _mm_packs_epi32(
1045 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd),
1046 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd));
1047 m128iS2 = _mm_packs_epi32(
1048 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd),
1049 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd));
1050 m128iS3 = _mm_packs_epi32(
1051 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd),
1052 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd));
1053 m128iS4 = _mm_packs_epi32(
1054 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd),
1055 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd));
1056 m128iS5 = _mm_packs_epi32(
1057 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd),
1058 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd));
1059 m128iS6 = _mm_packs_epi32(
1060 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd),
1061 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd));
1062 m128iS7 = _mm_packs_epi32(
1063 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd),
1064 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd));
1065
1066 E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
1067 E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
1068 E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
1069 E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
1070 O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
1071 O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
1072 O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
1073 O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
1074 m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
1075 m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
1076 m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1077 m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1078 m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
1079 m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
1080 m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1081 m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1082 m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
1083 m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
1084 m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1085 m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1086 m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
1087 m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
1088 m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1089 m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1090
1091 E0l = _mm_loadl_epi64((__m128i *) dst);
1092 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1093
1094 E0l = _mm_adds_epi16(E0l, m128iS0);
1095 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1096 _mm_storel_epi64((__m128i *) dst, E0l);
1097 dst += stride;
1098
1099 E0l = _mm_loadl_epi64((__m128i *) dst);
1100 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1101
1102 E0l = _mm_adds_epi16(E0l, m128iS1);
1103 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1104 _mm_storel_epi64((__m128i *) dst, E0l);
1105 dst += stride;
1106
1107 E0l = _mm_loadl_epi64((__m128i *) dst);
1108 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1109
1110 E0l = _mm_adds_epi16(E0l, m128iS2);
1111 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1112 _mm_storel_epi64((__m128i *) dst, E0l);
1113 dst += stride;
1114
1115 E0l = _mm_loadl_epi64((__m128i *) dst);
1116 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1117
1118 E0l = _mm_adds_epi16(E0l, m128iS3);
1119 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1120 _mm_storel_epi64((__m128i *) dst, E0l);
1121 dst += stride;
1122
1123 E0l = _mm_loadl_epi64((__m128i *) dst);
1124 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1125
1126 E0l = _mm_adds_epi16(E0l, m128iS4);
1127 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1128 _mm_storel_epi64((__m128i *) dst, E0l);
1129 dst += stride;
1130
1131 E0l = _mm_loadl_epi64((__m128i *) dst);
1132 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1133
1134 E0l = _mm_adds_epi16(E0l, m128iS5);
1135 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1136 _mm_storel_epi64((__m128i *) dst, E0l);
1137 dst += stride;
1138
1139 E0l = _mm_loadl_epi64((__m128i *) dst);
1140 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1141
1142 E0l = _mm_adds_epi16(E0l, m128iS6);
1143 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1144 _mm_storel_epi64((__m128i *) dst, E0l);
1145 dst += stride;
1146
1147 E0l = _mm_loadl_epi64((__m128i *) dst);
1148 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1149
1150 E0l = _mm_adds_epi16(E0l, m128iS7);
1151 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1152 _mm_storel_epi64((__m128i *) dst, E0l);
1153 dst += stride;
1154
1155 }
1156 #endif
1157
1158 #if 0
1159 void ff_hevc_transform_8x8_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
1160 ptrdiff_t _stride) {
1161 int i;
1162 uint16_t *dst = (uint16_t*) _dst;
1163 ptrdiff_t stride = _stride / sizeof(uint16_t);
1164 int16_t *src = coeffs;
1165 uint8_t shift_2nd = 10; // 20 - Bit depth
1166 uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
1167
1168 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
1169 m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
1170 E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
1171 O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
1172 int j;
1173 m128iAdd = _mm_set1_epi32(add_1st);
1174
1175 m128iS1 = _mm_load_si128((__m128i *) (src + 8));
1176 m128iS3 = _mm_load_si128((__m128i *) (src + 24));
1177 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1178 E1l = _mm_madd_epi16(m128Tmp0,
1179 _mm_load_si128((__m128i *) (transform8x8[0])));
1180 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1181 E1h = _mm_madd_epi16(m128Tmp1,
1182 _mm_load_si128((__m128i *) (transform8x8[0])));
1183 m128iS5 = _mm_load_si128((__m128i *) (src + 40));
1184 m128iS7 = _mm_load_si128((__m128i *) (src + 56));
1185 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1186 E2l = _mm_madd_epi16(m128Tmp2,
1187 _mm_load_si128((__m128i *) (transform8x8[1])));
1188 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1189 E2h = _mm_madd_epi16(m128Tmp3,
1190 _mm_load_si128((__m128i *) (transform8x8[1])));
1191 O0l = _mm_add_epi32(E1l, E2l);
1192 O0h = _mm_add_epi32(E1h, E2h);
1193
1194 E1l = _mm_madd_epi16(m128Tmp0,
1195 _mm_load_si128((__m128i *) (transform8x8[2])));
1196 E1h = _mm_madd_epi16(m128Tmp1,
1197 _mm_load_si128((__m128i *) (transform8x8[2])));
1198 E2l = _mm_madd_epi16(m128Tmp2,
1199 _mm_load_si128((__m128i *) (transform8x8[3])));
1200 E2h = _mm_madd_epi16(m128Tmp3,
1201 _mm_load_si128((__m128i *) (transform8x8[3])));
1202
1203 O1l = _mm_add_epi32(E1l, E2l);
1204 O1h = _mm_add_epi32(E1h, E2h);
1205
1206 E1l = _mm_madd_epi16(m128Tmp0,
1207 _mm_load_si128((__m128i *) (transform8x8[4])));
1208 E1h = _mm_madd_epi16(m128Tmp1,
1209 _mm_load_si128((__m128i *) (transform8x8[4])));
1210 E2l = _mm_madd_epi16(m128Tmp2,
1211 _mm_load_si128((__m128i *) (transform8x8[5])));
1212 E2h = _mm_madd_epi16(m128Tmp3,
1213 _mm_load_si128((__m128i *) (transform8x8[5])));
1214 O2l = _mm_add_epi32(E1l, E2l);
1215 O2h = _mm_add_epi32(E1h, E2h);
1216
1217 E1l = _mm_madd_epi16(m128Tmp0,
1218 _mm_load_si128((__m128i *) (transform8x8[6])));
1219 E1h = _mm_madd_epi16(m128Tmp1,
1220 _mm_load_si128((__m128i *) (transform8x8[6])));
1221 E2l = _mm_madd_epi16(m128Tmp2,
1222 _mm_load_si128((__m128i *) (transform8x8[7])));
1223 E2h = _mm_madd_epi16(m128Tmp3,
1224 _mm_load_si128((__m128i *) (transform8x8[7])));
1225 O3h = _mm_add_epi32(E1h, E2h);
1226 O3l = _mm_add_epi32(E1l, E2l);
1227
1228 /* ------- */
1229
1230 m128iS0 = _mm_load_si128((__m128i *) (src + 0));
1231 m128iS4 = _mm_load_si128((__m128i *) (src + 32));
1232 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1233 EE0l = _mm_madd_epi16(m128Tmp0,
1234 _mm_load_si128((__m128i *) (transform8x8[8])));
1235 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1236 EE0h = _mm_madd_epi16(m128Tmp1,
1237 _mm_load_si128((__m128i *) (transform8x8[8])));
1238
1239 EE1l = _mm_madd_epi16(m128Tmp0,
1240 _mm_load_si128((__m128i *) (transform8x8[9])));
1241 EE1h = _mm_madd_epi16(m128Tmp1,
1242 _mm_load_si128((__m128i *) (transform8x8[9])));
1243
1244 /* ------- */
1245
1246 m128iS2 = _mm_load_si128((__m128i *) (src + 16));
1247 m128iS6 = _mm_load_si128((__m128i *) (src + 48));
1248 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1249 E00l = _mm_madd_epi16(m128Tmp0,
1250 _mm_load_si128((__m128i *) (transform8x8[10])));
1251 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1252 E00h = _mm_madd_epi16(m128Tmp1,
1253 _mm_load_si128((__m128i *) (transform8x8[10])));
1254 E01l = _mm_madd_epi16(m128Tmp0,
1255 _mm_load_si128((__m128i *) (transform8x8[11])));
1256 E01h = _mm_madd_epi16(m128Tmp1,
1257 _mm_load_si128((__m128i *) (transform8x8[11])));
1258 E0l = _mm_add_epi32(EE0l, E00l);
1259 E0l = _mm_add_epi32(E0l, m128iAdd);
1260 E0h = _mm_add_epi32(EE0h, E00h);
1261 E0h = _mm_add_epi32(E0h, m128iAdd);
1262 E3l = _mm_sub_epi32(EE0l, E00l);
1263 E3l = _mm_add_epi32(E3l, m128iAdd);
1264 E3h = _mm_sub_epi32(EE0h, E00h);
1265 E3h = _mm_add_epi32(E3h, m128iAdd);
1266
1267 E1l = _mm_add_epi32(EE1l, E01l);
1268 E1l = _mm_add_epi32(E1l, m128iAdd);
1269 E1h = _mm_add_epi32(EE1h, E01h);
1270 E1h = _mm_add_epi32(E1h, m128iAdd);
1271 E2l = _mm_sub_epi32(EE1l, E01l);
1272 E2l = _mm_add_epi32(E2l, m128iAdd);
1273 E2h = _mm_sub_epi32(EE1h, E01h);
1274 E2h = _mm_add_epi32(E2h, m128iAdd);
1275 m128iS0 = _mm_packs_epi32(
1276 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st),
1277 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st));
1278 m128iS1 = _mm_packs_epi32(
1279 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st),
1280 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st));
1281 m128iS2 = _mm_packs_epi32(
1282 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st),
1283 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st));
1284 m128iS3 = _mm_packs_epi32(
1285 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st),
1286 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st));
1287 m128iS4 = _mm_packs_epi32(
1288 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st),
1289 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st));
1290 m128iS5 = _mm_packs_epi32(
1291 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st),
1292 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st));
1293 m128iS6 = _mm_packs_epi32(
1294 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st),
1295 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st));
1296 m128iS7 = _mm_packs_epi32(
1297 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st),
1298 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st));
1299 /* Invers matrix */
1300
1301 E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
1302 E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
1303 E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
1304 E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
1305 O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
1306 O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
1307 O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
1308 O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
1309 m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
1310 m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
1311 m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1312 m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1313 m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
1314 m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
1315 m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1316 m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1317 m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
1318 m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
1319 m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1320 m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1321 m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
1322 m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
1323 m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1324 m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1325
1326 m128iAdd = _mm_set1_epi32(add_2nd);
1327
1328 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1329 E1l = _mm_madd_epi16(m128Tmp0,
1330 _mm_load_si128((__m128i *) (transform8x8[0])));
1331 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1332 E1h = _mm_madd_epi16(m128Tmp1,
1333 _mm_load_si128((__m128i *) (transform8x8[0])));
1334 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1335 E2l = _mm_madd_epi16(m128Tmp2,
1336 _mm_load_si128((__m128i *) (transform8x8[1])));
1337 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1338 E2h = _mm_madd_epi16(m128Tmp3,
1339 _mm_load_si128((__m128i *) (transform8x8[1])));
1340 O0l = _mm_add_epi32(E1l, E2l);
1341 O0h = _mm_add_epi32(E1h, E2h);
1342 E1l = _mm_madd_epi16(m128Tmp0,
1343 _mm_load_si128((__m128i *) (transform8x8[2])));
1344 E1h = _mm_madd_epi16(m128Tmp1,
1345 _mm_load_si128((__m128i *) (transform8x8[2])));
1346 E2l = _mm_madd_epi16(m128Tmp2,
1347 _mm_load_si128((__m128i *) (transform8x8[3])));
1348 E2h = _mm_madd_epi16(m128Tmp3,
1349 _mm_load_si128((__m128i *) (transform8x8[3])));
1350 O1l = _mm_add_epi32(E1l, E2l);
1351 O1h = _mm_add_epi32(E1h, E2h);
1352 E1l = _mm_madd_epi16(m128Tmp0,
1353 _mm_load_si128((__m128i *) (transform8x8[4])));
1354 E1h = _mm_madd_epi16(m128Tmp1,
1355 _mm_load_si128((__m128i *) (transform8x8[4])));
1356 E2l = _mm_madd_epi16(m128Tmp2,
1357 _mm_load_si128((__m128i *) (transform8x8[5])));
1358 E2h = _mm_madd_epi16(m128Tmp3,
1359 _mm_load_si128((__m128i *) (transform8x8[5])));
1360 O2l = _mm_add_epi32(E1l, E2l);
1361 O2h = _mm_add_epi32(E1h, E2h);
1362 E1l = _mm_madd_epi16(m128Tmp0,
1363 _mm_load_si128((__m128i *) (transform8x8[6])));
1364 E1h = _mm_madd_epi16(m128Tmp1,
1365 _mm_load_si128((__m128i *) (transform8x8[6])));
1366 E2l = _mm_madd_epi16(m128Tmp2,
1367 _mm_load_si128((__m128i *) (transform8x8[7])));
1368 E2h = _mm_madd_epi16(m128Tmp3,
1369 _mm_load_si128((__m128i *) (transform8x8[7])));
1370 O3h = _mm_add_epi32(E1h, E2h);
1371 O3l = _mm_add_epi32(E1l, E2l);
1372
1373 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1374 EE0l = _mm_madd_epi16(m128Tmp0,
1375 _mm_load_si128((__m128i *) (transform8x8[8])));
1376 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1377 EE0h = _mm_madd_epi16(m128Tmp1,
1378 _mm_load_si128((__m128i *) (transform8x8[8])));
1379 EE1l = _mm_madd_epi16(m128Tmp0,
1380 _mm_load_si128((__m128i *) (transform8x8[9])));
1381 EE1h = _mm_madd_epi16(m128Tmp1,
1382 _mm_load_si128((__m128i *) (transform8x8[9])));
1383
1384 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1385 E00l = _mm_madd_epi16(m128Tmp0,
1386 _mm_load_si128((__m128i *) (transform8x8[10])));
1387 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1388 E00h = _mm_madd_epi16(m128Tmp1,
1389 _mm_load_si128((__m128i *) (transform8x8[10])));
1390 E01l = _mm_madd_epi16(m128Tmp0,
1391 _mm_load_si128((__m128i *) (transform8x8[11])));
1392 E01h = _mm_madd_epi16(m128Tmp1,
1393 _mm_load_si128((__m128i *) (transform8x8[11])));
1394 E0l = _mm_add_epi32(EE0l, E00l);
1395 E0l = _mm_add_epi32(E0l, m128iAdd);
1396 E0h = _mm_add_epi32(EE0h, E00h);
1397 E0h = _mm_add_epi32(E0h, m128iAdd);
1398 E3l = _mm_sub_epi32(EE0l, E00l);
1399 E3l = _mm_add_epi32(E3l, m128iAdd);
1400 E3h = _mm_sub_epi32(EE0h, E00h);
1401 E3h = _mm_add_epi32(E3h, m128iAdd);
1402 E1l = _mm_add_epi32(EE1l, E01l);
1403 E1l = _mm_add_epi32(E1l, m128iAdd);
1404 E1h = _mm_add_epi32(EE1h, E01h);
1405 E1h = _mm_add_epi32(E1h, m128iAdd);
1406 E2l = _mm_sub_epi32(EE1l, E01l);
1407 E2l = _mm_add_epi32(E2l, m128iAdd);
1408 E2h = _mm_sub_epi32(EE1h, E01h);
1409 E2h = _mm_add_epi32(E2h, m128iAdd);
1410
1411 m128iS0 = _mm_packs_epi32(
1412 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd),
1413 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd));
1414 m128iS1 = _mm_packs_epi32(
1415 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd),
1416 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd));
1417 m128iS2 = _mm_packs_epi32(
1418 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd),
1419 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd));
1420 m128iS3 = _mm_packs_epi32(
1421 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd),
1422 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd));
1423 m128iS4 = _mm_packs_epi32(
1424 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd),
1425 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd));
1426 m128iS5 = _mm_packs_epi32(
1427 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd),
1428 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd));
1429 m128iS6 = _mm_packs_epi32(
1430 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd),
1431 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd));
1432 m128iS7 = _mm_packs_epi32(
1433 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd),
1434 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd));
1435
1436 _mm_store_si128((__m128i *) (src), m128iS0);
1437 _mm_store_si128((__m128i *) (src + 8), m128iS1);
1438 _mm_store_si128((__m128i *) (src + 16), m128iS2);
1439 _mm_store_si128((__m128i *) (src + 24), m128iS3);
1440 _mm_store_si128((__m128i *) (src + 32), m128iS4);
1441 _mm_store_si128((__m128i *) (src + 40), m128iS5);
1442 _mm_store_si128((__m128i *) (src + 48), m128iS6);
1443 _mm_store_si128((__m128i *) (src + 56), m128iS7);
1444
1445 j = 0;
1446 for (i = 0; i < 4; i++) {
1447 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
1448 dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10);
1449 dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10);
1450 dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10);
1451 dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10);
1452 dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10);
1453 dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10);
1454 dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10);
1455 j += 1;
1456 dst += stride;
1457 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
1458 dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10);
1459 dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10);
1460 dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10);
1461 dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10);
1462 dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10);
1463 dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10);
1464 dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10);
1465 j += 1;
1466 dst += stride;
1467 }
1468
1469 }
1470 #endif
1471
1472
1473 #if HAVE_SSE4_1
1474 void ff_hevc_transform_16x16_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
1475 ptrdiff_t _stride) {
1476 uint8_t shift_2nd = 12; // 20 - Bit depth
1477 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
1478 int i;
1479 uint8_t *dst = (uint8_t*) _dst;
1480 ptrdiff_t stride = _stride / sizeof(uint8_t);
1481 int16_t *src = coeffs;
1482 int32_t shift;
1483 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
1484 m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
1485 m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
1486 m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
1487 E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
1488 O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
1489 E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
1490 __m128i E4l, E5l, E6l, E7l;
1491 __m128i E4h, E5h, E6h, E7h;
1492 __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;
1493 __m128i r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31;
1494
1495
1496 /*__m128i T00,T01, T02, T03, T04, T05, T06, T07;
1497 __m128i T10,T11, T12, T13, T14, T15, T16, T17;
1498 __m128i T20,T21, T22, T23, T24, T25, T26, T27;
1499 __m128i T30,T31, T32, T33, T34, T35, T36, T37;
1500
1501 __m128i U00,U01, U02, U03, U10, U11, U12, U13;
1502
1503 __m128i V00,V01, V10, V11;*/
1504
1505
1506 const __m128i T00 = _mm_load_si128((__m128i *) (transform16x16_1[0][0]));
1507 const __m128i T01 = _mm_load_si128((__m128i *) (transform16x16_1[0][1]));
1508 const __m128i T02 = _mm_load_si128((__m128i *) (transform16x16_1[0][2]));
1509 const __m128i T03 = _mm_load_si128((__m128i *) (transform16x16_1[0][3]));
1510 const __m128i T04 = _mm_load_si128((__m128i *) (transform16x16_1[0][4]));
1511 const __m128i T05 = _mm_load_si128((__m128i *) (transform16x16_1[0][5]));
1512 const __m128i T06 = _mm_load_si128((__m128i *) (transform16x16_1[0][6]));
1513 const __m128i T07 = _mm_load_si128((__m128i *) (transform16x16_1[0][7]));
1514 const __m128i T10 = _mm_load_si128((__m128i *) (transform16x16_1[1][0]));
1515 const __m128i T11 = _mm_load_si128((__m128i *) (transform16x16_1[1][1]));
1516 const __m128i T12 = _mm_load_si128((__m128i *) (transform16x16_1[1][2]));
1517 const __m128i T13 = _mm_load_si128((__m128i *) (transform16x16_1[1][3]));
1518 const __m128i T14 = _mm_load_si128((__m128i *) (transform16x16_1[1][4]));
1519 const __m128i T15 = _mm_load_si128((__m128i *) (transform16x16_1[1][5]));
1520 const __m128i T16 = _mm_load_si128((__m128i *) (transform16x16_1[1][6]));
1521 const __m128i T17 = _mm_load_si128((__m128i *) (transform16x16_1[1][7]));
1522 const __m128i T20 = _mm_load_si128((__m128i *) (transform16x16_1[2][0]));
1523 const __m128i T21 = _mm_load_si128((__m128i *) (transform16x16_1[2][1]));
1524 const __m128i T22 = _mm_load_si128((__m128i *) (transform16x16_1[2][2]));
1525 const __m128i T23 = _mm_load_si128((__m128i *) (transform16x16_1[2][3]));
1526 const __m128i T24 = _mm_load_si128((__m128i *) (transform16x16_1[2][4]));
1527 const __m128i T25 = _mm_load_si128((__m128i *) (transform16x16_1[2][5]));
1528 const __m128i T26 = _mm_load_si128((__m128i *) (transform16x16_1[2][6]));
1529 const __m128i T27 = _mm_load_si128((__m128i *) (transform16x16_1[2][7]));
1530 const __m128i T30 = _mm_load_si128((__m128i *) (transform16x16_1[3][0]));
1531 const __m128i T31 = _mm_load_si128((__m128i *) (transform16x16_1[3][1]));
1532 const __m128i T32 = _mm_load_si128((__m128i *) (transform16x16_1[3][2]));
1533 const __m128i T33 = _mm_load_si128((__m128i *) (transform16x16_1[3][3]));
1534 const __m128i T34 = _mm_load_si128((__m128i *) (transform16x16_1[3][4]));
1535 const __m128i T35 = _mm_load_si128((__m128i *) (transform16x16_1[3][5]));
1536 const __m128i T36 = _mm_load_si128((__m128i *) (transform16x16_1[3][6]));
1537 const __m128i T37 = _mm_load_si128((__m128i *) (transform16x16_1[3][7]));
1538
1539 const __m128i U00 = _mm_load_si128((__m128i *) (transform16x16_2[0][0]));
1540 const __m128i U01 = _mm_load_si128((__m128i *) (transform16x16_2[0][1]));
1541 const __m128i U02 = _mm_load_si128((__m128i *) (transform16x16_2[0][2]));
1542 const __m128i U03 = _mm_load_si128((__m128i *) (transform16x16_2[0][3]));
1543 const __m128i U10 = _mm_load_si128((__m128i *) (transform16x16_2[1][0]));
1544 const __m128i U11 = _mm_load_si128((__m128i *) (transform16x16_2[1][1]));
1545 const __m128i U12 = _mm_load_si128((__m128i *) (transform16x16_2[1][2]));
1546 const __m128i U13 = _mm_load_si128((__m128i *) (transform16x16_2[1][3]));
1547
1548 const __m128i V00 = _mm_load_si128((__m128i *) (transform16x16_3[0][0]));
1549 const __m128i V01 = _mm_load_si128((__m128i *) (transform16x16_3[0][1]));
1550 const __m128i V10 = _mm_load_si128((__m128i *) (transform16x16_3[1][0]));
1551 const __m128i V11 = _mm_load_si128((__m128i *) (transform16x16_3[1][1]));
1552
1553
1554
1555 int j;
1556 m128iS0 = _mm_load_si128((__m128i *) (src));
1557 m128iS1 = _mm_load_si128((__m128i *) (src + 16));
1558 m128iS2 = _mm_load_si128((__m128i *) (src + 32));
1559 m128iS3 = _mm_load_si128((__m128i *) (src + 48));
1560 m128iS4 = _mm_loadu_si128((__m128i *) (src + 64));
1561 m128iS5 = _mm_load_si128((__m128i *) (src + 80));
1562 m128iS6 = _mm_load_si128((__m128i *) (src + 96));
1563 m128iS7 = _mm_load_si128((__m128i *) (src + 112));
1564 m128iS8 = _mm_load_si128((__m128i *) (src + 128));
1565 m128iS9 = _mm_load_si128((__m128i *) (src + 144));
1566 m128iS10 = _mm_load_si128((__m128i *) (src + 160));
1567 m128iS11 = _mm_load_si128((__m128i *) (src + 176));
1568 m128iS12 = _mm_load_si128((__m128i *) (src + 192));
1569 m128iS13 = _mm_load_si128((__m128i *) (src + 208));
1570 m128iS14 = _mm_load_si128((__m128i *) (src + 224));
1571 m128iS15 = _mm_load_si128((__m128i *) (src + 240));
1572 shift = shift_1st;
1573 m128iAdd = _mm_set1_epi32(add_1st);
1574
1575 for (j = 0; j < 2; j++) {
1576 for (i = 0; i < 16; i += 8) {
1577
1578 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1579 E0l = _mm_madd_epi16(m128Tmp0,T00);
1580 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1581 E0h = _mm_madd_epi16(m128Tmp1,T00);
1582
1583 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1584 E1l = _mm_madd_epi16(m128Tmp2,T10);
1585 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1586 E1h = _mm_madd_epi16(m128Tmp3,T10);
1587
1588 m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
1589 E2l = _mm_madd_epi16(m128Tmp4,T20);
1590 m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
1591 E2h = _mm_madd_epi16(m128Tmp5,T20);
1592
1593 m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
1594 E3l = _mm_madd_epi16(m128Tmp6,T30);
1595 m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
1596 E3h = _mm_madd_epi16(m128Tmp7,T30);
1597
1598 O0l = _mm_add_epi32(E0l, E1l);
1599 O0l = _mm_add_epi32(O0l, E2l);
1600 O0l = _mm_add_epi32(O0l, E3l);
1601
1602 O0h = _mm_add_epi32(E0h, E1h);
1603 O0h = _mm_add_epi32(O0h, E2h);
1604 O0h = _mm_add_epi32(O0h, E3h);
1605
1606 /* Compute O1*/
1607 E0l = _mm_madd_epi16(m128Tmp0,T01);
1608 E0h = _mm_madd_epi16(m128Tmp1,T01);
1609 E1l = _mm_madd_epi16(m128Tmp2,T11);
1610 E1h = _mm_madd_epi16(m128Tmp3,T11);
1611 E2l = _mm_madd_epi16(m128Tmp4,T21);
1612 E2h = _mm_madd_epi16(m128Tmp5,T21);
1613 E3l = _mm_madd_epi16(m128Tmp6,T31);
1614 E3h = _mm_madd_epi16(m128Tmp7,T31);
1615 O1l = _mm_add_epi32(E0l, E1l);
1616 O1l = _mm_add_epi32(O1l, E2l);
1617 O1l = _mm_add_epi32(O1l, E3l);
1618 O1h = _mm_add_epi32(E0h, E1h);
1619 O1h = _mm_add_epi32(O1h, E2h);
1620 O1h = _mm_add_epi32(O1h, E3h);
1621
1622 /* Compute O2*/
1623 E0l = _mm_madd_epi16(m128Tmp0,T02);
1624 E0h = _mm_madd_epi16(m128Tmp1,T02);
1625 E1l = _mm_madd_epi16(m128Tmp2,T12);
1626 E1h = _mm_madd_epi16(m128Tmp3,T12);
1627 E2l = _mm_madd_epi16(m128Tmp4,T22);
1628 E2h = _mm_madd_epi16(m128Tmp5,T22);
1629 E3l = _mm_madd_epi16(m128Tmp6,T32);
1630 E3h = _mm_madd_epi16(m128Tmp7,T32);
1631 O2l = _mm_add_epi32(E0l, E1l);
1632 O2l = _mm_add_epi32(O2l, E2l);
1633 O2l = _mm_add_epi32(O2l, E3l);
1634
1635 O2h = _mm_add_epi32(E0h, E1h);
1636 O2h = _mm_add_epi32(O2h, E2h);
1637 O2h = _mm_add_epi32(O2h, E3h);
1638
1639 /* Compute O3*/
1640 E0l = _mm_madd_epi16(m128Tmp0,T03);
1641 E0h = _mm_madd_epi16(m128Tmp1,T03);
1642 E1l = _mm_madd_epi16(m128Tmp2,T13);
1643 E1h = _mm_madd_epi16(m128Tmp3,T13);
1644 E2l = _mm_madd_epi16(m128Tmp4,T23);
1645 E2h = _mm_madd_epi16(m128Tmp5,T23);
1646 E3l = _mm_madd_epi16(m128Tmp6,T33);
1647 E3h = _mm_madd_epi16(m128Tmp7,T33);
1648
1649 O3l = _mm_add_epi32(E0l, E1l);
1650 O3l = _mm_add_epi32(O3l, E2l);
1651 O3l = _mm_add_epi32(O3l, E3l);
1652
1653 O3h = _mm_add_epi32(E0h, E1h);
1654 O3h = _mm_add_epi32(O3h, E2h);
1655 O3h = _mm_add_epi32(O3h, E3h);
1656
1657 /* Compute O4*/
1658
1659 E0l = _mm_madd_epi16(m128Tmp0,T04);
1660 E0h = _mm_madd_epi16(m128Tmp1,T04);
1661 E1l = _mm_madd_epi16(m128Tmp2,T14);
1662 E1h = _mm_madd_epi16(m128Tmp3,T14);
1663 E2l = _mm_madd_epi16(m128Tmp4,T24);
1664 E2h = _mm_madd_epi16(m128Tmp5,T24);
1665 E3l = _mm_madd_epi16(m128Tmp6,T34);
1666 E3h = _mm_madd_epi16(m128Tmp7,T34);
1667
1668 O4l = _mm_add_epi32(E0l, E1l);
1669 O4l = _mm_add_epi32(O4l, E2l);
1670 O4l = _mm_add_epi32(O4l, E3l);
1671
1672 O4h = _mm_add_epi32(E0h, E1h);
1673 O4h = _mm_add_epi32(O4h, E2h);
1674 O4h = _mm_add_epi32(O4h, E3h);
1675
1676 /* Compute O5*/
1677 E0l = _mm_madd_epi16(m128Tmp0,T05);
1678 E0h = _mm_madd_epi16(m128Tmp1,T05);
1679 E1l = _mm_madd_epi16(m128Tmp2,T15);
1680 E1h = _mm_madd_epi16(m128Tmp3,T15);
1681 E2l = _mm_madd_epi16(m128Tmp4,T25);
1682 E2h = _mm_madd_epi16(m128Tmp5,T25);
1683 E3l = _mm_madd_epi16(m128Tmp6,T35);
1684 E3h = _mm_madd_epi16(m128Tmp7,T35);
1685
1686 O5l = _mm_add_epi32(E0l, E1l);
1687 O5l = _mm_add_epi32(O5l, E2l);
1688 O5l = _mm_add_epi32(O5l, E3l);
1689
1690 O5h = _mm_add_epi32(E0h, E1h);
1691 O5h = _mm_add_epi32(O5h, E2h);
1692 O5h = _mm_add_epi32(O5h, E3h);
1693
1694 /* Compute O6*/
1695
1696 E0l = _mm_madd_epi16(m128Tmp0,T06);
1697 E0h = _mm_madd_epi16(m128Tmp1,T06);
1698 E1l = _mm_madd_epi16(m128Tmp2,T16);
1699 E1h = _mm_madd_epi16(m128Tmp3,T16);
1700 E2l = _mm_madd_epi16(m128Tmp4,T26);
1701 E2h = _mm_madd_epi16(m128Tmp5,T26);
1702 E3l = _mm_madd_epi16(m128Tmp6,T36);
1703 E3h = _mm_madd_epi16(m128Tmp7,T36);
1704
1705 O6l = _mm_add_epi32(E0l, E1l);
1706 O6l = _mm_add_epi32(O6l, E2l);
1707 O6l = _mm_add_epi32(O6l, E3l);
1708
1709 O6h = _mm_add_epi32(E0h, E1h);
1710 O6h = _mm_add_epi32(O6h, E2h);
1711 O6h = _mm_add_epi32(O6h, E3h);
1712
1713 /* Compute O7*/
1714
1715 E0l = _mm_madd_epi16(m128Tmp0,T07);
1716 E0h = _mm_madd_epi16(m128Tmp1,T07);
1717 E1l = _mm_madd_epi16(m128Tmp2,T17);
1718 E1h = _mm_madd_epi16(m128Tmp3,T17);
1719 E2l = _mm_madd_epi16(m128Tmp4,T27);
1720 E2h = _mm_madd_epi16(m128Tmp5,T27);
1721 E3l = _mm_madd_epi16(m128Tmp6,T37);
1722 E3h = _mm_madd_epi16(m128Tmp7,T37);
1723
1724 O7l = _mm_add_epi32(E0l, E1l);
1725 O7l = _mm_add_epi32(O7l, E2l);
1726 O7l = _mm_add_epi32(O7l, E3l);
1727
1728 O7h = _mm_add_epi32(E0h, E1h);
1729 O7h = _mm_add_epi32(O7h, E2h);
1730 O7h = _mm_add_epi32(O7h, E3h);
1731
1732 /* Compute E0 */
1733
1734
1735
1736 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1737 E0l = _mm_madd_epi16(m128Tmp0,U00);
1738 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1739 E0h = _mm_madd_epi16(m128Tmp1,U00);
1740
1741 m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
1742 E0l = _mm_add_epi32(E0l,
1743 _mm_madd_epi16(m128Tmp2,U10));
1744 m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
1745 E0h = _mm_add_epi32(E0h,
1746 _mm_madd_epi16(m128Tmp3,U10));
1747
1748 /* Compute E1 */
1749 E1l = _mm_madd_epi16(m128Tmp0,U01);
1750 E1h = _mm_madd_epi16(m128Tmp1,U01);
1751 E1l = _mm_add_epi32(E1l,
1752 _mm_madd_epi16(m128Tmp2,U11));
1753 E1h = _mm_add_epi32(E1h,
1754 _mm_madd_epi16(m128Tmp3,U11));
1755
1756 /* Compute E2 */
1757 E2l = _mm_madd_epi16(m128Tmp0,U02);
1758 E2h = _mm_madd_epi16(m128Tmp1,U02);
1759 E2l = _mm_add_epi32(E2l,
1760 _mm_madd_epi16(m128Tmp2,U12));
1761 E2h = _mm_add_epi32(E2h,
1762 _mm_madd_epi16(m128Tmp3,U12));
1763 /* Compute E3 */
1764 E3l = _mm_madd_epi16(m128Tmp0,U03);
1765 E3h = _mm_madd_epi16(m128Tmp1,U03);
1766 E3l = _mm_add_epi32(E3l,
1767 _mm_madd_epi16(m128Tmp2,U13));
1768 E3h = _mm_add_epi32(E3h,
1769 _mm_madd_epi16(m128Tmp3,U13));
1770
1771 /* Compute EE0 and EEE */
1772
1773 m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
1774 E00l = _mm_madd_epi16(m128Tmp0,V00);
1775 m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
1776 E00h = _mm_madd_epi16(m128Tmp1,V00);
1777
1778 m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8);
1779 EE0l = _mm_madd_epi16(m128Tmp2,V10);
1780 m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8);
1781 EE0h = _mm_madd_epi16(m128Tmp3,V10);
1782
1783 E01l = _mm_madd_epi16(m128Tmp0,V01);
1784 E01h = _mm_madd_epi16(m128Tmp1,V01);
1785
1786 EE1l = _mm_madd_epi16(m128Tmp2,V11);
1787 EE1h = _mm_madd_epi16(m128Tmp3,V11);
1788
1789 /* Compute EE */
1790 EE2l = _mm_sub_epi32(EE1l, E01l);
1791 EE3l = _mm_sub_epi32(EE0l, E00l);
1792 EE2h = _mm_sub_epi32(EE1h, E01h);
1793 EE3h = _mm_sub_epi32(EE0h, E00h);
1794
1795 EE0l = _mm_add_epi32(EE0l, E00l);
1796 EE1l = _mm_add_epi32(EE1l, E01l);
1797 EE0h = _mm_add_epi32(EE0h, E00h);
1798 EE1h = _mm_add_epi32(EE1h, E01h);
1799
1800 /* Compute E */
1801
1802 E4l = _mm_sub_epi32(EE3l, E3l);
1803 E4l = _mm_add_epi32(E4l, m128iAdd);
1804
1805 E5l = _mm_sub_epi32(EE2l, E2l);
1806 E5l = _mm_add_epi32(E5l, m128iAdd);
1807
1808 E6l = _mm_sub_epi32(EE1l, E1l);
1809 E6l = _mm_add_epi32(E6l, m128iAdd);
1810
1811 E7l = _mm_sub_epi32(EE0l, E0l);
1812 E7l = _mm_add_epi32(E7l, m128iAdd);
1813
1814 E4h = _mm_sub_epi32(EE3h, E3h);
1815 E4h = _mm_add_epi32(E4h, m128iAdd);
1816
1817 E5h = _mm_sub_epi32(EE2h, E2h);
1818 E5h = _mm_add_epi32(E5h, m128iAdd);
1819
1820 E6h = _mm_sub_epi32(EE1h, E1h);
1821 E6h = _mm_add_epi32(E6h, m128iAdd);
1822
1823 E7h = _mm_sub_epi32(EE0h, E0h);
1824 E7h = _mm_add_epi32(E7h, m128iAdd);
1825
1826 E0l = _mm_add_epi32(EE0l, E0l);
1827 E0l = _mm_add_epi32(E0l, m128iAdd);
1828
1829 E1l = _mm_add_epi32(EE1l, E1l);
1830 E1l = _mm_add_epi32(E1l, m128iAdd);
1831
1832 E2l = _mm_add_epi32(EE2l, E2l);
1833 E2l = _mm_add_epi32(E2l, m128iAdd);
1834
1835 E3l = _mm_add_epi32(EE3l, E3l);
1836 E3l = _mm_add_epi32(E3l, m128iAdd);
1837
1838 E0h = _mm_add_epi32(EE0h, E0h);
1839 E0h = _mm_add_epi32(E0h, m128iAdd);
1840
1841 E1h = _mm_add_epi32(EE1h, E1h);
1842 E1h = _mm_add_epi32(E1h, m128iAdd);
1843
1844 E2h = _mm_add_epi32(EE2h, E2h);
1845 E2h = _mm_add_epi32(E2h, m128iAdd);
1846
1847 E3h = _mm_add_epi32(EE3h, E3h);
1848 E3h = _mm_add_epi32(E3h, m128iAdd);
1849
1850 m128iS0 = _mm_packs_epi32(
1851 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
1852 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
1853 m128iS1 = _mm_packs_epi32(
1854 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
1855 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
1856 m128iS2 = _mm_packs_epi32(
1857 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
1858 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
1859 m128iS3 = _mm_packs_epi32(
1860 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
1861 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
1862
1863 m128iS4 = _mm_packs_epi32(
1864 _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
1865 _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
1866 m128iS5 = _mm_packs_epi32(
1867 _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
1868 _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
1869 m128iS6 = _mm_packs_epi32(
1870 _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
1871 _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
1872 m128iS7 = _mm_packs_epi32(
1873 _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
1874 _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
1875
1876 m128iS15 = _mm_packs_epi32(
1877 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
1878 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
1879 m128iS14 = _mm_packs_epi32(
1880 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
1881 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
1882 m128iS13 = _mm_packs_epi32(
1883 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
1884 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
1885 m128iS12 = _mm_packs_epi32(
1886 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
1887 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
1888
1889 m128iS11 = _mm_packs_epi32(
1890 _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
1891 _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
1892 m128iS10 = _mm_packs_epi32(
1893 _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
1894 _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
1895 m128iS9 = _mm_packs_epi32(
1896 _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
1897 _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
1898 m128iS8 = _mm_packs_epi32(
1899 _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
1900 _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
1901
1902
1903
1904 if (!j) { //first pass
1905
1906 /* Inverse the matrix */
1907 E0l = _mm_unpacklo_epi16(m128iS0, m128iS8);
1908 E1l = _mm_unpacklo_epi16(m128iS1, m128iS9);
1909 E2l = _mm_unpacklo_epi16(m128iS2, m128iS10);
1910 E3l = _mm_unpacklo_epi16(m128iS3, m128iS11);
1911 E4l = _mm_unpacklo_epi16(m128iS4, m128iS12);
1912 E5l = _mm_unpacklo_epi16(m128iS5, m128iS13);
1913 E6l = _mm_unpacklo_epi16(m128iS6, m128iS14);
1914 E7l = _mm_unpacklo_epi16(m128iS7, m128iS15);
1915
1916 E0h = _mm_unpackhi_epi16(m128iS0, m128iS8);
1917 E1h = _mm_unpackhi_epi16(m128iS1, m128iS9);
1918 E2h = _mm_unpackhi_epi16(m128iS2, m128iS10);
1919 E3h = _mm_unpackhi_epi16(m128iS3, m128iS11);
1920 E4h = _mm_unpackhi_epi16(m128iS4, m128iS12);
1921 E5h = _mm_unpackhi_epi16(m128iS5, m128iS13);
1922 E6h = _mm_unpackhi_epi16(m128iS6, m128iS14);
1923 E7h = _mm_unpackhi_epi16(m128iS7, m128iS15);
1924
1925 m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l);
1926 m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l);
1927 m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l);
1928 m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l);
1929
1930 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1931 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1932 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1933 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1934
1935 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1936 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1937 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1938 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1939
1940 m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l);
1941 m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l);
1942 m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l);
1943 m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l);
1944
1945 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1946 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1947 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1948 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1949
1950 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1951 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1952 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1953 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1954
1955 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
1956 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
1957 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
1958 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
1959
1960 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1961 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1962 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1963 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1964
1965 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1966 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1967 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1968 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1969
1970 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
1971 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
1972 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
1973 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
1974
1975 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1976 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1977 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1978 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1979
1980 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1981 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1982 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1983 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1984
1985 if (!i) {
1986
1987 r0= m128iS0; //0
1988 r1= m128iS1; //16
1989 r2= m128iS2; //32
1990 r3= m128iS3; //48
1991 r4= m128iS4; //64
1992 r5= m128iS5; //80
1993 r6= m128iS6; //96
1994 r7= m128iS7; //112
1995 r8= m128iS8; //128
1996 r9= m128iS9; //144
1997 r10= m128iS10; //160
1998 r11= m128iS11; //176
1999 r12= m128iS12; //192
2000 r13= m128iS13; //208
2001 r14= m128iS14; //224
2002 r15= m128iS15; //240
2003
2004
2005
2006 m128iS0 = _mm_load_si128((__m128i *) (src + 8));
2007 m128iS1 = _mm_load_si128((__m128i *) (src + 24));
2008 m128iS2 = _mm_load_si128((__m128i *) (src + 40));
2009 m128iS3 = _mm_load_si128((__m128i *) (src + 56));
2010 m128iS4 = _mm_loadu_si128((__m128i *) (src + 72));
2011 m128iS5 = _mm_load_si128((__m128i *) (src + 88));
2012 m128iS6 = _mm_load_si128((__m128i *) (src + 104));
2013 m128iS7 = _mm_load_si128((__m128i *) (src + 120));
2014 m128iS8 = _mm_load_si128((__m128i *) (src + 136));
2015 m128iS9 = _mm_load_si128((__m128i *) (src + 152));
2016 m128iS10 = _mm_load_si128((__m128i *) (src + 168));
2017 m128iS11 = _mm_load_si128((__m128i *) (src + 184));
2018 m128iS12 = _mm_load_si128((__m128i *) (src + 200));
2019 m128iS13 = _mm_load_si128((__m128i *) (src + 216));
2020 m128iS14 = _mm_load_si128((__m128i *) (src + 232));
2021 m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2022 } else {
2023
2024 r16= m128iS0; //8
2025 r17= m128iS1; //24
2026 r18= m128iS2; //40
2027 r19= m128iS3; //56
2028 r20= m128iS4; //72
2029 r21= m128iS5; //88
2030 r22= m128iS6; //104
2031 r23= m128iS7; //120
2032 r24= m128iS8; //136
2033 r25= m128iS9; //152
2034 r26= m128iS10; //168
2035 r27= m128iS11; //184
2036 r28= m128iS12; //200
2037 r29= m128iS13; //216
2038 r30= m128iS14; //232
2039 r31= m128iS15; //248
2040
2041 //prepare next iteration :
2042
2043 m128iS0= r0;
2044 m128iS1= r2;
2045 m128iS2= r4;
2046 m128iS3= r6;
2047 m128iS4= r8;
2048 m128iS5= r10;
2049 m128iS6= r12;
2050 m128iS7= r14;
2051 m128iS8= r16;
2052 m128iS9= r18;
2053 m128iS10=r20;
2054 m128iS11=r22;
2055 m128iS12=r24;
2056 m128iS13=r26;
2057 m128iS14=r28;
2058 m128iS15=r30;
2059
2060 shift = shift_2nd;
2061 m128iAdd = _mm_set1_epi32(add_2nd);
2062 }
2063
2064 } else {
2065
2066 //transpose half matrix :
2067 //instead of having 1 register = 1 half-column,
2068 //1 register = 1 half-row.
2069 E0l = _mm_unpacklo_epi16(m128iS0, m128iS1);
2070 E1l = _mm_unpacklo_epi16(m128iS2, m128iS3);
2071 E2l = _mm_unpacklo_epi16(m128iS4, m128iS5);
2072 E3l = _mm_unpacklo_epi16(m128iS6, m128iS7);
2073 E4l = _mm_unpacklo_epi16(m128iS8, m128iS9);
2074 E5l = _mm_unpacklo_epi16(m128iS10, m128iS11);
2075 E6l = _mm_unpacklo_epi16(m128iS12, m128iS13);
2076 E7l = _mm_unpacklo_epi16(m128iS14, m128iS15);
2077
2078 O0l = _mm_unpackhi_epi16(m128iS0, m128iS1);
2079 O1l = _mm_unpackhi_epi16(m128iS2, m128iS3);
2080 O2l = _mm_unpackhi_epi16(m128iS4, m128iS5);
2081 O3l = _mm_unpackhi_epi16(m128iS6, m128iS7);
2082 O4l = _mm_unpackhi_epi16(m128iS8, m128iS9);
2083 O5l = _mm_unpackhi_epi16(m128iS10, m128iS11);
2084 O6l = _mm_unpackhi_epi16(m128iS12, m128iS13);
2085 O7l = _mm_unpackhi_epi16(m128iS14, m128iS15);
2086
2087
2088 m128Tmp0 = _mm_unpacklo_epi32(E0l, E1l);
2089 m128Tmp1 = _mm_unpacklo_epi32(E2l, E3l);
2090
2091 m128Tmp2 = _mm_unpacklo_epi32(E4l, E5l);
2092 m128Tmp3 = _mm_unpacklo_epi32(E6l, E7l);
2093
2094 r0 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); //1st half 1st row
2095 r2 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); //2nd half 1st row
2096
2097
2098 r4 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); //1st half 2nd row
2099 r6 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); //2nd hald 2nd row
2100
2101 m128Tmp0 = _mm_unpackhi_epi32(E0l, E1l);
2102 m128Tmp1 = _mm_unpackhi_epi32(E2l, E3l);
2103 m128Tmp2 = _mm_unpackhi_epi32(E4l, E5l);
2104 m128Tmp3 = _mm_unpackhi_epi32(E6l, E7l);
2105
2106
2107 r8 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2108 r10 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2109
2110 r12 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2111 r14 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2112
2113 m128Tmp0 = _mm_unpacklo_epi32(O0l, O1l);
2114 m128Tmp1 = _mm_unpacklo_epi32(O2l, O3l);
2115 m128Tmp2 = _mm_unpacklo_epi32(O4l, O5l);
2116 m128Tmp3 = _mm_unpacklo_epi32(O6l, O7l);
2117
2118 r16 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2119 r18 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2120
2121
2122 r20 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2123 r22 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2124
2125 m128Tmp0 = _mm_unpackhi_epi32(O0l, O1l);
2126 m128Tmp1 = _mm_unpackhi_epi32(O2l, O3l);
2127 m128Tmp2 = _mm_unpackhi_epi32(O4l, O5l);
2128 m128Tmp3 = _mm_unpackhi_epi32(O6l, O7l);
2129
2130 r24 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2131 r26 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2132
2133
2134 r28 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2135 r30 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2136
2137 dst = (uint8_t*) (_dst + (i*stride));
2138 m128Tmp0= _mm_setzero_si128();
2139 m128Tmp1= _mm_load_si128((__m128i*)dst);
2140 m128Tmp2= _mm_load_si128((__m128i*)(dst+stride));
2141 m128Tmp3= _mm_load_si128((__m128i*)(dst+2*stride));
2142 m128Tmp4= _mm_load_si128((__m128i*)(dst+3*stride));
2143 m128Tmp5= _mm_load_si128((__m128i*)(dst+4*stride));
2144 m128Tmp6= _mm_load_si128((__m128i*)(dst+5*stride));
2145 m128Tmp7= _mm_load_si128((__m128i*)(dst+6*stride));
2146 E0l= _mm_load_si128((__m128i*)(dst+7*stride));
2147
2148
2149 r0= _mm_adds_epi16(r0,_mm_unpacklo_epi8(m128Tmp1,m128Tmp0));
2150 r2= _mm_adds_epi16(r2,_mm_unpackhi_epi8(m128Tmp1,m128Tmp0));
2151 r0= _mm_packus_epi16(r0,r2);
2152
2153
2154
2155
2156 r4= _mm_adds_epi16(r4,_mm_unpacklo_epi8(m128Tmp2,m128Tmp0));
2157 r6= _mm_adds_epi16(r6,_mm_unpackhi_epi8(m128Tmp2,m128Tmp0));
2158 r4= _mm_packus_epi16(r4,r6);
2159
2160
2161 r8= _mm_adds_epi16(r8,_mm_unpacklo_epi8(m128Tmp3,m128Tmp0));
2162 r10= _mm_adds_epi16(r10,_mm_unpackhi_epi8(m128Tmp3,m128Tmp0));
2163 r8= _mm_packus_epi16(r8,r10);
2164
2165
2166 r12= _mm_adds_epi16(r12,_mm_unpacklo_epi8(m128Tmp4,m128Tmp0));
2167 r14= _mm_adds_epi16(r14,_mm_unpackhi_epi8(m128Tmp4,m128Tmp0));
2168 r12= _mm_packus_epi16(r12,r14);
2169
2170
2171 r16= _mm_adds_epi16(r16,_mm_unpacklo_epi8(m128Tmp5,m128Tmp0));
2172 r18= _mm_adds_epi16(r18,_mm_unpackhi_epi8(m128Tmp5,m128Tmp0));
2173 r16= _mm_packus_epi16(r16,r18);
2174
2175
2176 r20= _mm_adds_epi16(r20,_mm_unpacklo_epi8(m128Tmp6,m128Tmp0));
2177 r22= _mm_adds_epi16(r22,_mm_unpackhi_epi8(m128Tmp6,m128Tmp0));
2178 r20= _mm_packus_epi16(r20,r22);
2179
2180
2181 r24= _mm_adds_epi16(r24,_mm_unpacklo_epi8(m128Tmp7,m128Tmp0));
2182 r26= _mm_adds_epi16(r26,_mm_unpackhi_epi8(m128Tmp7,m128Tmp0));
2183 r24= _mm_packus_epi16(r24,r26);
2184
2185
2186
2187 r28= _mm_adds_epi16(r28,_mm_unpacklo_epi8(E0l,m128Tmp0));
2188 r30= _mm_adds_epi16(r30,_mm_unpackhi_epi8(E0l,m128Tmp0));
2189 r28= _mm_packus_epi16(r28,r30);
2190
2191 _mm_store_si128((__m128i*)dst,r0);
2192 _mm_store_si128((__m128i*)(dst+stride),r4);
2193 _mm_store_si128((__m128i*)(dst+2*stride),r8);
2194 _mm_store_si128((__m128i*)(dst+3*stride),r12);
2195 _mm_store_si128((__m128i*)(dst+4*stride),r16);
2196 _mm_store_si128((__m128i*)(dst+5*stride),r20);
2197 _mm_store_si128((__m128i*)(dst+6*stride),r24);
2198 _mm_store_si128((__m128i*)(dst+7*stride),r28);
2199
2200
2201
2202 if (!i) {
2203 //first half done, can store !
2204
2205
2206 m128iS0= r1;
2207 m128iS1= r3;
2208 m128iS2= r5;
2209 m128iS3= r7;
2210 m128iS4= r9;
2211 m128iS5= r11;
2212 m128iS6= r13;
2213 m128iS7= r15;
2214 m128iS8= r17;
2215 m128iS9= r19;
2216 m128iS10=r21;
2217 m128iS11=r23;
2218 m128iS12=r25;
2219 m128iS13=r27;
2220 m128iS14=r29;
2221 m128iS15=r31;
2222 }
2223 }
2224 }
2225 }
2226 }
2227 #endif
2228
2229
2230 #if 0
2231 void ff_hevc_transform_16x16_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
2232 ptrdiff_t _stride) {
2233 int i;
2234 uint16_t *dst = (uint16_t*) _dst;
2235 ptrdiff_t stride = _stride / 2;
2236 int16_t *src = coeffs;
2237 int32_t shift;
2238 uint8_t shift_2nd = 10; //20 - bit depth
2239 uint16_t add_2nd = 1 << 9; //shift - 1;
2240 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
2241 m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
2242 m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
2243 m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
2244 E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
2245 O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
2246 E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
2247 __m128i E4l, E5l, E6l, E7l;
2248 __m128i E4h, E5h, E6h, E7h;
2249 int j;
2250 m128iS0 = _mm_load_si128((__m128i *) (src));
2251 m128iS1 = _mm_load_si128((__m128i *) (src + 16));
2252 m128iS2 = _mm_load_si128((__m128i *) (src + 32));
2253 m128iS3 = _mm_load_si128((__m128i *) (src + 48));
2254 m128iS4 = _mm_loadu_si128((__m128i *) (src + 64));
2255 m128iS5 = _mm_load_si128((__m128i *) (src + 80));
2256 m128iS6 = _mm_load_si128((__m128i *) (src + 96));
2257 m128iS7 = _mm_load_si128((__m128i *) (src + 112));
2258 m128iS8 = _mm_load_si128((__m128i *) (src + 128));
2259 m128iS9 = _mm_load_si128((__m128i *) (src + 144));
2260 m128iS10 = _mm_load_si128((__m128i *) (src + 160));
2261 m128iS11 = _mm_load_si128((__m128i *) (src + 176));
2262 m128iS12 = _mm_loadu_si128((__m128i *) (src + 192));
2263 m128iS13 = _mm_load_si128((__m128i *) (src + 208));
2264 m128iS14 = _mm_load_si128((__m128i *) (src + 224));
2265 m128iS15 = _mm_load_si128((__m128i *) (src + 240));
2266 shift = shift_1st;
2267 m128iAdd = _mm_set1_epi32(add_1st);
2268
2269 for (j = 0; j < 2; j++) {
2270 for (i = 0; i < 16; i += 8) {
2271
2272 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
2273 E0l = _mm_madd_epi16(m128Tmp0,
2274 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
2275 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
2276 E0h = _mm_madd_epi16(m128Tmp1,
2277 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
2278
2279 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
2280 E1l = _mm_madd_epi16(m128Tmp2,
2281 _mm_load_si128((__m128i *) (transform16x16_1[1][0])));
2282 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
2283 E1h = _mm_madd_epi16(m128Tmp3,
2284 _mm_load_si128((__m128i *) (transform16x16_1[1][0])));
2285
2286 m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
2287 E2l = _mm_madd_epi16(m128Tmp4,
2288 _mm_load_si128((__m128i *) (transform16x16_1[2][0])));
2289 m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
2290 E2h = _mm_madd_epi16(m128Tmp5,
2291 _mm_load_si128((__m128i *) (transform16x16_1[2][0])));
2292
2293 m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
2294 E3l = _mm_madd_epi16(m128Tmp6,
2295 _mm_load_si128((__m128i *) (transform16x16_1[3][0])));
2296 m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
2297 E3h = _mm_madd_epi16(m128Tmp7,
2298 _mm_load_si128((__m128i *) (transform16x16_1[3][0])));
2299
2300 O0l = _mm_add_epi32(E0l, E1l);
2301 O0l = _mm_add_epi32(O0l, E2l);
2302 O0l = _mm_add_epi32(O0l, E3l);
2303
2304 O0h = _mm_add_epi32(E0h, E1h);
2305 O0h = _mm_add_epi32(O0h, E2h);
2306 O0h = _mm_add_epi32(O0h, E3h);
2307
2308 /* Compute O1*/
2309 E0l = _mm_madd_epi16(m128Tmp0,
2310 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
2311 E0h = _mm_madd_epi16(m128Tmp1,
2312 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
2313 E1l = _mm_madd_epi16(m128Tmp2,
2314 _mm_load_si128((__m128i *) (transform16x16_1[1][1])));
2315 E1h = _mm_madd_epi16(m128Tmp3,
2316 _mm_load_si128((__m128i *) (transform16x16_1[1][1])));
2317 E2l = _mm_madd_epi16(m128Tmp4,
2318 _mm_load_si128((__m128i *) (transform16x16_1[2][1])));
2319 E2h = _mm_madd_epi16(m128Tmp5,
2320 _mm_load_si128((__m128i *) (transform16x16_1[2][1])));
2321 E3l = _mm_madd_epi16(m128Tmp6,
2322 _mm_load_si128((__m128i *) (transform16x16_1[3][1])));
2323 E3h = _mm_madd_epi16(m128Tmp7,
2324 _mm_load_si128((__m128i *) (transform16x16_1[3][1])));
2325 O1l = _mm_add_epi32(E0l, E1l);
2326 O1l = _mm_add_epi32(O1l, E2l);
2327 O1l = _mm_add_epi32(O1l, E3l);
2328 O1h = _mm_add_epi32(E0h, E1h);
2329 O1h = _mm_add_epi32(O1h, E2h);
2330 O1h = _mm_add_epi32(O1h, E3h);
2331
2332 /* Compute O2*/
2333 E0l = _mm_madd_epi16(m128Tmp0,
2334 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
2335 E0h = _mm_madd_epi16(m128Tmp1,
2336 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
2337 E1l = _mm_madd_epi16(m128Tmp2,
2338 _mm_load_si128((__m128i *) (transform16x16_1[1][2])));
2339 E1h = _mm_madd_epi16(m128Tmp3,
2340 _mm_load_si128((__m128i *) (transform16x16_1[1][2])));
2341 E2l = _mm_madd_epi16(m128Tmp4,
2342 _mm_load_si128((__m128i *) (transform16x16_1[2][2])));
2343 E2h = _mm_madd_epi16(m128Tmp5,
2344 _mm_load_si128((__m128i *) (transform16x16_1[2][2])));
2345 E3l = _mm_madd_epi16(m128Tmp6,
2346 _mm_load_si128((__m128i *) (transform16x16_1[3][2])));
2347 E3h = _mm_madd_epi16(m128Tmp7,
2348 _mm_load_si128((__m128i *) (transform16x16_1[3][2])));
2349 O2l = _mm_add_epi32(E0l, E1l);
2350 O2l = _mm_add_epi32(O2l, E2l);
2351 O2l = _mm_add_epi32(O2l, E3l);
2352
2353 O2h = _mm_add_epi32(E0h, E1h);
2354 O2h = _mm_add_epi32(O2h, E2h);
2355 O2h = _mm_add_epi32(O2h, E3h);
2356
2357 /* Compute O3*/
2358 E0l = _mm_madd_epi16(m128Tmp0,
2359 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
2360 E0h = _mm_madd_epi16(m128Tmp1,
2361 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
2362 E1l = _mm_madd_epi16(m128Tmp2,
2363 _mm_load_si128((__m128i *) (transform16x16_1[1][3])));
2364 E1h = _mm_madd_epi16(m128Tmp3,
2365 _mm_load_si128((__m128i *) (transform16x16_1[1][3])));
2366 E2l = _mm_madd_epi16(m128Tmp4,
2367 _mm_load_si128((__m128i *) (transform16x16_1[2][3])));
2368 E2h = _mm_madd_epi16(m128Tmp5,
2369 _mm_load_si128((__m128i *) (transform16x16_1[2][3])));
2370 E3l = _mm_madd_epi16(m128Tmp6,
2371 _mm_load_si128((__m128i *) (transform16x16_1[3][3])));
2372 E3h = _mm_madd_epi16(m128Tmp7,
2373 _mm_load_si128((__m128i *) (transform16x16_1[3][3])));
2374
2375 O3l = _mm_add_epi32(E0l, E1l);
2376 O3l = _mm_add_epi32(O3l, E2l);
2377 O3l = _mm_add_epi32(O3l, E3l);
2378
2379 O3h = _mm_add_epi32(E0h, E1h);
2380 O3h = _mm_add_epi32(O3h, E2h);
2381 O3h = _mm_add_epi32(O3h, E3h);
2382
2383 /* Compute O4*/
2384
2385 E0l = _mm_madd_epi16(m128Tmp0,
2386 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
2387 E0h = _mm_madd_epi16(m128Tmp1,
2388 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
2389 E1l = _mm_madd_epi16(m128Tmp2,
2390 _mm_load_si128((__m128i *) (transform16x16_1[1][4])));
2391 E1h = _mm_madd_epi16(m128Tmp3,
2392 _mm_load_si128((__m128i *) (transform16x16_1[1][4])));
2393 E2l = _mm_madd_epi16(m128Tmp4,
2394 _mm_load_si128((__m128i *) (transform16x16_1[2][4])));
2395 E2h = _mm_madd_epi16(m128Tmp5,
2396 _mm_load_si128((__m128i *) (transform16x16_1[2][4])));
2397 E3l = _mm_madd_epi16(m128Tmp6,
2398 _mm_load_si128((__m128i *) (transform16x16_1[3][4])));
2399 E3h = _mm_madd_epi16(m128Tmp7,
2400 _mm_load_si128((__m128i *) (transform16x16_1[3][4])));
2401
2402 O4l = _mm_add_epi32(E0l, E1l);
2403 O4l = _mm_add_epi32(O4l, E2l);
2404 O4l = _mm_add_epi32(O4l, E3l);
2405
2406 O4h = _mm_add_epi32(E0h, E1h);
2407 O4h = _mm_add_epi32(O4h, E2h);
2408 O4h = _mm_add_epi32(O4h, E3h);
2409
2410 /* Compute O5*/
2411 E0l = _mm_madd_epi16(m128Tmp0,
2412 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
2413 E0h = _mm_madd_epi16(m128Tmp1,
2414 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
2415 E1l = _mm_madd_epi16(m128Tmp2,
2416 _mm_load_si128((__m128i *) (transform16x16_1[1][5])));
2417 E1h = _mm_madd_epi16(m128Tmp3,
2418 _mm_load_si128((__m128i *) (transform16x16_1[1][5])));
2419 E2l = _mm_madd_epi16(m128Tmp4,
2420 _mm_load_si128((__m128i *) (transform16x16_1[2][5])));
2421 E2h = _mm_madd_epi16(m128Tmp5,
2422 _mm_load_si128((__m128i *) (transform16x16_1[2][5])));
2423 E3l = _mm_madd_epi16(m128Tmp6,
2424 _mm_load_si128((__m128i *) (transform16x16_1[3][5])));
2425 E3h = _mm_madd_epi16(m128Tmp7,
2426 _mm_load_si128((__m128i *) (transform16x16_1[3][5])));
2427
2428 O5l = _mm_add_epi32(E0l, E1l);
2429 O5l = _mm_add_epi32(O5l, E2l);
2430 O5l = _mm_add_epi32(O5l, E3l);
2431
2432 O5h = _mm_add_epi32(E0h, E1h);
2433 O5h = _mm_add_epi32(O5h, E2h);
2434 O5h = _mm_add_epi32(O5h, E3h);
2435
2436 /* Compute O6*/
2437
2438 E0l = _mm_madd_epi16(m128Tmp0,
2439 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
2440 E0h = _mm_madd_epi16(m128Tmp1,
2441 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
2442 E1l = _mm_madd_epi16(m128Tmp2,
2443 _mm_load_si128((__m128i *) (transform16x16_1[1][6])));
2444 E1h = _mm_madd_epi16(m128Tmp3,
2445 _mm_load_si128((__m128i *) (transform16x16_1[1][6])));
2446 E2l = _mm_madd_epi16(m128Tmp4,
2447 _mm_load_si128((__m128i *) (transform16x16_1[2][6])));
2448 E2h = _mm_madd_epi16(m128Tmp5,
2449 _mm_load_si128((__m128i *) (transform16x16_1[2][6])));
2450 E3l = _mm_madd_epi16(m128Tmp6,
2451 _mm_load_si128((__m128i *) (transform16x16_1[3][6])));
2452 E3h = _mm_madd_epi16(m128Tmp7,
2453 _mm_load_si128((__m128i *) (transform16x16_1[3][6])));
2454
2455 O6l = _mm_add_epi32(E0l, E1l);
2456 O6l = _mm_add_epi32(O6l, E2l);
2457 O6l = _mm_add_epi32(O6l, E3l);
2458
2459 O6h = _mm_add_epi32(E0h, E1h);
2460 O6h = _mm_add_epi32(O6h, E2h);
2461 O6h = _mm_add_epi32(O6h, E3h);
2462
2463 /* Compute O7*/
2464
2465 E0l = _mm_madd_epi16(m128Tmp0,
2466 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
2467 E0h = _mm_madd_epi16(m128Tmp1,
2468 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
2469 E1l = _mm_madd_epi16(m128Tmp2,
2470 _mm_load_si128((__m128i *) (transform16x16_1[1][7])));
2471 E1h = _mm_madd_epi16(m128Tmp3,
2472 _mm_load_si128((__m128i *) (transform16x16_1[1][7])));
2473 E2l = _mm_madd_epi16(m128Tmp4,
2474 _mm_load_si128((__m128i *) (transform16x16_1[2][7])));
2475 E2h = _mm_madd_epi16(m128Tmp5,
2476 _mm_load_si128((__m128i *) (transform16x16_1[2][7])));
2477 E3l = _mm_madd_epi16(m128Tmp6,
2478 _mm_load_si128((__m128i *) (transform16x16_1[3][7])));
2479 E3h = _mm_madd_epi16(m128Tmp7,
2480 _mm_load_si128((__m128i *) (transform16x16_1[3][7])));
2481
2482 O7l = _mm_add_epi32(E0l, E1l);
2483 O7l = _mm_add_epi32(O7l, E2l);
2484 O7l = _mm_add_epi32(O7l, E3l);
2485
2486 O7h = _mm_add_epi32(E0h, E1h);
2487 O7h = _mm_add_epi32(O7h, E2h);
2488 O7h = _mm_add_epi32(O7h, E3h);
2489
2490 /* Compute E0 */
2491
2492 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
2493 E0l = _mm_madd_epi16(m128Tmp0,
2494 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
2495 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
2496 E0h = _mm_madd_epi16(m128Tmp1,
2497 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
2498
2499 m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
2500 E0l = _mm_add_epi32(E0l,
2501 _mm_madd_epi16(m128Tmp2,
2502 _mm_load_si128(
2503 (__m128i *) (transform16x16_2[1][0]))));
2504 m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
2505 E0h = _mm_add_epi32(E0h,
2506 _mm_madd_epi16(m128Tmp3,
2507 _mm_load_si128(
2508 (__m128i *) (transform16x16_2[1][0]))));
2509
2510 /* Compute E1 */
2511 E1l = _mm_madd_epi16(m128Tmp0,
2512 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
2513 E1h = _mm_madd_epi16(m128Tmp1,
2514 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
2515 E1l = _mm_add_epi32(E1l,
2516 _mm_madd_epi16(m128Tmp2,
2517 _mm_load_si128(
2518 (__m128i *) (transform16x16_2[1][1]))));
2519 E1h = _mm_add_epi32(E1h,
2520 _mm_madd_epi16(m128Tmp3,
2521 _mm_load_si128(
2522 (__m128i *) (transform16x16_2[1][1]))));
2523
2524 /* Compute E2 */
2525 E2l = _mm_madd_epi16(m128Tmp0,
2526 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
2527 E2h = _mm_madd_epi16(m128Tmp1,
2528 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
2529 E2l = _mm_add_epi32(E2l,
2530 _mm_madd_epi16(m128Tmp2,
2531 _mm_load_si128(
2532 (__m128i *) (transform16x16_2[1][2]))));
2533 E2h = _mm_add_epi32(E2h,
2534 _mm_madd_epi16(m128Tmp3,
2535 _mm_load_si128(
2536 (__m128i *) (transform16x16_2[1][2]))));
2537 /* Compute E3 */
2538 E3l = _mm_madd_epi16(m128Tmp0,
2539 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
2540 E3h = _mm_madd_epi16(m128Tmp1,
2541 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
2542 E3l = _mm_add_epi32(E3l,
2543 _mm_madd_epi16(m128Tmp2,
2544 _mm_load_si128(
2545 (__m128i *) (transform16x16_2[1][3]))));
2546 E3h = _mm_add_epi32(E3h,
2547 _mm_madd_epi16(m128Tmp3,
2548 _mm_load_si128(
2549 (__m128i *) (transform16x16_2[1][3]))));
2550
2551 /* Compute EE0 and EEE */
2552
2553 m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
2554 E00l = _mm_madd_epi16(m128Tmp0,
2555 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
2556 m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
2557 E00h = _mm_madd_epi16(m128Tmp1,
2558 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
2559
2560 m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8);
2561 EE0l = _mm_madd_epi16(m128Tmp2,
2562 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
2563 m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8);
2564 EE0h = _mm_madd_epi16(m128Tmp3,
2565 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
2566
2567 E01l = _mm_madd_epi16(m128Tmp0,
2568 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
2569 E01h = _mm_madd_epi16(m128Tmp1,
2570 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
2571
2572 EE1l = _mm_madd_epi16(m128Tmp2,
2573 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
2574 EE1h = _mm_madd_epi16(m128Tmp3,
2575 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
2576
2577 /* Compute EE */
2578 EE2l = _mm_sub_epi32(EE1l, E01l);
2579 EE3l = _mm_sub_epi32(EE0l, E00l);
2580 EE2h = _mm_sub_epi32(EE1h, E01h);
2581 EE3h = _mm_sub_epi32(EE0h, E00h);
2582
2583 EE0l = _mm_add_epi32(EE0l, E00l);
2584 EE1l = _mm_add_epi32(EE1l, E01l);
2585 EE0h = _mm_add_epi32(EE0h, E00h);
2586 EE1h = _mm_add_epi32(EE1h, E01h);
2587
2588 /* Compute E */
2589
2590 E4l = _mm_sub_epi32(EE3l, E3l);
2591 E4l = _mm_add_epi32(E4l, m128iAdd);
2592
2593 E5l = _mm_sub_epi32(EE2l, E2l);
2594 E5l = _mm_add_epi32(E5l, m128iAdd);
2595
2596 E6l = _mm_sub_epi32(EE1l, E1l);
2597 E6l = _mm_add_epi32(E6l, m128iAdd);
2598
2599 E7l = _mm_sub_epi32(EE0l, E0l);
2600 E7l = _mm_add_epi32(E7l, m128iAdd);
2601
2602 E4h = _mm_sub_epi32(EE3h, E3h);
2603 E4h = _mm_add_epi32(E4h, m128iAdd);
2604
2605 E5h = _mm_sub_epi32(EE2h, E2h);
2606 E5h = _mm_add_epi32(E5h, m128iAdd);
2607
2608 E6h = _mm_sub_epi32(EE1h, E1h);
2609 E6h = _mm_add_epi32(E6h, m128iAdd);
2610
2611 E7h = _mm_sub_epi32(EE0h, E0h);
2612 E7h = _mm_add_epi32(E7h, m128iAdd);
2613
2614 E0l = _mm_add_epi32(EE0l, E0l);
2615 E0l = _mm_add_epi32(E0l, m128iAdd);
2616
2617 E1l = _mm_add_epi32(EE1l, E1l);
2618 E1l = _mm_add_epi32(E1l, m128iAdd);
2619
2620 E2l = _mm_add_epi32(EE2l, E2l);
2621 E2l = _mm_add_epi32(E2l, m128iAdd);
2622
2623 E3l = _mm_add_epi32(EE3l, E3l);
2624 E3l = _mm_add_epi32(E3l, m128iAdd);
2625
2626 E0h = _mm_add_epi32(EE0h, E0h);
2627 E0h = _mm_add_epi32(E0h, m128iAdd);
2628
2629 E1h = _mm_add_epi32(EE1h, E1h);
2630 E1h = _mm_add_epi32(E1h, m128iAdd);
2631
2632 E2h = _mm_add_epi32(EE2h, E2h);
2633 E2h = _mm_add_epi32(E2h, m128iAdd);
2634
2635 E3h = _mm_add_epi32(EE3h, E3h);
2636 E3h = _mm_add_epi32(E3h, m128iAdd);
2637
2638 m128iS0 = _mm_packs_epi32(
2639 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
2640 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
2641 m128iS1 = _mm_packs_epi32(
2642 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
2643 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
2644 m128iS2 = _mm_packs_epi32(
2645 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
2646 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
2647 m128iS3 = _mm_packs_epi32(
2648 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
2649 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
2650
2651 m128iS4 = _mm_packs_epi32(
2652 _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
2653 _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
2654 m128iS5 = _mm_packs_epi32(
2655 _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
2656 _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
2657 m128iS6 = _mm_packs_epi32(
2658 _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
2659 _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
2660 m128iS7 = _mm_packs_epi32(
2661 _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
2662 _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
2663
2664 m128iS15 = _mm_packs_epi32(
2665 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
2666 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
2667 m128iS14 = _mm_packs_epi32(
2668 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
2669 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
2670 m128iS13 = _mm_packs_epi32(
2671 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
2672 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
2673 m128iS12 = _mm_packs_epi32(
2674 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
2675 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
2676
2677 m128iS11 = _mm_packs_epi32(
2678 _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
2679 _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
2680 m128iS10 = _mm_packs_epi32(
2681 _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
2682 _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
2683 m128iS9 = _mm_packs_epi32(
2684 _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
2685 _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
2686 m128iS8 = _mm_packs_epi32(
2687 _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
2688 _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
2689
2690 if (!j) {
2691 /* Inverse the matrix */
2692 E0l = _mm_unpacklo_epi16(m128iS0, m128iS8);
2693 E1l = _mm_unpacklo_epi16(m128iS1, m128iS9);
2694 E2l = _mm_unpacklo_epi16(m128iS2, m128iS10);
2695 E3l = _mm_unpacklo_epi16(m128iS3, m128iS11);
2696 E4l = _mm_unpacklo_epi16(m128iS4, m128iS12);
2697 E5l = _mm_unpacklo_epi16(m128iS5, m128iS13);
2698 E6l = _mm_unpacklo_epi16(m128iS6, m128iS14);
2699 E7l = _mm_unpacklo_epi16(m128iS7, m128iS15);
2700
2701 O0l = _mm_unpackhi_epi16(m128iS0, m128iS8);
2702 O1l = _mm_unpackhi_epi16(m128iS1, m128iS9);
2703 O2l = _mm_unpackhi_epi16(m128iS2, m128iS10);
2704 O3l = _mm_unpackhi_epi16(m128iS3, m128iS11);
2705 O4l = _mm_unpackhi_epi16(m128iS4, m128iS12);
2706 O5l = _mm_unpackhi_epi16(m128iS5, m128iS13);
2707 O6l = _mm_unpackhi_epi16(m128iS6, m128iS14);
2708 O7l = _mm_unpackhi_epi16(m128iS7, m128iS15);
2709
2710 m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l);
2711 m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l);
2712 m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l);
2713 m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l);
2714
2715 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2716 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2717 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2718 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2719
2720 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2721 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2722 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2723 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2724
2725 m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l);
2726 m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l);
2727 m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l);
2728 m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l);
2729
2730 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2731 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2732 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2733 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2734
2735 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2736 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2737 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2738 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2739
2740 m128Tmp0 = _mm_unpacklo_epi16(O0l, O4l);
2741 m128Tmp1 = _mm_unpacklo_epi16(O1l, O5l);
2742 m128Tmp2 = _mm_unpacklo_epi16(O2l, O6l);
2743 m128Tmp3 = _mm_unpacklo_epi16(O3l, O7l);
2744
2745 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2746 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2747 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2748 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2749
2750 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2751 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2752 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2753 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2754
2755 m128Tmp0 = _mm_unpackhi_epi16(O0l, O4l);
2756 m128Tmp1 = _mm_unpackhi_epi16(O1l, O5l);
2757 m128Tmp2 = _mm_unpackhi_epi16(O2l, O6l);
2758 m128Tmp3 = _mm_unpackhi_epi16(O3l, O7l);
2759
2760 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2761 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2762 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2763 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2764
2765 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2766 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2767 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2768 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2769
2770 /* */
2771 _mm_store_si128((__m128i *) (src + i), m128iS0);
2772 _mm_store_si128((__m128i *) (src + 16 + i), m128iS1);
2773 _mm_store_si128((__m128i *) (src + 32 + i), m128iS2);
2774 _mm_store_si128((__m128i *) (src + 48 + i), m128iS3);
2775 _mm_store_si128((__m128i *) (src + 64 + i), m128iS4);
2776 _mm_store_si128((__m128i *) (src + 80 + i), m128iS5);
2777 _mm_store_si128((__m128i *) (src + 96 + i), m128iS6);
2778 _mm_store_si128((__m128i *) (src + 112 + i), m128iS7);
2779 _mm_store_si128((__m128i *) (src + 128 + i), m128iS8);
2780 _mm_store_si128((__m128i *) (src + 144 + i), m128iS9);
2781 _mm_store_si128((__m128i *) (src + 160 + i), m128iS10);
2782 _mm_store_si128((__m128i *) (src + 176 + i), m128iS11);
2783 _mm_store_si128((__m128i *) (src + 192 + i), m128iS12);
2784 _mm_store_si128((__m128i *) (src + 208 + i), m128iS13);
2785 _mm_store_si128((__m128i *) (src + 224 + i), m128iS14);
2786 _mm_store_si128((__m128i *) (src + 240 + i), m128iS15);
2787
2788 if (!i) {
2789 m128iS0 = _mm_load_si128((__m128i *) (src + 8));
2790 m128iS1 = _mm_load_si128((__m128i *) (src + 24));
2791 m128iS2 = _mm_load_si128((__m128i *) (src + 40));
2792 m128iS3 = _mm_load_si128((__m128i *) (src + 56));
2793 m128iS4 = _mm_loadu_si128((__m128i *) (src + 72));
2794 m128iS5 = _mm_load_si128((__m128i *) (src + 88));
2795 m128iS6 = _mm_load_si128((__m128i *) (src + 104));
2796 m128iS7 = _mm_load_si128((__m128i *) (src + 120));
2797 m128iS8 = _mm_load_si128((__m128i *) (src + 136));
2798 m128iS9 = _mm_load_si128((__m128i *) (src + 152));
2799 m128iS10 = _mm_load_si128((__m128i *) (src + 168));
2800 m128iS11 = _mm_load_si128((__m128i *) (src + 184));
2801 m128iS12 = _mm_loadu_si128((__m128i *) (src + 200));
2802 m128iS13 = _mm_load_si128((__m128i *) (src + 216));
2803 m128iS14 = _mm_load_si128((__m128i *) (src + 232));
2804 m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2805 } else {
2806 m128iS0 = _mm_load_si128((__m128i *) (src));
2807 m128iS1 = _mm_load_si128((__m128i *) (src + 32));
2808 m128iS2 = _mm_load_si128((__m128i *) (src + 64));
2809 m128iS3 = _mm_load_si128((__m128i *) (src + 96));
2810 m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
2811 m128iS5 = _mm_load_si128((__m128i *) (src + 160));
2812 m128iS6 = _mm_load_si128((__m128i *) (src + 192));
2813 m128iS7 = _mm_load_si128((__m128i *) (src + 224));
2814 m128iS8 = _mm_load_si128((__m128i *) (src + 8));
2815 m128iS9 = _mm_load_si128((__m128i *) (src + 32 + 8));
2816 m128iS10 = _mm_load_si128((__m128i *) (src + 64 + 8));
2817 m128iS11 = _mm_load_si128((__m128i *) (src + 96 + 8));
2818 m128iS12 = _mm_loadu_si128((__m128i *) (src + 128 + 8));
2819 m128iS13 = _mm_load_si128((__m128i *) (src + 160 + 8));
2820 m128iS14 = _mm_load_si128((__m128i *) (src + 192 + 8));
2821 m128iS15 = _mm_load_si128((__m128i *) (src + 224 + 8));
2822 shift = shift_2nd;
2823 m128iAdd = _mm_set1_epi32(add_2nd);
2824 }
2825
2826 } else {
2827 int k, m = 0;
2828 _mm_storeu_si128((__m128i *) (src), m128iS0);
2829 _mm_storeu_si128((__m128i *) (src + 8), m128iS1);
2830 _mm_storeu_si128((__m128i *) (src + 32), m128iS2);
2831 _mm_storeu_si128((__m128i *) (src + 40), m128iS3);
2832 _mm_storeu_si128((__m128i *) (src + 64), m128iS4);
2833 _mm_storeu_si128((__m128i *) (src + 72), m128iS5);
2834 _mm_storeu_si128((__m128i *) (src + 96), m128iS6);
2835 _mm_storeu_si128((__m128i *) (src + 104), m128iS7);
2836 _mm_storeu_si128((__m128i *) (src + 128), m128iS8);
2837 _mm_storeu_si128((__m128i *) (src + 136), m128iS9);
2838 _mm_storeu_si128((__m128i *) (src + 160), m128iS10);
2839 _mm_storeu_si128((__m128i *) (src + 168), m128iS11);
2840 _mm_storeu_si128((__m128i *) (src + 192), m128iS12);
2841 _mm_storeu_si128((__m128i *) (src + 200), m128iS13);
2842 _mm_storeu_si128((__m128i *) (src + 224), m128iS14);
2843 _mm_storeu_si128((__m128i *) (src + 232), m128iS15);
2844 dst = (uint16_t*) _dst + (i * stride);
2845
2846 for (k = 0; k < 8; k++) {
2847 dst[0] = av_clip_uintp2(dst[0] + src[m],10);
2848 dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10);
2849 dst[2] = av_clip_uintp2(dst[2] + src[m + 32],10);
2850 dst[3] = av_clip_uintp2(dst[3] + src[m + 40],10);
2851 dst[4] = av_clip_uintp2(dst[4] + src[m + 64],10);
2852 dst[5] = av_clip_uintp2(dst[5] + src[m + 72],10);
2853 dst[6] = av_clip_uintp2(dst[6] + src[m + 96],10);
2854 dst[7] = av_clip_uintp2(dst[7] + src[m + 104],10);
2855
2856 dst[8] = av_clip_uintp2(dst[8] + src[m + 128],10);
2857 dst[9] = av_clip_uintp2(dst[9] + src[m + 136],10);
2858 dst[10] = av_clip_uintp2(dst[10] + src[m + 160],10);
2859 dst[11] = av_clip_uintp2(dst[11] + src[m + 168],10);
2860 dst[12] = av_clip_uintp2(dst[12] + src[m + 192],10);
2861 dst[13] = av_clip_uintp2(dst[13] + src[m + 200],10);
2862 dst[14] = av_clip_uintp2(dst[14] + src[m + 224],10);
2863 dst[15] = av_clip_uintp2(dst[15] + src[m + 232],10);
2864 m += 1;
2865 dst += stride;
2866 }
2867 if (!i) {
2868 m128iS0 = _mm_load_si128((__m128i *) (src + 16));
2869 m128iS1 = _mm_load_si128((__m128i *) (src + 48));
2870 m128iS2 = _mm_load_si128((__m128i *) (src + 80));
2871 m128iS3 = _mm_loadu_si128((__m128i *) (src + 112));
2872 m128iS4 = _mm_load_si128((__m128i *) (src + 144));
2873 m128iS5 = _mm_load_si128((__m128i *) (src + 176));
2874 m128iS6 = _mm_load_si128((__m128i *) (src + 208));
2875 m128iS7 = _mm_load_si128((__m128i *) (src + 240));
2876 m128iS8 = _mm_load_si128((__m128i *) (src + 24));
2877 m128iS9 = _mm_load_si128((__m128i *) (src + 56));
2878 m128iS10 = _mm_load_si128((__m128i *) (src + 88));
2879 m128iS11 = _mm_loadu_si128((__m128i *) (src + 120));
2880 m128iS12 = _mm_load_si128((__m128i *) (src + 152));
2881 m128iS13 = _mm_load_si128((__m128i *) (src + 184));
2882 m128iS14 = _mm_load_si128((__m128i *) (src + 216));
2883 m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2884 }
2885 }
2886 }
2887 }
2888
2889 }
2890 #endif
2891
2892
2893 #if HAVE_SSE4_1
2894 void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
2895 ptrdiff_t _stride) {
2896 uint8_t shift_2nd = 12; // 20 - Bit depth
2897 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
2898 int i, j;
2899 uint8_t *dst = (uint8_t*) _dst;
2900 ptrdiff_t stride = _stride / sizeof(uint8_t);
2901 int shift;
2902 int16_t *src = coeffs;
2903
2904 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
2905 m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
2906 m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
2907 m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
2908 E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
2909 O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
2910 E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
2911 __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l;
2912 __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h,
2913 EEE0l, EEE1l, EEE0h, EEE1h;
2914 __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21,
2915 m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27,
2916 m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9,
2917 m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15,
2918 O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l,
2919 O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l,
2920 EE4l, EE7h, EE6h, EE5h, EE4h;
2921
2922 __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31;
2923 __m128i r32,r33,r34,r35,r36,r37,r38,r39,r40,r41,r42,r43,r44,r45,r46,r47,r48,r49,r50,r51,r52,r53,r54,r55,r56,r57,r58,r59,r60,r61,r62,r63;
2924 __m128i r64,r65,r66,r67,r68,r69,r70,r71,r72,r73,r74,r75,r76,r77,r78,r79,r80,r81,r82,r83,r84,r85,r86,r87,r88,r89,r90,r91,r92,r93,r94,r95;
2925 __m128i r96,r97,r98,r99,r100,r101,r102,r103,r104,r105,r106,r107,r108,r109,r110,r111,r112,r113,r114,r115,r116,r117,r118,r119,r120,r121,r122,r123,r124,r125,r126,r127;
2926
2927
2928 m128iS0 = _mm_load_si128((__m128i *) (src));
2929 m128iS1 = _mm_load_si128((__m128i *) (src + 32));
2930 m128iS2 = _mm_load_si128((__m128i *) (src + 64));
2931 m128iS3 = _mm_load_si128((__m128i *) (src + 96));
2932 m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
2933 m128iS5 = _mm_load_si128((__m128i *) (src + 160));
2934 m128iS6 = _mm_load_si128((__m128i *) (src + 192));
2935 m128iS7 = _mm_load_si128((__m128i *) (src + 224));
2936 m128iS8 = _mm_load_si128((__m128i *) (src + 256));
2937 m128iS9 = _mm_load_si128((__m128i *) (src + 288));
2938 m128iS10 = _mm_load_si128((__m128i *) (src + 320));
2939 m128iS11 = _mm_load_si128((__m128i *) (src + 352));
2940 m128iS12 = _mm_load_si128((__m128i *) (src + 384));
2941 m128iS13 = _mm_load_si128((__m128i *) (src + 416));
2942 m128iS14 = _mm_load_si128((__m128i *) (src + 448));
2943 m128iS15 = _mm_load_si128((__m128i *) (src + 480));
2944 m128iS16 = _mm_load_si128((__m128i *) (src + 512));
2945 m128iS17 = _mm_load_si128((__m128i *) (src + 544));
2946 m128iS18 = _mm_load_si128((__m128i *) (src + 576));
2947 m128iS19 = _mm_load_si128((__m128i *) (src + 608));
2948 m128iS20 = _mm_load_si128((__m128i *) (src + 640));
2949 m128iS21 = _mm_load_si128((__m128i *) (src + 672));
2950 m128iS22 = _mm_load_si128((__m128i *) (src + 704));
2951 m128iS23 = _mm_load_si128((__m128i *) (src + 736));
2952 m128iS24 = _mm_load_si128((__m128i *) (src + 768));
2953 m128iS25 = _mm_load_si128((__m128i *) (src + 800));
2954 m128iS26 = _mm_load_si128((__m128i *) (src + 832));
2955 m128iS27 = _mm_load_si128((__m128i *) (src + 864));
2956 m128iS28 = _mm_load_si128((__m128i *) (src + 896));
2957 m128iS29 = _mm_load_si128((__m128i *) (src + 928));
2958 m128iS30 = _mm_load_si128((__m128i *) (src + 960));
2959 m128iS31 = _mm_load_si128((__m128i *) (src + 992));
2960
2961 shift = shift_1st;
2962 m128iAdd = _mm_set1_epi32(add_1st);
2963
2964 for (j = 0; j < 2; j++) {
2965 for (i = 0; i < 32; i += 8) {
2966 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
2967 E0l = _mm_madd_epi16(m128Tmp0,
2968 _mm_load_si128((__m128i *) (transform32x32[0][0])));
2969 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
2970 E0h = _mm_madd_epi16(m128Tmp1,
2971 _mm_load_si128((__m128i *) (transform32x32[0][0])));
2972
2973 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
2974 E1l = _mm_madd_epi16(m128Tmp2,
2975 _mm_load_si128((__m128i *) (transform32x32[1][0])));
2976 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
2977 E1h = _mm_madd_epi16(m128Tmp3,
2978 _mm_load_si128((__m128i *) (transform32x32[1][0])));
2979
2980 m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
2981 E2l = _mm_madd_epi16(m128Tmp4,
2982 _mm_load_si128((__m128i *) (transform32x32[2][0])));
2983 m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
2984 E2h = _mm_madd_epi16(m128Tmp5,
2985 _mm_load_si128((__m128i *) (transform32x32[2][0])));
2986
2987 m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
2988 E3l = _mm_madd_epi16(m128Tmp6,
2989 _mm_load_si128((__m128i *) (transform32x32[3][0])));
2990 m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
2991 E3h = _mm_madd_epi16(m128Tmp7,
2992 _mm_load_si128((__m128i *) (transform32x32[3][0])));
2993
2994 m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19);
2995 E4l = _mm_madd_epi16(m128Tmp8,
2996 _mm_load_si128((__m128i *) (transform32x32[4][0])));
2997 m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19);
2998 E4h = _mm_madd_epi16(m128Tmp9,
2999 _mm_load_si128((__m128i *) (transform32x32[4][0])));
3000
3001 m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23);
3002 E5l = _mm_madd_epi16(m128Tmp10,
3003 _mm_load_si128((__m128i *) (transform32x32[5][0])));
3004 m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23);
3005 E5h = _mm_madd_epi16(m128Tmp11,
3006 _mm_load_si128((__m128i *) (transform32x32[5][0])));
3007
3008 m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27);
3009 E6l = _mm_madd_epi16(m128Tmp12,
3010 _mm_load_si128((__m128i *) (transform32x32[6][0])));
3011 m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27);
3012 E6h = _mm_madd_epi16(m128Tmp13,
3013 _mm_load_si128((__m128i *) (transform32x32[6][0])));
3014
3015 m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31);
3016 E7l = _mm_madd_epi16(m128Tmp14,
3017 _mm_load_si128((__m128i *) (transform32x32[7][0])));
3018 m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31);
3019 E7h = _mm_madd_epi16(m128Tmp15,
3020 _mm_load_si128((__m128i *) (transform32x32[7][0])));
3021
3022 O0l = _mm_add_epi32(E0l, E1l);
3023 O0l = _mm_add_epi32(O0l, E2l);
3024 O0l = _mm_add_epi32(O0l, E3l);
3025 O0l = _mm_add_epi32(O0l, E4l);
3026 O0l = _mm_add_epi32(O0l, E5l);
3027 O0l = _mm_add_epi32(O0l, E6l);
3028 O0l = _mm_add_epi32(O0l, E7l);
3029
3030 O0h = _mm_add_epi32(E0h, E1h);
3031 O0h = _mm_add_epi32(O0h, E2h);
3032 O0h = _mm_add_epi32(O0h, E3h);
3033 O0h = _mm_add_epi32(O0h, E4h);
3034 O0h = _mm_add_epi32(O0h, E5h);
3035 O0h = _mm_add_epi32(O0h, E6h);
3036 O0h = _mm_add_epi32(O0h, E7h);
3037
3038 /* Compute O1*/
3039 E0l = _mm_madd_epi16(m128Tmp0,
3040 _mm_load_si128((__m128i *) (transform32x32[0][1])));
3041 E0h = _mm_madd_epi16(m128Tmp1,
3042 _mm_load_si128((__m128i *) (transform32x32[0][1])));
3043 E1l = _mm_madd_epi16(m128Tmp2,
3044 _mm_load_si128((__m128i *) (transform32x32[1][1])));
3045 E1h = _mm_madd_epi16(m128Tmp3,
3046 _mm_load_si128((__m128i *) (transform32x32[1][1])));
3047 E2l = _mm_madd_epi16(m128Tmp4,
3048 _mm_load_si128((__m128i *) (transform32x32[2][1])));
3049 E2h = _mm_madd_epi16(m128Tmp5,
3050 _mm_load_si128((__m128i *) (transform32x32[2][1])));
3051 E3l = _mm_madd_epi16(m128Tmp6,
3052 _mm_load_si128((__m128i *) (transform32x32[3][1])));
3053 E3h = _mm_madd_epi16(m128Tmp7,
3054 _mm_load_si128((__m128i *) (transform32x32[3][1])));
3055
3056 E4l = _mm_madd_epi16(m128Tmp8,
3057 _mm_load_si128((__m128i *) (transform32x32[4][1])));
3058 E4h = _mm_madd_epi16(m128Tmp9,
3059 _mm_load_si128((__m128i *) (transform32x32[4][1])));
3060 E5l = _mm_madd_epi16(m128Tmp10,
3061 _mm_load_si128((__m128i *) (transform32x32[5][1])));
3062 E5h = _mm_madd_epi16(m128Tmp11,
3063 _mm_load_si128((__m128i *) (transform32x32[5][1])));
3064 E6l = _mm_madd_epi16(m128Tmp12,
3065 _mm_load_si128((__m128i *) (transform32x32[6][1])));
3066 E6h = _mm_madd_epi16(m128Tmp13,
3067 _mm_load_si128((__m128i *) (transform32x32[6][1])));
3068 E7l = _mm_madd_epi16(m128Tmp14,
3069 _mm_load_si128((__m128i *) (transform32x32[7][1])));
3070 E7h = _mm_madd_epi16(m128Tmp15,
3071 _mm_load_si128((__m128i *) (transform32x32[7][1])));
3072
3073 O1l = _mm_add_epi32(E0l, E1l);
3074 O1l = _mm_add_epi32(O1l, E2l);
3075 O1l = _mm_add_epi32(O1l, E3l);
3076 O1l = _mm_add_epi32(O1l, E4l);
3077 O1l = _mm_add_epi32(O1l, E5l);
3078 O1l = _mm_add_epi32(O1l, E6l);
3079 O1l = _mm_add_epi32(O1l, E7l);
3080
3081 O1h = _mm_add_epi32(E0h, E1h);
3082 O1h = _mm_add_epi32(O1h, E2h);
3083 O1h = _mm_add_epi32(O1h, E3h);
3084 O1h = _mm_add_epi32(O1h, E4h);
3085 O1h = _mm_add_epi32(O1h, E5h);
3086 O1h = _mm_add_epi32(O1h, E6h);
3087 O1h = _mm_add_epi32(O1h, E7h);
3088 /* Compute O2*/
3089 E0l = _mm_madd_epi16(m128Tmp0,
3090 _mm_load_si128((__m128i *) (transform32x32[0][2])));
3091 E0h = _mm_madd_epi16(m128Tmp1,
3092 _mm_load_si128((__m128i *) (transform32x32[0][2])));
3093 E1l = _mm_madd_epi16(m128Tmp2,
3094 _mm_load_si128((__m128i *) (transform32x32[1][2])));
3095 E1h = _mm_madd_epi16(m128Tmp3,
3096 _mm_load_si128((__m128i *) (transform32x32[1][2])));
3097 E2l = _mm_madd_epi16(m128Tmp4,
3098 _mm_load_si128((__m128i *) (transform32x32[2][2])));
3099 E2h = _mm_madd_epi16(m128Tmp5,
3100 _mm_load_si128((__m128i *) (transform32x32[2][2])));
3101 E3l = _mm_madd_epi16(m128Tmp6,
3102 _mm_load_si128((__m128i *) (transform32x32[3][2])));
3103 E3h = _mm_madd_epi16(m128Tmp7,
3104 _mm_load_si128((__m128i *) (transform32x32[3][2])));
3105
3106 E4l = _mm_madd_epi16(m128Tmp8,
3107 _mm_load_si128((__m128i *) (transform32x32[4][2])));
3108 E4h = _mm_madd_epi16(m128Tmp9,
3109 _mm_load_si128((__m128i *) (transform32x32[4][2])));
3110 E5l = _mm_madd_epi16(m128Tmp10,
3111 _mm_load_si128((__m128i *) (transform32x32[5][2])));
3112 E5h = _mm_madd_epi16(m128Tmp11,
3113 _mm_load_si128((__m128i *) (transform32x32[5][2])));
3114 E6l = _mm_madd_epi16(m128Tmp12,
3115 _mm_load_si128((__m128i *) (transform32x32[6][2])));
3116 E6h = _mm_madd_epi16(m128Tmp13,
3117 _mm_load_si128((__m128i *) (transform32x32[6][2])));
3118 E7l = _mm_madd_epi16(m128Tmp14,
3119 _mm_load_si128((__m128i *) (transform32x32[7][2])));
3120 E7h = _mm_madd_epi16(m128Tmp15,
3121 _mm_load_si128((__m128i *) (transform32x32[7][2])));
3122
3123 O2l = _mm_add_epi32(E0l, E1l);
3124 O2l = _mm_add_epi32(O2l, E2l);
3125 O2l = _mm_add_epi32(O2l, E3l);
3126 O2l = _mm_add_epi32(O2l, E4l);
3127 O2l = _mm_add_epi32(O2l, E5l);
3128 O2l = _mm_add_epi32(O2l, E6l);
3129 O2l = _mm_add_epi32(O2l, E7l);
3130
3131 O2h = _mm_add_epi32(E0h, E1h);
3132 O2h = _mm_add_epi32(O2h, E2h);
3133 O2h = _mm_add_epi32(O2h, E3h);
3134 O2h = _mm_add_epi32(O2h, E4h);
3135 O2h = _mm_add_epi32(O2h, E5h);
3136 O2h = _mm_add_epi32(O2h, E6h);
3137 O2h = _mm_add_epi32(O2h, E7h);
3138 /* Compute O3*/
3139 E0l = _mm_madd_epi16(m128Tmp0,
3140 _mm_load_si128((__m128i *) (transform32x32[0][3])));
3141 E0h = _mm_madd_epi16(m128Tmp1,
3142 _mm_load_si128((__m128i *) (transform32x32[0][3])));
3143 E1l = _mm_madd_epi16(m128Tmp2,
3144 _mm_load_si128((__m128i *) (transform32x32[1][3])));
3145 E1h = _mm_madd_epi16(m128Tmp3,
3146 _mm_load_si128((__m128i *) (transform32x32[1][3])));
3147 E2l = _mm_madd_epi16(m128Tmp4,
3148 _mm_load_si128((__m128i *) (transform32x32[2][3])));
3149 E2h = _mm_madd_epi16(m128Tmp5,
3150 _mm_load_si128((__m128i *) (transform32x32[2][3])));
3151 E3l = _mm_madd_epi16(m128Tmp6,
3152 _mm_load_si128((__m128i *) (transform32x32[3][3])));
3153 E3h = _mm_madd_epi16(m128Tmp7,
3154 _mm_load_si128((__m128i *) (transform32x32[3][3])));
3155
3156 E4l = _mm_madd_epi16(m128Tmp8,
3157 _mm_load_si128((__m128i *) (transform32x32[4][3])));
3158 E4h = _mm_madd_epi16(m128Tmp9,
3159 _mm_load_si128((__m128i *) (transform32x32[4][3])));
3160 E5l = _mm_madd_epi16(m128Tmp10,
3161 _mm_load_si128((__m128i *) (transform32x32[5][3])));
3162 E5h = _mm_madd_epi16(m128Tmp11,
3163 _mm_load_si128((__m128i *) (transform32x32[5][3])));
3164 E6l = _mm_madd_epi16(m128Tmp12,
3165 _mm_load_si128((__m128i *) (transform32x32[6][3])));
3166 E6h = _mm_madd_epi16(m128Tmp13,
3167 _mm_load_si128((__m128i *) (transform32x32[6][3])));
3168 E7l = _mm_madd_epi16(m128Tmp14,
3169 _mm_load_si128((__m128i *) (transform32x32[7][3])));
3170 E7h = _mm_madd_epi16(m128Tmp15,
3171 _mm_load_si128((__m128i *) (transform32x32[7][3])));
3172
3173 O3l = _mm_add_epi32(E0l, E1l);
3174 O3l = _mm_add_epi32(O3l, E2l);
3175 O3l = _mm_add_epi32(O3l, E3l);
3176 O3l = _mm_add_epi32(O3l, E4l);
3177 O3l = _mm_add_epi32(O3l, E5l);
3178 O3l = _mm_add_epi32(O3l, E6l);
3179 O3l = _mm_add_epi32(O3l, E7l);
3180
3181 O3h = _mm_add_epi32(E0h, E1h);
3182 O3h = _mm_add_epi32(O3h, E2h);
3183 O3h = _mm_add_epi32(O3h, E3h);
3184 O3h = _mm_add_epi32(O3h, E4h);
3185 O3h = _mm_add_epi32(O3h, E5h);
3186 O3h = _mm_add_epi32(O3h, E6h);
3187 O3h = _mm_add_epi32(O3h, E7h);
3188 /* Compute O4*/
3189
3190 E0l = _mm_madd_epi16(m128Tmp0,
3191 _mm_load_si128((__m128i *) (transform32x32[0][4])));
3192 E0h = _mm_madd_epi16(m128Tmp1,
3193 _mm_load_si128((__m128i *) (transform32x32[0][4])));
3194 E1l = _mm_madd_epi16(m128Tmp2,
3195 _mm_load_si128((__m128i *) (transform32x32[1][4])));
3196 E1h = _mm_madd_epi16(m128Tmp3,
3197 _mm_load_si128((__m128i *) (transform32x32[1][4])));
3198 E2l = _mm_madd_epi16(m128Tmp4,
3199 _mm_load_si128((__m128i *) (transform32x32[2][4])));
3200 E2h = _mm_madd_epi16(m128Tmp5,
3201 _mm_load_si128((__m128i *) (transform32x32[2][4])));
3202 E3l = _mm_madd_epi16(m128Tmp6,
3203 _mm_load_si128((__m128i *) (transform32x32[3][4])));
3204 E3h = _mm_madd_epi16(m128Tmp7,
3205 _mm_load_si128((__m128i *) (transform32x32[3][4])));
3206
3207 E4l = _mm_madd_epi16(m128Tmp8,
3208 _mm_load_si128((__m128i *) (transform32x32[4][4])));
3209 E4h = _mm_madd_epi16(m128Tmp9,
3210 _mm_load_si128((__m128i *) (transform32x32[4][4])));
3211 E5l = _mm_madd_epi16(m128Tmp10,
3212 _mm_load_si128((__m128i *) (transform32x32[5][4])));
3213 E5h = _mm_madd_epi16(m128Tmp11,
3214 _mm_load_si128((__m128i *) (transform32x32[5][4])));
3215 E6l = _mm_madd_epi16(m128Tmp12,
3216 _mm_load_si128((__m128i *) (transform32x32[6][4])));
3217 E6h = _mm_madd_epi16(m128Tmp13,
3218 _mm_load_si128((__m128i *) (transform32x32[6][4])));
3219 E7l = _mm_madd_epi16(m128Tmp14,
3220 _mm_load_si128((__m128i *) (transform32x32[7][4])));
3221 E7h = _mm_madd_epi16(m128Tmp15,
3222 _mm_load_si128((__m128i *) (transform32x32[7][4])));
3223
3224 O4l = _mm_add_epi32(E0l, E1l);
3225 O4l = _mm_add_epi32(O4l, E2l);
3226 O4l = _mm_add_epi32(O4l, E3l);
3227 O4l = _mm_add_epi32(O4l, E4l);
3228 O4l = _mm_add_epi32(O4l, E5l);
3229 O4l = _mm_add_epi32(O4l, E6l);
3230 O4l = _mm_add_epi32(O4l, E7l);
3231
3232 O4h = _mm_add_epi32(E0h, E1h);
3233 O4h = _mm_add_epi32(O4h, E2h);
3234 O4h = _mm_add_epi32(O4h, E3h);
3235 O4h = _mm_add_epi32(O4h, E4h);
3236 O4h = _mm_add_epi32(O4h, E5h);
3237 O4h = _mm_add_epi32(O4h, E6h);
3238 O4h = _mm_add_epi32(O4h, E7h);
3239
3240 /* Compute O5*/
3241 E0l = _mm_madd_epi16(m128Tmp0,
3242 _mm_load_si128((__m128i *) (transform32x32[0][5])));
3243 E0h = _mm_madd_epi16(m128Tmp1,
3244 _mm_load_si128((__m128i *) (transform32x32[0][5])));
3245 E1l = _mm_madd_epi16(m128Tmp2,
3246 _mm_load_si128((__m128i *) (transform32x32[1][5])));
3247 E1h = _mm_madd_epi16(m128Tmp3,
3248 _mm_load_si128((__m128i *) (transform32x32[1][5])));
3249 E2l = _mm_madd_epi16(m128Tmp4,
3250 _mm_load_si128((__m128i *) (transform32x32[2][5])));
3251 E2h = _mm_madd_epi16(m128Tmp5,
3252 _mm_load_si128((__m128i *) (transform32x32[2][5])));
3253 E3l = _mm_madd_epi16(m128Tmp6,
3254 _mm_load_si128((__m128i *) (transform32x32[3][5])));
3255 E3h = _mm_madd_epi16(m128Tmp7,
3256 _mm_load_si128((__m128i *) (transform32x32[3][5])));
3257
3258 E4l = _mm_madd_epi16(m128Tmp8,
3259 _mm_load_si128((__m128i *) (transform32x32[4][5])));
3260 E4h = _mm_madd_epi16(m128Tmp9,
3261 _mm_load_si128((__m128i *) (transform32x32[4][5])));
3262 E5l = _mm_madd_epi16(m128Tmp10,
3263 _mm_load_si128((__m128i *) (transform32x32[5][5])));
3264 E5h = _mm_madd_epi16(m128Tmp11,
3265 _mm_load_si128((__m128i *) (transform32x32[5][5])));
3266 E6l = _mm_madd_epi16(m128Tmp12,
3267 _mm_load_si128((__m128i *) (transform32x32[6][5])));
3268 E6h = _mm_madd_epi16(m128Tmp13,
3269 _mm_load_si128((__m128i *) (transform32x32[6][5])));
3270 E7l = _mm_madd_epi16(m128Tmp14,
3271 _mm_load_si128((__m128i *) (transform32x32[7][5])));
3272 E7h = _mm_madd_epi16(m128Tmp15,
3273 _mm_load_si128((__m128i *) (transform32x32[7][5])));
3274
3275 O5l = _mm_add_epi32(E0l, E1l);
3276 O5l = _mm_add_epi32(O5l, E2l);
3277 O5l = _mm_add_epi32(O5l, E3l);
3278 O5l = _mm_add_epi32(O5l, E4l);
3279 O5l = _mm_add_epi32(O5l, E5l);
3280 O5l = _mm_add_epi32(O5l, E6l);
3281 O5l = _mm_add_epi32(O5l, E7l);
3282
3283 O5h = _mm_add_epi32(E0h, E1h);
3284 O5h = _mm_add_epi32(O5h, E2h);
3285 O5h = _mm_add_epi32(O5h, E3h);
3286 O5h = _mm_add_epi32(O5h, E4h);
3287 O5h = _mm_add_epi32(O5h, E5h);
3288 O5h = _mm_add_epi32(O5h, E6h);
3289 O5h = _mm_add_epi32(O5h, E7h);
3290
3291 /* Compute O6*/
3292
3293 E0l = _mm_madd_epi16(m128Tmp0,
3294 _mm_load_si128((__m128i *) (transform32x32[0][6])));
3295 E0h = _mm_madd_epi16(m128Tmp1,
3296 _mm_load_si128((__m128i *) (transform32x32[0][6])));
3297 E1l = _mm_madd_epi16(m128Tmp2,
3298 _mm_load_si128((__m128i *) (transform32x32[1][6])));
3299 E1h = _mm_madd_epi16(m128Tmp3,
3300 _mm_load_si128((__m128i *) (transform32x32[1][6])));
3301 E2l = _mm_madd_epi16(m128Tmp4,
3302 _mm_load_si128((__m128i *) (transform32x32[2][6])));
3303 E2h = _mm_madd_epi16(m128Tmp5,
3304 _mm_load_si128((__m128i *) (transform32x32[2][6])));
3305 E3l = _mm_madd_epi16(m128Tmp6,
3306 _mm_load_si128((__m128i *) (transform32x32[3][6])));
3307 E3h = _mm_madd_epi16(m128Tmp7,
3308 _mm_load_si128((__m128i *) (transform32x32[3][6])));
3309
3310 E4l = _mm_madd_epi16(m128Tmp8,
3311 _mm_load_si128((__m128i *) (transform32x32[4][6])));
3312 E4h = _mm_madd_epi16(m128Tmp9,
3313 _mm_load_si128((__m128i *) (transform32x32[4][6])));
3314 E5l = _mm_madd_epi16(m128Tmp10,
3315 _mm_load_si128((__m128i *) (transform32x32[5][6])));
3316 E5h = _mm_madd_epi16(m128Tmp11,
3317 _mm_load_si128((__m128i *) (transform32x32[5][6])));
3318 E6l = _mm_madd_epi16(m128Tmp12,
3319 _mm_load_si128((__m128i *) (transform32x32[6][6])));
3320 E6h = _mm_madd_epi16(m128Tmp13,
3321 _mm_load_si128((__m128i *) (transform32x32[6][6])));
3322 E7l = _mm_madd_epi16(m128Tmp14,
3323 _mm_load_si128((__m128i *) (transform32x32[7][6])));
3324 E7h = _mm_madd_epi16(m128Tmp15,
3325 _mm_load_si128((__m128i *) (transform32x32[7][6])));
3326
3327 O6l = _mm_add_epi32(E0l, E1l);
3328 O6l = _mm_add_epi32(O6l, E2l);
3329 O6l = _mm_add_epi32(O6l, E3l);
3330 O6l = _mm_add_epi32(O6l, E4l);
3331 O6l = _mm_add_epi32(O6l, E5l);
3332 O6l = _mm_add_epi32(O6l, E6l);
3333 O6l = _mm_add_epi32(O6l, E7l);
3334
3335 O6h = _mm_add_epi32(E0h, E1h);
3336 O6h = _mm_add_epi32(O6h, E2h);
3337 O6h = _mm_add_epi32(O6h, E3h);
3338 O6h = _mm_add_epi32(O6h, E4h);
3339 O6h = _mm_add_epi32(O6h, E5h);
3340 O6h = _mm_add_epi32(O6h, E6h);
3341 O6h = _mm_add_epi32(O6h, E7h);
3342
3343 /* Compute O7*/
3344
3345 E0l = _mm_madd_epi16(m128Tmp0,
3346 _mm_load_si128((__m128i *) (transform32x32[0][7])));
3347 E0h = _mm_madd_epi16(m128Tmp1,
3348 _mm_load_si128((__m128i *) (transform32x32[0][7])));
3349 E1l = _mm_madd_epi16(m128Tmp2,
3350 _mm_load_si128((__m128i *) (transform32x32[1][7])));
3351 E1h = _mm_madd_epi16(m128Tmp3,
3352 _mm_load_si128((__m128i *) (transform32x32[1][7])));
3353 E2l = _mm_madd_epi16(m128Tmp4,
3354 _mm_load_si128((__m128i *) (transform32x32[2][7])));
3355 E2h = _mm_madd_epi16(m128Tmp5,
3356 _mm_load_si128((__m128i *) (transform32x32[2][7])));
3357 E3l = _mm_madd_epi16(m128Tmp6,
3358 _mm_load_si128((__m128i *) (transform32x32[3][7])));
3359 E3h = _mm_madd_epi16(m128Tmp7,
3360 _mm_load_si128((__m128i *) (transform32x32[3][7])));
3361
3362 E4l = _mm_madd_epi16(m128Tmp8,
3363 _mm_load_si128((__m128i *) (transform32x32[4][7])));
3364 E4h = _mm_madd_epi16(m128Tmp9,
3365 _mm_load_si128((__m128i *) (transform32x32[4][7])));
3366 E5l = _mm_madd_epi16(m128Tmp10,
3367 _mm_load_si128((__m128i *) (transform32x32[5][7])));
3368 E5h = _mm_madd_epi16(m128Tmp11,
3369 _mm_load_si128((__m128i *) (transform32x32[5][7])));
3370 E6l = _mm_madd_epi16(m128Tmp12,
3371 _mm_load_si128((__m128i *) (transform32x32[6][7])));
3372 E6h = _mm_madd_epi16(m128Tmp13,
3373 _mm_load_si128((__m128i *) (transform32x32[6][7])));
3374 E7l = _mm_madd_epi16(m128Tmp14,
3375 _mm_load_si128((__m128i *) (transform32x32[7][7])));
3376 E7h = _mm_madd_epi16(m128Tmp15,
3377 _mm_load_si128((__m128i *) (transform32x32[7][7])));
3378
3379 O7l = _mm_add_epi32(E0l, E1l);
3380 O7l = _mm_add_epi32(O7l, E2l);
3381 O7l = _mm_add_epi32(O7l, E3l);
3382 O7l = _mm_add_epi32(O7l, E4l);
3383 O7l = _mm_add_epi32(O7l, E5l);
3384 O7l = _mm_add_epi32(O7l, E6l);
3385 O7l = _mm_add_epi32(O7l, E7l);
3386
3387 O7h = _mm_add_epi32(E0h, E1h);
3388 O7h = _mm_add_epi32(O7h, E2h);
3389 O7h = _mm_add_epi32(O7h, E3h);
3390 O7h = _mm_add_epi32(O7h, E4h);
3391 O7h = _mm_add_epi32(O7h, E5h);
3392 O7h = _mm_add_epi32(O7h, E6h);
3393 O7h = _mm_add_epi32(O7h, E7h);
3394
3395 /* Compute O8*/
3396
3397 E0l = _mm_madd_epi16(m128Tmp0,
3398 _mm_load_si128((__m128i *) (transform32x32[0][8])));
3399 E0h = _mm_madd_epi16(m128Tmp1,
3400 _mm_load_si128((__m128i *) (transform32x32[0][8])));
3401 E1l = _mm_madd_epi16(m128Tmp2,
3402 _mm_load_si128((__m128i *) (transform32x32[1][8])));
3403 E1h = _mm_madd_epi16(m128Tmp3,
3404 _mm_load_si128((__m128i *) (transform32x32[1][8])));
3405 E2l = _mm_madd_epi16(m128Tmp4,
3406 _mm_load_si128((__m128i *) (transform32x32[2][8])));
3407 E2h = _mm_madd_epi16(m128Tmp5,
3408 _mm_load_si128((__m128i *) (transform32x32[2][8])));
3409 E3l = _mm_madd_epi16(m128Tmp6,
3410 _mm_load_si128((__m128i *) (transform32x32[3][8])));
3411 E3h = _mm_madd_epi16(m128Tmp7,
3412 _mm_load_si128((__m128i *) (transform32x32[3][8])));
3413
3414 E4l = _mm_madd_epi16(m128Tmp8,
3415 _mm_load_si128((__m128i *) (transform32x32[4][8])));
3416 E4h = _mm_madd_epi16(m128Tmp9,
3417 _mm_load_si128((__m128i *) (transform32x32[4][8])));
3418 E5l = _mm_madd_epi16(m128Tmp10,
3419 _mm_load_si128((__m128i *) (transform32x32[5][8])));
3420 E5h = _mm_madd_epi16(m128Tmp11,
3421 _mm_load_si128((__m128i *) (transform32x32[5][8])));
3422 E6l = _mm_madd_epi16(m128Tmp12,
3423 _mm_load_si128((__m128i *) (transform32x32[6][8])));
3424 E6h = _mm_madd_epi16(m128Tmp13,
3425 _mm_load_si128((__m128i *) (transform32x32[6][8])));
3426 E7l = _mm_madd_epi16(m128Tmp14,
3427 _mm_load_si128((__m128i *) (transform32x32[7][8])));
3428 E7h = _mm_madd_epi16(m128Tmp15,
3429 _mm_load_si128((__m128i *) (transform32x32[7][8])));
3430
3431 O8l = _mm_add_epi32(E0l, E1l);
3432 O8l = _mm_add_epi32(O8l, E2l);
3433 O8l = _mm_add_epi32(O8l, E3l);
3434 O8l = _mm_add_epi32(O8l, E4l);
3435 O8l = _mm_add_epi32(O8l, E5l);
3436 O8l = _mm_add_epi32(O8l, E6l);
3437 O8l = _mm_add_epi32(O8l, E7l);
3438
3439 O8h = _mm_add_epi32(E0h, E1h);
3440 O8h = _mm_add_epi32(O8h, E2h);
3441 O8h = _mm_add_epi32(O8h, E3h);
3442 O8h = _mm_add_epi32(O8h, E4h);
3443 O8h = _mm_add_epi32(O8h, E5h);
3444 O8h = _mm_add_epi32(O8h, E6h);
3445 O8h = _mm_add_epi32(O8h, E7h);
3446
3447 /* Compute O9*/
3448
3449 E0l = _mm_madd_epi16(m128Tmp0,
3450 _mm_load_si128((__m128i *) (transform32x32[0][9])));
3451 E0h = _mm_madd_epi16(m128Tmp1,
3452 _mm_load_si128((__m128i *) (transform32x32[0][9])));
3453 E1l = _mm_madd_epi16(m128Tmp2,
3454 _mm_load_si128((__m128i *) (transform32x32[1][9])));
3455 E1h = _mm_madd_epi16(m128Tmp3,
3456 _mm_load_si128((__m128i *) (transform32x32[1][9])));
3457 E2l = _mm_madd_epi16(m128Tmp4,
3458 _mm_load_si128((__m128i *) (transform32x32[2][9])));
3459 E2h = _mm_madd_epi16(m128Tmp5,
3460 _mm_load_si128((__m128i *) (transform32x32[2][9])));
3461 E3l = _mm_madd_epi16(m128Tmp6,
3462 _mm_load_si128((__m128i *) (transform32x32[3][9])));
3463 E3h = _mm_madd_epi16(m128Tmp7,
3464 _mm_load_si128((__m128i *) (transform32x32[3][9])));
3465
3466 E4l = _mm_madd_epi16(m128Tmp8,
3467 _mm_load_si128((__m128i *) (transform32x32[4][9])));
3468 E4h = _mm_madd_epi16(m128Tmp9,
3469 _mm_load_si128((__m128i *) (transform32x32[4][9])));
3470 E5l = _mm_madd_epi16(m128Tmp10,
3471 _mm_load_si128((__m128i *) (transform32x32[5][9])));
3472 E5h = _mm_madd_epi16(m128Tmp11,
3473 _mm_load_si128((__m128i *) (transform32x32[5][9])));
3474 E6l = _mm_madd_epi16(m128Tmp12,
3475 _mm_load_si128((__m128i *) (transform32x32[6][9])));
3476 E6h = _mm_madd_epi16(m128Tmp13,
3477 _mm_load_si128((__m128i *) (transform32x32[6][9])));
3478 E7l = _mm_madd_epi16(m128Tmp14,
3479 _mm_load_si128((__m128i *) (transform32x32[7][9])));
3480 E7h = _mm_madd_epi16(m128Tmp15,
3481 _mm_load_si128((__m128i *) (transform32x32[7][9])));
3482
3483 O9l = _mm_add_epi32(E0l, E1l);
3484 O9l = _mm_add_epi32(O9l, E2l);
3485 O9l = _mm_add_epi32(O9l, E3l);
3486 O9l = _mm_add_epi32(O9l, E4l);
3487 O9l = _mm_add_epi32(O9l, E5l);
3488 O9l = _mm_add_epi32(O9l, E6l);
3489 O9l = _mm_add_epi32(O9l, E7l);
3490
3491 O9h = _mm_add_epi32(E0h, E1h);
3492 O9h = _mm_add_epi32(O9h, E2h);
3493 O9h = _mm_add_epi32(O9h, E3h);
3494 O9h = _mm_add_epi32(O9h, E4h);
3495 O9h = _mm_add_epi32(O9h, E5h);
3496 O9h = _mm_add_epi32(O9h, E6h);
3497 O9h = _mm_add_epi32(O9h, E7h);
3498
3499 /* Compute 10*/
3500
3501 E0l = _mm_madd_epi16(m128Tmp0,
3502 _mm_load_si128((__m128i *) (transform32x32[0][10])));
3503 E0h = _mm_madd_epi16(m128Tmp1,
3504 _mm_load_si128((__m128i *) (transform32x32[0][10])));
3505 E1l = _mm_madd_epi16(m128Tmp2,
3506 _mm_load_si128((__m128i *) (transform32x32[1][10])));
3507 E1h = _mm_madd_epi16(m128Tmp3,
3508 _mm_load_si128((__m128i *) (transform32x32[1][10])));
3509 E2l = _mm_madd_epi16(m128Tmp4,
3510 _mm_load_si128((__m128i *) (transform32x32[2][10])));
3511 E2h = _mm_madd_epi16(m128Tmp5,
3512 _mm_load_si128((__m128i *) (transform32x32[2][10])));
3513 E3l = _mm_madd_epi16(m128Tmp6,
3514 _mm_load_si128((__m128i *) (transform32x32[3][10])));
3515 E3h = _mm_madd_epi16(m128Tmp7,
3516 _mm_load_si128((__m128i *) (transform32x32[3][10])));
3517
3518 E4l = _mm_madd_epi16(m128Tmp8,
3519 _mm_load_si128((__m128i *) (transform32x32[4][10])));
3520 E4h = _mm_madd_epi16(m128Tmp9,
3521 _mm_load_si128((__m128i *) (transform32x32[4][10])));
3522 E5l = _mm_madd_epi16(m128Tmp10,
3523 _mm_load_si128((__m128i *) (transform32x32[5][10])));
3524 E5h = _mm_madd_epi16(m128Tmp11,
3525 _mm_load_si128((__m128i *) (transform32x32[5][10])));
3526 E6l = _mm_madd_epi16(m128Tmp12,
3527 _mm_load_si128((__m128i *) (transform32x32[6][10])));
3528 E6h = _mm_madd_epi16(m128Tmp13,
3529 _mm_load_si128((__m128i *) (transform32x32[6][10])));
3530 E7l = _mm_madd_epi16(m128Tmp14,
3531 _mm_load_si128((__m128i *) (transform32x32[7][10])));
3532 E7h = _mm_madd_epi16(m128Tmp15,
3533 _mm_load_si128((__m128i *) (transform32x32[7][10])));
3534
3535 O10l = _mm_add_epi32(E0l, E1l);
3536 O10l = _mm_add_epi32(O10l, E2l);
3537 O10l = _mm_add_epi32(O10l, E3l);
3538 O10l = _mm_add_epi32(O10l, E4l);
3539 O10l = _mm_add_epi32(O10l, E5l);
3540 O10l = _mm_add_epi32(O10l, E6l);
3541 O10l = _mm_add_epi32(O10l, E7l);
3542
3543 O10h = _mm_add_epi32(E0h, E1h);
3544 O10h = _mm_add_epi32(O10h, E2h);
3545 O10h = _mm_add_epi32(O10h, E3h);
3546 O10h = _mm_add_epi32(O10h, E4h);
3547 O10h = _mm_add_epi32(O10h, E5h);
3548 O10h = _mm_add_epi32(O10h, E6h);
3549 O10h = _mm_add_epi32(O10h, E7h);
3550
3551 /* Compute 11*/
3552
3553 E0l = _mm_madd_epi16(m128Tmp0,
3554 _mm_load_si128((__m128i *) (transform32x32[0][11])));
3555 E0h = _mm_madd_epi16(m128Tmp1,
3556 _mm_load_si128((__m128i *) (transform32x32[0][11])));
3557 E1l = _mm_madd_epi16(m128Tmp2,
3558 _mm_load_si128((__m128i *) (transform32x32[1][11])));
3559 E1h = _mm_madd_epi16(m128Tmp3,
3560 _mm_load_si128((__m128i *) (transform32x32[1][11])));
3561 E2l = _mm_madd_epi16(m128Tmp4,
3562 _mm_load_si128((__m128i *) (transform32x32[2][11])));
3563 E2h = _mm_madd_epi16(m128Tmp5,
3564 _mm_load_si128((__m128i *) (transform32x32[2][11])));
3565 E3l = _mm_madd_epi16(m128Tmp6,
3566 _mm_load_si128((__m128i *) (transform32x32[3][11])));
3567 E3h = _mm_madd_epi16(m128Tmp7,
3568 _mm_load_si128((__m128i *) (transform32x32[3][11])));
3569
3570 E4l = _mm_madd_epi16(m128Tmp8,
3571 _mm_load_si128((__m128i *) (transform32x32[4][11])));
3572 E4h = _mm_madd_epi16(m128Tmp9,
3573 _mm_load_si128((__m128i *) (transform32x32[4][11])));
3574 E5l = _mm_madd_epi16(m128Tmp10,
3575 _mm_load_si128((__m128i *) (transform32x32[5][11])));
3576 E5h = _mm_madd_epi16(m128Tmp11,
3577 _mm_load_si128((__m128i *) (transform32x32[5][11])));
3578 E6l = _mm_madd_epi16(m128Tmp12,
3579 _mm_load_si128((__m128i *) (transform32x32[6][11])));
3580 E6h = _mm_madd_epi16(m128Tmp13,
3581 _mm_load_si128((__m128i *) (transform32x32[6][11])));
3582 E7l = _mm_madd_epi16(m128Tmp14,
3583 _mm_load_si128((__m128i *) (transform32x32[7][11])));
3584 E7h = _mm_madd_epi16(m128Tmp15,
3585 _mm_load_si128((__m128i *) (transform32x32[7][11])));
3586
3587 O11l = _mm_add_epi32(E0l, E1l);
3588 O11l = _mm_add_epi32(O11l, E2l);
3589 O11l = _mm_add_epi32(O11l, E3l);
3590 O11l = _mm_add_epi32(O11l, E4l);
3591 O11l = _mm_add_epi32(O11l, E5l);
3592 O11l = _mm_add_epi32(O11l, E6l);
3593 O11l = _mm_add_epi32(O11l, E7l);
3594
3595 O11h = _mm_add_epi32(E0h, E1h);
3596 O11h = _mm_add_epi32(O11h, E2h);
3597 O11h = _mm_add_epi32(O11h, E3h);
3598 O11h = _mm_add_epi32(O11h, E4h);
3599 O11h = _mm_add_epi32(O11h, E5h);
3600 O11h = _mm_add_epi32(O11h, E6h);
3601 O11h = _mm_add_epi32(O11h, E7h);
3602
3603 /* Compute 12*/
3604
3605 E0l = _mm_madd_epi16(m128Tmp0,
3606 _mm_load_si128((__m128i *) (transform32x32[0][12])));
3607 E0h = _mm_madd_epi16(m128Tmp1,
3608 _mm_load_si128((__m128i *) (transform32x32[0][12])));
3609 E1l = _mm_madd_epi16(m128Tmp2,
3610 _mm_load_si128((__m128i *) (transform32x32[1][12])));
3611 E1h = _mm_madd_epi16(m128Tmp3,
3612 _mm_load_si128((__m128i *) (transform32x32[1][12])));
3613 E2l = _mm_madd_epi16(m128Tmp4,
3614 _mm_load_si128((__m128i *) (transform32x32[2][12])));
3615 E2h = _mm_madd_epi16(m128Tmp5,
3616 _mm_load_si128((__m128i *) (transform32x32[2][12])));
3617 E3l = _mm_madd_epi16(m128Tmp6,
3618 _mm_load_si128((__m128i *) (transform32x32[3][12])));
3619 E3h = _mm_madd_epi16(m128Tmp7,
3620 _mm_load_si128((__m128i *) (transform32x32[3][12])));
3621
3622 E4l = _mm_madd_epi16(m128Tmp8,
3623 _mm_load_si128((__m128i *) (transform32x32[4][12])));
3624 E4h = _mm_madd_epi16(m128Tmp9,
3625 _mm_load_si128((__m128i *) (transform32x32[4][12])));
3626 E5l = _mm_madd_epi16(m128Tmp10,
3627 _mm_load_si128((__m128i *) (transform32x32[5][12])));
3628 E5h = _mm_madd_epi16(m128Tmp11,
3629 _mm_load_si128((__m128i *) (transform32x32[5][12])));
3630 E6l = _mm_madd_epi16(m128Tmp12,
3631 _mm_load_si128((__m128i *) (transform32x32[6][12])));
3632 E6h = _mm_madd_epi16(m128Tmp13,
3633 _mm_load_si128((__m128i *) (transform32x32[6][12])));
3634 E7l = _mm_madd_epi16(m128Tmp14,
3635 _mm_load_si128((__m128i *) (transform32x32[7][12])));
3636 E7h = _mm_madd_epi16(m128Tmp15,
3637 _mm_load_si128((__m128i *) (transform32x32[7][12])));
3638
3639 O12l = _mm_add_epi32(E0l, E1l);
3640 O12l = _mm_add_epi32(O12l, E2l);
3641 O12l = _mm_add_epi32(O12l, E3l);
3642 O12l = _mm_add_epi32(O12l, E4l);
3643 O12l = _mm_add_epi32(O12l, E5l);
3644 O12l = _mm_add_epi32(O12l, E6l);
3645 O12l = _mm_add_epi32(O12l, E7l);
3646
3647 O12h = _mm_add_epi32(E0h, E1h);
3648 O12h = _mm_add_epi32(O12h, E2h);
3649 O12h = _mm_add_epi32(O12h, E3h);
3650 O12h = _mm_add_epi32(O12h, E4h);
3651 O12h = _mm_add_epi32(O12h, E5h);
3652 O12h = _mm_add_epi32(O12h, E6h);
3653 O12h = _mm_add_epi32(O12h, E7h);
3654
3655 /* Compute 13*/
3656
3657 E0l = _mm_madd_epi16(m128Tmp0,
3658 _mm_load_si128((__m128i *) (transform32x32[0][13])));
3659 E0h = _mm_madd_epi16(m128Tmp1,
3660 _mm_load_si128((__m128i *) (transform32x32[0][13])));
3661 E1l = _mm_madd_epi16(m128Tmp2,
3662 _mm_load_si128((__m128i *) (transform32x32[1][13])));
3663 E1h = _mm_madd_epi16(m128Tmp3,
3664 _mm_load_si128((__m128i *) (transform32x32[1][13])));
3665 E2l = _mm_madd_epi16(m128Tmp4,
3666 _mm_load_si128((__m128i *) (transform32x32[2][13])));
3667 E2h = _mm_madd_epi16(m128Tmp5,
3668 _mm_load_si128((__m128i *) (transform32x32[2][13])));
3669 E3l = _mm_madd_epi16(m128Tmp6,
3670 _mm_load_si128((__m128i *) (transform32x32[3][13])));
3671 E3h = _mm_madd_epi16(m128Tmp7,
3672 _mm_load_si128((__m128i *) (transform32x32[3][13])));
3673
3674 E4l = _mm_madd_epi16(m128Tmp8,
3675 _mm_load_si128((__m128i *) (transform32x32[4][13])));
3676 E4h = _mm_madd_epi16(m128Tmp9,
3677 _mm_load_si128((__m128i *) (transform32x32[4][13])));
3678 E5l = _mm_madd_epi16(m128Tmp10,
3679 _mm_load_si128((__m128i *) (transform32x32[5][13])));
3680 E5h = _mm_madd_epi16(m128Tmp11,
3681 _mm_load_si128((__m128i *) (transform32x32[5][13])));
3682 E6l = _mm_madd_epi16(m128Tmp12,
3683 _mm_load_si128((__m128i *) (transform32x32[6][13])));
3684 E6h = _mm_madd_epi16(m128Tmp13,
3685 _mm_load_si128((__m128i *) (transform32x32[6][13])));
3686 E7l = _mm_madd_epi16(m128Tmp14,
3687 _mm_load_si128((__m128i *) (transform32x32[7][13])));
3688 E7h = _mm_madd_epi16(m128Tmp15,
3689 _mm_load_si128((__m128i *) (transform32x32[7][13])));
3690
3691 O13l = _mm_add_epi32(E0l, E1l);
3692 O13l = _mm_add_epi32(O13l, E2l);
3693 O13l = _mm_add_epi32(O13l, E3l);
3694 O13l = _mm_add_epi32(O13l, E4l);
3695 O13l = _mm_add_epi32(O13l, E5l);
3696 O13l = _mm_add_epi32(O13l, E6l);
3697 O13l = _mm_add_epi32(O13l, E7l);
3698
3699 O13h = _mm_add_epi32(E0h, E1h);
3700 O13h = _mm_add_epi32(O13h, E2h);
3701 O13h = _mm_add_epi32(O13h, E3h);
3702 O13h = _mm_add_epi32(O13h, E4h);
3703 O13h = _mm_add_epi32(O13h, E5h);
3704 O13h = _mm_add_epi32(O13h, E6h);
3705 O13h = _mm_add_epi32(O13h, E7h);
3706
3707 /* Compute O14 */
3708
3709 E0l = _mm_madd_epi16(m128Tmp0,
3710 _mm_load_si128((__m128i *) (transform32x32[0][14])));
3711 E0h = _mm_madd_epi16(m128Tmp1,
3712 _mm_load_si128((__m128i *) (transform32x32[0][14])));
3713 E1l = _mm_madd_epi16(m128Tmp2,
3714 _mm_load_si128((__m128i *) (transform32x32[1][14])));
3715 E1h = _mm_madd_epi16(m128Tmp3,
3716 _mm_load_si128((__m128i *) (transform32x32[1][14])));
3717 E2l = _mm_madd_epi16(m128Tmp4,
3718 _mm_load_si128((__m128i *) (transform32x32[2][14])));
3719 E2h = _mm_madd_epi16(m128Tmp5,
3720 _mm_load_si128((__m128i *) (transform32x32[2][14])));
3721 E3l = _mm_madd_epi16(m128Tmp6,
3722 _mm_load_si128((__m128i *) (transform32x32[3][14])));
3723 E3h = _mm_madd_epi16(m128Tmp7,
3724 _mm_load_si128((__m128i *) (transform32x32[3][14])));
3725
3726 E4l = _mm_madd_epi16(m128Tmp8,
3727 _mm_load_si128((__m128i *) (transform32x32[4][14])));
3728 E4h = _mm_madd_epi16(m128Tmp9,
3729 _mm_load_si128((__m128i *) (transform32x32[4][14])));
3730 E5l = _mm_madd_epi16(m128Tmp10,
3731 _mm_load_si128((__m128i *) (transform32x32[5][14])));
3732 E5h = _mm_madd_epi16(m128Tmp11,
3733 _mm_load_si128((__m128i *) (transform32x32[5][14])));
3734 E6l = _mm_madd_epi16(m128Tmp12,
3735 _mm_load_si128((__m128i *) (transform32x32[6][14])));
3736 E6h = _mm_madd_epi16(m128Tmp13,
3737 _mm_load_si128((__m128i *) (transform32x32[6][14])));
3738 E7l = _mm_madd_epi16(m128Tmp14,
3739 _mm_load_si128((__m128i *) (transform32x32[7][14])));
3740 E7h = _mm_madd_epi16(m128Tmp15,
3741 _mm_load_si128((__m128i *) (transform32x32[7][14])));
3742
3743 O14l = _mm_add_epi32(E0l, E1l);
3744 O14l = _mm_add_epi32(O14l, E2l);
3745 O14l = _mm_add_epi32(O14l, E3l);
3746 O14l = _mm_add_epi32(O14l, E4l);
3747 O14l = _mm_add_epi32(O14l, E5l);
3748 O14l = _mm_add_epi32(O14l, E6l);
3749 O14l = _mm_add_epi32(O14l, E7l);
3750
3751 O14h = _mm_add_epi32(E0h, E1h);
3752 O14h = _mm_add_epi32(O14h, E2h);
3753 O14h = _mm_add_epi32(O14h, E3h);
3754 O14h = _mm_add_epi32(O14h, E4h);
3755 O14h = _mm_add_epi32(O14h, E5h);
3756 O14h = _mm_add_epi32(O14h, E6h);
3757 O14h = _mm_add_epi32(O14h, E7h);
3758
3759 /* Compute O15*/
3760
3761 E0l = _mm_madd_epi16(m128Tmp0,
3762 _mm_load_si128((__m128i *) (transform32x32[0][15])));
3763 E0h = _mm_madd_epi16(m128Tmp1,
3764 _mm_load_si128((__m128i *) (transform32x32[0][15])));
3765 E1l = _mm_madd_epi16(m128Tmp2,
3766 _mm_load_si128((__m128i *) (transform32x32[1][15])));
3767 E1h = _mm_madd_epi16(m128Tmp3,
3768 _mm_load_si128((__m128i *) (transform32x32[1][15])));
3769 E2l = _mm_madd_epi16(m128Tmp4,
3770 _mm_load_si128((__m128i *) (transform32x32[2][15])));
3771 E2h = _mm_madd_epi16(m128Tmp5,
3772 _mm_load_si128((__m128i *) (transform32x32[2][15])));
3773 E3l = _mm_madd_epi16(m128Tmp6,
3774 _mm_load_si128((__m128i *) (transform32x32[3][15])));
3775 E3h = _mm_madd_epi16(m128Tmp7,
3776 _mm_load_si128((__m128i *) (transform32x32[3][15])));
3777
3778 E4l = _mm_madd_epi16(m128Tmp8,
3779 _mm_load_si128((__m128i *) (transform32x32[4][15])));
3780 E4h = _mm_madd_epi16(m128Tmp9,
3781 _mm_load_si128((__m128i *) (transform32x32[4][15])));
3782 E5l = _mm_madd_epi16(m128Tmp10,
3783 _mm_load_si128((__m128i *) (transform32x32[5][15])));
3784 E5h = _mm_madd_epi16(m128Tmp11,
3785 _mm_load_si128((__m128i *) (transform32x32[5][15])));
3786 E6l = _mm_madd_epi16(m128Tmp12,
3787 _mm_load_si128((__m128i *) (transform32x32[6][15])));
3788 E6h = _mm_madd_epi16(m128Tmp13,
3789 _mm_load_si128((__m128i *) (transform32x32[6][15])));
3790 E7l = _mm_madd_epi16(m128Tmp14,
3791 _mm_load_si128((__m128i *) (transform32x32[7][15])));
3792 E7h = _mm_madd_epi16(m128Tmp15,
3793 _mm_load_si128((__m128i *) (transform32x32[7][15])));
3794
3795 O15l = _mm_add_epi32(E0l, E1l);
3796 O15l = _mm_add_epi32(O15l, E2l);
3797 O15l = _mm_add_epi32(O15l, E3l);
3798 O15l = _mm_add_epi32(O15l, E4l);
3799 O15l = _mm_add_epi32(O15l, E5l);
3800 O15l = _mm_add_epi32(O15l, E6l);
3801 O15l = _mm_add_epi32(O15l, E7l);
3802
3803 O15h = _mm_add_epi32(E0h, E1h);
3804 O15h = _mm_add_epi32(O15h, E2h);
3805 O15h = _mm_add_epi32(O15h, E3h);
3806 O15h = _mm_add_epi32(O15h, E4h);
3807 O15h = _mm_add_epi32(O15h, E5h);
3808 O15h = _mm_add_epi32(O15h, E6h);
3809 O15h = _mm_add_epi32(O15h, E7h);
3810 /* Compute E0 */
3811
3812 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
3813 E0l = _mm_madd_epi16(m128Tmp0,
3814 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
3815 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
3816 E0h = _mm_madd_epi16(m128Tmp1,
3817 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
3818
3819 m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
3820 E0l = _mm_add_epi32(E0l,
3821 _mm_madd_epi16(m128Tmp2,
3822 _mm_load_si128(
3823 (__m128i *) (transform16x16_1[1][0]))));
3824 m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
3825 E0h = _mm_add_epi32(E0h,
3826 _mm_madd_epi16(m128Tmp3,
3827 _mm_load_si128(
3828 (__m128i *) (transform16x16_1[1][0]))));
3829
3830 m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22);
3831 E0l = _mm_add_epi32(E0l,
3832 _mm_madd_epi16(m128Tmp4,
3833 _mm_load_si128(
3834 (__m128i *) (transform16x16_1[2][0]))));
3835 m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22);
3836 E0h = _mm_add_epi32(E0h,
3837 _mm_madd_epi16(m128Tmp5,
3838 _mm_load_si128(
3839 (__m128i *) (transform16x16_1[2][0]))));
3840
3841 m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30);
3842 E0l = _mm_add_epi32(E0l,
3843 _mm_madd_epi16(m128Tmp6,
3844 _mm_load_si128(
3845 (__m128i *) (transform16x16_1[3][0]))));
3846 m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30);
3847 E0h = _mm_add_epi32(E0h,
3848 _mm_madd_epi16(m128Tmp7,
3849 _mm_load_si128(
3850 (__m128i *) (transform16x16_1[3][0]))));
3851
3852 /* Compute E1 */
3853 E1l = _mm_madd_epi16(m128Tmp0,
3854 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
3855 E1h = _mm_madd_epi16(m128Tmp1,
3856 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
3857 E1l = _mm_add_epi32(E1l,
3858 _mm_madd_epi16(m128Tmp2,
3859 _mm_load_si128(
3860 (__m128i *) (transform16x16_1[1][1]))));
3861 E1h = _mm_add_epi32(E1h,
3862 _mm_madd_epi16(m128Tmp3,
3863 _mm_load_si128(
3864 (__m128i *) (transform16x16_1[1][1]))));
3865 E1l = _mm_add_epi32(E1l,
3866 _mm_madd_epi16(m128Tmp4,
3867 _mm_load_si128(
3868 (__m128i *) (transform16x16_1[2][1]))));
3869 E1h = _mm_add_epi32(E1h,
3870 _mm_madd_epi16(m128Tmp5,
3871 _mm_load_si128(
3872 (__m128i *) (transform16x16_1[2][1]))));
3873 E1l = _mm_add_epi32(E1l,
3874 _mm_madd_epi16(m128Tmp6,
3875 _mm_load_si128(
3876 (__m128i *) (transform16x16_1[3][1]))));
3877 E1h = _mm_add_epi32(E1h,
3878 _mm_madd_epi16(m128Tmp7,
3879 _mm_load_si128(
3880 (__m128i *) (transform16x16_1[3][1]))));
3881
3882 /* Compute E2 */
3883 E2l = _mm_madd_epi16(m128Tmp0,
3884 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
3885 E2h = _mm_madd_epi16(m128Tmp1,
3886 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
3887 E2l = _mm_add_epi32(E2l,
3888 _mm_madd_epi16(m128Tmp2,
3889 _mm_load_si128(
3890 (__m128i *) (transform16x16_1[1][2]))));
3891 E2h = _mm_add_epi32(E2h,
3892 _mm_madd_epi16(m128Tmp3,
3893 _mm_load_si128(
3894 (__m128i *) (transform16x16_1[1][2]))));
3895 E2l = _mm_add_epi32(E2l,
3896 _mm_madd_epi16(m128Tmp4,
3897 _mm_load_si128(
3898 (__m128i *) (transform16x16_1[2][2]))));
3899 E2h = _mm_add_epi32(E2h,
3900 _mm_madd_epi16(m128Tmp5,
3901 _mm_load_si128(
3902 (__m128i *) (transform16x16_1[2][2]))));
3903 E2l = _mm_add_epi32(E2l,
3904 _mm_madd_epi16(m128Tmp6,
3905 _mm_load_si128(
3906 (__m128i *) (transform16x16_1[3][2]))));
3907 E2h = _mm_add_epi32(E2h,
3908 _mm_madd_epi16(m128Tmp7,
3909 _mm_load_si128(
3910 (__m128i *) (transform16x16_1[3][2]))));
3911
3912 /* Compute E3 */
3913 E3l = _mm_madd_epi16(m128Tmp0,
3914 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
3915 E3h = _mm_madd_epi16(m128Tmp1,
3916 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
3917 E3l = _mm_add_epi32(E3l,
3918 _mm_madd_epi16(m128Tmp2,
3919 _mm_load_si128(
3920 (__m128i *) (transform16x16_1[1][3]))));
3921 E3h = _mm_add_epi32(E3h,
3922 _mm_madd_epi16(m128Tmp3,
3923 _mm_load_si128(
3924 (__m128i *) (transform16x16_1[1][3]))));
3925 E3l = _mm_add_epi32(E3l,
3926 _mm_madd_epi16(m128Tmp4,
3927 _mm_load_si128(
3928 (__m128i *) (transform16x16_1[2][3]))));
3929 E3h = _mm_add_epi32(E3h,
3930 _mm_madd_epi16(m128Tmp5,
3931 _mm_load_si128(
3932 (__m128i *) (transform16x16_1[2][3]))));
3933 E3l = _mm_add_epi32(E3l,
3934 _mm_madd_epi16(m128Tmp6,
3935 _mm_load_si128(
3936 (__m128i *) (transform16x16_1[3][3]))));
3937 E3h = _mm_add_epi32(E3h,
3938 _mm_madd_epi16(m128Tmp7,
3939 _mm_load_si128(
3940 (__m128i *) (transform16x16_1[3][3]))));
3941
3942 /* Compute E4 */
3943 E4l = _mm_madd_epi16(m128Tmp0,
3944 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
3945 E4h = _mm_madd_epi16(m128Tmp1,
3946 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
3947 E4l = _mm_add_epi32(E4l,
3948 _mm_madd_epi16(m128Tmp2,
3949 _mm_load_si128(
3950 (__m128i *) (transform16x16_1[1][4]))));
3951 E4h = _mm_add_epi32(E4h,
3952 _mm_madd_epi16(m128Tmp3,
3953 _mm_load_si128(
3954 (__m128i *) (transform16x16_1[1][4]))));
3955 E4l = _mm_add_epi32(E4l,
3956 _mm_madd_epi16(m128Tmp4,
3957 _mm_load_si128(
3958 (__m128i *) (transform16x16_1[2][4]))));
3959 E4h = _mm_add_epi32(E4h,
3960 _mm_madd_epi16(m128Tmp5,
3961 _mm_load_si128(
3962 (__m128i *) (transform16x16_1[2][4]))));
3963 E4l = _mm_add_epi32(E4l,
3964 _mm_madd_epi16(m128Tmp6,
3965 _mm_load_si128(
3966 (__m128i *) (transform16x16_1[3][4]))));
3967 E4h = _mm_add_epi32(E4h,
3968 _mm_madd_epi16(m128Tmp7,
3969 _mm_load_si128(
3970 (__m128i *) (transform16x16_1[3][4]))));
3971
3972 /* Compute E3 */
3973 E5l = _mm_madd_epi16(m128Tmp0,
3974 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
3975 E5h = _mm_madd_epi16(m128Tmp1,
3976 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
3977 E5l = _mm_add_epi32(E5l,
3978 _mm_madd_epi16(m128Tmp2,
3979 _mm_load_si128(
3980 (__m128i *) (transform16x16_1[1][5]))));
3981 E5h = _mm_add_epi32(E5h,
3982 _mm_madd_epi16(m128Tmp3,
3983 _mm_load_si128(
3984 (__m128i *) (transform16x16_1[1][5]))));
3985 E5l = _mm_add_epi32(E5l,
3986 _mm_madd_epi16(m128Tmp4,
3987 _mm_load_si128(
3988 (__m128i *) (transform16x16_1[2][5]))));
3989 E5h = _mm_add_epi32(E5h,
3990 _mm_madd_epi16(m128Tmp5,
3991 _mm_load_si128(
3992 (__m128i *) (transform16x16_1[2][5]))));
3993 E5l = _mm_add_epi32(E5l,
3994 _mm_madd_epi16(m128Tmp6,
3995 _mm_load_si128(
3996 (__m128i *) (transform16x16_1[3][5]))));
3997 E5h = _mm_add_epi32(E5h,
3998 _mm_madd_epi16(m128Tmp7,
3999 _mm_load_si128(
4000 (__m128i *) (transform16x16_1[3][5]))));
4001
4002 /* Compute E6 */
4003 E6l = _mm_madd_epi16(m128Tmp0,
4004 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
4005 E6h = _mm_madd_epi16(m128Tmp1,
4006 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
4007 E6l = _mm_add_epi32(E6l,
4008 _mm_madd_epi16(m128Tmp2,
4009 _mm_load_si128(
4010 (__m128i *) (transform16x16_1[1][6]))));
4011 E6h = _mm_add_epi32(E6h,
4012 _mm_madd_epi16(m128Tmp3,
4013 _mm_load_si128(
4014 (__m128i *) (transform16x16_1[1][6]))));
4015 E6l = _mm_add_epi32(E6l,
4016 _mm_madd_epi16(m128Tmp4,
4017 _mm_load_si128(
4018 (__m128i *) (transform16x16_1[2][6]))));
4019 E6h = _mm_add_epi32(E6h,
4020 _mm_madd_epi16(m128Tmp5,
4021 _mm_load_si128(
4022 (__m128i *) (transform16x16_1[2][6]))));
4023 E6l = _mm_add_epi32(E6l,
4024 _mm_madd_epi16(m128Tmp6,
4025 _mm_load_si128(
4026 (__m128i *) (transform16x16_1[3][6]))));
4027 E6h = _mm_add_epi32(E6h,
4028 _mm_madd_epi16(m128Tmp7,
4029 _mm_load_si128(
4030 (__m128i *) (transform16x16_1[3][6]))));
4031
4032 /* Compute E7 */
4033 E7l = _mm_madd_epi16(m128Tmp0,
4034 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
4035 E7h = _mm_madd_epi16(m128Tmp1,
4036 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
4037 E7l = _mm_add_epi32(E7l,
4038 _mm_madd_epi16(m128Tmp2,
4039 _mm_load_si128(
4040 (__m128i *) (transform16x16_1[1][7]))));
4041 E7h = _mm_add_epi32(E7h,
4042 _mm_madd_epi16(m128Tmp3,
4043 _mm_load_si128(
4044 (__m128i *) (transform16x16_1[1][7]))));
4045 E7l = _mm_add_epi32(E7l,
4046 _mm_madd_epi16(m128Tmp4,
4047 _mm_load_si128(
4048 (__m128i *) (transform16x16_1[2][7]))));
4049 E7h = _mm_add_epi32(E7h,
4050 _mm_madd_epi16(m128Tmp5,
4051 _mm_load_si128(
4052 (__m128i *) (transform16x16_1[2][7]))));
4053 E7l = _mm_add_epi32(E7l,
4054 _mm_madd_epi16(m128Tmp6,
4055 _mm_load_si128(
4056 (__m128i *) (transform16x16_1[3][7]))));
4057 E7h = _mm_add_epi32(E7h,
4058 _mm_madd_epi16(m128Tmp7,
4059 _mm_load_si128(
4060 (__m128i *) (transform16x16_1[3][7]))));
4061
4062 /* Compute EE0 and EEE */
4063
4064 m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
4065 E00l = _mm_madd_epi16(m128Tmp0,
4066 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
4067 m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
4068 E00h = _mm_madd_epi16(m128Tmp1,
4069 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
4070
4071 m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28);
4072 E00l = _mm_add_epi32(E00l,
4073 _mm_madd_epi16(m128Tmp2,
4074 _mm_load_si128(
4075 (__m128i *) (transform16x16_2[1][0]))));
4076 m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28);
4077 E00h = _mm_add_epi32(E00h,
4078 _mm_madd_epi16(m128Tmp3,
4079 _mm_load_si128(
4080 (__m128i *) (transform16x16_2[1][0]))));
4081
4082 E01l = _mm_madd_epi16(m128Tmp0,
4083 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
4084 E01h = _mm_madd_epi16(m128Tmp1,
4085 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
4086 E01l = _mm_add_epi32(E01l,
4087 _mm_madd_epi16(m128Tmp2,
4088 _mm_load_si128(
4089 (__m128i *) (transform16x16_2[1][1]))));
4090 E01h = _mm_add_epi32(E01h,
4091 _mm_madd_epi16(m128Tmp3,
4092 _mm_load_si128(
4093 (__m128i *) (transform16x16_2[1][1]))));
4094
4095 E02l = _mm_madd_epi16(m128Tmp0,
4096 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
4097 E02h = _mm_madd_epi16(m128Tmp1,
4098 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
4099 E02l = _mm_add_epi32(E02l,
4100 _mm_madd_epi16(m128Tmp2,
4101 _mm_load_si128(
4102 (__m128i *) (transform16x16_2[1][2]))));
4103 E02h = _mm_add_epi32(E02h,
4104 _mm_madd_epi16(m128Tmp3,
4105 _mm_load_si128(
4106 (__m128i *) (transform16x16_2[1][2]))));
4107
4108 E03l = _mm_madd_epi16(m128Tmp0,
4109 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
4110 E03h = _mm_madd_epi16(m128Tmp1,
4111 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
4112 E03l = _mm_add_epi32(E03l,
4113 _mm_madd_epi16(m128Tmp2,
4114 _mm_load_si128(
4115 (__m128i *) (transform16x16_2[1][3]))));
4116 E03h = _mm_add_epi32(E03h,
4117 _mm_madd_epi16(m128Tmp3,
4118 _mm_load_si128(
4119 (__m128i *) (transform16x16_2[1][3]))));
4120
4121 /* Compute EE0 and EEE */
4122
4123 m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24);
4124 EE0l = _mm_madd_epi16(m128Tmp0,
4125 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
4126 m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24);
4127 EE0h = _mm_madd_epi16(m128Tmp1,
4128 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
4129
4130 m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16);
4131 EEE0l = _mm_madd_epi16(m128Tmp2,
4132 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
4133 m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16);
4134 EEE0h = _mm_madd_epi16(m128Tmp3,
4135 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
4136
4137 EE1l = _mm_madd_epi16(m128Tmp0,
4138 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
4139 EE1h = _mm_madd_epi16(m128Tmp1,
4140 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
4141
4142 EEE1l = _mm_madd_epi16(m128Tmp2,
4143 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
4144 EEE1h = _mm_madd_epi16(m128Tmp3,
4145 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
4146
4147 /* Compute EE */
4148
4149 EE2l = _mm_sub_epi32(EEE1l, EE1l);
4150 EE3l = _mm_sub_epi32(EEE0l, EE0l);
4151 EE2h = _mm_sub_epi32(EEE1h, EE1h);
4152 EE3h = _mm_sub_epi32(EEE0h, EE0h);
4153
4154 EE0l = _mm_add_epi32(EEE0l, EE0l);
4155 EE1l = _mm_add_epi32(EEE1l, EE1l);
4156 EE0h = _mm_add_epi32(EEE0h, EE0h);
4157 EE1h = _mm_add_epi32(EEE1h, EE1h);
4158 /**/
4159
4160 EE7l = _mm_sub_epi32(EE0l, E00l);
4161 EE6l = _mm_sub_epi32(EE1l, E01l);
4162 EE5l = _mm_sub_epi32(EE2l, E02l);
4163 EE4l = _mm_sub_epi32(EE3l, E03l);
4164
4165 EE7h = _mm_sub_epi32(EE0h, E00h);
4166 EE6h = _mm_sub_epi32(EE1h, E01h);
4167 EE5h = _mm_sub_epi32(EE2h, E02h);
4168 EE4h = _mm_sub_epi32(EE3h, E03h);
4169
4170 EE0l = _mm_add_epi32(EE0l, E00l);
4171 EE1l = _mm_add_epi32(EE1l, E01l);
4172 EE2l = _mm_add_epi32(EE2l, E02l);
4173 EE3l = _mm_add_epi32(EE3l, E03l);
4174
4175 EE0h = _mm_add_epi32(EE0h, E00h);
4176 EE1h = _mm_add_epi32(EE1h, E01h);
4177 EE2h = _mm_add_epi32(EE2h, E02h);
4178 EE3h = _mm_add_epi32(EE3h, E03h);
4179 /* Compute E */
4180
4181 E15l = _mm_sub_epi32(EE0l, E0l);
4182 E15l = _mm_add_epi32(E15l, m128iAdd);
4183 E14l = _mm_sub_epi32(EE1l, E1l);
4184 E14l = _mm_add_epi32(E14l, m128iAdd);
4185 E13l = _mm_sub_epi32(EE2l, E2l);
4186 E13l = _mm_add_epi32(E13l, m128iAdd);
4187 E12l = _mm_sub_epi32(EE3l, E3l);
4188 E12l = _mm_add_epi32(E12l, m128iAdd);
4189 E11l = _mm_sub_epi32(EE4l, E4l);
4190 E11l = _mm_add_epi32(E11l, m128iAdd);
4191 E10l = _mm_sub_epi32(EE5l, E5l);
4192 E10l = _mm_add_epi32(E10l, m128iAdd);
4193 E9l = _mm_sub_epi32(EE6l, E6l);
4194 E9l = _mm_add_epi32(E9l, m128iAdd);
4195 E8l = _mm_sub_epi32(EE7l, E7l);
4196 E8l = _mm_add_epi32(E8l, m128iAdd);
4197
4198 E0l = _mm_add_epi32(EE0l, E0l);
4199 E0l = _mm_add_epi32(E0l, m128iAdd);
4200 E1l = _mm_add_epi32(EE1l, E1l);
4201 E1l = _mm_add_epi32(E1l, m128iAdd);
4202 E2l = _mm_add_epi32(EE2l, E2l);
4203 E2l = _mm_add_epi32(E2l, m128iAdd);
4204 E3l = _mm_add_epi32(EE3l, E3l);
4205 E3l = _mm_add_epi32(E3l, m128iAdd);
4206 E4l = _mm_add_epi32(EE4l, E4l);
4207 E4l = _mm_add_epi32(E4l, m128iAdd);
4208 E5l = _mm_add_epi32(EE5l, E5l);
4209 E5l = _mm_add_epi32(E5l, m128iAdd);
4210 E6l = _mm_add_epi32(EE6l, E6l);
4211 E6l = _mm_add_epi32(E6l, m128iAdd);
4212 E7l = _mm_add_epi32(EE7l, E7l);
4213 E7l = _mm_add_epi32(E7l, m128iAdd);
4214
4215 E15h = _mm_sub_epi32(EE0h, E0h);
4216 E15h = _mm_add_epi32(E15h, m128iAdd);
4217 E14h = _mm_sub_epi32(EE1h, E1h);
4218 E14h = _mm_add_epi32(E14h, m128iAdd);
4219 E13h = _mm_sub_epi32(EE2h, E2h);
4220 E13h = _mm_add_epi32(E13h, m128iAdd);
4221 E12h = _mm_sub_epi32(EE3h, E3h);
4222 E12h = _mm_add_epi32(E12h, m128iAdd);
4223 E11h = _mm_sub_epi32(EE4h, E4h);
4224 E11h = _mm_add_epi32(E11h, m128iAdd);
4225 E10h = _mm_sub_epi32(EE5h, E5h);
4226 E10h = _mm_add_epi32(E10h, m128iAdd);
4227 E9h = _mm_sub_epi32(EE6h, E6h);
4228 E9h = _mm_add_epi32(E9h, m128iAdd);
4229 E8h = _mm_sub_epi32(EE7h, E7h);
4230 E8h = _mm_add_epi32(E8h, m128iAdd);
4231
4232 E0h = _mm_add_epi32(EE0h, E0h);
4233 E0h = _mm_add_epi32(E0h, m128iAdd);
4234 E1h = _mm_add_epi32(EE1h, E1h);
4235 E1h = _mm_add_epi32(E1h, m128iAdd);
4236 E2h = _mm_add_epi32(EE2h, E2h);
4237 E2h = _mm_add_epi32(E2h, m128iAdd);
4238 E3h = _mm_add_epi32(EE3h, E3h);
4239 E3h = _mm_add_epi32(E3h, m128iAdd);
4240 E4h = _mm_add_epi32(EE4h, E4h);
4241 E4h = _mm_add_epi32(E4h, m128iAdd);
4242 E5h = _mm_add_epi32(EE5h, E5h);
4243 E5h = _mm_add_epi32(E5h, m128iAdd);
4244 E6h = _mm_add_epi32(EE6h, E6h);
4245 E6h = _mm_add_epi32(E6h, m128iAdd);
4246 E7h = _mm_add_epi32(EE7h, E7h);
4247 E7h = _mm_add_epi32(E7h, m128iAdd);
4248
4249 m128iS0 = _mm_packs_epi32(
4250 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
4251 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
4252 m128iS1 = _mm_packs_epi32(
4253 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
4254 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
4255 m128iS2 = _mm_packs_epi32(
4256 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
4257 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
4258 m128iS3 = _mm_packs_epi32(
4259 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
4260 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
4261 m128iS4 = _mm_packs_epi32(
4262 _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
4263 _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
4264 m128iS5 = _mm_packs_epi32(
4265 _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
4266 _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
4267 m128iS6 = _mm_packs_epi32(
4268 _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
4269 _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
4270 m128iS7 = _mm_packs_epi32(
4271 _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
4272 _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
4273 m128iS8 = _mm_packs_epi32(
4274 _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift),
4275 _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift));
4276 m128iS9 = _mm_packs_epi32(
4277 _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift),
4278 _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift));
4279 m128iS10 = _mm_packs_epi32(
4280 _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift),
4281 _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift));
4282 m128iS11 = _mm_packs_epi32(
4283 _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift),
4284 _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift));
4285 m128iS12 = _mm_packs_epi32(
4286 _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift),
4287 _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift));
4288 m128iS13 = _mm_packs_epi32(
4289 _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift),
4290 _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift));
4291 m128iS14 = _mm_packs_epi32(
4292 _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift),
4293 _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift));
4294 m128iS15 = _mm_packs_epi32(
4295 _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift),
4296 _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift));
4297
4298 m128iS31 = _mm_packs_epi32(
4299 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
4300 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
4301 m128iS30 = _mm_packs_epi32(
4302 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
4303 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
4304 m128iS29 = _mm_packs_epi32(
4305 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
4306 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
4307 m128iS28 = _mm_packs_epi32(
4308 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
4309 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
4310 m128iS27 = _mm_packs_epi32(
4311 _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
4312 _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
4313 m128iS26 = _mm_packs_epi32(
4314 _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
4315 _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
4316 m128iS25 = _mm_packs_epi32(
4317 _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
4318 _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
4319 m128iS24 = _mm_packs_epi32(
4320 _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
4321 _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
4322 m128iS23 = _mm_packs_epi32(
4323 _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift),
4324 _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift));
4325 m128iS22 = _mm_packs_epi32(
4326 _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift),
4327 _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift));
4328 m128iS21 = _mm_packs_epi32(
4329 _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift),
4330 _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift));
4331 m128iS20 = _mm_packs_epi32(
4332 _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift),
4333 _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift));
4334 m128iS19 = _mm_packs_epi32(
4335 _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift),
4336 _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift));
4337 m128iS18 = _mm_packs_epi32(
4338 _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift),
4339 _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift));
4340 m128iS17 = _mm_packs_epi32(
4341 _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift),
4342 _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift));
4343 m128iS16 = _mm_packs_epi32(
4344 _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift),
4345 _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift));
4346
4347 if (!j) {
4348 /* Inverse the matrix */
4349 E0l = _mm_unpacklo_epi16(m128iS0, m128iS16);
4350 E1l = _mm_unpacklo_epi16(m128iS1, m128iS17);
4351 E2l = _mm_unpacklo_epi16(m128iS2, m128iS18);
4352 E3l = _mm_unpacklo_epi16(m128iS3, m128iS19);
4353 E4l = _mm_unpacklo_epi16(m128iS4, m128iS20);
4354 E5l = _mm_unpacklo_epi16(m128iS5, m128iS21);
4355 E6l = _mm_unpacklo_epi16(m128iS6, m128iS22);
4356 E7l = _mm_unpacklo_epi16(m128iS7, m128iS23);
4357 E8l = _mm_unpacklo_epi16(m128iS8, m128iS24);
4358 E9l = _mm_unpacklo_epi16(m128iS9, m128iS25);
4359 E10l = _mm_unpacklo_epi16(m128iS10, m128iS26);
4360 E11l = _mm_unpacklo_epi16(m128iS11, m128iS27);
4361 E12l = _mm_unpacklo_epi16(m128iS12, m128iS28);
4362 E13l = _mm_unpacklo_epi16(m128iS13, m128iS29);
4363 E14l = _mm_unpacklo_epi16(m128iS14, m128iS30);
4364 E15l = _mm_unpacklo_epi16(m128iS15, m128iS31);
4365
4366 O0l = _mm_unpackhi_epi16(m128iS0, m128iS16);
4367 O1l = _mm_unpackhi_epi16(m128iS1, m128iS17);
4368 O2l = _mm_unpackhi_epi16(m128iS2, m128iS18);
4369 O3l = _mm_unpackhi_epi16(m128iS3, m128iS19);
4370 O4l = _mm_unpackhi_epi16(m128iS4, m128iS20);
4371 O5l = _mm_unpackhi_epi16(m128iS5, m128iS21);
4372 O6l = _mm_unpackhi_epi16(m128iS6, m128iS22);
4373 O7l = _mm_unpackhi_epi16(m128iS7, m128iS23);
4374 O8l = _mm_unpackhi_epi16(m128iS8, m128iS24);
4375 O9l = _mm_unpackhi_epi16(m128iS9, m128iS25);
4376 O10l = _mm_unpackhi_epi16(m128iS10, m128iS26);
4377 O11l = _mm_unpackhi_epi16(m128iS11, m128iS27);
4378 O12l = _mm_unpackhi_epi16(m128iS12, m128iS28);
4379 O13l = _mm_unpackhi_epi16(m128iS13, m128iS29);
4380 O14l = _mm_unpackhi_epi16(m128iS14, m128iS30);
4381 O15l = _mm_unpackhi_epi16(m128iS15, m128iS31);
4382
4383 E0h = _mm_unpacklo_epi16(E0l, E8l);
4384 E1h = _mm_unpacklo_epi16(E1l, E9l);
4385 E2h = _mm_unpacklo_epi16(E2l, E10l);
4386 E3h = _mm_unpacklo_epi16(E3l, E11l);
4387 E4h = _mm_unpacklo_epi16(E4l, E12l);
4388 E5h = _mm_unpacklo_epi16(E5l, E13l);
4389 E6h = _mm_unpacklo_epi16(E6l, E14l);
4390 E7h = _mm_unpacklo_epi16(E7l, E15l);
4391
4392 E8h = _mm_unpackhi_epi16(E0l, E8l);
4393 E9h = _mm_unpackhi_epi16(E1l, E9l);
4394 E10h = _mm_unpackhi_epi16(E2l, E10l);
4395 E11h = _mm_unpackhi_epi16(E3l, E11l);
4396 E12h = _mm_unpackhi_epi16(E4l, E12l);
4397 E13h = _mm_unpackhi_epi16(E5l, E13l);
4398 E14h = _mm_unpackhi_epi16(E6l, E14l);
4399 E15h = _mm_unpackhi_epi16(E7l, E15l);
4400
4401 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
4402 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
4403 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
4404 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
4405
4406 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4407 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4408 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4409 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4410
4411 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4412 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4413 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4414 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4415
4416 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
4417 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
4418 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
4419 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
4420
4421 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4422 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4423 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4424 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4425
4426 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4427 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4428 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4429 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4430
4431 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
4432 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
4433 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
4434 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
4435
4436 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4437 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4438 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4439 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4440
4441 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4442 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4443 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4444 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4445
4446 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
4447 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
4448 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
4449 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
4450
4451 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4452 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4453 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4454 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4455
4456 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4457 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4458 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4459 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4460
4461 /* */
4462 E0h = _mm_unpacklo_epi16(O0l, O8l);
4463 E1h = _mm_unpacklo_epi16(O1l, O9l);
4464 E2h = _mm_unpacklo_epi16(O2l, O10l);
4465 E3h = _mm_unpacklo_epi16(O3l, O11l);
4466 E4h = _mm_unpacklo_epi16(O4l, O12l);
4467 E5h = _mm_unpacklo_epi16(O5l, O13l);
4468 E6h = _mm_unpacklo_epi16(O6l, O14l);
4469 E7h = _mm_unpacklo_epi16(O7l, O15l);
4470
4471 E8h = _mm_unpackhi_epi16(O0l, O8l);
4472 E9h = _mm_unpackhi_epi16(O1l, O9l);
4473 E10h = _mm_unpackhi_epi16(O2l, O10l);
4474 E11h = _mm_unpackhi_epi16(O3l, O11l);
4475 E12h = _mm_unpackhi_epi16(O4l, O12l);
4476 E13h = _mm_unpackhi_epi16(O5l, O13l);
4477 E14h = _mm_unpackhi_epi16(O6l, O14l);
4478 E15h = _mm_unpackhi_epi16(O7l, O15l);
4479
4480 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
4481 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
4482 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
4483 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
4484
4485 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4486 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4487 m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4488 m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4489
4490 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4491 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4492 m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4493 m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4494
4495 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
4496 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
4497 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
4498 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
4499
4500 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4501 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4502 m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4503 m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4504
4505 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4506 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4507 m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4508 m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4509
4510 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
4511 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
4512 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
4513 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
4514
4515 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4516 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4517 m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4518 m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4519
4520 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4521 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4522 m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4523 m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4524
4525 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
4526 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
4527 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
4528 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
4529
4530 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4531 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4532 m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4533 m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4534
4535 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4536 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4537 m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4538 m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4539
4540 if(i==0){
4541 int k = 8;
4542 r0=m128iS0;
4543 r1=m128iS1;
4544 r2=m128iS2;
4545 r3=m128iS3;
4546 r4=m128iS4;
4547 r5=m128iS5;
4548 r6=m128iS6;
4549 r7=m128iS7;
4550 r8=m128iS8;
4551 r9=m128iS9;
4552 r10=m128iS10;
4553 r11=m128iS11;
4554 r12=m128iS12;
4555 r13=m128iS13;
4556 r14=m128iS14;
4557 r15=m128iS15;
4558 r16=m128iS16;
4559 r17=m128iS17;
4560 r18=m128iS18;
4561 r19=m128iS19;
4562 r20=m128iS20;
4563 r21=m128iS21;
4564 r22=m128iS22;
4565 r23=m128iS23;
4566 r24=m128iS24;
4567 r25=m128iS25;
4568 r26=m128iS26;
4569 r27=m128iS27;
4570 r28=m128iS28;
4571 r29=m128iS29;
4572 r30=m128iS30;
4573 r31=m128iS31;
4574 m128iS0 = _mm_load_si128((__m128i *) (src + k));
4575 m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k));
4576 m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k));
4577 m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k));
4578 m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k));
4579 m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k));
4580 m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k));
4581 m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k));
4582 m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k));
4583 m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k));
4584 m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k));
4585 m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k));
4586 m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k));
4587 m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k));
4588 m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k));
4589 m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k));
4590
4591 m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k));
4592 m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k));
4593 m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k));
4594 m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k));
4595 m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k));
4596 m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k));
4597 m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k));
4598 m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k));
4599 m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k));
4600 m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k));
4601 m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k));
4602 m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k));
4603 m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k));
4604 m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k));
4605 m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k));
4606 m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k));
4607
4608 }else if(i ==8){
4609
4610 r32=m128iS0;
4611 r33=m128iS1;
4612 r34=m128iS2;
4613 r35=m128iS3;
4614 r36=m128iS4;
4615 r37=m128iS5;
4616 r38=m128iS6;
4617 r39=m128iS7;
4618 r40=m128iS8;
4619 r41=m128iS9;
4620 r42=m128iS10;
4621 r43=m128iS11;
4622 r44=m128iS12;
4623 r45=m128iS13;
4624 r46=m128iS14;
4625 r47=m128iS15;
4626 r48=m128iS16;
4627 r49=m128iS17;
4628 r50=m128iS18;
4629 r51=m128iS19;
4630 r52=m128iS20;
4631 r53=m128iS21;
4632 r54=m128iS22;
4633 r55=m128iS23;
4634 r56=m128iS24;
4635 r57=m128iS25;
4636 r58=m128iS26;
4637 r59=m128iS27;
4638 r60=m128iS28;
4639 r61=m128iS29;
4640 r62=m128iS30;
4641 r63=m128iS31;
4642
4643 m128iS0 = _mm_load_si128((__m128i *) (src + 16));
4644 m128iS1 = _mm_load_si128((__m128i *) (src + 48));
4645 m128iS2 = _mm_load_si128((__m128i *) (src + 80));
4646 m128iS3 = _mm_load_si128((__m128i *) (src + 112));
4647 m128iS4 = _mm_load_si128((__m128i *) (src + 144));
4648 m128iS5 = _mm_load_si128((__m128i *) (src + 176));
4649 m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 16));
4650 m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 16));
4651 m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 16));
4652 m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 16));
4653 m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 16));
4654 m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 16));
4655 m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 16));
4656 m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 16));
4657 m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 16));
4658 m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 16));
4659
4660 m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 16));
4661 m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 16));
4662 m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 16));
4663 m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 16));
4664 m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 16));
4665 m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 16));
4666 m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 16));
4667 m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 16));
4668 m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 16));
4669 m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 16));
4670 m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 16));
4671 m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 16));
4672 m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 16));
4673 m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 16));
4674 m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 16));
4675 m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 16));
4676
4677
4678 }else if(i ==16){
4679
4680 r64=m128iS0;
4681 r65=m128iS1;
4682 r66=m128iS2;
4683 r67=m128iS3;
4684 r68=m128iS4;
4685 r69=m128iS5;
4686 r70=m128iS6;
4687 r71=m128iS7;
4688 r72=m128iS8;
4689 r73=m128iS9;
4690 r74=m128iS10;
4691 r75=m128iS11;
4692 r76=m128iS12;
4693 r77=m128iS13;
4694 r78=m128iS14;
4695 r79=m128iS15;
4696 r80=m128iS16;
4697 r81=m128iS17;
4698 r82=m128iS18;
4699 r83=m128iS19;
4700 r84=m128iS20;
4701 r85=m128iS21;
4702 r86=m128iS22;
4703 r87=m128iS23;
4704 r88=m128iS24;
4705 r89=m128iS25;
4706 r90=m128iS26;
4707 r91=m128iS27;
4708 r92=m128iS28;
4709 r93=m128iS29;
4710 r94=m128iS30;
4711 r95=m128iS31;
4712
4713 m128iS0 = _mm_load_si128((__m128i *) (src + 24));
4714 m128iS1 = _mm_load_si128((__m128i *) (src + 56));
4715 m128iS2 = _mm_load_si128((__m128i *) (src + 64 + 24));
4716 m128iS3 = _mm_load_si128((__m128i *) (src + 96 + 24));
4717 m128iS4 = _mm_load_si128((__m128i *) (src + 128 + 24));
4718 m128iS5 = _mm_load_si128((__m128i *) (src + 160 + 24));
4719 m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 24));
4720 m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 24));
4721 m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 24));
4722 m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 24));
4723 m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 24));
4724 m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 24));
4725 m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 24));
4726 m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 24));
4727 m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 24));
4728 m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 24));
4729
4730 m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 24));
4731 m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 24));
4732 m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 24));
4733 m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 24));
4734 m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 24));
4735 m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 24));
4736 m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 24));
4737 m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 24));
4738 m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 24));
4739 m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 24));
4740 m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 24));
4741 m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 24));
4742 m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 24));
4743 m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 24));
4744 m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 24));
4745 m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 24));
4746
4747 }else{
4748 r96=m128iS0;
4749 r97=m128iS1;
4750 r98=m128iS2;
4751 r99=m128iS3;
4752 r100=m128iS4;
4753 r101=m128iS5;
4754 r102=m128iS6;
4755 r103=m128iS7;
4756 r104=m128iS8;
4757 r105=m128iS9;
4758 r106=m128iS10;
4759 r107=m128iS11;
4760 r108=m128iS12;
4761 r109=m128iS13;
4762 r110=m128iS14;
4763 r111=m128iS15;
4764 r112=m128iS16;
4765 r113=m128iS17;
4766 r114=m128iS18;
4767 r115=m128iS19;
4768 r116=m128iS20;
4769 r117=m128iS21;
4770 r118=m128iS22;
4771 r119=m128iS23;
4772 r120=m128iS24;
4773 r121=m128iS25;
4774 r122=m128iS26;
4775 r123=m128iS27;
4776 r124=m128iS28;
4777 r125=m128iS29;
4778 r126=m128iS30;
4779 r127=m128iS31;
4780
4781 //load data for next j :
4782 m128iS0 = r0;
4783 m128iS1 = r4;
4784 m128iS2 = r8;
4785 m128iS3 = r12;
4786 m128iS4 = r16;
4787 m128iS5 = r20;
4788 m128iS6 = r24;
4789 m128iS7 = r28;
4790 m128iS8 = r32;
4791 m128iS9 = r36;
4792 m128iS10 = r40;
4793 m128iS11 = r44;
4794 m128iS12 = r48;
4795 m128iS13 = r52;
4796 m128iS14 = r56;
4797 m128iS15 = r60;
4798 m128iS16 = r64;
4799 m128iS17 = r68;
4800 m128iS18 = r72;
4801 m128iS19 = r76;
4802 m128iS20 = r80;
4803 m128iS21 = r84;
4804 m128iS22 = r88;
4805 m128iS23 = r92;
4806 m128iS24 = r96;
4807 m128iS25 = r100;
4808 m128iS26 = r104;
4809 m128iS27 = r108;
4810 m128iS28 = r112;
4811 m128iS29 = r116;
4812 m128iS30 = r120;
4813 m128iS31 =r124;
4814 shift = shift_2nd;
4815 m128iAdd = _mm_set1_epi32(add_2nd);
4816
4817
4818 }
4819
4820 } else {
4821
4822 //Transpose Matrix
4823
4824 E0l= _mm_unpacklo_epi16(m128iS0,m128iS1);
4825 E1l= _mm_unpacklo_epi16(m128iS2,m128iS3);
4826 E2l= _mm_unpacklo_epi16(m128iS4,m128iS5);
4827 E3l= _mm_unpacklo_epi16(m128iS6,m128iS7);
4828 E4l= _mm_unpacklo_epi16(m128iS8,m128iS9);
4829 E5l= _mm_unpacklo_epi16(m128iS10,m128iS11);
4830 E6l= _mm_unpacklo_epi16(m128iS12,m128iS13);
4831 E7l= _mm_unpacklo_epi16(m128iS14,m128iS15);
4832 E8l= _mm_unpacklo_epi16(m128iS16,m128iS17);
4833 E9l= _mm_unpacklo_epi16(m128iS18,m128iS19);
4834 E10l= _mm_unpacklo_epi16(m128iS20,m128iS21);
4835 E11l= _mm_unpacklo_epi16(m128iS22,m128iS23);
4836 E12l= _mm_unpacklo_epi16(m128iS24,m128iS25);
4837 E13l= _mm_unpacklo_epi16(m128iS26,m128iS27);
4838 E14l= _mm_unpacklo_epi16(m128iS28,m128iS29);
4839 E15l= _mm_unpacklo_epi16(m128iS30,m128iS31);
4840
4841
4842 E0h= _mm_unpackhi_epi16(m128iS0,m128iS1);
4843 E1h= _mm_unpackhi_epi16(m128iS2,m128iS3);
4844 E2h= _mm_unpackhi_epi16(m128iS4,m128iS5);
4845 E3h= _mm_unpackhi_epi16(m128iS6,m128iS7);
4846 E4h= _mm_unpackhi_epi16(m128iS8,m128iS9);
4847 E5h= _mm_unpackhi_epi16(m128iS10,m128iS11);
4848 E6h= _mm_unpackhi_epi16(m128iS12,m128iS13);
4849 E7h= _mm_unpackhi_epi16(m128iS14,m128iS15);
4850 E8h= _mm_unpackhi_epi16(m128iS16,m128iS17);
4851 E9h= _mm_unpackhi_epi16(m128iS18,m128iS19);
4852 E10h= _mm_unpackhi_epi16(m128iS20,m128iS21);
4853 E11h= _mm_unpackhi_epi16(m128iS22,m128iS23);
4854 E12h= _mm_unpackhi_epi16(m128iS24,m128iS25);
4855 E13h= _mm_unpackhi_epi16(m128iS26,m128iS27);
4856 E14h= _mm_unpackhi_epi16(m128iS28,m128iS29);
4857 E15h= _mm_unpackhi_epi16(m128iS30,m128iS31);
4858
4859 m128Tmp0= _mm_unpacklo_epi32(E0l,E1l);
4860 m128Tmp1= _mm_unpacklo_epi32(E2l,E3l);
4861 m128Tmp2= _mm_unpacklo_epi32(E4l,E5l);
4862 m128Tmp3= _mm_unpacklo_epi32(E6l,E7l);
4863 m128Tmp4= _mm_unpacklo_epi32(E8l,E9l);
4864 m128Tmp5= _mm_unpacklo_epi32(E10l,E11l);
4865 m128Tmp6= _mm_unpacklo_epi32(E12l,E13l);
4866 m128Tmp7= _mm_unpacklo_epi32(E14l,E15l);
4867
4868 m128iS0= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter 1st row
4869 m128iS1= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter 1st row
4870
4871
4872 m128iS2= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter 1st row
4873 m128iS3= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter 1st row
4874
4875 //second row
4876
4877 m128iS4= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4878 m128iS5= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4879
4880 m128iS6= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4881 m128iS7= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4882
4883 //third row
4884
4885 m128Tmp0= _mm_unpackhi_epi32(E0l,E1l);
4886 m128Tmp1= _mm_unpackhi_epi32(E2l,E3l);
4887 m128Tmp2= _mm_unpackhi_epi32(E4l,E5l);
4888 m128Tmp3= _mm_unpackhi_epi32(E6l,E7l);
4889 m128Tmp4= _mm_unpackhi_epi32(E8l,E9l);
4890 m128Tmp5= _mm_unpackhi_epi32(E10l,E11l);
4891 m128Tmp6= _mm_unpackhi_epi32(E12l,E13l);
4892 m128Tmp7= _mm_unpackhi_epi32(E14l,E15l);
4893
4894
4895 m128iS8= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4896 m128iS9= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4897
4898 m128iS10= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4899 m128iS11= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter
4900
4901 //fourth row
4902
4903 m128iS12= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4904 m128iS13= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4905
4906 m128iS14= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4907 m128iS15= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4908
4909 //fith row
4910
4911 m128Tmp0= _mm_unpacklo_epi32(E0h,E1h);
4912 m128Tmp1= _mm_unpacklo_epi32(E2h,E3h);
4913 m128Tmp2= _mm_unpacklo_epi32(E4h,E5h);
4914 m128Tmp3= _mm_unpacklo_epi32(E6h,E7h);
4915 m128Tmp4= _mm_unpacklo_epi32(E8h,E9h);
4916 m128Tmp5= _mm_unpacklo_epi32(E10h,E11h);
4917 m128Tmp6= _mm_unpacklo_epi32(E12h,E13h);
4918 m128Tmp7= _mm_unpacklo_epi32(E14h,E15h);
4919
4920 m128iS16= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4921 m128iS17= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4922
4923
4924 m128iS18= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4925 m128iS19= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7);
4926
4927 //sixth row
4928
4929 m128iS20= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4930 m128iS21= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4931
4932
4933 m128iS22= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4934 m128iS23= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4935
4936 //seventh row
4937
4938 m128Tmp0= _mm_unpackhi_epi32(E0h,E1h);
4939 m128Tmp1= _mm_unpackhi_epi32(E2h,E3h);
4940 m128Tmp2= _mm_unpackhi_epi32(E4h,E5h);
4941 m128Tmp3= _mm_unpackhi_epi32(E6h,E7h);
4942 m128Tmp4= _mm_unpackhi_epi32(E8h,E9h);
4943 m128Tmp5= _mm_unpackhi_epi32(E10h,E11h);
4944 m128Tmp6= _mm_unpackhi_epi32(E12h,E13h);
4945 m128Tmp7= _mm_unpackhi_epi32(E14h,E15h);
4946
4947
4948 m128iS24= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4949 m128iS25= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4950
4951
4952 m128iS26= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4953 m128iS27= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter
4954
4955 //last row
4956
4957
4958 m128iS28= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4959 m128iS29= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4960
4961 m128iS30= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4962 m128iS31= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4963
4964
4965 m128Tmp0=_mm_setzero_si128();
4966
4967
4968 //store
4969 dst = (uint8_t*) _dst + i*stride;
4970
4971
4972 E0l= _mm_load_si128((__m128i*)dst); //16 values
4973 E1l= _mm_load_si128((__m128i*)(dst+16));
4974 E2l= _mm_load_si128((__m128i*)(dst+stride));
4975 E3l= _mm_load_si128((__m128i*)(dst+stride+16));
4976 E4l= _mm_load_si128((__m128i*)(dst+2*stride));
4977 E5l= _mm_load_si128((__m128i*)(dst+2*stride+16));
4978 E6l= _mm_load_si128((__m128i*)(dst+3*stride));
4979 E7l= _mm_load_si128((__m128i*)(dst+3*stride+16));
4980 E8l= _mm_load_si128((__m128i*)(dst+4*stride));
4981 E9l= _mm_load_si128((__m128i*)(dst+4*stride+16));
4982 E10l= _mm_load_si128((__m128i*)(dst+5*stride));
4983 E11l= _mm_load_si128((__m128i*)(dst+5*stride+16));
4984 E12l= _mm_load_si128((__m128i*)(dst+6*stride));
4985 E13l= _mm_load_si128((__m128i*)(dst+6*stride+16));
4986 E14l= _mm_load_si128((__m128i*)(dst+7*stride));
4987 E15l= _mm_load_si128((__m128i*)(dst+7*stride+16));
4988
4989 m128iS0= _mm_adds_epi16(m128iS0,_mm_unpacklo_epi8(E0l,m128Tmp0));
4990 m128iS1= _mm_adds_epi16(m128iS1,_mm_unpackhi_epi8(E0l,m128Tmp0));
4991 m128iS0= _mm_packus_epi16(m128iS0,m128iS1);
4992
4993 m128iS2= _mm_adds_epi16(m128iS2,_mm_unpacklo_epi8(E1l,m128Tmp0));
4994 m128iS3= _mm_adds_epi16(m128iS3,_mm_unpackhi_epi8(E1l,m128Tmp0));
4995 m128iS2= _mm_packus_epi16(m128iS2,m128iS3);
4996
4997 m128iS4= _mm_adds_epi16(m128iS4,_mm_unpacklo_epi8(E2l,m128Tmp0));
4998 m128iS5= _mm_adds_epi16(m128iS5,_mm_unpackhi_epi8(E2l,m128Tmp0));
4999 m128iS4= _mm_packus_epi16(m128iS4,m128iS5);
5000
5001 m128iS6= _mm_adds_epi16(m128iS6,_mm_unpacklo_epi8(E3l,m128Tmp0));
5002 m128iS7= _mm_adds_epi16(m128iS7,_mm_unpackhi_epi8(E3l,m128Tmp0));
5003 m128iS6= _mm_packus_epi16(m128iS6,m128iS7);
5004
5005 m128iS8= _mm_adds_epi16(m128iS8,_mm_unpacklo_epi8(E4l,m128Tmp0));
5006 m128iS9= _mm_adds_epi16(m128iS9,_mm_unpackhi_epi8(E4l,m128Tmp0));
5007 m128iS8= _mm_packus_epi16(m128iS8,m128iS9);
5008
5009 m128iS10= _mm_adds_epi16(m128iS10,_mm_unpacklo_epi8(E5l,m128Tmp0));
5010 m128iS11= _mm_adds_epi16(m128iS11,_mm_unpackhi_epi8(E5l,m128Tmp0));
5011 m128iS10= _mm_packus_epi16(m128iS10,m128iS11);
5012
5013 m128iS12= _mm_adds_epi16(m128iS12,_mm_unpacklo_epi8(E6l,m128Tmp0));
5014 m128iS13= _mm_adds_epi16(m128iS13,_mm_unpackhi_epi8(E6l,m128Tmp0));
5015 m128iS12= _mm_packus_epi16(m128iS12,m128iS13);
5016
5017 m128iS14= _mm_adds_epi16(m128iS14,_mm_unpacklo_epi8(E7l,m128Tmp0));
5018 m128iS15= _mm_adds_epi16(m128iS15,_mm_unpackhi_epi8(E7l,m128Tmp0));
5019 m128iS14= _mm_packus_epi16(m128iS14,m128iS15);
5020
5021 m128iS16= _mm_adds_epi16(m128iS16,_mm_unpacklo_epi8(E8l,m128Tmp0));
5022 m128iS17= _mm_adds_epi16(m128iS17,_mm_unpackhi_epi8(E8l,m128Tmp0));
5023 m128iS16= _mm_packus_epi16(m128iS16,m128iS17);
5024
5025 m128iS18= _mm_adds_epi16(m128iS18,_mm_unpacklo_epi8(E9l,m128Tmp0));
5026 m128iS19= _mm_adds_epi16(m128iS19,_mm_unpackhi_epi8(E9l,m128Tmp0));
5027 m128iS18= _mm_packus_epi16(m128iS18,m128iS19);
5028
5029 m128iS20= _mm_adds_epi16(m128iS20,_mm_unpacklo_epi8(E10l,m128Tmp0));
5030 m128iS21= _mm_adds_epi16(m128iS21,_mm_unpackhi_epi8(E10l,m128Tmp0));
5031 m128iS20= _mm_packus_epi16(m128iS20,m128iS21);
5032
5033 m128iS22= _mm_adds_epi16(m128iS22,_mm_unpacklo_epi8(E11l,m128Tmp0));
5034 m128iS23= _mm_adds_epi16(m128iS23,_mm_unpackhi_epi8(E11l,m128Tmp0));
5035 m128iS22= _mm_packus_epi16(m128iS22,m128iS23);
5036
5037 m128iS24= _mm_adds_epi16(m128iS24,_mm_unpacklo_epi8(E12l,m128Tmp0));
5038 m128iS25= _mm_adds_epi16(m128iS25,_mm_unpackhi_epi8(E12l,m128Tmp0));
5039 m128iS24= _mm_packus_epi16(m128iS24,m128iS25);
5040
5041 m128iS26= _mm_adds_epi16(m128iS26,_mm_unpacklo_epi8(E13l,m128Tmp0));
5042 m128iS27= _mm_adds_epi16(m128iS27,_mm_unpackhi_epi8(E13l,m128Tmp0));
5043 m128iS26= _mm_packus_epi16(m128iS26,m128iS27);
5044
5045 m128iS28= _mm_adds_epi16(m128iS28,_mm_unpacklo_epi8(E14l,m128Tmp0));
5046 m128iS29= _mm_adds_epi16(m128iS29,_mm_unpackhi_epi8(E14l,m128Tmp0));
5047 m128iS28= _mm_packus_epi16(m128iS28,m128iS29);
5048
5049 m128iS30= _mm_adds_epi16(m128iS30,_mm_unpacklo_epi8(E15l,m128Tmp0));
5050 m128iS31= _mm_adds_epi16(m128iS31,_mm_unpackhi_epi8(E15l,m128Tmp0));
5051 m128iS30= _mm_packus_epi16(m128iS30,m128iS31);
5052
5053
5054 _mm_store_si128((__m128i*)dst,m128iS0);
5055 _mm_store_si128((__m128i*)(dst+16),m128iS2);
5056 _mm_store_si128((__m128i*)(dst+stride),m128iS4);
5057 _mm_store_si128((__m128i*)(dst+stride+16),m128iS6);
5058 _mm_store_si128((__m128i*)(dst+2*stride),m128iS8);
5059 _mm_store_si128((__m128i*)(dst+2*stride+16),m128iS10);
5060 _mm_store_si128((__m128i*)(dst+3*stride),m128iS12);
5061 _mm_store_si128((__m128i*)(dst+3*stride+16),m128iS14);
5062 _mm_store_si128((__m128i*)(dst+4*stride),m128iS16);
5063 _mm_store_si128((__m128i*)(dst+4*stride+16),m128iS18);
5064 _mm_store_si128((__m128i*)(dst+5*stride),m128iS20);
5065 _mm_store_si128((__m128i*)(dst+5*stride+16),m128iS22);
5066 _mm_store_si128((__m128i*)(dst+6*stride),m128iS24);
5067 _mm_store_si128((__m128i*)(dst+6*stride+16),m128iS26);
5068 _mm_store_si128((__m128i*)(dst+7*stride),m128iS28);
5069 _mm_store_si128((__m128i*)(dst+7*stride+16),m128iS30);
5070
5071
5072 if(i==0){
5073 //load next values :
5074 m128iS0 = r1;
5075 m128iS1 = r5;
5076 m128iS2 = r9;
5077 m128iS3 = r13;
5078 m128iS4 = r17;
5079 m128iS5 = r21;
5080 m128iS6 = r25;
5081 m128iS7 = r29;
5082 m128iS8 = r33;
5083 m128iS9 = r37;
5084 m128iS10 = r41;
5085 m128iS11 = r45;
5086 m128iS12 = r49;
5087 m128iS13 = r53;
5088 m128iS14 = r57;
5089 m128iS15 = r61;
5090 m128iS16 = r65;
5091 m128iS17 = r69;
5092 m128iS18 = r73;
5093 m128iS19 = r77;
5094 m128iS20 = r81;
5095 m128iS21 = r85;
5096 m128iS22 = r89;
5097 m128iS23 = r93;
5098 m128iS24 = r97;
5099 m128iS25 = r101;
5100 m128iS26 = r105;
5101 m128iS27 = r109;
5102 m128iS28 = r113;
5103 m128iS29 = r117;
5104 m128iS30 = r121;
5105 m128iS31 =r125;
5106
5107 }else if(i ==8){
5108 //load next values :
5109 m128iS0 = r2;
5110 m128iS1 = r6;
5111 m128iS2 = r10;
5112 m128iS3 = r14;
5113 m128iS4 = r18;
5114 m128iS5 = r22;
5115 m128iS6 = r26;
5116 m128iS7 = r30;
5117 m128iS8 = r34;
5118 m128iS9 = r38;
5119 m128iS10 = r42;
5120 m128iS11 = r46;
5121 m128iS12 = r50;
5122 m128iS13 = r54;
5123 m128iS14 = r58;
5124 m128iS15 = r62;
5125 m128iS16 = r66;
5126 m128iS17 = r70;
5127 m128iS18 = r74;
5128 m128iS19 = r78;
5129 m128iS20 = r82;
5130 m128iS21 = r86;
5131 m128iS22 = r90;
5132 m128iS23 = r94;
5133 m128iS24 = r98;
5134 m128iS25 = r102;
5135 m128iS26 = r106;
5136 m128iS27 = r110;
5137 m128iS28 = r114;
5138 m128iS29 = r118;
5139 m128iS30 = r122;
5140 m128iS31 =r126;
5141
5142 }else if(i==16)
5143 {
5144 //load next values :
5145 m128iS0 = r3;
5146 m128iS1 = r7;
5147 m128iS2 = r11;
5148 m128iS3 = r15;
5149 m128iS4 = r19;
5150 m128iS5 = r23;
5151 m128iS6 = r27;
5152 m128iS7 = r31;
5153 m128iS8 = r35;
5154 m128iS9 = r39;
5155 m128iS10 = r43;
5156 m128iS11 = r47;
5157 m128iS12 = r51;
5158 m128iS13 = r55;
5159 m128iS14 = r59;
5160 m128iS15 = r63;
5161 m128iS16 = r67;
5162 m128iS17 = r71;
5163 m128iS18 = r75;
5164 m128iS19 = r79;
5165 m128iS20 = r83;
5166 m128iS21 = r87;
5167 m128iS22 = r91;
5168 m128iS23 = r95;
5169 m128iS24 = r99;
5170 m128iS25 = r103;
5171 m128iS26 = r107;
5172 m128iS27 = r111;
5173 m128iS28 = r115;
5174 m128iS29 = r119;
5175 m128iS30 = r123;
5176 m128iS31 =r127;
5177 }
5178 }
5179 }
5180 }
5181 }
5182 #endif
5183
5184
5185 #if 0
5186 void ff_hevc_transform_32x32_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
5187 ptrdiff_t _stride) {
5188 int i, j;
5189 uint16_t *dst = (uint16_t*) _dst;
5190 ptrdiff_t stride = _stride / 2;
5191 int shift;
5192 uint8_t shift_2nd = 10; //20 - bit depth
5193 uint16_t add_2nd = 1<<9; //shift2 - 1
5194 int16_t *src = coeffs;
5195
5196 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
5197 m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
5198 m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
5199 m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
5200 E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
5201 O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
5202 E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
5203 __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l;
5204 __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h,
5205 EEE0l, EEE1l, EEE0h, EEE1h;
5206 __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21,
5207 m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27,
5208 m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9,
5209 m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15,
5210 O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l,
5211 O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l,
5212 EE4l, EE7h, EE6h, EE5h, EE4h;
5213 m128iS0 = _mm_load_si128((__m128i *) (src));
5214 m128iS1 = _mm_load_si128((__m128i *) (src + 32));
5215 m128iS2 = _mm_load_si128((__m128i *) (src + 64));
5216 m128iS3 = _mm_load_si128((__m128i *) (src + 96));
5217 m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
5218 m128iS5 = _mm_load_si128((__m128i *) (src + 160));
5219 m128iS6 = _mm_load_si128((__m128i *) (src + 192));
5220 m128iS7 = _mm_load_si128((__m128i *) (src + 224));
5221 m128iS8 = _mm_load_si128((__m128i *) (src + 256));
5222 m128iS9 = _mm_load_si128((__m128i *) (src + 288));
5223 m128iS10 = _mm_load_si128((__m128i *) (src + 320));
5224 m128iS11 = _mm_load_si128((__m128i *) (src + 352));
5225 m128iS12 = _mm_loadu_si128((__m128i *) (src + 384));
5226 m128iS13 = _mm_load_si128((__m128i *) (src + 416));
5227 m128iS14 = _mm_load_si128((__m128i *) (src + 448));
5228 m128iS15 = _mm_load_si128((__m128i *) (src + 480));
5229 m128iS16 = _mm_load_si128((__m128i *) (src + 512));
5230 m128iS17 = _mm_load_si128((__m128i *) (src + 544));
5231 m128iS18 = _mm_load_si128((__m128i *) (src + 576));
5232 m128iS19 = _mm_load_si128((__m128i *) (src + 608));
5233 m128iS20 = _mm_load_si128((__m128i *) (src + 640));
5234 m128iS21 = _mm_load_si128((__m128i *) (src + 672));
5235 m128iS22 = _mm_load_si128((__m128i *) (src + 704));
5236 m128iS23 = _mm_load_si128((__m128i *) (src + 736));
5237 m128iS24 = _mm_load_si128((__m128i *) (src + 768));
5238 m128iS25 = _mm_load_si128((__m128i *) (src + 800));
5239 m128iS26 = _mm_load_si128((__m128i *) (src + 832));
5240 m128iS27 = _mm_load_si128((__m128i *) (src + 864));
5241 m128iS28 = _mm_load_si128((__m128i *) (src + 896));
5242 m128iS29 = _mm_load_si128((__m128i *) (src + 928));
5243 m128iS30 = _mm_load_si128((__m128i *) (src + 960));
5244 m128iS31 = _mm_load_si128((__m128i *) (src + 992));
5245
5246 shift = shift_1st;
5247 m128iAdd = _mm_set1_epi32(add_1st);
5248
5249 for (j = 0; j < 2; j++) {
5250 for (i = 0; i < 32; i += 8) {
5251 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
5252 E0l = _mm_madd_epi16(m128Tmp0,
5253 _mm_load_si128((__m128i *) (transform32x32[0][0])));
5254 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
5255 E0h = _mm_madd_epi16(m128Tmp1,
5256 _mm_load_si128((__m128i *) (transform32x32[0][0])));
5257
5258 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
5259 E1l = _mm_madd_epi16(m128Tmp2,
5260 _mm_load_si128((__m128i *) (transform32x32[1][0])));
5261 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
5262 E1h = _mm_madd_epi16(m128Tmp3,
5263 _mm_load_si128((__m128i *) (transform32x32[1][0])));
5264
5265 m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
5266 E2l = _mm_madd_epi16(m128Tmp4,
5267 _mm_load_si128((__m128i *) (transform32x32[2][0])));
5268 m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
5269 E2h = _mm_madd_epi16(m128Tmp5,
5270 _mm_load_si128((__m128i *) (transform32x32[2][0])));
5271
5272 m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
5273 E3l = _mm_madd_epi16(m128Tmp6,
5274 _mm_load_si128((__m128i *) (transform32x32[3][0])));
5275 m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
5276 E3h = _mm_madd_epi16(m128Tmp7,
5277 _mm_load_si128((__m128i *) (transform32x32[3][0])));
5278
5279 m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19);
5280 E4l = _mm_madd_epi16(m128Tmp8,
5281 _mm_load_si128((__m128i *) (transform32x32[4][0])));
5282 m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19);
5283 E4h = _mm_madd_epi16(m128Tmp9,
5284 _mm_load_si128((__m128i *) (transform32x32[4][0])));
5285
5286 m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23);
5287 E5l = _mm_madd_epi16(m128Tmp10,
5288 _mm_load_si128((__m128i *) (transform32x32[5][0])));
5289 m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23);
5290 E5h = _mm_madd_epi16(m128Tmp11,
5291 _mm_load_si128((__m128i *) (transform32x32[5][0])));
5292
5293 m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27);
5294 E6l = _mm_madd_epi16(m128Tmp12,
5295 _mm_load_si128((__m128i *) (transform32x32[6][0])));
5296 m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27);
5297 E6h = _mm_madd_epi16(m128Tmp13,
5298 _mm_load_si128((__m128i *) (transform32x32[6][0])));
5299
5300 m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31);
5301 E7l = _mm_madd_epi16(m128Tmp14,
5302 _mm_load_si128((__m128i *) (transform32x32[7][0])));
5303 m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31);
5304 E7h = _mm_madd_epi16(m128Tmp15,
5305 _mm_load_si128((__m128i *) (transform32x32[7][0])));
5306
5307 O0l = _mm_add_epi32(E0l, E1l);
5308 O0l = _mm_add_epi32(O0l, E2l);
5309 O0l = _mm_add_epi32(O0l, E3l);
5310 O0l = _mm_add_epi32(O0l, E4l);
5311 O0l = _mm_add_epi32(O0l, E5l);
5312 O0l = _mm_add_epi32(O0l, E6l);
5313 O0l = _mm_add_epi32(O0l, E7l);
5314
5315 O0h = _mm_add_epi32(E0h, E1h);
5316 O0h = _mm_add_epi32(O0h, E2h);
5317 O0h = _mm_add_epi32(O0h, E3h);
5318 O0h = _mm_add_epi32(O0h, E4h);
5319 O0h = _mm_add_epi32(O0h, E5h);
5320 O0h = _mm_add_epi32(O0h, E6h);
5321 O0h = _mm_add_epi32(O0h, E7h);
5322
5323 /* Compute O1*/
5324 E0l = _mm_madd_epi16(m128Tmp0,
5325 _mm_load_si128((__m128i *) (transform32x32[0][1])));
5326 E0h = _mm_madd_epi16(m128Tmp1,
5327 _mm_load_si128((__m128i *) (transform32x32[0][1])));
5328 E1l = _mm_madd_epi16(m128Tmp2,
5329 _mm_load_si128((__m128i *) (transform32x32[1][1])));
5330 E1h = _mm_madd_epi16(m128Tmp3,
5331 _mm_load_si128((__m128i *) (transform32x32[1][1])));
5332 E2l = _mm_madd_epi16(m128Tmp4,
5333 _mm_load_si128((__m128i *) (transform32x32[2][1])));
5334 E2h = _mm_madd_epi16(m128Tmp5,
5335 _mm_load_si128((__m128i *) (transform32x32[2][1])));
5336 E3l = _mm_madd_epi16(m128Tmp6,
5337 _mm_load_si128((__m128i *) (transform32x32[3][1])));
5338 E3h = _mm_madd_epi16(m128Tmp7,
5339 _mm_load_si128((__m128i *) (transform32x32[3][1])));
5340
5341 E4l = _mm_madd_epi16(m128Tmp8,
5342 _mm_load_si128((__m128i *) (transform32x32[4][1])));
5343 E4h = _mm_madd_epi16(m128Tmp9,
5344 _mm_load_si128((__m128i *) (transform32x32[4][1])));
5345 E5l = _mm_madd_epi16(m128Tmp10,
5346 _mm_load_si128((__m128i *) (transform32x32[5][1])));
5347 E5h = _mm_madd_epi16(m128Tmp11,
5348 _mm_load_si128((__m128i *) (transform32x32[5][1])));
5349 E6l = _mm_madd_epi16(m128Tmp12,
5350 _mm_load_si128((__m128i *) (transform32x32[6][1])));
5351 E6h = _mm_madd_epi16(m128Tmp13,
5352 _mm_load_si128((__m128i *) (transform32x32[6][1])));
5353 E7l = _mm_madd_epi16(m128Tmp14,
5354 _mm_load_si128((__m128i *) (transform32x32[7][1])));
5355 E7h = _mm_madd_epi16(m128Tmp15,
5356 _mm_load_si128((__m128i *) (transform32x32[7][1])));
5357
5358 O1l = _mm_add_epi32(E0l, E1l);
5359 O1l = _mm_add_epi32(O1l, E2l);
5360 O1l = _mm_add_epi32(O1l, E3l);
5361 O1l = _mm_add_epi32(O1l, E4l);
5362 O1l = _mm_add_epi32(O1l, E5l);
5363 O1l = _mm_add_epi32(O1l, E6l);
5364 O1l = _mm_add_epi32(O1l, E7l);
5365
5366 O1h = _mm_add_epi32(E0h, E1h);
5367 O1h = _mm_add_epi32(O1h, E2h);
5368 O1h = _mm_add_epi32(O1h, E3h);
5369 O1h = _mm_add_epi32(O1h, E4h);
5370 O1h = _mm_add_epi32(O1h, E5h);
5371 O1h = _mm_add_epi32(O1h, E6h);
5372 O1h = _mm_add_epi32(O1h, E7h);
5373 /* Compute O2*/
5374 E0l = _mm_madd_epi16(m128Tmp0,
5375 _mm_load_si128((__m128i *) (transform32x32[0][2])));
5376 E0h = _mm_madd_epi16(m128Tmp1,
5377 _mm_load_si128((__m128i *) (transform32x32[0][2])));
5378 E1l = _mm_madd_epi16(m128Tmp2,
5379 _mm_load_si128((__m128i *) (transform32x32[1][2])));
5380 E1h = _mm_madd_epi16(m128Tmp3,
5381 _mm_load_si128((__m128i *) (transform32x32[1][2])));
5382 E2l = _mm_madd_epi16(m128Tmp4,
5383 _mm_load_si128((__m128i *) (transform32x32[2][2])));
5384 E2h = _mm_madd_epi16(m128Tmp5,
5385 _mm_load_si128((__m128i *) (transform32x32[2][2])));
5386 E3l = _mm_madd_epi16(m128Tmp6,
5387 _mm_load_si128((__m128i *) (transform32x32[3][2])));
5388 E3h = _mm_madd_epi16(m128Tmp7,
5389 _mm_load_si128((__m128i *) (transform32x32[3][2])));
5390
5391 E4l = _mm_madd_epi16(m128Tmp8,
5392 _mm_load_si128((__m128i *) (transform32x32[4][2])));
5393 E4h = _mm_madd_epi16(m128Tmp9,
5394 _mm_load_si128((__m128i *) (transform32x32[4][2])));
5395 E5l = _mm_madd_epi16(m128Tmp10,
5396 _mm_load_si128((__m128i *) (transform32x32[5][2])));
5397 E5h = _mm_madd_epi16(m128Tmp11,
5398 _mm_load_si128((__m128i *) (transform32x32[5][2])));
5399 E6l = _mm_madd_epi16(m128Tmp12,
5400 _mm_load_si128((__m128i *) (transform32x32[6][2])));
5401 E6h = _mm_madd_epi16(m128Tmp13,
5402 _mm_load_si128((__m128i *) (transform32x32[6][2])));
5403 E7l = _mm_madd_epi16(m128Tmp14,
5404 _mm_load_si128((__m128i *) (transform32x32[7][2])));
5405 E7h = _mm_madd_epi16(m128Tmp15,
5406 _mm_load_si128((__m128i *) (transform32x32[7][2])));
5407
5408 O2l = _mm_add_epi32(E0l, E1l);
5409 O2l = _mm_add_epi32(O2l, E2l);
5410 O2l = _mm_add_epi32(O2l, E3l);
5411 O2l = _mm_add_epi32(O2l, E4l);
5412 O2l = _mm_add_epi32(O2l, E5l);
5413 O2l = _mm_add_epi32(O2l, E6l);
5414 O2l = _mm_add_epi32(O2l, E7l);
5415
5416 O2h = _mm_add_epi32(E0h, E1h);
5417 O2h = _mm_add_epi32(O2h, E2h);
5418 O2h = _mm_add_epi32(O2h, E3h);
5419 O2h = _mm_add_epi32(O2h, E4h);
5420 O2h = _mm_add_epi32(O2h, E5h);
5421 O2h = _mm_add_epi32(O2h, E6h);
5422 O2h = _mm_add_epi32(O2h, E7h);
5423 /* Compute O3*/
5424 E0l = _mm_madd_epi16(m128Tmp0,
5425 _mm_load_si128((__m128i *) (transform32x32[0][3])));
5426 E0h = _mm_madd_epi16(m128Tmp1,
5427 _mm_load_si128((__m128i *) (transform32x32[0][3])));
5428 E1l = _mm_madd_epi16(m128Tmp2,
5429 _mm_load_si128((__m128i *) (transform32x32[1][3])));
5430 E1h = _mm_madd_epi16(m128Tmp3,
5431 _mm_load_si128((__m128i *) (transform32x32[1][3])));
5432 E2l = _mm_madd_epi16(m128Tmp4,
5433 _mm_load_si128((__m128i *) (transform32x32[2][3])));
5434 E2h = _mm_madd_epi16(m128Tmp5,
5435 _mm_load_si128((__m128i *) (transform32x32[2][3])));
5436 E3l = _mm_madd_epi16(m128Tmp6,
5437 _mm_load_si128((__m128i *) (transform32x32[3][3])));
5438 E3h = _mm_madd_epi16(m128Tmp7,
5439 _mm_load_si128((__m128i *) (transform32x32[3][3])));
5440
5441 E4l = _mm_madd_epi16(m128Tmp8,
5442 _mm_load_si128((__m128i *) (transform32x32[4][3])));
5443 E4h = _mm_madd_epi16(m128Tmp9,
5444 _mm_load_si128((__m128i *) (transform32x32[4][3])));
5445 E5l = _mm_madd_epi16(m128Tmp10,
5446 _mm_load_si128((__m128i *) (transform32x32[5][3])));
5447 E5h = _mm_madd_epi16(m128Tmp11,
5448 _mm_load_si128((__m128i *) (transform32x32[5][3])));
5449 E6l = _mm_madd_epi16(m128Tmp12,
5450 _mm_load_si128((__m128i *) (transform32x32[6][3])));
5451 E6h = _mm_madd_epi16(m128Tmp13,
5452 _mm_load_si128((__m128i *) (transform32x32[6][3])));
5453 E7l = _mm_madd_epi16(m128Tmp14,
5454 _mm_load_si128((__m128i *) (transform32x32[7][3])));
5455 E7h = _mm_madd_epi16(m128Tmp15,
5456 _mm_load_si128((__m128i *) (transform32x32[7][3])));
5457
5458 O3l = _mm_add_epi32(E0l, E1l);
5459 O3l = _mm_add_epi32(O3l, E2l);
5460 O3l = _mm_add_epi32(O3l, E3l);
5461 O3l = _mm_add_epi32(O3l, E4l);
5462 O3l = _mm_add_epi32(O3l, E5l);
5463 O3l = _mm_add_epi32(O3l, E6l);
5464 O3l = _mm_add_epi32(O3l, E7l);
5465
5466 O3h = _mm_add_epi32(E0h, E1h);
5467 O3h = _mm_add_epi32(O3h, E2h);
5468 O3h = _mm_add_epi32(O3h, E3h);
5469 O3h = _mm_add_epi32(O3h, E4h);
5470 O3h = _mm_add_epi32(O3h, E5h);
5471 O3h = _mm_add_epi32(O3h, E6h);
5472 O3h = _mm_add_epi32(O3h, E7h);
5473 /* Compute O4*/
5474
5475 E0l = _mm_madd_epi16(m128Tmp0,
5476 _mm_load_si128((__m128i *) (transform32x32[0][4])));
5477 E0h = _mm_madd_epi16(m128Tmp1,
5478 _mm_load_si128((__m128i *) (transform32x32[0][4])));
5479 E1l = _mm_madd_epi16(m128Tmp2,
5480 _mm_load_si128((__m128i *) (transform32x32[1][4])));
5481 E1h = _mm_madd_epi16(m128Tmp3,
5482 _mm_load_si128((__m128i *) (transform32x32[1][4])));
5483 E2l = _mm_madd_epi16(m128Tmp4,
5484 _mm_load_si128((__m128i *) (transform32x32[2][4])));
5485 E2h = _mm_madd_epi16(m128Tmp5,
5486 _mm_load_si128((__m128i *) (transform32x32[2][4])));
5487 E3l = _mm_madd_epi16(m128Tmp6,
5488 _mm_load_si128((__m128i *) (transform32x32[3][4])));
5489 E3h = _mm_madd_epi16(m128Tmp7,
5490 _mm_load_si128((__m128i *) (transform32x32[3][4])));
5491
5492 E4l = _mm_madd_epi16(m128Tmp8,
5493 _mm_load_si128((__m128i *) (transform32x32[4][4])));
5494 E4h = _mm_madd_epi16(m128Tmp9,
5495 _mm_load_si128((__m128i *) (transform32x32[4][4])));
5496 E5l = _mm_madd_epi16(m128Tmp10,
5497 _mm_load_si128((__m128i *) (transform32x32[5][4])));
5498 E5h = _mm_madd_epi16(m128Tmp11,
5499 _mm_load_si128((__m128i *) (transform32x32[5][4])));
5500 E6l = _mm_madd_epi16(m128Tmp12,
5501 _mm_load_si128((__m128i *) (transform32x32[6][4])));
5502 E6h = _mm_madd_epi16(m128Tmp13,
5503 _mm_load_si128((__m128i *) (transform32x32[6][4])));
5504 E7l = _mm_madd_epi16(m128Tmp14,
5505 _mm_load_si128((__m128i *) (transform32x32[7][4])));
5506 E7h = _mm_madd_epi16(m128Tmp15,
5507 _mm_load_si128((__m128i *) (transform32x32[7][4])));
5508
5509 O4l = _mm_add_epi32(E0l, E1l);
5510 O4l = _mm_add_epi32(O4l, E2l);
5511 O4l = _mm_add_epi32(O4l, E3l);
5512 O4l = _mm_add_epi32(O4l, E4l);
5513 O4l = _mm_add_epi32(O4l, E5l);
5514 O4l = _mm_add_epi32(O4l, E6l);
5515 O4l = _mm_add_epi32(O4l, E7l);
5516
5517 O4h = _mm_add_epi32(E0h, E1h);
5518 O4h = _mm_add_epi32(O4h, E2h);
5519 O4h = _mm_add_epi32(O4h, E3h);
5520 O4h = _mm_add_epi32(O4h, E4h);
5521 O4h = _mm_add_epi32(O4h, E5h);
5522 O4h = _mm_add_epi32(O4h, E6h);
5523 O4h = _mm_add_epi32(O4h, E7h);
5524
5525 /* Compute O5*/
5526 E0l = _mm_madd_epi16(m128Tmp0,
5527 _mm_load_si128((__m128i *) (transform32x32[0][5])));
5528 E0h = _mm_madd_epi16(m128Tmp1,
5529 _mm_load_si128((__m128i *) (transform32x32[0][5])));
5530 E1l = _mm_madd_epi16(m128Tmp2,
5531 _mm_load_si128((__m128i *) (transform32x32[1][5])));
5532 E1h = _mm_madd_epi16(m128Tmp3,
5533 _mm_load_si128((__m128i *) (transform32x32[1][5])));
5534 E2l = _mm_madd_epi16(m128Tmp4,
5535 _mm_load_si128((__m128i *) (transform32x32[2][5])));
5536 E2h = _mm_madd_epi16(m128Tmp5,
5537 _mm_load_si128((__m128i *) (transform32x32[2][5])));
5538 E3l = _mm_madd_epi16(m128Tmp6,
5539 _mm_load_si128((__m128i *) (transform32x32[3][5])));
5540 E3h = _mm_madd_epi16(m128Tmp7,
5541 _mm_load_si128((__m128i *) (transform32x32[3][5])));
5542
5543 E4l = _mm_madd_epi16(m128Tmp8,
5544 _mm_load_si128((__m128i *) (transform32x32[4][5])));
5545 E4h = _mm_madd_epi16(m128Tmp9,
5546 _mm_load_si128((__m128i *) (transform32x32[4][5])));
5547 E5l = _mm_madd_epi16(m128Tmp10,
5548 _mm_load_si128((__m128i *) (transform32x32[5][5])));
5549 E5h = _mm_madd_epi16(m128Tmp11,
5550 _mm_load_si128((__m128i *) (transform32x32[5][5])));
5551 E6l = _mm_madd_epi16(m128Tmp12,
5552 _mm_load_si128((__m128i *) (transform32x32[6][5])));
5553 E6h = _mm_madd_epi16(m128Tmp13,
5554 _mm_load_si128((__m128i *) (transform32x32[6][5])));
5555 E7l = _mm_madd_epi16(m128Tmp14,
5556 _mm_load_si128((__m128i *) (transform32x32[7][5])));
5557 E7h = _mm_madd_epi16(m128Tmp15,
5558 _mm_load_si128((__m128i *) (transform32x32[7][5])));
5559
5560 O5l = _mm_add_epi32(E0l, E1l);
5561 O5l = _mm_add_epi32(O5l, E2l);
5562 O5l = _mm_add_epi32(O5l, E3l);
5563 O5l = _mm_add_epi32(O5l, E4l);
5564 O5l = _mm_add_epi32(O5l, E5l);
5565 O5l = _mm_add_epi32(O5l, E6l);
5566 O5l = _mm_add_epi32(O5l, E7l);
5567
5568 O5h = _mm_add_epi32(E0h, E1h);
5569 O5h = _mm_add_epi32(O5h, E2h);
5570 O5h = _mm_add_epi32(O5h, E3h);
5571 O5h = _mm_add_epi32(O5h, E4h);
5572 O5h = _mm_add_epi32(O5h, E5h);
5573 O5h = _mm_add_epi32(O5h, E6h);
5574 O5h = _mm_add_epi32(O5h, E7h);
5575
5576 /* Compute O6*/
5577
5578 E0l = _mm_madd_epi16(m128Tmp0,
5579 _mm_load_si128((__m128i *) (transform32x32[0][6])));
5580 E0h = _mm_madd_epi16(m128Tmp1,
5581 _mm_load_si128((__m128i *) (transform32x32[0][6])));
5582 E1l = _mm_madd_epi16(m128Tmp2,
5583 _mm_load_si128((__m128i *) (transform32x32[1][6])));
5584 E1h = _mm_madd_epi16(m128Tmp3,
5585 _mm_load_si128((__m128i *) (transform32x32[1][6])));
5586 E2l = _mm_madd_epi16(m128Tmp4,
5587 _mm_load_si128((__m128i *) (transform32x32[2][6])));
5588 E2h = _mm_madd_epi16(m128Tmp5,
5589 _mm_load_si128((__m128i *) (transform32x32[2][6])));
5590 E3l = _mm_madd_epi16(m128Tmp6,
5591 _mm_load_si128((__m128i *) (transform32x32[3][6])));
5592 E3h = _mm_madd_epi16(m128Tmp7,
5593 _mm_load_si128((__m128i *) (transform32x32[3][6])));
5594
5595 E4l = _mm_madd_epi16(m128Tmp8,
5596 _mm_load_si128((__m128i *) (transform32x32[4][6])));
5597 E4h = _mm_madd_epi16(m128Tmp9,
5598 _mm_load_si128((__m128i *) (transform32x32[4][6])));
5599 E5l = _mm_madd_epi16(m128Tmp10,
5600 _mm_load_si128((__m128i *) (transform32x32[5][6])));
5601 E5h = _mm_madd_epi16(m128Tmp11,
5602 _mm_load_si128((__m128i *) (transform32x32[5][6])));
5603 E6l = _mm_madd_epi16(m128Tmp12,
5604 _mm_load_si128((__m128i *) (transform32x32[6][6])));
5605 E6h = _mm_madd_epi16(m128Tmp13,
5606 _mm_load_si128((__m128i *) (transform32x32[6][6])));
5607 E7l = _mm_madd_epi16(m128Tmp14,
5608 _mm_load_si128((__m128i *) (transform32x32[7][6])));
5609 E7h = _mm_madd_epi16(m128Tmp15,
5610 _mm_load_si128((__m128i *) (transform32x32[7][6])));
5611
5612 O6l = _mm_add_epi32(E0l, E1l);
5613 O6l = _mm_add_epi32(O6l, E2l);
5614 O6l = _mm_add_epi32(O6l, E3l);
5615 O6l = _mm_add_epi32(O6l, E4l);
5616 O6l = _mm_add_epi32(O6l, E5l);
5617 O6l = _mm_add_epi32(O6l, E6l);
5618 O6l = _mm_add_epi32(O6l, E7l);
5619
5620 O6h = _mm_add_epi32(E0h, E1h);
5621 O6h = _mm_add_epi32(O6h, E2h);
5622 O6h = _mm_add_epi32(O6h, E3h);
5623 O6h = _mm_add_epi32(O6h, E4h);
5624 O6h = _mm_add_epi32(O6h, E5h);
5625 O6h = _mm_add_epi32(O6h, E6h);
5626 O6h = _mm_add_epi32(O6h, E7h);
5627
5628 /* Compute O7*/
5629
5630 E0l = _mm_madd_epi16(m128Tmp0,
5631 _mm_load_si128((__m128i *) (transform32x32[0][7])));
5632 E0h = _mm_madd_epi16(m128Tmp1,
5633 _mm_load_si128((__m128i *) (transform32x32[0][7])));
5634 E1l = _mm_madd_epi16(m128Tmp2,
5635 _mm_load_si128((__m128i *) (transform32x32[1][7])));
5636 E1h = _mm_madd_epi16(m128Tmp3,
5637 _mm_load_si128((__m128i *) (transform32x32[1][7])));
5638 E2l = _mm_madd_epi16(m128Tmp4,
5639 _mm_load_si128((__m128i *) (transform32x32[2][7])));
5640 E2h = _mm_madd_epi16(m128Tmp5,
5641 _mm_load_si128((__m128i *) (transform32x32[2][7])));
5642 E3l = _mm_madd_epi16(m128Tmp6,
5643 _mm_load_si128((__m128i *) (transform32x32[3][7])));
5644 E3h = _mm_madd_epi16(m128Tmp7,
5645 _mm_load_si128((__m128i *) (transform32x32[3][7])));
5646
5647 E4l = _mm_madd_epi16(m128Tmp8,
5648 _mm_load_si128((__m128i *) (transform32x32[4][7])));
5649 E4h = _mm_madd_epi16(m128Tmp9,
5650 _mm_load_si128((__m128i *) (transform32x32[4][7])));
5651 E5l = _mm_madd_epi16(m128Tmp10,
5652 _mm_load_si128((__m128i *) (transform32x32[5][7])));
5653 E5h = _mm_madd_epi16(m128Tmp11,
5654 _mm_load_si128((__m128i *) (transform32x32[5][7])));
5655 E6l = _mm_madd_epi16(m128Tmp12,
5656 _mm_load_si128((__m128i *) (transform32x32[6][7])));
5657 E6h = _mm_madd_epi16(m128Tmp13,
5658 _mm_load_si128((__m128i *) (transform32x32[6][7])));
5659 E7l = _mm_madd_epi16(m128Tmp14,
5660 _mm_load_si128((__m128i *) (transform32x32[7][7])));
5661 E7h = _mm_madd_epi16(m128Tmp15,
5662 _mm_load_si128((__m128i *) (transform32x32[7][7])));
5663
5664 O7l = _mm_add_epi32(E0l, E1l);
5665 O7l = _mm_add_epi32(O7l, E2l);
5666 O7l = _mm_add_epi32(O7l, E3l);
5667 O7l = _mm_add_epi32(O7l, E4l);
5668 O7l = _mm_add_epi32(O7l, E5l);
5669 O7l = _mm_add_epi32(O7l, E6l);
5670 O7l = _mm_add_epi32(O7l, E7l);
5671
5672 O7h = _mm_add_epi32(E0h, E1h);
5673 O7h = _mm_add_epi32(O7h, E2h);
5674 O7h = _mm_add_epi32(O7h, E3h);
5675 O7h = _mm_add_epi32(O7h, E4h);
5676 O7h = _mm_add_epi32(O7h, E5h);
5677 O7h = _mm_add_epi32(O7h, E6h);
5678 O7h = _mm_add_epi32(O7h, E7h);
5679
5680 /* Compute O8*/
5681
5682 E0l = _mm_madd_epi16(m128Tmp0,
5683 _mm_load_si128((__m128i *) (transform32x32[0][8])));
5684 E0h = _mm_madd_epi16(m128Tmp1,
5685 _mm_load_si128((__m128i *) (transform32x32[0][8])));
5686 E1l = _mm_madd_epi16(m128Tmp2,
5687 _mm_load_si128((__m128i *) (transform32x32[1][8])));
5688 E1h = _mm_madd_epi16(m128Tmp3,
5689 _mm_load_si128((__m128i *) (transform32x32[1][8])));
5690 E2l = _mm_madd_epi16(m128Tmp4,
5691 _mm_load_si128((__m128i *) (transform32x32[2][8])));
5692 E2h = _mm_madd_epi16(m128Tmp5,
5693 _mm_load_si128((__m128i *) (transform32x32[2][8])));
5694 E3l = _mm_madd_epi16(m128Tmp6,
5695 _mm_load_si128((__m128i *) (transform32x32[3][8])));
5696 E3h = _mm_madd_epi16(m128Tmp7,
5697 _mm_load_si128((__m128i *) (transform32x32[3][8])));
5698
5699 E4l = _mm_madd_epi16(m128Tmp8,
5700 _mm_load_si128((__m128i *) (transform32x32[4][8])));
5701 E4h = _mm_madd_epi16(m128Tmp9,
5702 _mm_load_si128((__m128i *) (transform32x32[4][8])));
5703 E5l = _mm_madd_epi16(m128Tmp10,
5704 _mm_load_si128((__m128i *) (transform32x32[5][8])));
5705 E5h = _mm_madd_epi16(m128Tmp11,
5706 _mm_load_si128((__m128i *) (transform32x32[5][8])));
5707 E6l = _mm_madd_epi16(m128Tmp12,
5708 _mm_load_si128((__m128i *) (transform32x32[6][8])));
5709 E6h = _mm_madd_epi16(m128Tmp13,
5710 _mm_load_si128((__m128i *) (transform32x32[6][8])));
5711 E7l = _mm_madd_epi16(m128Tmp14,
5712 _mm_load_si128((__m128i *) (transform32x32[7][8])));
5713 E7h = _mm_madd_epi16(m128Tmp15,
5714 _mm_load_si128((__m128i *) (transform32x32[7][8])));
5715
5716 O8l = _mm_add_epi32(E0l, E1l);
5717 O8l = _mm_add_epi32(O8l, E2l);
5718 O8l = _mm_add_epi32(O8l, E3l);
5719 O8l = _mm_add_epi32(O8l, E4l);
5720 O8l = _mm_add_epi32(O8l, E5l);
5721 O8l = _mm_add_epi32(O8l, E6l);
5722 O8l = _mm_add_epi32(O8l, E7l);
5723
5724 O8h = _mm_add_epi32(E0h, E1h);
5725 O8h = _mm_add_epi32(O8h, E2h);
5726 O8h = _mm_add_epi32(O8h, E3h);
5727 O8h = _mm_add_epi32(O8h, E4h);
5728 O8h = _mm_add_epi32(O8h, E5h);
5729 O8h = _mm_add_epi32(O8h, E6h);
5730 O8h = _mm_add_epi32(O8h, E7h);
5731
5732 /* Compute O9*/
5733
5734 E0l = _mm_madd_epi16(m128Tmp0,
5735 _mm_load_si128((__m128i *) (transform32x32[0][9])));
5736 E0h = _mm_madd_epi16(m128Tmp1,
5737 _mm_load_si128((__m128i *) (transform32x32[0][9])));
5738 E1l = _mm_madd_epi16(m128Tmp2,
5739 _mm_load_si128((__m128i *) (transform32x32[1][9])));
5740 E1h = _mm_madd_epi16(m128Tmp3,
5741 _mm_load_si128((__m128i *) (transform32x32[1][9])));
5742 E2l = _mm_madd_epi16(m128Tmp4,
5743 _mm_load_si128((__m128i *) (transform32x32[2][9])));
5744 E2h = _mm_madd_epi16(m128Tmp5,
5745 _mm_load_si128((__m128i *) (transform32x32[2][9])));
5746 E3l = _mm_madd_epi16(m128Tmp6,
5747 _mm_load_si128((__m128i *) (transform32x32[3][9])));
5748 E3h = _mm_madd_epi16(m128Tmp7,
5749 _mm_load_si128((__m128i *) (transform32x32[3][9])));
5750
5751 E4l = _mm_madd_epi16(m128Tmp8,
5752 _mm_load_si128((__m128i *) (transform32x32[4][9])));
5753 E4h = _mm_madd_epi16(m128Tmp9,
5754 _mm_load_si128((__m128i *) (transform32x32[4][9])));
5755 E5l = _mm_madd_epi16(m128Tmp10,
5756 _mm_load_si128((__m128i *) (transform32x32[5][9])));
5757 E5h = _mm_madd_epi16(m128Tmp11,
5758 _mm_load_si128((__m128i *) (transform32x32[5][9])));
5759 E6l = _mm_madd_epi16(m128Tmp12,
5760 _mm_load_si128((__m128i *) (transform32x32[6][9])));
5761 E6h = _mm_madd_epi16(m128Tmp13,
5762 _mm_load_si128((__m128i *) (transform32x32[6][9])));
5763 E7l = _mm_madd_epi16(m128Tmp14,
5764 _mm_load_si128((__m128i *) (transform32x32[7][9])));
5765 E7h = _mm_madd_epi16(m128Tmp15,
5766 _mm_load_si128((__m128i *) (transform32x32[7][9])));
5767
5768 O9l = _mm_add_epi32(E0l, E1l);
5769 O9l = _mm_add_epi32(O9l, E2l);
5770 O9l = _mm_add_epi32(O9l, E3l);
5771 O9l = _mm_add_epi32(O9l, E4l);
5772 O9l = _mm_add_epi32(O9l, E5l);
5773 O9l = _mm_add_epi32(O9l, E6l);
5774 O9l = _mm_add_epi32(O9l, E7l);
5775
5776 O9h = _mm_add_epi32(E0h, E1h);
5777 O9h = _mm_add_epi32(O9h, E2h);
5778 O9h = _mm_add_epi32(O9h, E3h);
5779 O9h = _mm_add_epi32(O9h, E4h);
5780 O9h = _mm_add_epi32(O9h, E5h);
5781 O9h = _mm_add_epi32(O9h, E6h);
5782 O9h = _mm_add_epi32(O9h, E7h);
5783
5784 /* Compute 10*/
5785
5786 E0l = _mm_madd_epi16(m128Tmp0,
5787 _mm_load_si128((__m128i *) (transform32x32[0][10])));
5788 E0h = _mm_madd_epi16(m128Tmp1,
5789 _mm_load_si128((__m128i *) (transform32x32[0][10])));
5790 E1l = _mm_madd_epi16(m128Tmp2,
5791 _mm_load_si128((__m128i *) (transform32x32[1][10])));
5792 E1h = _mm_madd_epi16(m128Tmp3,
5793 _mm_load_si128((__m128i *) (transform32x32[1][10])));
5794 E2l = _mm_madd_epi16(m128Tmp4,
5795 _mm_load_si128((__m128i *) (transform32x32[2][10])));
5796 E2h = _mm_madd_epi16(m128Tmp5,
5797 _mm_load_si128((__m128i *) (transform32x32[2][10])));
5798 E3l = _mm_madd_epi16(m128Tmp6,
5799 _mm_load_si128((__m128i *) (transform32x32[3][10])));
5800 E3h = _mm_madd_epi16(m128Tmp7,
5801 _mm_load_si128((__m128i *) (transform32x32[3][10])));
5802
5803 E4l = _mm_madd_epi16(m128Tmp8,
5804 _mm_load_si128((__m128i *) (transform32x32[4][10])));
5805 E4h = _mm_madd_epi16(m128Tmp9,
5806 _mm_load_si128((__m128i *) (transform32x32[4][10])));
5807 E5l = _mm_madd_epi16(m128Tmp10,
5808 _mm_load_si128((__m128i *) (transform32x32[5][10])));
5809 E5h = _mm_madd_epi16(m128Tmp11,
5810 _mm_load_si128((__m128i *) (transform32x32[5][10])));
5811 E6l = _mm_madd_epi16(m128Tmp12,
5812 _mm_load_si128((__m128i *) (transform32x32[6][10])));
5813 E6h = _mm_madd_epi16(m128Tmp13,
5814 _mm_load_si128((__m128i *) (transform32x32[6][10])));
5815 E7l = _mm_madd_epi16(m128Tmp14,
5816 _mm_load_si128((__m128i *) (transform32x32[7][10])));
5817 E7h = _mm_madd_epi16(m128Tmp15,
5818 _mm_load_si128((__m128i *) (transform32x32[7][10])));
5819
5820 O10l = _mm_add_epi32(E0l, E1l);
5821 O10l = _mm_add_epi32(O10l, E2l);
5822 O10l = _mm_add_epi32(O10l, E3l);
5823 O10l = _mm_add_epi32(O10l, E4l);
5824 O10l = _mm_add_epi32(O10l, E5l);
5825 O10l = _mm_add_epi32(O10l, E6l);
5826 O10l = _mm_add_epi32(O10l, E7l);
5827
5828 O10h = _mm_add_epi32(E0h, E1h);
5829 O10h = _mm_add_epi32(O10h, E2h);
5830 O10h = _mm_add_epi32(O10h, E3h);
5831 O10h = _mm_add_epi32(O10h, E4h);
5832 O10h = _mm_add_epi32(O10h, E5h);
5833 O10h = _mm_add_epi32(O10h, E6h);
5834 O10h = _mm_add_epi32(O10h, E7h);
5835
5836 /* Compute 11*/
5837
5838 E0l = _mm_madd_epi16(m128Tmp0,
5839 _mm_load_si128((__m128i *) (transform32x32[0][11])));
5840 E0h = _mm_madd_epi16(m128Tmp1,
5841 _mm_load_si128((__m128i *) (transform32x32[0][11])));
5842 E1l = _mm_madd_epi16(m128Tmp2,
5843 _mm_load_si128((__m128i *) (transform32x32[1][11])));
5844 E1h = _mm_madd_epi16(m128Tmp3,
5845 _mm_load_si128((__m128i *) (transform32x32[1][11])));
5846 E2l = _mm_madd_epi16(m128Tmp4,
5847 _mm_load_si128((__m128i *) (transform32x32[2][11])));
5848 E2h = _mm_madd_epi16(m128Tmp5,
5849 _mm_load_si128((__m128i *) (transform32x32[2][11])));
5850 E3l = _mm_madd_epi16(m128Tmp6,
5851 _mm_load_si128((__m128i *) (transform32x32[3][11])));
5852 E3h = _mm_madd_epi16(m128Tmp7,
5853 _mm_load_si128((__m128i *) (transform32x32[3][11])));
5854
5855 E4l = _mm_madd_epi16(m128Tmp8,
5856 _mm_load_si128((__m128i *) (transform32x32[4][11])));
5857 E4h = _mm_madd_epi16(m128Tmp9,
5858 _mm_load_si128((__m128i *) (transform32x32[4][11])));
5859 E5l = _mm_madd_epi16(m128Tmp10,
5860 _mm_load_si128((__m128i *) (transform32x32[5][11])));
5861 E5h = _mm_madd_epi16(m128Tmp11,
5862 _mm_load_si128((__m128i *) (transform32x32[5][11])));
5863 E6l = _mm_madd_epi16(m128Tmp12,
5864 _mm_load_si128((__m128i *) (transform32x32[6][11])));
5865 E6h = _mm_madd_epi16(m128Tmp13,
5866 _mm_load_si128((__m128i *) (transform32x32[6][11])));
5867 E7l = _mm_madd_epi16(m128Tmp14,
5868 _mm_load_si128((__m128i *) (transform32x32[7][11])));
5869 E7h = _mm_madd_epi16(m128Tmp15,
5870 _mm_load_si128((__m128i *) (transform32x32[7][11])));
5871
5872 O11l = _mm_add_epi32(E0l, E1l);
5873 O11l = _mm_add_epi32(O11l, E2l);
5874 O11l = _mm_add_epi32(O11l, E3l);
5875 O11l = _mm_add_epi32(O11l, E4l);
5876 O11l = _mm_add_epi32(O11l, E5l);
5877 O11l = _mm_add_epi32(O11l, E6l);
5878 O11l = _mm_add_epi32(O11l, E7l);
5879
5880 O11h = _mm_add_epi32(E0h, E1h);
5881 O11h = _mm_add_epi32(O11h, E2h);
5882 O11h = _mm_add_epi32(O11h, E3h);
5883 O11h = _mm_add_epi32(O11h, E4h);
5884 O11h = _mm_add_epi32(O11h, E5h);
5885 O11h = _mm_add_epi32(O11h, E6h);
5886 O11h = _mm_add_epi32(O11h, E7h);
5887
5888 /* Compute 12*/
5889
5890 E0l = _mm_madd_epi16(m128Tmp0,
5891 _mm_load_si128((__m128i *) (transform32x32[0][12])));
5892 E0h = _mm_madd_epi16(m128Tmp1,
5893 _mm_load_si128((__m128i *) (transform32x32[0][12])));
5894 E1l = _mm_madd_epi16(m128Tmp2,
5895 _mm_load_si128((__m128i *) (transform32x32[1][12])));
5896 E1h = _mm_madd_epi16(m128Tmp3,
5897 _mm_load_si128((__m128i *) (transform32x32[1][12])));
5898 E2l = _mm_madd_epi16(m128Tmp4,
5899 _mm_load_si128((__m128i *) (transform32x32[2][12])));
5900 E2h = _mm_madd_epi16(m128Tmp5,
5901 _mm_load_si128((__m128i *) (transform32x32[2][12])));
5902 E3l = _mm_madd_epi16(m128Tmp6,
5903 _mm_load_si128((__m128i *) (transform32x32[3][12])));
5904 E3h = _mm_madd_epi16(m128Tmp7,
5905 _mm_load_si128((__m128i *) (transform32x32[3][12])));
5906
5907 E4l = _mm_madd_epi16(m128Tmp8,
5908 _mm_load_si128((__m128i *) (transform32x32[4][12])));
5909 E4h = _mm_madd_epi16(m128Tmp9,
5910 _mm_load_si128((__m128i *) (transform32x32[4][12])));
5911 E5l = _mm_madd_epi16(m128Tmp10,
5912 _mm_load_si128((__m128i *) (transform32x32[5][12])));
5913 E5h = _mm_madd_epi16(m128Tmp11,
5914 _mm_load_si128((__m128i *) (transform32x32[5][12])));
5915 E6l = _mm_madd_epi16(m128Tmp12,
5916 _mm_load_si128((__m128i *) (transform32x32[6][12])));
5917 E6h = _mm_madd_epi16(m128Tmp13,
5918 _mm_load_si128((__m128i *) (transform32x32[6][12])));
5919 E7l = _mm_madd_epi16(m128Tmp14,
5920 _mm_load_si128((__m128i *) (transform32x32[7][12])));
5921 E7h = _mm_madd_epi16(m128Tmp15,
5922 _mm_load_si128((__m128i *) (transform32x32[7][12])));
5923
5924 O12l = _mm_add_epi32(E0l, E1l);
5925 O12l = _mm_add_epi32(O12l, E2l);
5926 O12l = _mm_add_epi32(O12l, E3l);
5927 O12l = _mm_add_epi32(O12l, E4l);
5928 O12l = _mm_add_epi32(O12l, E5l);
5929 O12l = _mm_add_epi32(O12l, E6l);
5930 O12l = _mm_add_epi32(O12l, E7l);
5931
5932 O12h = _mm_add_epi32(E0h, E1h);
5933 O12h = _mm_add_epi32(O12h, E2h);
5934 O12h = _mm_add_epi32(O12h, E3h);
5935 O12h = _mm_add_epi32(O12h, E4h);
5936 O12h = _mm_add_epi32(O12h, E5h);
5937 O12h = _mm_add_epi32(O12h, E6h);
5938 O12h = _mm_add_epi32(O12h, E7h);
5939
5940 /* Compute 13*/
5941
5942 E0l = _mm_madd_epi16(m128Tmp0,
5943 _mm_load_si128((__m128i *) (transform32x32[0][13])));
5944 E0h = _mm_madd_epi16(m128Tmp1,
5945 _mm_load_si128((__m128i *) (transform32x32[0][13])));
5946 E1l = _mm_madd_epi16(m128Tmp2,
5947 _mm_load_si128((__m128i *) (transform32x32[1][13])));
5948 E1h = _mm_madd_epi16(m128Tmp3,
5949 _mm_load_si128((__m128i *) (transform32x32[1][13])));
5950 E2l = _mm_madd_epi16(m128Tmp4,
5951 _mm_load_si128((__m128i *) (transform32x32[2][13])));
5952 E2h = _mm_madd_epi16(m128Tmp5,
5953 _mm_load_si128((__m128i *) (transform32x32[2][13])));
5954 E3l = _mm_madd_epi16(m128Tmp6,
5955 _mm_load_si128((__m128i *) (transform32x32[3][13])));
5956 E3h = _mm_madd_epi16(m128Tmp7,
5957 _mm_load_si128((__m128i *) (transform32x32[3][13])));
5958
5959 E4l = _mm_madd_epi16(m128Tmp8,
5960 _mm_load_si128((__m128i *) (transform32x32[4][13])));
5961 E4h = _mm_madd_epi16(m128Tmp9,
5962 _mm_load_si128((__m128i *) (transform32x32[4][13])));
5963 E5l = _mm_madd_epi16(m128Tmp10,
5964 _mm_load_si128((__m128i *) (transform32x32[5][13])));
5965 E5h = _mm_madd_epi16(m128Tmp11,
5966 _mm_load_si128((__m128i *) (transform32x32[5][13])));
5967 E6l = _mm_madd_epi16(m128Tmp12,
5968 _mm_load_si128((__m128i *) (transform32x32[6][13])));
5969 E6h = _mm_madd_epi16(m128Tmp13,
5970 _mm_load_si128((__m128i *) (transform32x32[6][13])));
5971 E7l = _mm_madd_epi16(m128Tmp14,
5972 _mm_load_si128((__m128i *) (transform32x32[7][13])));
5973 E7h = _mm_madd_epi16(m128Tmp15,
5974 _mm_load_si128((__m128i *) (transform32x32[7][13])));
5975
5976 O13l = _mm_add_epi32(E0l, E1l);
5977 O13l = _mm_add_epi32(O13l, E2l);
5978 O13l = _mm_add_epi32(O13l, E3l);
5979 O13l = _mm_add_epi32(O13l, E4l);
5980 O13l = _mm_add_epi32(O13l, E5l);
5981 O13l = _mm_add_epi32(O13l, E6l);
5982 O13l = _mm_add_epi32(O13l, E7l);
5983
5984 O13h = _mm_add_epi32(E0h, E1h);
5985 O13h = _mm_add_epi32(O13h, E2h);
5986 O13h = _mm_add_epi32(O13h, E3h);
5987 O13h = _mm_add_epi32(O13h, E4h);
5988 O13h = _mm_add_epi32(O13h, E5h);
5989 O13h = _mm_add_epi32(O13h, E6h);
5990 O13h = _mm_add_epi32(O13h, E7h);
5991
5992 /* Compute O14 */
5993
5994 E0l = _mm_madd_epi16(m128Tmp0,
5995 _mm_load_si128((__m128i *) (transform32x32[0][14])));
5996 E0h = _mm_madd_epi16(m128Tmp1,
5997 _mm_load_si128((__m128i *) (transform32x32[0][14])));
5998 E1l = _mm_madd_epi16(m128Tmp2,
5999 _mm_load_si128((__m128i *) (transform32x32[1][14])));
6000 E1h = _mm_madd_epi16(m128Tmp3,
6001 _mm_load_si128((__m128i *) (transform32x32[1][14])));
6002 E2l = _mm_madd_epi16(m128Tmp4,
6003 _mm_load_si128((__m128i *) (transform32x32[2][14])));
6004 E2h = _mm_madd_epi16(m128Tmp5,
6005 _mm_load_si128((__m128i *) (transform32x32[2][14])));
6006 E3l = _mm_madd_epi16(m128Tmp6,
6007 _mm_load_si128((__m128i *) (transform32x32[3][14])));
6008 E3h = _mm_madd_epi16(m128Tmp7,
6009 _mm_load_si128((__m128i *) (transform32x32[3][14])));
6010
6011 E4l = _mm_madd_epi16(m128Tmp8,
6012 _mm_load_si128((__m128i *) (transform32x32[4][14])));
6013 E4h = _mm_madd_epi16(m128Tmp9,
6014 _mm_load_si128((__m128i *) (transform32x32[4][14])));
6015 E5l = _mm_madd_epi16(m128Tmp10,
6016 _mm_load_si128((__m128i *) (transform32x32[5][14])));
6017 E5h = _mm_madd_epi16(m128Tmp11,
6018 _mm_load_si128((__m128i *) (transform32x32[5][14])));
6019 E6l = _mm_madd_epi16(m128Tmp12,
6020 _mm_load_si128((__m128i *) (transform32x32[6][14])));
6021 E6h = _mm_madd_epi16(m128Tmp13,
6022 _mm_load_si128((__m128i *) (transform32x32[6][14])));
6023 E7l = _mm_madd_epi16(m128Tmp14,
6024 _mm_load_si128((__m128i *) (transform32x32[7][14])));
6025 E7h = _mm_madd_epi16(m128Tmp15,
6026 _mm_load_si128((__m128i *) (transform32x32[7][14])));
6027
6028 O14l = _mm_add_epi32(E0l, E1l);
6029 O14l = _mm_add_epi32(O14l, E2l);
6030 O14l = _mm_add_epi32(O14l, E3l);
6031 O14l = _mm_add_epi32(O14l, E4l);
6032 O14l = _mm_add_epi32(O14l, E5l);
6033 O14l = _mm_add_epi32(O14l, E6l);
6034 O14l = _mm_add_epi32(O14l, E7l);
6035
6036 O14h = _mm_add_epi32(E0h, E1h);
6037 O14h = _mm_add_epi32(O14h, E2h);
6038 O14h = _mm_add_epi32(O14h, E3h);
6039 O14h = _mm_add_epi32(O14h, E4h);
6040 O14h = _mm_add_epi32(O14h, E5h);
6041 O14h = _mm_add_epi32(O14h, E6h);
6042 O14h = _mm_add_epi32(O14h, E7h);
6043
6044 /* Compute O15*/
6045
6046 E0l = _mm_madd_epi16(m128Tmp0,
6047 _mm_load_si128((__m128i *) (transform32x32[0][15])));
6048 E0h = _mm_madd_epi16(m128Tmp1,
6049 _mm_load_si128((__m128i *) (transform32x32[0][15])));
6050 E1l = _mm_madd_epi16(m128Tmp2,
6051 _mm_load_si128((__m128i *) (transform32x32[1][15])));
6052 E1h = _mm_madd_epi16(m128Tmp3,
6053 _mm_load_si128((__m128i *) (transform32x32[1][15])));
6054 E2l = _mm_madd_epi16(m128Tmp4,
6055 _mm_load_si128((__m128i *) (transform32x32[2][15])));
6056 E2h = _mm_madd_epi16(m128Tmp5,
6057 _mm_load_si128((__m128i *) (transform32x32[2][15])));
6058 E3l = _mm_madd_epi16(m128Tmp6,
6059 _mm_load_si128((__m128i *) (transform32x32[3][15])));
6060 E3h = _mm_madd_epi16(m128Tmp7,
6061 _mm_load_si128((__m128i *) (transform32x32[3][15])));
6062
6063 E4l = _mm_madd_epi16(m128Tmp8,
6064 _mm_load_si128((__m128i *) (transform32x32[4][15])));
6065 E4h = _mm_madd_epi16(m128Tmp9,
6066 _mm_load_si128((__m128i *) (transform32x32[4][15])));
6067 E5l = _mm_madd_epi16(m128Tmp10,
6068 _mm_load_si128((__m128i *) (transform32x32[5][15])));
6069 E5h = _mm_madd_epi16(m128Tmp11,
6070 _mm_load_si128((__m128i *) (transform32x32[5][15])));
6071 E6l = _mm_madd_epi16(m128Tmp12,
6072 _mm_load_si128((__m128i *) (transform32x32[6][15])));
6073 E6h = _mm_madd_epi16(m128Tmp13,
6074 _mm_load_si128((__m128i *) (transform32x32[6][15])));
6075 E7l = _mm_madd_epi16(m128Tmp14,
6076 _mm_load_si128((__m128i *) (transform32x32[7][15])));
6077 E7h = _mm_madd_epi16(m128Tmp15,
6078 _mm_load_si128((__m128i *) (transform32x32[7][15])));
6079
6080 O15l = _mm_add_epi32(E0l, E1l);
6081 O15l = _mm_add_epi32(O15l, E2l);
6082 O15l = _mm_add_epi32(O15l, E3l);
6083 O15l = _mm_add_epi32(O15l, E4l);
6084 O15l = _mm_add_epi32(O15l, E5l);
6085 O15l = _mm_add_epi32(O15l, E6l);
6086 O15l = _mm_add_epi32(O15l, E7l);
6087
6088 O15h = _mm_add_epi32(E0h, E1h);
6089 O15h = _mm_add_epi32(O15h, E2h);
6090 O15h = _mm_add_epi32(O15h, E3h);
6091 O15h = _mm_add_epi32(O15h, E4h);
6092 O15h = _mm_add_epi32(O15h, E5h);
6093 O15h = _mm_add_epi32(O15h, E6h);
6094 O15h = _mm_add_epi32(O15h, E7h);
6095 /* Compute E0 */
6096
6097 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
6098 E0l = _mm_madd_epi16(m128Tmp0,
6099 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
6100 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
6101 E0h = _mm_madd_epi16(m128Tmp1,
6102 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
6103
6104 m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
6105 E0l = _mm_add_epi32(E0l,
6106 _mm_madd_epi16(m128Tmp2,
6107 _mm_load_si128(
6108 (__m128i *) (transform16x16_1[1][0]))));
6109 m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
6110 E0h = _mm_add_epi32(E0h,
6111 _mm_madd_epi16(m128Tmp3,
6112 _mm_load_si128(
6113 (__m128i *) (transform16x16_1[1][0]))));
6114
6115 m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22);
6116 E0l = _mm_add_epi32(E0l,
6117 _mm_madd_epi16(m128Tmp4,
6118 _mm_load_si128(
6119 (__m128i *) (transform16x16_1[2][0]))));
6120 m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22);
6121 E0h = _mm_add_epi32(E0h,
6122 _mm_madd_epi16(m128Tmp5,
6123 _mm_load_si128(
6124 (__m128i *) (transform16x16_1[2][0]))));
6125
6126 m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30);
6127 E0l = _mm_add_epi32(E0l,
6128 _mm_madd_epi16(m128Tmp6,
6129 _mm_load_si128(
6130 (__m128i *) (transform16x16_1[3][0]))));
6131 m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30);
6132 E0h = _mm_add_epi32(E0h,
6133 _mm_madd_epi16(m128Tmp7,
6134 _mm_load_si128(
6135 (__m128i *) (transform16x16_1[3][0]))));
6136
6137 /* Compute E1 */
6138 E1l = _mm_madd_epi16(m128Tmp0,
6139 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
6140 E1h = _mm_madd_epi16(m128Tmp1,
6141 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
6142 E1l = _mm_add_epi32(E1l,
6143 _mm_madd_epi16(m128Tmp2,
6144 _mm_load_si128(
6145 (__m128i *) (transform16x16_1[1][1]))));
6146 E1h = _mm_add_epi32(E1h,
6147 _mm_madd_epi16(m128Tmp3,
6148 _mm_load_si128(
6149 (__m128i *) (transform16x16_1[1][1]))));
6150 E1l = _mm_add_epi32(E1l,
6151 _mm_madd_epi16(m128Tmp4,
6152 _mm_load_si128(
6153 (__m128i *) (transform16x16_1[2][1]))));
6154 E1h = _mm_add_epi32(E1h,
6155 _mm_madd_epi16(m128Tmp5,
6156 _mm_load_si128(
6157 (__m128i *) (transform16x16_1[2][1]))));
6158 E1l = _mm_add_epi32(E1l,
6159 _mm_madd_epi16(m128Tmp6,
6160 _mm_load_si128(
6161 (__m128i *) (transform16x16_1[3][1]))));
6162 E1h = _mm_add_epi32(E1h,
6163 _mm_madd_epi16(m128Tmp7,
6164 _mm_load_si128(
6165 (__m128i *) (transform16x16_1[3][1]))));
6166
6167 /* Compute E2 */
6168 E2l = _mm_madd_epi16(m128Tmp0,
6169 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
6170 E2h = _mm_madd_epi16(m128Tmp1,
6171 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
6172 E2l = _mm_add_epi32(E2l,
6173 _mm_madd_epi16(m128Tmp2,
6174 _mm_load_si128(
6175 (__m128i *) (transform16x16_1[1][2]))));
6176 E2h = _mm_add_epi32(E2h,
6177 _mm_madd_epi16(m128Tmp3,
6178 _mm_load_si128(
6179 (__m128i *) (transform16x16_1[1][2]))));
6180 E2l = _mm_add_epi32(E2l,
6181 _mm_madd_epi16(m128Tmp4,
6182 _mm_load_si128(
6183 (__m128i *) (transform16x16_1[2][2]))));
6184 E2h = _mm_add_epi32(E2h,
6185 _mm_madd_epi16(m128Tmp5,
6186 _mm_load_si128(
6187 (__m128i *) (transform16x16_1[2][2]))));
6188 E2l = _mm_add_epi32(E2l,
6189 _mm_madd_epi16(m128Tmp6,
6190 _mm_load_si128(
6191 (__m128i *) (transform16x16_1[3][2]))));
6192 E2h = _mm_add_epi32(E2h,
6193 _mm_madd_epi16(m128Tmp7,
6194 _mm_load_si128(
6195 (__m128i *) (transform16x16_1[3][2]))));
6196
6197 /* Compute E3 */
6198 E3l = _mm_madd_epi16(m128Tmp0,
6199 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
6200 E3h = _mm_madd_epi16(m128Tmp1,
6201 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
6202 E3l = _mm_add_epi32(E3l,
6203 _mm_madd_epi16(m128Tmp2,
6204 _mm_load_si128(
6205 (__m128i *) (transform16x16_1[1][3]))));
6206 E3h = _mm_add_epi32(E3h,
6207 _mm_madd_epi16(m128Tmp3,
6208 _mm_load_si128(
6209 (__m128i *) (transform16x16_1[1][3]))));
6210 E3l = _mm_add_epi32(E3l,
6211 _mm_madd_epi16(m128Tmp4,
6212 _mm_load_si128(
6213 (__m128i *) (transform16x16_1[2][3]))));
6214 E3h = _mm_add_epi32(E3h,
6215 _mm_madd_epi16(m128Tmp5,
6216 _mm_load_si128(
6217 (__m128i *) (transform16x16_1[2][3]))));
6218 E3l = _mm_add_epi32(E3l,
6219 _mm_madd_epi16(m128Tmp6,
6220 _mm_load_si128(
6221 (__m128i *) (transform16x16_1[3][3]))));
6222 E3h = _mm_add_epi32(E3h,
6223 _mm_madd_epi16(m128Tmp7,
6224 _mm_load_si128(
6225 (__m128i *) (transform16x16_1[3][3]))));
6226
6227 /* Compute E4 */
6228 E4l = _mm_madd_epi16(m128Tmp0,
6229 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
6230 E4h = _mm_madd_epi16(m128Tmp1,
6231 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
6232 E4l = _mm_add_epi32(E4l,
6233 _mm_madd_epi16(m128Tmp2,
6234 _mm_load_si128(
6235 (__m128i *) (transform16x16_1[1][4]))));
6236 E4h = _mm_add_epi32(E4h,
6237 _mm_madd_epi16(m128Tmp3,
6238 _mm_load_si128(
6239 (__m128i *) (transform16x16_1[1][4]))));
6240 E4l = _mm_add_epi32(E4l,
6241 _mm_madd_epi16(m128Tmp4,
6242 _mm_load_si128(
6243 (__m128i *) (transform16x16_1[2][4]))));
6244 E4h = _mm_add_epi32(E4h,
6245 _mm_madd_epi16(m128Tmp5,
6246 _mm_load_si128(
6247 (__m128i *) (transform16x16_1[2][4]))));
6248 E4l = _mm_add_epi32(E4l,
6249 _mm_madd_epi16(m128Tmp6,
6250 _mm_load_si128(
6251 (__m128i *) (transform16x16_1[3][4]))));
6252 E4h = _mm_add_epi32(E4h,
6253 _mm_madd_epi16(m128Tmp7,
6254 _mm_load_si128(
6255 (__m128i *) (transform16x16_1[3][4]))));
6256
6257 /* Compute E3 */
6258 E5l = _mm_madd_epi16(m128Tmp0,
6259 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
6260 E5h = _mm_madd_epi16(m128Tmp1,
6261 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
6262 E5l = _mm_add_epi32(E5l,
6263 _mm_madd_epi16(m128Tmp2,
6264 _mm_load_si128(
6265 (__m128i *) (transform16x16_1[1][5]))));
6266 E5h = _mm_add_epi32(E5h,
6267 _mm_madd_epi16(m128Tmp3,
6268 _mm_load_si128(
6269 (__m128i *) (transform16x16_1[1][5]))));
6270 E5l = _mm_add_epi32(E5l,
6271 _mm_madd_epi16(m128Tmp4,
6272 _mm_load_si128(
6273 (__m128i *) (transform16x16_1[2][5]))));
6274 E5h = _mm_add_epi32(E5h,
6275 _mm_madd_epi16(m128Tmp5,
6276 _mm_load_si128(
6277 (__m128i *) (transform16x16_1[2][5]))));
6278 E5l = _mm_add_epi32(E5l,
6279 _mm_madd_epi16(m128Tmp6,
6280 _mm_load_si128(
6281 (__m128i *) (transform16x16_1[3][5]))));
6282 E5h = _mm_add_epi32(E5h,
6283 _mm_madd_epi16(m128Tmp7,
6284 _mm_load_si128(
6285 (__m128i *) (transform16x16_1[3][5]))));
6286
6287 /* Compute E6 */
6288 E6l = _mm_madd_epi16(m128Tmp0,
6289 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
6290 E6h = _mm_madd_epi16(m128Tmp1,
6291 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
6292 E6l = _mm_add_epi32(E6l,
6293 _mm_madd_epi16(m128Tmp2,
6294 _mm_load_si128(
6295 (__m128i *) (transform16x16_1[1][6]))));
6296 E6h = _mm_add_epi32(E6h,
6297 _mm_madd_epi16(m128Tmp3,
6298 _mm_load_si128(
6299 (__m128i *) (transform16x16_1[1][6]))));
6300 E6l = _mm_add_epi32(E6l,
6301 _mm_madd_epi16(m128Tmp4,
6302 _mm_load_si128(
6303 (__m128i *) (transform16x16_1[2][6]))));
6304 E6h = _mm_add_epi32(E6h,
6305 _mm_madd_epi16(m128Tmp5,
6306 _mm_load_si128(
6307 (__m128i *) (transform16x16_1[2][6]))));
6308 E6l = _mm_add_epi32(E6l,
6309 _mm_madd_epi16(m128Tmp6,
6310 _mm_load_si128(
6311 (__m128i *) (transform16x16_1[3][6]))));
6312 E6h = _mm_add_epi32(E6h,
6313 _mm_madd_epi16(m128Tmp7,
6314 _mm_load_si128(
6315 (__m128i *) (transform16x16_1[3][6]))));
6316
6317 /* Compute E7 */
6318 E7l = _mm_madd_epi16(m128Tmp0,
6319 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
6320 E7h = _mm_madd_epi16(m128Tmp1,
6321 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
6322 E7l = _mm_add_epi32(E7l,
6323 _mm_madd_epi16(m128Tmp2,
6324 _mm_load_si128(
6325 (__m128i *) (transform16x16_1[1][7]))));
6326 E7h = _mm_add_epi32(E7h,
6327 _mm_madd_epi16(m128Tmp3,
6328 _mm_load_si128(
6329 (__m128i *) (transform16x16_1[1][7]))));
6330 E7l = _mm_add_epi32(E7l,
6331 _mm_madd_epi16(m128Tmp4,
6332 _mm_load_si128(
6333 (__m128i *) (transform16x16_1[2][7]))));
6334 E7h = _mm_add_epi32(E7h,
6335 _mm_madd_epi16(m128Tmp5,
6336 _mm_load_si128(
6337 (__m128i *) (transform16x16_1[2][7]))));
6338 E7l = _mm_add_epi32(E7l,
6339 _mm_madd_epi16(m128Tmp6,
6340 _mm_load_si128(
6341 (__m128i *) (transform16x16_1[3][7]))));
6342 E7h = _mm_add_epi32(E7h,
6343 _mm_madd_epi16(m128Tmp7,
6344 _mm_load_si128(
6345 (__m128i *) (transform16x16_1[3][7]))));
6346
6347 /* Compute EE0 and EEE */
6348
6349 m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
6350 E00l = _mm_madd_epi16(m128Tmp0,
6351 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
6352 m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
6353 E00h = _mm_madd_epi16(m128Tmp1,
6354 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
6355
6356 m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28);
6357 E00l = _mm_add_epi32(E00l,
6358 _mm_madd_epi16(m128Tmp2,
6359 _mm_load_si128(
6360 (__m128i *) (transform16x16_2[1][0]))));
6361 m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28);
6362 E00h = _mm_add_epi32(E00h,
6363 _mm_madd_epi16(m128Tmp3,
6364 _mm_load_si128(
6365 (__m128i *) (transform16x16_2[1][0]))));
6366
6367 E01l = _mm_madd_epi16(m128Tmp0,
6368 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
6369 E01h = _mm_madd_epi16(m128Tmp1,
6370 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
6371 E01l = _mm_add_epi32(E01l,
6372 _mm_madd_epi16(m128Tmp2,
6373 _mm_load_si128(
6374 (__m128i *) (transform16x16_2[1][1]))));
6375 E01h = _mm_add_epi32(E01h,
6376 _mm_madd_epi16(m128Tmp3,
6377 _mm_load_si128(
6378 (__m128i *) (transform16x16_2[1][1]))));
6379
6380 E02l = _mm_madd_epi16(m128Tmp0,
6381 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
6382 E02h = _mm_madd_epi16(m128Tmp1,
6383 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
6384 E02l = _mm_add_epi32(E02l,
6385 _mm_madd_epi16(m128Tmp2,
6386 _mm_load_si128(
6387 (__m128i *) (transform16x16_2[1][2]))));
6388 E02h = _mm_add_epi32(E02h,
6389 _mm_madd_epi16(m128Tmp3,
6390 _mm_load_si128(
6391 (__m128i *) (transform16x16_2[1][2]))));
6392
6393 E03l = _mm_madd_epi16(m128Tmp0,
6394 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
6395 E03h = _mm_madd_epi16(m128Tmp1,
6396 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
6397 E03l = _mm_add_epi32(E03l,
6398 _mm_madd_epi16(m128Tmp2,
6399 _mm_load_si128(
6400 (__m128i *) (transform16x16_2[1][3]))));
6401 E03h = _mm_add_epi32(E03h,
6402 _mm_madd_epi16(m128Tmp3,
6403 _mm_load_si128(
6404 (__m128i *) (transform16x16_2[1][3]))));
6405
6406 /* Compute EE0 and EEE */
6407
6408 m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24);
6409 EE0l = _mm_madd_epi16(m128Tmp0,
6410 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
6411 m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24);
6412 EE0h = _mm_madd_epi16(m128Tmp1,
6413 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
6414
6415 m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16);
6416 EEE0l = _mm_madd_epi16(m128Tmp2,
6417 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
6418 m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16);
6419 EEE0h = _mm_madd_epi16(m128Tmp3,
6420 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
6421
6422 EE1l = _mm_madd_epi16(m128Tmp0,
6423 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
6424 EE1h = _mm_madd_epi16(m128Tmp1,
6425 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
6426
6427 EEE1l = _mm_madd_epi16(m128Tmp2,
6428 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
6429 EEE1h = _mm_madd_epi16(m128Tmp3,
6430 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
6431
6432 /* Compute EE */
6433
6434 EE2l = _mm_sub_epi32(EEE1l, EE1l);
6435 EE3l = _mm_sub_epi32(EEE0l, EE0l);
6436 EE2h = _mm_sub_epi32(EEE1h, EE1h);
6437 EE3h = _mm_sub_epi32(EEE0h, EE0h);
6438
6439 EE0l = _mm_add_epi32(EEE0l, EE0l);
6440 EE1l = _mm_add_epi32(EEE1l, EE1l);
6441 EE0h = _mm_add_epi32(EEE0h, EE0h);
6442 EE1h = _mm_add_epi32(EEE1h, EE1h);
6443 /**/
6444
6445 EE7l = _mm_sub_epi32(EE0l, E00l);
6446 EE6l = _mm_sub_epi32(EE1l, E01l);
6447 EE5l = _mm_sub_epi32(EE2l, E02l);
6448 EE4l = _mm_sub_epi32(EE3l, E03l);
6449
6450 EE7h = _mm_sub_epi32(EE0h, E00h);
6451 EE6h = _mm_sub_epi32(EE1h, E01h);
6452 EE5h = _mm_sub_epi32(EE2h, E02h);
6453 EE4h = _mm_sub_epi32(EE3h, E03h);
6454
6455 EE0l = _mm_add_epi32(EE0l, E00l);
6456 EE1l = _mm_add_epi32(EE1l, E01l);
6457 EE2l = _mm_add_epi32(EE2l, E02l);
6458 EE3l = _mm_add_epi32(EE3l, E03l);
6459
6460 EE0h = _mm_add_epi32(EE0h, E00h);
6461 EE1h = _mm_add_epi32(EE1h, E01h);
6462 EE2h = _mm_add_epi32(EE2h, E02h);
6463 EE3h = _mm_add_epi32(EE3h, E03h);
6464 /* Compute E */
6465
6466 E15l = _mm_sub_epi32(EE0l, E0l);
6467 E15l = _mm_add_epi32(E15l, m128iAdd);
6468 E14l = _mm_sub_epi32(EE1l, E1l);
6469 E14l = _mm_add_epi32(E14l, m128iAdd);
6470 E13l = _mm_sub_epi32(EE2l, E2l);
6471 E13l = _mm_add_epi32(E13l, m128iAdd);
6472 E12l = _mm_sub_epi32(EE3l, E3l);
6473 E12l = _mm_add_epi32(E12l, m128iAdd);
6474 E11l = _mm_sub_epi32(EE4l, E4l);
6475 E11l = _mm_add_epi32(E11l, m128iAdd);
6476 E10l = _mm_sub_epi32(EE5l, E5l);
6477 E10l = _mm_add_epi32(E10l, m128iAdd);
6478 E9l = _mm_sub_epi32(EE6l, E6l);
6479 E9l = _mm_add_epi32(E9l, m128iAdd);
6480 E8l = _mm_sub_epi32(EE7l, E7l);
6481 E8l = _mm_add_epi32(E8l, m128iAdd);
6482
6483 E0l = _mm_add_epi32(EE0l, E0l);
6484 E0l = _mm_add_epi32(E0l, m128iAdd);
6485 E1l = _mm_add_epi32(EE1l, E1l);
6486 E1l = _mm_add_epi32(E1l, m128iAdd);
6487 E2l = _mm_add_epi32(EE2l, E2l);
6488 E2l = _mm_add_epi32(E2l, m128iAdd);
6489 E3l = _mm_add_epi32(EE3l, E3l);
6490 E3l = _mm_add_epi32(E3l, m128iAdd);
6491 E4l = _mm_add_epi32(EE4l, E4l);
6492 E4l = _mm_add_epi32(E4l, m128iAdd);
6493 E5l = _mm_add_epi32(EE5l, E5l);
6494 E5l = _mm_add_epi32(E5l, m128iAdd);
6495 E6l = _mm_add_epi32(EE6l, E6l);
6496 E6l = _mm_add_epi32(E6l, m128iAdd);
6497 E7l = _mm_add_epi32(EE7l, E7l);
6498 E7l = _mm_add_epi32(E7l, m128iAdd);
6499
6500 E15h = _mm_sub_epi32(EE0h, E0h);
6501 E15h = _mm_add_epi32(E15h, m128iAdd);
6502 E14h = _mm_sub_epi32(EE1h, E1h);
6503 E14h = _mm_add_epi32(E14h, m128iAdd);
6504 E13h = _mm_sub_epi32(EE2h, E2h);
6505 E13h = _mm_add_epi32(E13h, m128iAdd);
6506 E12h = _mm_sub_epi32(EE3h, E3h);
6507 E12h = _mm_add_epi32(E12h, m128iAdd);
6508 E11h = _mm_sub_epi32(EE4h, E4h);
6509 E11h = _mm_add_epi32(E11h, m128iAdd);
6510 E10h = _mm_sub_epi32(EE5h, E5h);
6511 E10h = _mm_add_epi32(E10h, m128iAdd);
6512 E9h = _mm_sub_epi32(EE6h, E6h);
6513 E9h = _mm_add_epi32(E9h, m128iAdd);
6514 E8h = _mm_sub_epi32(EE7h, E7h);
6515 E8h = _mm_add_epi32(E8h, m128iAdd);
6516
6517 E0h = _mm_add_epi32(EE0h, E0h);
6518 E0h = _mm_add_epi32(E0h, m128iAdd);
6519 E1h = _mm_add_epi32(EE1h, E1h);
6520 E1h = _mm_add_epi32(E1h, m128iAdd);
6521 E2h = _mm_add_epi32(EE2h, E2h);
6522 E2h = _mm_add_epi32(E2h, m128iAdd);
6523 E3h = _mm_add_epi32(EE3h, E3h);
6524 E3h = _mm_add_epi32(E3h, m128iAdd);
6525 E4h = _mm_add_epi32(EE4h, E4h);
6526 E4h = _mm_add_epi32(E4h, m128iAdd);
6527 E5h = _mm_add_epi32(EE5h, E5h);
6528 E5h = _mm_add_epi32(E5h, m128iAdd);
6529 E6h = _mm_add_epi32(EE6h, E6h);
6530 E6h = _mm_add_epi32(E6h, m128iAdd);
6531 E7h = _mm_add_epi32(EE7h, E7h);
6532 E7h = _mm_add_epi32(E7h, m128iAdd);
6533
6534 m128iS0 = _mm_packs_epi32(
6535 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
6536 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
6537 m128iS1 = _mm_packs_epi32(
6538 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
6539 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
6540 m128iS2 = _mm_packs_epi32(
6541 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
6542 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
6543 m128iS3 = _mm_packs_epi32(
6544 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
6545 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
6546 m128iS4 = _mm_packs_epi32(
6547 _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
6548 _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
6549 m128iS5 = _mm_packs_epi32(
6550 _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
6551 _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
6552 m128iS6 = _mm_packs_epi32(
6553 _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
6554 _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
6555 m128iS7 = _mm_packs_epi32(
6556 _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
6557 _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
6558 m128iS8 = _mm_packs_epi32(
6559 _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift),
6560 _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift));
6561 m128iS9 = _mm_packs_epi32(
6562 _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift),
6563 _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift));
6564 m128iS10 = _mm_packs_epi32(
6565 _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift),
6566 _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift));
6567 m128iS11 = _mm_packs_epi32(
6568 _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift),
6569 _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift));
6570 m128iS12 = _mm_packs_epi32(
6571 _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift),
6572 _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift));
6573 m128iS13 = _mm_packs_epi32(
6574 _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift),
6575 _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift));
6576 m128iS14 = _mm_packs_epi32(
6577 _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift),
6578 _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift));
6579 m128iS15 = _mm_packs_epi32(
6580 _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift),
6581 _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift));
6582
6583 m128iS31 = _mm_packs_epi32(
6584 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
6585 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
6586 m128iS30 = _mm_packs_epi32(
6587 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
6588 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
6589 m128iS29 = _mm_packs_epi32(
6590 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
6591 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
6592 m128iS28 = _mm_packs_epi32(
6593 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
6594 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
6595 m128iS27 = _mm_packs_epi32(
6596 _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
6597 _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
6598 m128iS26 = _mm_packs_epi32(
6599 _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
6600 _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
6601 m128iS25 = _mm_packs_epi32(
6602 _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
6603 _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
6604 m128iS24 = _mm_packs_epi32(
6605 _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
6606 _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
6607 m128iS23 = _mm_packs_epi32(
6608 _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift),
6609 _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift));
6610 m128iS22 = _mm_packs_epi32(
6611 _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift),
6612 _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift));
6613 m128iS21 = _mm_packs_epi32(
6614 _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift),
6615 _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift));
6616 m128iS20 = _mm_packs_epi32(
6617 _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift),
6618 _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift));
6619 m128iS19 = _mm_packs_epi32(
6620 _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift),
6621 _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift));
6622 m128iS18 = _mm_packs_epi32(
6623 _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift),
6624 _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift));
6625 m128iS17 = _mm_packs_epi32(
6626 _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift),
6627 _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift));
6628 m128iS16 = _mm_packs_epi32(
6629 _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift),
6630 _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift));
6631
6632 if (!j) {
6633 /* Inverse the matrix */
6634 E0l = _mm_unpacklo_epi16(m128iS0, m128iS16);
6635 E1l = _mm_unpacklo_epi16(m128iS1, m128iS17);
6636 E2l = _mm_unpacklo_epi16(m128iS2, m128iS18);
6637 E3l = _mm_unpacklo_epi16(m128iS3, m128iS19);
6638 E4l = _mm_unpacklo_epi16(m128iS4, m128iS20);
6639 E5l = _mm_unpacklo_epi16(m128iS5, m128iS21);
6640 E6l = _mm_unpacklo_epi16(m128iS6, m128iS22);
6641 E7l = _mm_unpacklo_epi16(m128iS7, m128iS23);
6642 E8l = _mm_unpacklo_epi16(m128iS8, m128iS24);
6643 E9l = _mm_unpacklo_epi16(m128iS9, m128iS25);
6644 E10l = _mm_unpacklo_epi16(m128iS10, m128iS26);
6645 E11l = _mm_unpacklo_epi16(m128iS11, m128iS27);
6646 E12l = _mm_unpacklo_epi16(m128iS12, m128iS28);
6647 E13l = _mm_unpacklo_epi16(m128iS13, m128iS29);
6648 E14l = _mm_unpacklo_epi16(m128iS14, m128iS30);
6649 E15l = _mm_unpacklo_epi16(m128iS15, m128iS31);
6650
6651 O0l = _mm_unpackhi_epi16(m128iS0, m128iS16);
6652 O1l = _mm_unpackhi_epi16(m128iS1, m128iS17);
6653 O2l = _mm_unpackhi_epi16(m128iS2, m128iS18);
6654 O3l = _mm_unpackhi_epi16(m128iS3, m128iS19);
6655 O4l = _mm_unpackhi_epi16(m128iS4, m128iS20);
6656 O5l = _mm_unpackhi_epi16(m128iS5, m128iS21);
6657 O6l = _mm_unpackhi_epi16(m128iS6, m128iS22);
6658 O7l = _mm_unpackhi_epi16(m128iS7, m128iS23);
6659 O8l = _mm_unpackhi_epi16(m128iS8, m128iS24);
6660 O9l = _mm_unpackhi_epi16(m128iS9, m128iS25);
6661 O10l = _mm_unpackhi_epi16(m128iS10, m128iS26);
6662 O11l = _mm_unpackhi_epi16(m128iS11, m128iS27);
6663 O12l = _mm_unpackhi_epi16(m128iS12, m128iS28);
6664 O13l = _mm_unpackhi_epi16(m128iS13, m128iS29);
6665 O14l = _mm_unpackhi_epi16(m128iS14, m128iS30);
6666 O15l = _mm_unpackhi_epi16(m128iS15, m128iS31);
6667
6668 E0h = _mm_unpacklo_epi16(E0l, E8l);
6669 E1h = _mm_unpacklo_epi16(E1l, E9l);
6670 E2h = _mm_unpacklo_epi16(E2l, E10l);
6671 E3h = _mm_unpacklo_epi16(E3l, E11l);
6672 E4h = _mm_unpacklo_epi16(E4l, E12l);
6673 E5h = _mm_unpacklo_epi16(E5l, E13l);
6674 E6h = _mm_unpacklo_epi16(E6l, E14l);
6675 E7h = _mm_unpacklo_epi16(E7l, E15l);
6676
6677 E8h = _mm_unpackhi_epi16(E0l, E8l);
6678 E9h = _mm_unpackhi_epi16(E1l, E9l);
6679 E10h = _mm_unpackhi_epi16(E2l, E10l);
6680 E11h = _mm_unpackhi_epi16(E3l, E11l);
6681 E12h = _mm_unpackhi_epi16(E4l, E12l);
6682 E13h = _mm_unpackhi_epi16(E5l, E13l);
6683 E14h = _mm_unpackhi_epi16(E6l, E14l);
6684 E15h = _mm_unpackhi_epi16(E7l, E15l);
6685
6686 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
6687 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
6688 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
6689 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
6690
6691 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6692 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6693 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6694 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6695
6696 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6697 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6698 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6699 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6700
6701 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
6702 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
6703 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
6704 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
6705
6706 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6707 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6708 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6709 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6710
6711 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6712 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6713 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6714 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6715
6716 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
6717 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
6718 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
6719 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
6720
6721 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6722 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6723 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6724 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6725
6726 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6727 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6728 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6729 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6730
6731 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
6732 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
6733 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
6734 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
6735
6736 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6737 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6738 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6739 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6740
6741 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6742 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6743 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6744 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6745
6746 /* */
6747 E0h = _mm_unpacklo_epi16(O0l, O8l);
6748 E1h = _mm_unpacklo_epi16(O1l, O9l);
6749 E2h = _mm_unpacklo_epi16(O2l, O10l);
6750 E3h = _mm_unpacklo_epi16(O3l, O11l);
6751 E4h = _mm_unpacklo_epi16(O4l, O12l);
6752 E5h = _mm_unpacklo_epi16(O5l, O13l);
6753 E6h = _mm_unpacklo_epi16(O6l, O14l);
6754 E7h = _mm_unpacklo_epi16(O7l, O15l);
6755
6756 E8h = _mm_unpackhi_epi16(O0l, O8l);
6757 E9h = _mm_unpackhi_epi16(O1l, O9l);
6758 E10h = _mm_unpackhi_epi16(O2l, O10l);
6759 E11h = _mm_unpackhi_epi16(O3l, O11l);
6760 E12h = _mm_unpackhi_epi16(O4l, O12l);
6761 E13h = _mm_unpackhi_epi16(O5l, O13l);
6762 E14h = _mm_unpackhi_epi16(O6l, O14l);
6763 E15h = _mm_unpackhi_epi16(O7l, O15l);
6764
6765 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
6766 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
6767 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
6768 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
6769
6770 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6771 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6772 m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6773 m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6774
6775 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6776 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6777 m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6778 m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6779
6780 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
6781 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
6782 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
6783 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
6784
6785 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6786 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6787 m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6788 m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6789
6790 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6791 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6792 m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6793 m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6794
6795 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
6796 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
6797 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
6798 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
6799
6800 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6801 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6802 m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6803 m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6804
6805 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6806 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6807 m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6808 m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6809
6810 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
6811 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
6812 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
6813 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
6814
6815 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6816 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6817 m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6818 m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6819
6820 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6821 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6822 m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6823 m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6824 /* */
6825 _mm_store_si128((__m128i *) (src + i), m128iS0);
6826 _mm_store_si128((__m128i *) (src + 32 + i), m128iS1);
6827 _mm_store_si128((__m128i *) (src + 64 + i), m128iS2);
6828 _mm_store_si128((__m128i *) (src + 96 + i), m128iS3);
6829 _mm_store_si128((__m128i *) (src + 128 + i), m128iS4);
6830 _mm_store_si128((__m128i *) (src + 160 + i), m128iS5);
6831 _mm_store_si128((__m128i *) (src + 192 + i), m128iS6);
6832 _mm_store_si128((__m128i *) (src + 224 + i), m128iS7);
6833 _mm_store_si128((__m128i *) (src + 256 + i), m128iS8);
6834 _mm_store_si128((__m128i *) (src + 288 + i), m128iS9);
6835 _mm_store_si128((__m128i *) (src + 320 + i), m128iS10);
6836 _mm_store_si128((__m128i *) (src + 352 + i), m128iS11);
6837 _mm_store_si128((__m128i *) (src + 384 + i), m128iS12);
6838 _mm_store_si128((__m128i *) (src + 416 + i), m128iS13);
6839 _mm_store_si128((__m128i *) (src + 448 + i), m128iS14);
6840 _mm_store_si128((__m128i *) (src + 480 + i), m128iS15);
6841 _mm_store_si128((__m128i *) (src + 512 + i), m128iS16);
6842 _mm_store_si128((__m128i *) (src + 544 + i), m128iS17);
6843 _mm_store_si128((__m128i *) (src + 576 + i), m128iS18);
6844 _mm_store_si128((__m128i *) (src + 608 + i), m128iS19);
6845 _mm_store_si128((__m128i *) (src + 640 + i), m128iS20);
6846 _mm_store_si128((__m128i *) (src + 672 + i), m128iS21);
6847 _mm_store_si128((__m128i *) (src + 704 + i), m128iS22);
6848 _mm_store_si128((__m128i *) (src + 736 + i), m128iS23);
6849 _mm_store_si128((__m128i *) (src + 768 + i), m128iS24);
6850 _mm_store_si128((__m128i *) (src + 800 + i), m128iS25);
6851 _mm_store_si128((__m128i *) (src + 832 + i), m128iS26);
6852 _mm_store_si128((__m128i *) (src + 864 + i), m128iS27);
6853 _mm_store_si128((__m128i *) (src + 896 + i), m128iS28);
6854 _mm_store_si128((__m128i *) (src + 928 + i), m128iS29);
6855 _mm_store_si128((__m128i *) (src + 960 + i), m128iS30);
6856 _mm_store_si128((__m128i *) (src + 992 + i), m128iS31);
6857
6858 if (i <= 16) {
6859 int k = i + 8;
6860 m128iS0 = _mm_load_si128((__m128i *) (src + k));
6861 m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k));
6862 m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k));
6863 m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k));
6864 m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k));
6865 m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k));
6866 m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k));
6867 m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k));
6868 m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k));
6869 m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k));
6870 m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k));
6871 m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k));
6872 m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k));
6873 m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k));
6874 m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k));
6875 m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k));
6876
6877 m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k));
6878 m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k));
6879 m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k));
6880 m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k));
6881 m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k));
6882 m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k));
6883 m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k));
6884 m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k));
6885 m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k));
6886 m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k));
6887 m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k));
6888 m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k));
6889 m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k));
6890 m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k));
6891 m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k));
6892 m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k));
6893 } else {
6894 m128iS0 = _mm_load_si128((__m128i *) (src));
6895 m128iS1 = _mm_load_si128((__m128i *) (src + 128));
6896 m128iS2 = _mm_load_si128((__m128i *) (src + 256));
6897 m128iS3 = _mm_load_si128((__m128i *) (src + 384));
6898 m128iS4 = _mm_loadu_si128((__m128i *) (src + 512));
6899 m128iS5 = _mm_load_si128((__m128i *) (src + 640));
6900 m128iS6 = _mm_load_si128((__m128i *) (src + 768));
6901 m128iS7 = _mm_load_si128((__m128i *) (src + 896));
6902 m128iS8 = _mm_load_si128((__m128i *) (src + 8));
6903 m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8));
6904 m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8));
6905 m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8));
6906 m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8));
6907 m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8));
6908 m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8));
6909 m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8));
6910 m128iS16 = _mm_load_si128((__m128i *) (src + 16));
6911 m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16));
6912 m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16));
6913 m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16));
6914 m128iS20 = _mm_loadu_si128((__m128i *) (src + 512 + 16));
6915 m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16));
6916 m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16));
6917 m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16));
6918 m128iS24 = _mm_load_si128((__m128i *) (src + 24));
6919 m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24));
6920 m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24));
6921 m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24));
6922 m128iS28 = _mm_loadu_si128((__m128i *) (src + 512 + 24));
6923 m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24));
6924 m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24));
6925 m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24));
6926 shift = shift_2nd;
6927 m128iAdd = _mm_set1_epi32(add_2nd);
6928 }
6929
6930 } else {
6931 int k, m = 0;
6932 _mm_storeu_si128((__m128i *) (src), m128iS0);
6933 _mm_storeu_si128((__m128i *) (src + 8), m128iS1);
6934 _mm_storeu_si128((__m128i *) (src + 16), m128iS2);
6935 _mm_storeu_si128((__m128i *) (src + 24), m128iS3);
6936 _mm_storeu_si128((__m128i *) (src + 128), m128iS4);
6937 _mm_storeu_si128((__m128i *) (src + 128 + 8), m128iS5);
6938 _mm_storeu_si128((__m128i *) (src + 128 + 16), m128iS6);
6939 _mm_storeu_si128((__m128i *) (src + 128 + 24), m128iS7);
6940 _mm_storeu_si128((__m128i *) (src + 256), m128iS8);
6941 _mm_storeu_si128((__m128i *) (src + 256 + 8), m128iS9);
6942 _mm_storeu_si128((__m128i *) (src + 256 + 16), m128iS10);
6943 _mm_storeu_si128((__m128i *) (src + 256 + 24), m128iS11);
6944 _mm_storeu_si128((__m128i *) (src + 384), m128iS12);
6945 _mm_storeu_si128((__m128i *) (src + 384 + 8), m128iS13);
6946 _mm_storeu_si128((__m128i *) (src + 384 + 16), m128iS14);
6947 _mm_storeu_si128((__m128i *) (src + 384 + 24), m128iS15);
6948
6949 _mm_storeu_si128((__m128i *) (src + 512), m128iS16);
6950 _mm_storeu_si128((__m128i *) (src + 512 + 8), m128iS17);
6951 _mm_storeu_si128((__m128i *) (src + 512 + 16), m128iS18);
6952 _mm_storeu_si128((__m128i *) (src + 512 + 24), m128iS19);
6953 _mm_storeu_si128((__m128i *) (src + 640), m128iS20);
6954 _mm_storeu_si128((__m128i *) (src + 640 + 8), m128iS21);
6955 _mm_storeu_si128((__m128i *) (src + 640 + 16), m128iS22);
6956 _mm_storeu_si128((__m128i *) (src + 640 + 24), m128iS23);
6957 _mm_storeu_si128((__m128i *) (src + 768), m128iS24);
6958 _mm_storeu_si128((__m128i *) (src + 768 + 8), m128iS25);
6959 _mm_storeu_si128((__m128i *) (src + 768 + 16), m128iS26);
6960 _mm_storeu_si128((__m128i *) (src + 768 + 24), m128iS27);
6961 _mm_storeu_si128((__m128i *) (src + 896), m128iS28);
6962 _mm_storeu_si128((__m128i *) (src + 896 + 8), m128iS29);
6963 _mm_storeu_si128((__m128i *) (src + 896 + 16), m128iS30);
6964 _mm_storeu_si128((__m128i *) (src + 896 + 24), m128iS31);
6965 dst = (uint16_t*) _dst + (i * stride);
6966 for (k = 0; k < 8; k++) {
6967 dst[0] = av_clip_uintp2(dst[0] + src[m],10);
6968 dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10);
6969 dst[2] = av_clip_uintp2(dst[2] + src[m + 16],10);
6970 dst[3] = av_clip_uintp2(dst[3] + src[m + 24],10);
6971 dst[4] = av_clip_uintp2(
6972 dst[4] + src[m + 128],10);
6973 dst[5] = av_clip_uintp2(
6974 dst[5] + src[m + 128 + 8],10);
6975 dst[6] = av_clip_uintp2(
6976 dst[6] + src[m + 128 + 16],10);
6977 dst[7] = av_clip_uintp2(
6978 dst[7] + src[m + 128 + 24],10);
6979
6980 dst[8] = av_clip_uintp2(
6981 dst[8] + src[m + 256],10);
6982 dst[9] = av_clip_uintp2(
6983 dst[9] + src[m + 256 + 8],10);
6984 dst[10] = av_clip_uintp2(
6985 dst[10] + src[m + 256 + 16],10);
6986 dst[11] = av_clip_uintp2(
6987 dst[11] + src[m + 256 + 24],10);
6988 dst[12] = av_clip_uintp2(
6989 dst[12] + src[m + 384],10);
6990 dst[13] = av_clip_uintp2(
6991 dst[13] + src[m + 384 + 8],10);
6992 dst[14] = av_clip_uintp2(
6993 dst[14] + src[m + 384 + 16],10);
6994 dst[15] = av_clip_uintp2(
6995 dst[15] + src[m + 384 + 24],10);
6996
6997 dst[16] = av_clip_uintp2(
6998 dst[16] + src[m + 512],10);
6999 dst[17] = av_clip_uintp2(
7000 dst[17] + src[m + 512 + 8],10);
7001 dst[18] = av_clip_uintp2(
7002 dst[18] + src[m + 512 + 16],10);
7003 dst[19] = av_clip_uintp2(
7004 dst[19] + src[m + 512 + 24],10);
7005 dst[20] = av_clip_uintp2(
7006 dst[20] + src[m + 640],10);
7007 dst[21] = av_clip_uintp2(
7008 dst[21] + src[m + 640 + 8],10);
7009 dst[22] = av_clip_uintp2(
7010 dst[22] + src[m + 640 + 16],10);
7011 dst[23] = av_clip_uintp2(
7012 dst[23] + src[m + 640 + 24],10);
7013
7014 dst[24] = av_clip_uintp2(
7015 dst[24] + src[m + 768],10);
7016 dst[25] = av_clip_uintp2(
7017 dst[25] + src[m + 768 + 8],10);
7018 dst[26] = av_clip_uintp2(
7019 dst[26] + src[m + 768 + 16],10);
7020 dst[27] = av_clip_uintp2(
7021 dst[27] + src[m + 768 + 24],10);
7022 dst[28] = av_clip_uintp2(
7023 dst[28] + src[m + 896],10);
7024 dst[29] = av_clip_uintp2(
7025 dst[29] + src[m + 896 + 8],10);
7026 dst[30] = av_clip_uintp2(
7027 dst[30] + src[m + 896 + 16],10);
7028 dst[31] = av_clip_uintp2(
7029 dst[31] + src[m + 896 + 24],10);
7030
7031 m += 1;
7032 dst += stride;
7033 }
7034 if (i <= 16) {
7035 int k = (i + 8) * 4;
7036 m128iS0 = _mm_load_si128((__m128i *) (src + k));
7037 m128iS1 = _mm_load_si128((__m128i *) (src + 128 + k));
7038 m128iS2 = _mm_load_si128((__m128i *) (src + 256 + k));
7039 m128iS3 = _mm_load_si128((__m128i *) (src + 384 + k));
7040 m128iS4 = _mm_loadu_si128((__m128i *) (src + 512 + k));
7041 m128iS5 = _mm_load_si128((__m128i *) (src + 640 + k));
7042 m128iS6 = _mm_load_si128((__m128i *) (src + 768 + k));
7043 m128iS7 = _mm_load_si128((__m128i *) (src + 896 + k));
7044 m128iS8 = _mm_load_si128((__m128i *) (src + 8 + k));
7045 m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8 + k));
7046 m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8 + k));
7047 m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8 + k));
7048 m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8 + k));
7049 m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8 + k));
7050 m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8 + k));
7051 m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8 + k));
7052 m128iS16 = _mm_load_si128((__m128i *) (src + 16 + k));
7053 m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16 + k));
7054 m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16 + k));
7055 m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16 + k));
7056 m128iS20 = _mm_loadu_si128(
7057 (__m128i *) (src + 512 + 16 + k));
7058 m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16 + k));
7059 m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16 + k));
7060 m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16 + k));
7061 m128iS24 = _mm_load_si128((__m128i *) (src + 24 + k));
7062 m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24 + k));
7063 m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24 + k));
7064 m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24 + k));
7065 m128iS28 = _mm_loadu_si128(
7066 (__m128i *) (src + 512 + 24 + k));
7067 m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24 + k));
7068 m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24 + k));
7069 m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24 + k));
7070 }
7071 }
7072 }
7073 }
7074 }
7075 #endif
7076
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 openHEVC contributors
3 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4 *
5 * This file is part of libde265.
6 *
7 * libde265 is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation, either version 3 of
10 * the License, or (at your option) any later version.
11 *
12 * libde265 is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "x86/sse-dct.h"
22 #include "libde265/util.h"
23
24 #ifdef HAVE_CONFIG_H
25 #include "config.h"
26 #endif
27
28 #include <emmintrin.h> // SSE2
29 #include <tmmintrin.h> // SSSE3
30
31 #if HAVE_SSE4_1
32 #include <smmintrin.h> // SSE4.1
33 #endif
34
35
36 ALIGNED_16(static const int16_t) transform4x4_luma[8][8] =
37 {
38 { 29, +84, 29, +84, 29, +84, 29, +84 },
39 { +74, +55, +74, +55, +74, +55, +74, +55 },
40 { 55, -29, 55, -29, 55, -29, 55, -29 },
41 { +74, -84, +74, -84, +74, -84, +74, -84 },
42 { 74, -74, 74, -74, 74, -74, 74, -74 },
43 { 0, +74, 0, +74, 0, +74, 0, +74 },
44 { 84, +55, 84, +55, 84, +55, 84, +55 },
45 { -74, -29, -74, -29, -74, -29, -74, -29 }
46 };
47
48 ALIGNED_16(static const int16_t) transform4x4[4][8] = {
49 { 64, 64, 64, 64, 64, 64, 64, 64 },
50 { 64, -64, 64, -64, 64, -64, 64, -64 },
51 { 83, 36, 83, 36, 83, 36, 83, 36 },
52 { 36, -83, 36, -83, 36, -83, 36, -83 }
53 };
54
55 ALIGNED_16(static const int16_t) transform8x8[12][8] =
56 {
57 { 89, 75, 89, 75, 89, 75, 89, 75 },
58 { 50, 18, 50, 18, 50, 18, 50, 18 },
59 { 75, -18, 75, -18, 75, -18, 75, -18 },
60 { -89, -50, -89, -50,-89, -50,-89, -50 },
61 { 50, -89, 50, -89, 50, -89, 50, -89 },
62 { 18, 75, 18, 75, 18, 75, 18, 75 },
63 { 18, -50, 18, -50, 18, -50, 18, -50 },
64 { 75, -89, 75, -89, 75, -89, 75, -89 },
65 { 64, 64, 64, 64, 64, 64, 64, 64 },
66 { 64, -64, 64, -64, 64, -64, 64, -64 },
67 { 83, 36, 83, 36, 83, 36, 83, 36 },
68 { 36, -83, 36, -83, 36, -83, 36, -83 }
69 };
70
71 ALIGNED_16(static const int16_t) transform16x16_1[4][8][8] =
72 {
73 {/*1-3*/ /*2-6*/
74 { 90, 87, 90, 87, 90, 87, 90, 87 },
75 { 87, 57, 87, 57, 87, 57, 87, 57 },
76 { 80, 9, 80, 9, 80, 9, 80, 9 },
77 { 70, -43, 70, -43, 70, -43, 70, -43 },
78 { 57, -80, 57, -80, 57, -80, 57, -80 },
79 { 43, -90, 43, -90, 43, -90, 43, -90 },
80 { 25, -70, 25, -70, 25, -70, 25, -70 },
81 { 9, -25, 9, -25, 9, -25, 9, -25 },
82 },{ /*5-7*/ /*10-14*/
83 { 80, 70, 80, 70, 80, 70, 80, 70 },
84 { 9, -43, 9, -43, 9, -43, 9, -43 },
85 { -70, -87, -70, -87, -70, -87, -70, -87 },
86 { -87, 9, -87, 9, -87, 9, -87, 9 },
87 { -25, 90, -25, 90, -25, 90, -25, 90 },
88 { 57, 25, 57, 25, 57, 25, 57, 25 },
89 { 90, -80, 90, -80, 90, -80, 90, -80 },
90 { 43, -57, 43, -57, 43, -57, 43, -57 },
91 },{ /*9-11*/ /*18-22*/
92 { 57, 43, 57, 43, 57, 43, 57, 43 },
93 { -80, -90, -80, -90, -80, -90, -80, -90 },
94 { -25, 57, -25, 57, -25, 57, -25, 57 },
95 { 90, 25, 90, 25, 90, 25, 90, 25 },
96 { -9, -87, -9, -87, -9, -87, -9, -87 },
97 { -87, 70, -87, 70, -87, 70, -87, 70 },
98 { 43, 9, 43, 9, 43, 9, 43, 9 },
99 { 70, -80, 70, -80, 70, -80, 70, -80 },
100 },{/*13-15*/ /* 26-30 */
101 { 25, 9, 25, 9, 25, 9, 25, 9 },
102 { -70, -25, -70, -25, -70, -25, -70, -25 },
103 { 90, 43, 90, 43, 90, 43, 90, 43 },
104 { -80, -57, -80, -57, -80, -57, -80, -57 },
105 { 43, 70, 43, 70, 43, 70, 43, 70 },
106 { 9, -80, 9, -80, 9, -80, 9, -80 },
107 { -57, 87, -57, 87, -57, 87, -57, 87 },
108 { 87, -90, 87, -90, 87, -90, 87, -90 },
109 }
110 };
111
112 ALIGNED_16(static const int16_t) transform16x16_2[2][4][8] =
113 {
114 { /*2-6*/ /*4-12*/
115 { 89, 75, 89, 75, 89, 75, 89, 75 },
116 { 75, -18, 75, -18, 75, -18, 75, -18 },
117 { 50, -89, 50, -89, 50, -89, 50, -89 },
118 { 18, -50, 18, -50, 18, -50, 18, -50 },
119 },{ /*10-14*/ /*20-28*/
120 { 50, 18, 50, 18, 50, 18, 50, 18 },
121 { -89, -50, -89, -50, -89, -50, -89, -50 },
122 { 18, 75, 18, 75, 18, 75, 18, 75 },
123 { 75, -89, 75, -89, 75, -89, 75, -89 },
124 }
125 };
126
127 ALIGNED_16(static const int16_t) transform16x16_3[2][2][8] =
128 {
129 {/*4-12*/ /*8-24*/
130 { 83, 36, 83, 36, 83, 36, 83, 36 },
131 { 36, -83, 36, -83, 36, -83, 36, -83 },
132 },{ /*0-8*/ /*0-16*/
133 { 64, 64, 64, 64, 64, 64, 64, 64 },
134 { 64, -64, 64, -64, 64, -64, 64, -64 },
135 }
136 };
137
138
139 ALIGNED_16(static const int16_t) transform32x32[8][16][8] =
140 {
141 { /* 1-3 */
142 { 90, 90, 90, 90, 90, 90, 90, 90 },
143 { 90, 82, 90, 82, 90, 82, 90, 82 },
144 { 88, 67, 88, 67, 88, 67, 88, 67 },
145 { 85, 46, 85, 46, 85, 46, 85, 46 },
146 { 82, 22, 82, 22, 82, 22, 82, 22 },
147 { 78, -4, 78, -4, 78, -4, 78, -4 },
148 { 73, -31, 73, -31, 73, -31, 73, -31 },
149 { 67, -54, 67, -54, 67, -54, 67, -54 },
150 { 61, -73, 61, -73, 61, -73, 61, -73 },
151 { 54, -85, 54, -85, 54, -85, 54, -85 },
152 { 46, -90, 46, -90, 46, -90, 46, -90 },
153 { 38, -88, 38, -88, 38, -88, 38, -88 },
154 { 31, -78, 31, -78, 31, -78, 31, -78 },
155 { 22, -61, 22, -61, 22, -61, 22, -61 },
156 { 13, -38, 13, -38, 13, -38, 13, -38 },
157 { 4, -13, 4, -13, 4, -13, 4, -13 },
158 },{/* 5-7 */
159 { 88, 85, 88, 85, 88, 85, 88, 85 },
160 { 67, 46, 67, 46, 67, 46, 67, 46 },
161 { 31, -13, 31, -13, 31, -13, 31, -13 },
162 { -13, -67, -13, -67, -13, -67, -13, -67 },
163 { -54, -90, -54, -90, -54, -90, -54, -90 },
164 { -82, -73, -82, -73, -82, -73, -82, -73 },
165 { -90, -22, -90, -22, -90, -22, -90, -22 },
166 { -78, 38, -78, 38, -78, 38, -78, 38 },
167 { -46, 82, -46, 82, -46, 82, -46, 82 },
168 { -4, 88, -4, 88, -4, 88, -4, 88 },
169 { 38, 54, 38, 54, 38, 54, 38, 54 },
170 { 73, -4, 73, -4, 73, -4, 73, -4 },
171 { 90, -61, 90, -61, 90, -61, 90, -61 },
172 { 85, -90, 85, -90, 85, -90, 85, -90 },
173 { 61, -78, 61, -78, 61, -78, 61, -78 },
174 { 22, -31, 22, -31, 22, -31, 22, -31 },
175 },{/* 9-11 */
176 { 82, 78, 82, 78, 82, 78, 82, 78 },
177 { 22, -4, 22, -4, 22, -4, 22, -4 },
178 { -54, -82, -54, -82, -54, -82, -54, -82 },
179 { -90, -73, -90, -73, -90, -73, -90, -73 },
180 { -61, 13, -61, 13, -61, 13, -61, 13 },
181 { 13, 85, 13, 85, 13, 85, 13, 85 },
182 { 78, 67, 78, 67, 78, 67, 78, 67 },
183 { 85, -22, 85, -22, 85, -22, 85, -22 },
184 { 31, -88, 31, -88, 31, -88, 31, -88 },
185 { -46, -61, -46, -61, -46, -61, -46, -61 },
186 { -90, 31, -90, 31, -90, 31, -90, 31 },
187 { -67, 90, -67, 90, -67, 90, -67, 90 },
188 { 4, 54, 4, 54, 4, 54, 4, 54 },
189 { 73, -38, 73, -38, 73, -38, 73, -38 },
190 { 88, -90, 88, -90, 88, -90, 88, -90 },
191 { 38, -46, 38, -46, 38, -46, 38, -46 },
192 },{/* 13-15 */
193 { 73, 67, 73, 67, 73, 67, 73, 67 },
194 { -31, -54, -31, -54, -31, -54, -31, -54 },
195 { -90, -78, -90, -78, -90, -78, -90, -78 },
196 { -22, 38, -22, 38, -22, 38, -22, 38 },
197 { 78, 85, 78, 85, 78, 85, 78, 85 },
198 { 67, -22, 67, -22, 67, -22, 67, -22 },
199 { -38, -90, -38, -90, -38, -90, -38, -90 },
200 { -90, 4, -90, 4, -90, 4, -90, 4 },
201 { -13, 90, -13, 90, -13, 90, -13, 90 },
202 { 82, 13, 82, 13, 82, 13, 82, 13 },
203 { 61, -88, 61, -88, 61, -88, 61, -88 },
204 { -46, -31, -46, -31, -46, -31, -46, -31 },
205 { -88, 82, -88, 82, -88, 82, -88, 82 },
206 { -4, 46, -4, 46, -4, 46, -4, 46 },
207 { 85, -73, 85, -73, 85, -73, 85, -73 },
208 { 54, -61, 54, -61, 54, -61, 54, -61 },
209 },{/* 17-19 */
210 { 61, 54, 61, 54, 61, 54, 61, 54 },
211 { -73, -85, -73, -85, -73, -85, -73, -85 },
212 { -46, -4, -46, -4, -46, -4, -46, -4 },
213 { 82, 88, 82, 88, 82, 88, 82, 88 },
214 { 31, -46, 31, -46, 31, -46, 31, -46 },
215 { -88, -61, -88, -61, -88, -61, -88, -61 },
216 { -13, 82, -13, 82, -13, 82, -13, 82 },
217 { 90, 13, 90, 13, 90, 13, 90, 13 },
218 { -4, -90, -4, -90, -4, -90, -4, -90 },
219 { -90, 38, -90, 38, -90, 38, -90, 38 },
220 { 22, 67, 22, 67, 22, 67, 22, 67 },
221 { 85, -78, 85, -78, 85, -78, 85, -78 },
222 { -38, -22, -38, -22, -38, -22, -38, -22 },
223 { -78, 90, -78, 90, -78, 90, -78, 90 },
224 { 54, -31, 54, -31, 54, -31, 54, -31 },
225 { 67, -73, 67, -73, 67, -73, 67, -73 },
226 },{ /* 21-23 */
227 { 46, 38, 46, 38, 46, 38, 46, 38 },
228 { -90, -88, -90, -88, -90, -88, -90, -88 },
229 { 38, 73, 38, 73, 38, 73, 38, 73 },
230 { 54, -4, 54, -4, 54, -4, 54, -4 },
231 { -90, -67, -90, -67, -90, -67, -90, -67 },
232 { 31, 90, 31, 90, 31, 90, 31, 90 },
233 { 61, -46, 61, -46, 61, -46, 61, -46 },
234 { -88, -31, -88, -31, -88, -31, -88, -31 },
235 { 22, 85, 22, 85, 22, 85, 22, 85 },
236 { 67, -78, 67, -78, 67, -78, 67, -78 },
237 { -85, 13, -85, 13, -85, 13, -85, 13 },
238 { 13, 61, 13, 61, 13, 61, 13, 61 },
239 { 73, -90, 73, -90, 73, -90, 73, -90 },
240 { -82, 54, -82, 54, -82, 54, -82, 54 },
241 { 4, 22, 4, 22, 4, 22, 4, 22 },
242 { 78, -82, 78, -82, 78, -82, 78, -82 },
243 },{ /* 25-27 */
244 { 31, 22, 31, 22, 31, 22, 31, 22 },
245 { -78, -61, -78, -61, -78, -61, -78, -61 },
246 { 90, 85, 90, 85, 90, 85, 90, 85 },
247 { -61, -90, -61, -90, -61, -90, -61, -90 },
248 { 4, 73, 4, 73, 4, 73, 4, 73 },
249 { 54, -38, 54, -38, 54, -38, 54, -38 },
250 { -88, -4, -88, -4, -88, -4, -88, -4 },
251 { 82, 46, 82, 46, 82, 46, 82, 46 },
252 { -38, -78, -38, -78, -38, -78, -38, -78 },
253 { -22, 90, -22, 90, -22, 90, -22, 90 },
254 { 73, -82, 73, -82, 73, -82, 73, -82 },
255 { -90, 54, -90, 54, -90, 54, -90, 54 },
256 { 67, -13, 67, -13, 67, -13, 67, -13 },
257 { -13, -31, -13, -31, -13, -31, -13, -31 },
258 { -46, 67, -46, 67, -46, 67, -46, 67 },
259 { 85, -88, 85, -88, 85, -88, 85, -88 },
260 },{/* 29-31 */
261 { 13, 4, 13, 4, 13, 4, 13, 4 },
262 { -38, -13, -38, -13, -38, -13, -38, -13 },
263 { 61, 22, 61, 22, 61, 22, 61, 22 },
264 { -78, -31, -78, -31, -78, -31, -78, -31 },
265 { 88, 38, 88, 38, 88, 38, 88, 38 },
266 { -90, -46, -90, -46, -90, -46, -90, -46 },
267 { 85, 54, 85, 54, 85, 54, 85, 54 },
268 { -73, -61, -73, -61, -73, -61, -73, -61 },
269 { 54, 67, 54, 67, 54, 67, 54, 67 },
270 { -31, -73, -31, -73, -31, -73, -31, -73 },
271 { 4, 78, 4, 78, 4, 78, 4, 78 },
272 { 22, -82, 22, -82, 22, -82, 22, -82 },
273 { -46, 85, -46, 85, -46, 85, -46, 85 },
274 { 67, -88, 67, -88, 67, -88, 67, -88 },
275 { -82, 90, -82, 90, -82, 90, -82, 90 },
276 { 90, -90, 90, -90, 90, -90, 90, -90 },
277 }
278 };
279
280 #define shift_1st 7
281 #define add_1st (1 << (shift_1st - 1))
282
283
284 void ff_hevc_transform_skip_8_sse(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride)
285 {
286 uint8_t *dst = (uint8_t*)_dst;
287 ptrdiff_t stride = _stride;
288 int shift = 5;
289 int offset = 16;
290 __m128i r0,r1,r2,r3,r4,r5,r6,r9;
291
292 r9= _mm_setzero_si128();
293 //r8= _mm_set_epi32(0,0,0,-1);
294 r2= _mm_set1_epi16(offset);
295
296 r0= _mm_load_si128((__m128i*)(coeffs));
297 r1= _mm_load_si128((__m128i*)(coeffs+8));
298
299
300 r0= _mm_adds_epi16(r0,r2);
301 r1= _mm_adds_epi16(r1,r2);
302
303 r0= _mm_srai_epi16(r0,shift);
304 r1= _mm_srai_epi16(r1,shift);
305
306 r3= _mm_loadl_epi64((__m128i*)(dst));
307 r4= _mm_loadl_epi64((__m128i*)(dst + stride));
308 r5= _mm_loadl_epi64((__m128i*)(dst + 2*stride));
309 r6= _mm_loadl_epi64((__m128i*)(dst + 3*stride));
310
311 r3= _mm_unpacklo_epi8(r3,r9);
312 r4= _mm_unpacklo_epi8(r4,r9);
313 r5= _mm_unpacklo_epi8(r5,r9);
314 r6= _mm_unpacklo_epi8(r6,r9);
315 r3= _mm_unpacklo_epi64(r3,r4);
316 r4= _mm_unpacklo_epi64(r5,r6);
317
318
319 r3= _mm_adds_epi16(r3,r0);
320 r4= _mm_adds_epi16(r4,r1);
321
322 r3= _mm_packus_epi16(r3,r4);
323 //r8= _mm_set_epi32(0,0,0,-1);
324
325 //_mm_maskmoveu_si128(r3,r8,(char *) (dst));
326 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(r3);
327
328 r3= _mm_srli_si128(r3,4);
329 //_mm_maskmoveu_si128(r3,r8,(char *) (dst+stride));
330 *((uint32_t*)(dst+stride)) = _mm_cvtsi128_si32(r3);
331
332 r3= _mm_srli_si128(r3,4);
333 //_mm_maskmoveu_si128(r3,r8,(char *) (dst+2*stride));
334 *((uint32_t*)(dst+2*stride)) = _mm_cvtsi128_si32(r3);
335
336 r3= _mm_srli_si128(r3,4);
337 //_mm_maskmoveu_si128(r3,r8,(char *) (dst+3*stride));
338 *((uint32_t*)(dst+3*stride)) = _mm_cvtsi128_si32(r3);
339 }
340
341
342
343 #if HAVE_SSE4_1
344 void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
345 ptrdiff_t _stride) {
346
347 uint8_t shift_2nd = 12; // 20 - Bit depth
348 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
349
350 uint8_t *dst = (uint8_t*) _dst;
351 ptrdiff_t stride = _stride;
352 int16_t *src = coeffs;
353 __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
354 m128iD;
355 m128iAdd = _mm_set1_epi32(64);
356
357 S0 = _mm_load_si128((__m128i *) (src));
358 S8 = _mm_load_si128((__m128i *) (src + 8));
359
360 m128iAC = _mm_unpacklo_epi16(S0, S8);
361 m128iBD = _mm_unpackhi_epi16(S0, S8);
362
363 m128iTmp1 = _mm_madd_epi16(m128iAC,
364 _mm_load_si128((__m128i *) (transform4x4_luma[0])));
365 m128iTmp2 = _mm_madd_epi16(m128iBD,
366 _mm_load_si128((__m128i *) (transform4x4_luma[1])));
367 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
368 S0 = _mm_add_epi32(S0, m128iAdd);
369 S0 = _mm_srai_epi32(S0, shift_1st);
370
371 m128iTmp1 = _mm_madd_epi16(m128iAC,
372 _mm_load_si128((__m128i *) (transform4x4_luma[2])));
373 m128iTmp2 = _mm_madd_epi16(m128iBD,
374 _mm_load_si128((__m128i *) (transform4x4_luma[3])));
375 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
376 S8 = _mm_add_epi32(S8, m128iAdd);
377 S8 = _mm_srai_epi32(S8, shift_1st);
378
379 m128iA = _mm_packs_epi32(S0, S8);
380
381 m128iTmp1 = _mm_madd_epi16(m128iAC,
382 _mm_load_si128((__m128i *) (transform4x4_luma[4])));
383 m128iTmp2 = _mm_madd_epi16(m128iBD,
384 _mm_load_si128((__m128i *) (transform4x4_luma[5])));
385 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
386 S0 = _mm_add_epi32(S0, m128iAdd);
387 S0 = _mm_srai_epi32(S0, shift_1st);
388
389 m128iTmp1 = _mm_madd_epi16(m128iAC,
390 _mm_load_si128((__m128i *) (transform4x4_luma[6])));
391 m128iTmp2 = _mm_madd_epi16(m128iBD,
392 _mm_load_si128((__m128i *) (transform4x4_luma[7])));
393 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
394 S8 = _mm_add_epi32(S8, m128iAdd);
395 S8 = _mm_srai_epi32(S8, shift_1st);
396
397 m128iD = _mm_packs_epi32(S0, S8);
398
399 S0 = _mm_unpacklo_epi16(m128iA, m128iD);
400 S8 = _mm_unpackhi_epi16(m128iA, m128iD);
401
402 m128iA = _mm_unpacklo_epi16(S0, S8);
403 m128iD = _mm_unpackhi_epi16(S0, S8);
404
405 /* ################### */
406 m128iAdd = _mm_set1_epi32(add_2nd);
407
408 m128iAC = _mm_unpacklo_epi16(m128iA, m128iD);
409 m128iBD = _mm_unpackhi_epi16(m128iA, m128iD);
410
411 m128iTmp1 = _mm_madd_epi16(m128iAC,
412 _mm_load_si128((__m128i *) (transform4x4_luma[0])));
413 m128iTmp2 = _mm_madd_epi16(m128iBD,
414 _mm_load_si128((__m128i *) (transform4x4_luma[1])));
415 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
416 S0 = _mm_add_epi32(S0, m128iAdd);
417 S0 = _mm_srai_epi32(S0, shift_2nd);
418
419 m128iTmp1 = _mm_madd_epi16(m128iAC,
420 _mm_load_si128((__m128i *) (transform4x4_luma[2])));
421 m128iTmp2 = _mm_madd_epi16(m128iBD,
422 _mm_load_si128((__m128i *) (transform4x4_luma[3])));
423 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
424 S8 = _mm_add_epi32(S8, m128iAdd);
425 S8 = _mm_srai_epi32(S8, shift_2nd);
426
427 m128iA = _mm_packs_epi32(S0, S8);
428
429 m128iTmp1 = _mm_madd_epi16(m128iAC,
430 _mm_load_si128((__m128i *) (transform4x4_luma[4])));
431 m128iTmp2 = _mm_madd_epi16(m128iBD,
432 _mm_load_si128((__m128i *) (transform4x4_luma[5])));
433 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
434 S0 = _mm_add_epi32(S0, m128iAdd);
435 S0 = _mm_srai_epi32(S0, shift_2nd);
436
437 m128iTmp1 = _mm_madd_epi16(m128iAC,
438 _mm_load_si128((__m128i *) (transform4x4_luma[6])));
439 m128iTmp2 = _mm_madd_epi16(m128iBD,
440 _mm_load_si128((__m128i *) (transform4x4_luma[7])));
441 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
442 S8 = _mm_add_epi32(S8, m128iAdd);
443 S8 = _mm_srai_epi32(S8, shift_2nd);
444
445 m128iD = _mm_packs_epi32(S0, S8);
446
447 // _mm_storeu_si128((__m128i *) (src), m128iA);
448 // _mm_storeu_si128((__m128i *) (src + 8), m128iD);
449
450 S0 = _mm_move_epi64(m128iA); //contains row 0
451 S8 = _mm_move_epi64(m128iD); //row 2
452 m128iA = _mm_srli_si128(m128iA, 8); // row 1
453 m128iD = _mm_srli_si128(m128iD, 8); // row 3
454 m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA);
455 m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD);
456 S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2);
457 S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2);
458
459 //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1); //mask to store 4 * 8bit data
460
461 m128iA = _mm_loadl_epi64((__m128i *) dst);
462 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
463 m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values
464 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
465 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
466 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
467
468 dst += stride;
469
470 m128iA = _mm_loadl_epi64((__m128i *) dst);
471 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
472 m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA);
473 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
474 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
475 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
476
477 dst += stride;
478
479 m128iA = _mm_loadl_epi64((__m128i *) dst);
480 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
481 m128iTmp1 = _mm_adds_epi16(S8, m128iA);
482 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
483 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
484 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
485
486 dst += stride;
487
488 m128iA = _mm_loadl_epi64((__m128i *) dst);
489 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
490 m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA);
491 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
492 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
493 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
494 }
495 #endif // SSE4.1
496
497 #if 0
498 void ff_hevc_transform_4x4_luma_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
499 ptrdiff_t _stride) {
500 int i,j;
501 uint8_t shift_2nd = 10; // 20 - Bit depth
502 uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
503
504 uint16_t *dst = (uint16_t*) _dst;
505 ptrdiff_t stride = _stride/(sizeof(uint16_t));
506 int16_t *src = coeffs;
507 __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
508 m128iD;
509
510 m128iAdd = _mm_set1_epi32(64);
511
512 S0 = _mm_loadu_si128((__m128i *) (src));
513 S8 = _mm_loadu_si128((__m128i *) (src + 8));
514
515 m128iAC = _mm_unpacklo_epi16(S0, S8);
516 m128iBD = _mm_unpackhi_epi16(S0, S8);
517
518 m128iTmp1 = _mm_madd_epi16(m128iAC,
519 _mm_loadu_si128((__m128i *) (transform4x4_luma[0])));
520 m128iTmp2 = _mm_madd_epi16(m128iBD,
521 _mm_loadu_si128((__m128i *) (transform4x4_luma[1])));
522 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
523 S0 = _mm_add_epi32(S0, m128iAdd);
524 S0 = _mm_srai_epi32(S0, shift_1st);
525
526 m128iTmp1 = _mm_madd_epi16(m128iAC,
527 _mm_loadu_si128((__m128i *) (transform4x4_luma[2])));
528 m128iTmp2 = _mm_madd_epi16(m128iBD,
529 _mm_loadu_si128((__m128i *) (transform4x4_luma[3])));
530 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
531 S8 = _mm_add_epi32(S8, m128iAdd);
532 S8 = _mm_srai_epi32(S8, shift_1st);
533
534 m128iA = _mm_packs_epi32(S0, S8);
535
536 m128iTmp1 = _mm_madd_epi16(m128iAC,
537 _mm_loadu_si128((__m128i *) (transform4x4_luma[4])));
538 m128iTmp2 = _mm_madd_epi16(m128iBD,
539 _mm_loadu_si128((__m128i *) (transform4x4_luma[5])));
540 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
541 S0 = _mm_add_epi32(S0, m128iAdd);
542 S0 = _mm_srai_epi32(S0, shift_1st);
543
544 m128iTmp1 = _mm_madd_epi16(m128iAC,
545 _mm_loadu_si128((__m128i *) (transform4x4_luma[6])));
546 m128iTmp2 = _mm_madd_epi16(m128iBD,
547 _mm_loadu_si128((__m128i *) (transform4x4_luma[7])));
548 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
549 S8 = _mm_add_epi32(S8, m128iAdd);
550 S8 = _mm_srai_epi32(S8, shift_1st);
551
552 m128iD = _mm_packs_epi32(S0, S8);
553
554 S0 = _mm_unpacklo_epi16(m128iA, m128iD);
555 S8 = _mm_unpackhi_epi16(m128iA, m128iD);
556
557 m128iA = _mm_unpacklo_epi16(S0, S8);
558 m128iD = _mm_unpackhi_epi16(S0, S8);
559
560 /* ################### */
561 m128iAdd = _mm_set1_epi32(add_2nd);
562
563 m128iAC = _mm_unpacklo_epi16(m128iA, m128iD);
564 m128iBD = _mm_unpackhi_epi16(m128iA, m128iD);
565
566 m128iTmp1 = _mm_madd_epi16(m128iAC,
567 _mm_load_si128((__m128i *) (transform4x4_luma[0])));
568 m128iTmp2 = _mm_madd_epi16(m128iBD,
569 _mm_load_si128((__m128i *) (transform4x4_luma[1])));
570 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
571 S0 = _mm_add_epi32(S0, m128iAdd);
572 S0 = _mm_srai_epi32(S0, shift_2nd);
573
574 m128iTmp1 = _mm_madd_epi16(m128iAC,
575 _mm_load_si128((__m128i *) (transform4x4_luma[2])));
576 m128iTmp2 = _mm_madd_epi16(m128iBD,
577 _mm_load_si128((__m128i *) (transform4x4_luma[3])));
578 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
579 S8 = _mm_add_epi32(S8, m128iAdd);
580 S8 = _mm_srai_epi32(S8, shift_2nd);
581
582 m128iA = _mm_packs_epi32(S0, S8);
583
584 m128iTmp1 = _mm_madd_epi16(m128iAC,
585 _mm_load_si128((__m128i *) (transform4x4_luma[4])));
586 m128iTmp2 = _mm_madd_epi16(m128iBD,
587 _mm_load_si128((__m128i *) (transform4x4_luma[5])));
588 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
589 S0 = _mm_add_epi32(S0, m128iAdd);
590 S0 = _mm_srai_epi32(S0, shift_2nd);
591
592 m128iTmp1 = _mm_madd_epi16(m128iAC,
593 _mm_load_si128((__m128i *) (transform4x4_luma[6])));
594 m128iTmp2 = _mm_madd_epi16(m128iBD,
595 _mm_load_si128((__m128i *) (transform4x4_luma[7])));
596 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
597 S8 = _mm_add_epi32(S8, m128iAdd);
598 S8 = _mm_srai_epi32(S8, shift_2nd);
599
600 m128iD = _mm_packs_epi32(S0, S8);
601
602 _mm_storeu_si128((__m128i *) (src), m128iA);
603 _mm_storeu_si128((__m128i *) (src + 8), m128iD);
604 j = 0;
605 for (i = 0; i < 2; i++) {
606 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
607 dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
608 dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
609 dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
610 j += 1;
611 dst += stride;
612 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
613 dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
614 dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
615 dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
616 j += 1;
617 dst += stride;
618 }
619
620 }
621 #endif
622
623
624 #if HAVE_SSE4_1
625 void ff_hevc_transform_4x4_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
626 ptrdiff_t _stride) {
627 uint8_t shift_2nd = 12; // 20 - Bit depth
628 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
629
630 uint8_t *dst = (uint8_t*) _dst;
631 ptrdiff_t stride = _stride;
632 int16_t *src = coeffs;
633
634 __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD, m128iTmp1,m128iTmp2;
635 S0 = _mm_load_si128((__m128i *) (src));
636 S8 = _mm_load_si128((__m128i *) (src + 8));
637 m128iAdd = _mm_set1_epi32(add_1st);
638
639 m128Tmp = _mm_unpacklo_epi16(S0, S8);
640 E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
641 E1 = _mm_add_epi32(E1, m128iAdd);
642
643 E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
644 E2 = _mm_add_epi32(E2, m128iAdd);
645
646 m128Tmp = _mm_unpackhi_epi16(S0, S8);
647 O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
648 O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
649
650 m128iA = _mm_add_epi32(E1, O1);
651 m128iA = _mm_srai_epi32(m128iA, shift_1st); // Sum = Sum >> iShiftNum
652 m128Tmp = _mm_add_epi32(E2, O2);
653 m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum
654 m128iA = _mm_packs_epi32(m128iA, m128Tmp);
655
656 m128iD = _mm_sub_epi32(E2, O2);
657 m128iD = _mm_srai_epi32(m128iD, shift_1st); // Sum = Sum >> iShiftNum
658
659 m128Tmp = _mm_sub_epi32(E1, O1);
660 m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum
661
662 m128iD = _mm_packs_epi32(m128iD, m128Tmp);
663
664 S0 = _mm_unpacklo_epi16(m128iA, m128iD);
665 S8 = _mm_unpackhi_epi16(m128iA, m128iD);
666
667 m128iA = _mm_unpacklo_epi16(S0, S8);
668 m128iD = _mm_unpackhi_epi16(S0, S8);
669
670 /* ########################## */
671
672 m128iAdd = _mm_set1_epi32(add_2nd);
673 m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD);
674 E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
675 E1 = _mm_add_epi32(E1, m128iAdd);
676
677 E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
678 E2 = _mm_add_epi32(E2, m128iAdd);
679
680 m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD);
681 O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
682 O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
683
684 m128iA = _mm_add_epi32(E1, O1);
685 m128iA = _mm_srai_epi32(m128iA, shift_2nd);
686 m128Tmp = _mm_add_epi32(E2, O2);
687 m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
688 m128iA = _mm_packs_epi32(m128iA, m128Tmp);
689
690 m128iD = _mm_sub_epi32(E2, O2);
691 m128iD = _mm_srai_epi32(m128iD, shift_2nd);
692
693 m128Tmp = _mm_sub_epi32(E1, O1);
694 m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
695
696 m128iD = _mm_packs_epi32(m128iD, m128Tmp);
697
698 S0 = _mm_move_epi64(m128iA); //contains row 0
699 S8 = _mm_move_epi64(m128iD); //row 2
700 m128iA = _mm_srli_si128(m128iA, 8); // row 1
701 m128iD = _mm_srli_si128(m128iD, 8); // row 3
702 m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA);
703 m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD);
704 S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2);
705 S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2);
706
707 //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1); //mask to store 4 * 8bit data
708
709 m128iA = _mm_loadl_epi64((__m128i *) dst);
710 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
711 m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values
712 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
713 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
714 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
715
716 dst += stride;
717
718 m128iA = _mm_loadl_epi64((__m128i *) dst);
719 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
720 m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA);
721 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
722 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
723 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
724
725 dst += stride;
726
727 m128iA = _mm_loadl_epi64((__m128i *) dst);
728 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
729 m128iTmp1 = _mm_adds_epi16(S8, m128iA);
730 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
731 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
732 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
733
734 dst += stride;
735
736 m128iA = _mm_loadl_epi64((__m128i *) dst);
737 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
738 m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA);
739 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
740 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
741 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
742 }
743 #endif
744
745 #if 0
746 void ff_hevc_transform_4x4_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
747 ptrdiff_t _stride) {
748 int i;
749 uint8_t shift_2nd = 10; // 20 - Bit depth
750 uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
751
752 uint16_t *dst = (uint16_t*) _dst;
753 ptrdiff_t stride = _stride/2;
754 int16_t *src = coeffs;
755
756 int j;
757 __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD;
758 S0 = _mm_load_si128((__m128i *) (src));
759 S8 = _mm_load_si128((__m128i *) (src + 8));
760 m128iAdd = _mm_set1_epi32(add_1st);
761
762 m128Tmp = _mm_unpacklo_epi16(S0, S8);
763 E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
764 E1 = _mm_add_epi32(E1, m128iAdd);
765
766 E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
767 E2 = _mm_add_epi32(E2, m128iAdd);
768
769 m128Tmp = _mm_unpackhi_epi16(S0, S8);
770 O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
771 O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
772
773 m128iA = _mm_add_epi32(E1, O1);
774 m128iA = _mm_srai_epi32(m128iA, shift_1st); // Sum = Sum >> iShiftNum
775 m128Tmp = _mm_add_epi32(E2, O2);
776 m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum
777 m128iA = _mm_packs_epi32(m128iA, m128Tmp);
778
779 m128iD = _mm_sub_epi32(E2, O2);
780 m128iD = _mm_srai_epi32(m128iD, shift_1st); // Sum = Sum >> iShiftNum
781
782 m128Tmp = _mm_sub_epi32(E1, O1);
783 m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum
784
785 m128iD = _mm_packs_epi32(m128iD, m128Tmp);
786
787 S0 = _mm_unpacklo_epi16(m128iA, m128iD);
788 S8 = _mm_unpackhi_epi16(m128iA, m128iD);
789
790 m128iA = _mm_unpacklo_epi16(S0, S8);
791 m128iD = _mm_unpackhi_epi16(S0, S8);
792
793 /* ########################## */
794
795 m128iAdd = _mm_set1_epi32(add_2nd);
796 m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD);
797 E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
798 E1 = _mm_add_epi32(E1, m128iAdd);
799
800 E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
801 E2 = _mm_add_epi32(E2, m128iAdd);
802
803 m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD);
804 O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
805 O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
806
807 m128iA = _mm_add_epi32(E1, O1);
808 m128iA = _mm_srai_epi32(m128iA, shift_2nd);
809 m128Tmp = _mm_add_epi32(E2, O2);
810 m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
811 m128iA = _mm_packs_epi32(m128iA, m128Tmp);
812
813 m128iD = _mm_sub_epi32(E2, O2);
814 m128iD = _mm_srai_epi32(m128iD, shift_2nd);
815
816 m128Tmp = _mm_sub_epi32(E1, O1);
817 m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
818
819 m128iD = _mm_packs_epi32(m128iD, m128Tmp);
820 _mm_storeu_si128((__m128i *) (src), m128iA);
821 _mm_storeu_si128((__m128i *) (src + 8), m128iD);
822 j = 0;
823 for (i = 0; i < 2; i++) {
824 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
825 dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
826 dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
827 dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
828 j += 1;
829 dst += stride;
830 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
831 dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
832 dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
833 dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
834 j += 1;
835 dst += stride;
836 }
837 }
838 #endif
839
840 #if HAVE_SSE4_1
841 void ff_hevc_transform_8x8_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
842 ptrdiff_t _stride) {
843 uint8_t shift_2nd = 12; // 20 - Bit depth
844 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
845
846 uint8_t *dst = (uint8_t*) _dst;
847 ptrdiff_t stride = _stride / sizeof(uint8_t);
848 int16_t *src = coeffs;
849 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
850 m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
851 E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
852
853 O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h,
854 T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11;
855 T0= _mm_load_si128((__m128i *) (transform8x8[0]));
856 T1= _mm_load_si128((__m128i *) (transform8x8[1]));
857 T2= _mm_load_si128((__m128i *) (transform8x8[2]));
858 T3= _mm_load_si128((__m128i *) (transform8x8[3]));
859 T4= _mm_load_si128((__m128i *) (transform8x8[4]));
860 T5= _mm_load_si128((__m128i *) (transform8x8[5]));
861 T6= _mm_load_si128((__m128i *) (transform8x8[6]));
862 T7= _mm_load_si128((__m128i *) (transform8x8[7]));
863 T8= _mm_load_si128((__m128i *) (transform8x8[8]));
864 T9= _mm_load_si128((__m128i *) (transform8x8[9]));
865 T10= _mm_load_si128((__m128i *) (transform8x8[10]));
866 T11= _mm_load_si128((__m128i *) (transform8x8[11]));
867
868 m128iAdd = _mm_set1_epi32(add_1st);
869
870 m128iS1 = _mm_load_si128((__m128i *) (src + 8));
871 m128iS3 = _mm_load_si128((__m128i *) (src + 24));
872 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
873 E1l = _mm_madd_epi16(m128Tmp0, T0);
874 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
875 E1h = _mm_madd_epi16(m128Tmp1, T0);
876 m128iS5 = _mm_load_si128((__m128i *) (src + 40));
877 m128iS7 = _mm_load_si128((__m128i *) (src + 56));
878 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
879 E2l = _mm_madd_epi16(m128Tmp2, T1);
880 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
881 E2h = _mm_madd_epi16(m128Tmp3, T1);
882 O0l = _mm_add_epi32(E1l, E2l);
883 O0h = _mm_add_epi32(E1h, E2h);
884
885 E1l = _mm_madd_epi16(m128Tmp0, T2);
886 E1h = _mm_madd_epi16(m128Tmp1, T2);
887 E2l = _mm_madd_epi16(m128Tmp2, T3);
888 E2h = _mm_madd_epi16(m128Tmp3, T3);
889
890 O1l = _mm_add_epi32(E1l, E2l);
891 O1h = _mm_add_epi32(E1h, E2h);
892
893 E1l = _mm_madd_epi16(m128Tmp0, T4);
894 E1h = _mm_madd_epi16(m128Tmp1, T4);
895 E2l = _mm_madd_epi16(m128Tmp2, T5);
896 E2h = _mm_madd_epi16(m128Tmp3, T5);
897 O2l = _mm_add_epi32(E1l, E2l);
898 O2h = _mm_add_epi32(E1h, E2h);
899
900 E1l = _mm_madd_epi16(m128Tmp0, T6);
901 E1h = _mm_madd_epi16(m128Tmp1, T6);
902 E2l = _mm_madd_epi16(m128Tmp2, T7);
903 E2h = _mm_madd_epi16(m128Tmp3, T7);
904 O3h = _mm_add_epi32(E1h, E2h);
905 O3l = _mm_add_epi32(E1l, E2l);
906
907 /* ------- */
908
909 m128iS0 = _mm_load_si128((__m128i *) (src + 0));
910 m128iS4 = _mm_load_si128((__m128i *) (src + 32));
911 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
912 EE0l = _mm_madd_epi16(m128Tmp0, T8);
913 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
914 EE0h = _mm_madd_epi16(m128Tmp1, T8);
915
916 EE1l = _mm_madd_epi16(m128Tmp0, T9);
917 EE1h = _mm_madd_epi16(m128Tmp1, T9);
918
919 /* ------- */
920
921 m128iS2 = _mm_load_si128((__m128i *) (src + 16));
922 m128iS6 = _mm_load_si128((__m128i *) (src + 48));
923 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
924 E00l = _mm_madd_epi16(m128Tmp0, T10);
925 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
926 E00h = _mm_madd_epi16(m128Tmp1, T10);
927 E01l = _mm_madd_epi16(m128Tmp0, T11);
928 E01h = _mm_madd_epi16(m128Tmp1, T11);
929 E0l = _mm_add_epi32(EE0l, E00l);
930 E0l = _mm_add_epi32(E0l, m128iAdd);
931 E0h = _mm_add_epi32(EE0h, E00h);
932 E0h = _mm_add_epi32(E0h, m128iAdd);
933 E3l = _mm_sub_epi32(EE0l, E00l);
934 E3l = _mm_add_epi32(E3l, m128iAdd);
935 E3h = _mm_sub_epi32(EE0h, E00h);
936 E3h = _mm_add_epi32(E3h, m128iAdd);
937
938 E1l = _mm_add_epi32(EE1l, E01l);
939 E1l = _mm_add_epi32(E1l, m128iAdd);
940 E1h = _mm_add_epi32(EE1h, E01h);
941 E1h = _mm_add_epi32(E1h, m128iAdd);
942 E2l = _mm_sub_epi32(EE1l, E01l);
943 E2l = _mm_add_epi32(E2l, m128iAdd);
944 E2h = _mm_sub_epi32(EE1h, E01h);
945 E2h = _mm_add_epi32(E2h, m128iAdd);
946 m128iS0 = _mm_packs_epi32(
947 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st),
948 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st));
949 m128iS1 = _mm_packs_epi32(
950 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st),
951 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st));
952 m128iS2 = _mm_packs_epi32(
953 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st),
954 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st));
955 m128iS3 = _mm_packs_epi32(
956 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st),
957 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st));
958 m128iS4 = _mm_packs_epi32(
959 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st),
960 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st));
961 m128iS5 = _mm_packs_epi32(
962 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st),
963 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st));
964 m128iS6 = _mm_packs_epi32(
965 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st),
966 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st));
967 m128iS7 = _mm_packs_epi32(
968 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st),
969 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st));
970 /* Invers matrix */
971
972 E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
973 E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
974 E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
975 E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
976 O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
977 O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
978 O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
979 O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
980 m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
981 m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
982 m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
983 m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
984 m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
985 m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
986 m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
987 m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
988 m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
989 m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
990 m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
991 m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
992 m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
993 m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
994 m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
995 m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
996
997 m128iAdd = _mm_set1_epi32(add_2nd);
998
999 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1000 E1l = _mm_madd_epi16(m128Tmp0, T0);
1001 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1002 E1h = _mm_madd_epi16(m128Tmp1, T0);
1003 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1004 E2l = _mm_madd_epi16(m128Tmp2, T1);
1005 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1006 E2h = _mm_madd_epi16(m128Tmp3, T1);
1007 O0l = _mm_add_epi32(E1l, E2l);
1008 O0h = _mm_add_epi32(E1h, E2h);
1009 E1l = _mm_madd_epi16(m128Tmp0, T2);
1010 E1h = _mm_madd_epi16(m128Tmp1, T2);
1011 E2l = _mm_madd_epi16(m128Tmp2, T3);
1012 E2h = _mm_madd_epi16(m128Tmp3, T3);
1013 O1l = _mm_add_epi32(E1l, E2l);
1014 O1h = _mm_add_epi32(E1h, E2h);
1015 E1l = _mm_madd_epi16(m128Tmp0, T4);
1016 E1h = _mm_madd_epi16(m128Tmp1, T4);
1017 E2l = _mm_madd_epi16(m128Tmp2, T5);
1018 E2h = _mm_madd_epi16(m128Tmp3, T5);
1019 O2l = _mm_add_epi32(E1l, E2l);
1020 O2h = _mm_add_epi32(E1h, E2h);
1021 E1l = _mm_madd_epi16(m128Tmp0, T6);
1022 E1h = _mm_madd_epi16(m128Tmp1, T6);
1023 E2l = _mm_madd_epi16(m128Tmp2, T7);
1024 E2h = _mm_madd_epi16(m128Tmp3, T7);
1025 O3h = _mm_add_epi32(E1h, E2h);
1026 O3l = _mm_add_epi32(E1l, E2l);
1027
1028 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1029 EE0l = _mm_madd_epi16(m128Tmp0, T8);
1030 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1031 EE0h = _mm_madd_epi16(m128Tmp1, T8);
1032 EE1l = _mm_madd_epi16(m128Tmp0, T9);
1033 EE1h = _mm_madd_epi16(m128Tmp1, T9);
1034
1035 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1036 E00l = _mm_madd_epi16(m128Tmp0, T10);
1037 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1038 E00h = _mm_madd_epi16(m128Tmp1, T10);
1039 E01l = _mm_madd_epi16(m128Tmp0, T11);
1040 E01h = _mm_madd_epi16(m128Tmp1, T11);
1041 E0l = _mm_add_epi32(EE0l, E00l);
1042 E0l = _mm_add_epi32(E0l, m128iAdd);
1043 E0h = _mm_add_epi32(EE0h, E00h);
1044 E0h = _mm_add_epi32(E0h, m128iAdd);
1045 E3l = _mm_sub_epi32(EE0l, E00l);
1046 E3l = _mm_add_epi32(E3l, m128iAdd);
1047 E3h = _mm_sub_epi32(EE0h, E00h);
1048 E3h = _mm_add_epi32(E3h, m128iAdd);
1049 E1l = _mm_add_epi32(EE1l, E01l);
1050 E1l = _mm_add_epi32(E1l, m128iAdd);
1051 E1h = _mm_add_epi32(EE1h, E01h);
1052 E1h = _mm_add_epi32(E1h, m128iAdd);
1053 E2l = _mm_sub_epi32(EE1l, E01l);
1054 E2l = _mm_add_epi32(E2l, m128iAdd);
1055 E2h = _mm_sub_epi32(EE1h, E01h);
1056 E2h = _mm_add_epi32(E2h, m128iAdd);
1057
1058 m128iS0 = _mm_packs_epi32(
1059 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd),
1060 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd));
1061 m128iS1 = _mm_packs_epi32(
1062 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd),
1063 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd));
1064 m128iS2 = _mm_packs_epi32(
1065 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd),
1066 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd));
1067 m128iS3 = _mm_packs_epi32(
1068 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd),
1069 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd));
1070 m128iS4 = _mm_packs_epi32(
1071 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd),
1072 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd));
1073 m128iS5 = _mm_packs_epi32(
1074 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd),
1075 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd));
1076 m128iS6 = _mm_packs_epi32(
1077 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd),
1078 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd));
1079 m128iS7 = _mm_packs_epi32(
1080 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd),
1081 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd));
1082
1083 E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
1084 E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
1085 E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
1086 E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
1087 O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
1088 O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
1089 O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
1090 O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
1091 m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
1092 m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
1093 m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1094 m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1095 m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
1096 m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
1097 m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1098 m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1099 m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
1100 m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
1101 m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1102 m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1103 m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
1104 m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
1105 m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1106 m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1107
1108 E0l = _mm_loadl_epi64((__m128i *) dst);
1109 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1110
1111 E0l = _mm_adds_epi16(E0l, m128iS0);
1112 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1113 _mm_storel_epi64((__m128i *) dst, E0l);
1114 dst += stride;
1115
1116 E0l = _mm_loadl_epi64((__m128i *) dst);
1117 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1118
1119 E0l = _mm_adds_epi16(E0l, m128iS1);
1120 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1121 _mm_storel_epi64((__m128i *) dst, E0l);
1122 dst += stride;
1123
1124 E0l = _mm_loadl_epi64((__m128i *) dst);
1125 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1126
1127 E0l = _mm_adds_epi16(E0l, m128iS2);
1128 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1129 _mm_storel_epi64((__m128i *) dst, E0l);
1130 dst += stride;
1131
1132 E0l = _mm_loadl_epi64((__m128i *) dst);
1133 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1134
1135 E0l = _mm_adds_epi16(E0l, m128iS3);
1136 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1137 _mm_storel_epi64((__m128i *) dst, E0l);
1138 dst += stride;
1139
1140 E0l = _mm_loadl_epi64((__m128i *) dst);
1141 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1142
1143 E0l = _mm_adds_epi16(E0l, m128iS4);
1144 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1145 _mm_storel_epi64((__m128i *) dst, E0l);
1146 dst += stride;
1147
1148 E0l = _mm_loadl_epi64((__m128i *) dst);
1149 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1150
1151 E0l = _mm_adds_epi16(E0l, m128iS5);
1152 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1153 _mm_storel_epi64((__m128i *) dst, E0l);
1154 dst += stride;
1155
1156 E0l = _mm_loadl_epi64((__m128i *) dst);
1157 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1158
1159 E0l = _mm_adds_epi16(E0l, m128iS6);
1160 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1161 _mm_storel_epi64((__m128i *) dst, E0l);
1162 dst += stride;
1163
1164 E0l = _mm_loadl_epi64((__m128i *) dst);
1165 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1166
1167 E0l = _mm_adds_epi16(E0l, m128iS7);
1168 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1169 _mm_storel_epi64((__m128i *) dst, E0l);
1170 dst += stride;
1171
1172 }
1173 #endif
1174
1175 #if 0
1176 void ff_hevc_transform_8x8_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
1177 ptrdiff_t _stride) {
1178 int i;
1179 uint16_t *dst = (uint16_t*) _dst;
1180 ptrdiff_t stride = _stride / sizeof(uint16_t);
1181 int16_t *src = coeffs;
1182 uint8_t shift_2nd = 10; // 20 - Bit depth
1183 uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
1184
1185 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
1186 m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
1187 E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
1188 O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
1189 int j;
1190 m128iAdd = _mm_set1_epi32(add_1st);
1191
1192 m128iS1 = _mm_load_si128((__m128i *) (src + 8));
1193 m128iS3 = _mm_load_si128((__m128i *) (src + 24));
1194 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1195 E1l = _mm_madd_epi16(m128Tmp0,
1196 _mm_load_si128((__m128i *) (transform8x8[0])));
1197 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1198 E1h = _mm_madd_epi16(m128Tmp1,
1199 _mm_load_si128((__m128i *) (transform8x8[0])));
1200 m128iS5 = _mm_load_si128((__m128i *) (src + 40));
1201 m128iS7 = _mm_load_si128((__m128i *) (src + 56));
1202 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1203 E2l = _mm_madd_epi16(m128Tmp2,
1204 _mm_load_si128((__m128i *) (transform8x8[1])));
1205 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1206 E2h = _mm_madd_epi16(m128Tmp3,
1207 _mm_load_si128((__m128i *) (transform8x8[1])));
1208 O0l = _mm_add_epi32(E1l, E2l);
1209 O0h = _mm_add_epi32(E1h, E2h);
1210
1211 E1l = _mm_madd_epi16(m128Tmp0,
1212 _mm_load_si128((__m128i *) (transform8x8[2])));
1213 E1h = _mm_madd_epi16(m128Tmp1,
1214 _mm_load_si128((__m128i *) (transform8x8[2])));
1215 E2l = _mm_madd_epi16(m128Tmp2,
1216 _mm_load_si128((__m128i *) (transform8x8[3])));
1217 E2h = _mm_madd_epi16(m128Tmp3,
1218 _mm_load_si128((__m128i *) (transform8x8[3])));
1219
1220 O1l = _mm_add_epi32(E1l, E2l);
1221 O1h = _mm_add_epi32(E1h, E2h);
1222
1223 E1l = _mm_madd_epi16(m128Tmp0,
1224 _mm_load_si128((__m128i *) (transform8x8[4])));
1225 E1h = _mm_madd_epi16(m128Tmp1,
1226 _mm_load_si128((__m128i *) (transform8x8[4])));
1227 E2l = _mm_madd_epi16(m128Tmp2,
1228 _mm_load_si128((__m128i *) (transform8x8[5])));
1229 E2h = _mm_madd_epi16(m128Tmp3,
1230 _mm_load_si128((__m128i *) (transform8x8[5])));
1231 O2l = _mm_add_epi32(E1l, E2l);
1232 O2h = _mm_add_epi32(E1h, E2h);
1233
1234 E1l = _mm_madd_epi16(m128Tmp0,
1235 _mm_load_si128((__m128i *) (transform8x8[6])));
1236 E1h = _mm_madd_epi16(m128Tmp1,
1237 _mm_load_si128((__m128i *) (transform8x8[6])));
1238 E2l = _mm_madd_epi16(m128Tmp2,
1239 _mm_load_si128((__m128i *) (transform8x8[7])));
1240 E2h = _mm_madd_epi16(m128Tmp3,
1241 _mm_load_si128((__m128i *) (transform8x8[7])));
1242 O3h = _mm_add_epi32(E1h, E2h);
1243 O3l = _mm_add_epi32(E1l, E2l);
1244
1245 /* ------- */
1246
1247 m128iS0 = _mm_load_si128((__m128i *) (src + 0));
1248 m128iS4 = _mm_load_si128((__m128i *) (src + 32));
1249 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1250 EE0l = _mm_madd_epi16(m128Tmp0,
1251 _mm_load_si128((__m128i *) (transform8x8[8])));
1252 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1253 EE0h = _mm_madd_epi16(m128Tmp1,
1254 _mm_load_si128((__m128i *) (transform8x8[8])));
1255
1256 EE1l = _mm_madd_epi16(m128Tmp0,
1257 _mm_load_si128((__m128i *) (transform8x8[9])));
1258 EE1h = _mm_madd_epi16(m128Tmp1,
1259 _mm_load_si128((__m128i *) (transform8x8[9])));
1260
1261 /* ------- */
1262
1263 m128iS2 = _mm_load_si128((__m128i *) (src + 16));
1264 m128iS6 = _mm_load_si128((__m128i *) (src + 48));
1265 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1266 E00l = _mm_madd_epi16(m128Tmp0,
1267 _mm_load_si128((__m128i *) (transform8x8[10])));
1268 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1269 E00h = _mm_madd_epi16(m128Tmp1,
1270 _mm_load_si128((__m128i *) (transform8x8[10])));
1271 E01l = _mm_madd_epi16(m128Tmp0,
1272 _mm_load_si128((__m128i *) (transform8x8[11])));
1273 E01h = _mm_madd_epi16(m128Tmp1,
1274 _mm_load_si128((__m128i *) (transform8x8[11])));
1275 E0l = _mm_add_epi32(EE0l, E00l);
1276 E0l = _mm_add_epi32(E0l, m128iAdd);
1277 E0h = _mm_add_epi32(EE0h, E00h);
1278 E0h = _mm_add_epi32(E0h, m128iAdd);
1279 E3l = _mm_sub_epi32(EE0l, E00l);
1280 E3l = _mm_add_epi32(E3l, m128iAdd);
1281 E3h = _mm_sub_epi32(EE0h, E00h);
1282 E3h = _mm_add_epi32(E3h, m128iAdd);
1283
1284 E1l = _mm_add_epi32(EE1l, E01l);
1285 E1l = _mm_add_epi32(E1l, m128iAdd);
1286 E1h = _mm_add_epi32(EE1h, E01h);
1287 E1h = _mm_add_epi32(E1h, m128iAdd);
1288 E2l = _mm_sub_epi32(EE1l, E01l);
1289 E2l = _mm_add_epi32(E2l, m128iAdd);
1290 E2h = _mm_sub_epi32(EE1h, E01h);
1291 E2h = _mm_add_epi32(E2h, m128iAdd);
1292 m128iS0 = _mm_packs_epi32(
1293 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st),
1294 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st));
1295 m128iS1 = _mm_packs_epi32(
1296 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st),
1297 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st));
1298 m128iS2 = _mm_packs_epi32(
1299 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st),
1300 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st));
1301 m128iS3 = _mm_packs_epi32(
1302 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st),
1303 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st));
1304 m128iS4 = _mm_packs_epi32(
1305 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st),
1306 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st));
1307 m128iS5 = _mm_packs_epi32(
1308 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st),
1309 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st));
1310 m128iS6 = _mm_packs_epi32(
1311 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st),
1312 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st));
1313 m128iS7 = _mm_packs_epi32(
1314 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st),
1315 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st));
1316 /* Invers matrix */
1317
1318 E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
1319 E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
1320 E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
1321 E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
1322 O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
1323 O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
1324 O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
1325 O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
1326 m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
1327 m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
1328 m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1329 m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1330 m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
1331 m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
1332 m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1333 m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1334 m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
1335 m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
1336 m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1337 m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1338 m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
1339 m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
1340 m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1341 m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1342
1343 m128iAdd = _mm_set1_epi32(add_2nd);
1344
1345 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1346 E1l = _mm_madd_epi16(m128Tmp0,
1347 _mm_load_si128((__m128i *) (transform8x8[0])));
1348 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1349 E1h = _mm_madd_epi16(m128Tmp1,
1350 _mm_load_si128((__m128i *) (transform8x8[0])));
1351 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1352 E2l = _mm_madd_epi16(m128Tmp2,
1353 _mm_load_si128((__m128i *) (transform8x8[1])));
1354 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1355 E2h = _mm_madd_epi16(m128Tmp3,
1356 _mm_load_si128((__m128i *) (transform8x8[1])));
1357 O0l = _mm_add_epi32(E1l, E2l);
1358 O0h = _mm_add_epi32(E1h, E2h);
1359 E1l = _mm_madd_epi16(m128Tmp0,
1360 _mm_load_si128((__m128i *) (transform8x8[2])));
1361 E1h = _mm_madd_epi16(m128Tmp1,
1362 _mm_load_si128((__m128i *) (transform8x8[2])));
1363 E2l = _mm_madd_epi16(m128Tmp2,
1364 _mm_load_si128((__m128i *) (transform8x8[3])));
1365 E2h = _mm_madd_epi16(m128Tmp3,
1366 _mm_load_si128((__m128i *) (transform8x8[3])));
1367 O1l = _mm_add_epi32(E1l, E2l);
1368 O1h = _mm_add_epi32(E1h, E2h);
1369 E1l = _mm_madd_epi16(m128Tmp0,
1370 _mm_load_si128((__m128i *) (transform8x8[4])));
1371 E1h = _mm_madd_epi16(m128Tmp1,
1372 _mm_load_si128((__m128i *) (transform8x8[4])));
1373 E2l = _mm_madd_epi16(m128Tmp2,
1374 _mm_load_si128((__m128i *) (transform8x8[5])));
1375 E2h = _mm_madd_epi16(m128Tmp3,
1376 _mm_load_si128((__m128i *) (transform8x8[5])));
1377 O2l = _mm_add_epi32(E1l, E2l);
1378 O2h = _mm_add_epi32(E1h, E2h);
1379 E1l = _mm_madd_epi16(m128Tmp0,
1380 _mm_load_si128((__m128i *) (transform8x8[6])));
1381 E1h = _mm_madd_epi16(m128Tmp1,
1382 _mm_load_si128((__m128i *) (transform8x8[6])));
1383 E2l = _mm_madd_epi16(m128Tmp2,
1384 _mm_load_si128((__m128i *) (transform8x8[7])));
1385 E2h = _mm_madd_epi16(m128Tmp3,
1386 _mm_load_si128((__m128i *) (transform8x8[7])));
1387 O3h = _mm_add_epi32(E1h, E2h);
1388 O3l = _mm_add_epi32(E1l, E2l);
1389
1390 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1391 EE0l = _mm_madd_epi16(m128Tmp0,
1392 _mm_load_si128((__m128i *) (transform8x8[8])));
1393 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1394 EE0h = _mm_madd_epi16(m128Tmp1,
1395 _mm_load_si128((__m128i *) (transform8x8[8])));
1396 EE1l = _mm_madd_epi16(m128Tmp0,
1397 _mm_load_si128((__m128i *) (transform8x8[9])));
1398 EE1h = _mm_madd_epi16(m128Tmp1,
1399 _mm_load_si128((__m128i *) (transform8x8[9])));
1400
1401 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1402 E00l = _mm_madd_epi16(m128Tmp0,
1403 _mm_load_si128((__m128i *) (transform8x8[10])));
1404 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1405 E00h = _mm_madd_epi16(m128Tmp1,
1406 _mm_load_si128((__m128i *) (transform8x8[10])));
1407 E01l = _mm_madd_epi16(m128Tmp0,
1408 _mm_load_si128((__m128i *) (transform8x8[11])));
1409 E01h = _mm_madd_epi16(m128Tmp1,
1410 _mm_load_si128((__m128i *) (transform8x8[11])));
1411 E0l = _mm_add_epi32(EE0l, E00l);
1412 E0l = _mm_add_epi32(E0l, m128iAdd);
1413 E0h = _mm_add_epi32(EE0h, E00h);
1414 E0h = _mm_add_epi32(E0h, m128iAdd);
1415 E3l = _mm_sub_epi32(EE0l, E00l);
1416 E3l = _mm_add_epi32(E3l, m128iAdd);
1417 E3h = _mm_sub_epi32(EE0h, E00h);
1418 E3h = _mm_add_epi32(E3h, m128iAdd);
1419 E1l = _mm_add_epi32(EE1l, E01l);
1420 E1l = _mm_add_epi32(E1l, m128iAdd);
1421 E1h = _mm_add_epi32(EE1h, E01h);
1422 E1h = _mm_add_epi32(E1h, m128iAdd);
1423 E2l = _mm_sub_epi32(EE1l, E01l);
1424 E2l = _mm_add_epi32(E2l, m128iAdd);
1425 E2h = _mm_sub_epi32(EE1h, E01h);
1426 E2h = _mm_add_epi32(E2h, m128iAdd);
1427
1428 m128iS0 = _mm_packs_epi32(
1429 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd),
1430 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd));
1431 m128iS1 = _mm_packs_epi32(
1432 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd),
1433 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd));
1434 m128iS2 = _mm_packs_epi32(
1435 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd),
1436 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd));
1437 m128iS3 = _mm_packs_epi32(
1438 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd),
1439 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd));
1440 m128iS4 = _mm_packs_epi32(
1441 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd),
1442 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd));
1443 m128iS5 = _mm_packs_epi32(
1444 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd),
1445 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd));
1446 m128iS6 = _mm_packs_epi32(
1447 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd),
1448 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd));
1449 m128iS7 = _mm_packs_epi32(
1450 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd),
1451 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd));
1452
1453 _mm_store_si128((__m128i *) (src), m128iS0);
1454 _mm_store_si128((__m128i *) (src + 8), m128iS1);
1455 _mm_store_si128((__m128i *) (src + 16), m128iS2);
1456 _mm_store_si128((__m128i *) (src + 24), m128iS3);
1457 _mm_store_si128((__m128i *) (src + 32), m128iS4);
1458 _mm_store_si128((__m128i *) (src + 40), m128iS5);
1459 _mm_store_si128((__m128i *) (src + 48), m128iS6);
1460 _mm_store_si128((__m128i *) (src + 56), m128iS7);
1461
1462 j = 0;
1463 for (i = 0; i < 4; i++) {
1464 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
1465 dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10);
1466 dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10);
1467 dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10);
1468 dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10);
1469 dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10);
1470 dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10);
1471 dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10);
1472 j += 1;
1473 dst += stride;
1474 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
1475 dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10);
1476 dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10);
1477 dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10);
1478 dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10);
1479 dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10);
1480 dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10);
1481 dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10);
1482 j += 1;
1483 dst += stride;
1484 }
1485
1486 }
1487 #endif
1488
1489
1490 #if HAVE_SSE4_1
1491 void ff_hevc_transform_16x16_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
1492 ptrdiff_t _stride) {
1493 uint8_t shift_2nd = 12; // 20 - Bit depth
1494 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
1495 int i;
1496 uint8_t *dst = (uint8_t*) _dst;
1497 ptrdiff_t stride = _stride / sizeof(uint8_t);
1498 int16_t *src = coeffs;
1499 int32_t shift;
1500 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
1501 m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
1502 m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
1503 m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
1504 E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
1505 O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
1506 E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
1507 __m128i E4l, E5l, E6l, E7l;
1508 __m128i E4h, E5h, E6h, E7h;
1509 __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;
1510 __m128i r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31;
1511
1512
1513 /*__m128i T00,T01, T02, T03, T04, T05, T06, T07;
1514 __m128i T10,T11, T12, T13, T14, T15, T16, T17;
1515 __m128i T20,T21, T22, T23, T24, T25, T26, T27;
1516 __m128i T30,T31, T32, T33, T34, T35, T36, T37;
1517
1518 __m128i U00,U01, U02, U03, U10, U11, U12, U13;
1519
1520 __m128i V00,V01, V10, V11;*/
1521
1522
1523 const __m128i T00 = _mm_load_si128((__m128i *) (transform16x16_1[0][0]));
1524 const __m128i T01 = _mm_load_si128((__m128i *) (transform16x16_1[0][1]));
1525 const __m128i T02 = _mm_load_si128((__m128i *) (transform16x16_1[0][2]));
1526 const __m128i T03 = _mm_load_si128((__m128i *) (transform16x16_1[0][3]));
1527 const __m128i T04 = _mm_load_si128((__m128i *) (transform16x16_1[0][4]));
1528 const __m128i T05 = _mm_load_si128((__m128i *) (transform16x16_1[0][5]));
1529 const __m128i T06 = _mm_load_si128((__m128i *) (transform16x16_1[0][6]));
1530 const __m128i T07 = _mm_load_si128((__m128i *) (transform16x16_1[0][7]));
1531 const __m128i T10 = _mm_load_si128((__m128i *) (transform16x16_1[1][0]));
1532 const __m128i T11 = _mm_load_si128((__m128i *) (transform16x16_1[1][1]));
1533 const __m128i T12 = _mm_load_si128((__m128i *) (transform16x16_1[1][2]));
1534 const __m128i T13 = _mm_load_si128((__m128i *) (transform16x16_1[1][3]));
1535 const __m128i T14 = _mm_load_si128((__m128i *) (transform16x16_1[1][4]));
1536 const __m128i T15 = _mm_load_si128((__m128i *) (transform16x16_1[1][5]));
1537 const __m128i T16 = _mm_load_si128((__m128i *) (transform16x16_1[1][6]));
1538 const __m128i T17 = _mm_load_si128((__m128i *) (transform16x16_1[1][7]));
1539 const __m128i T20 = _mm_load_si128((__m128i *) (transform16x16_1[2][0]));
1540 const __m128i T21 = _mm_load_si128((__m128i *) (transform16x16_1[2][1]));
1541 const __m128i T22 = _mm_load_si128((__m128i *) (transform16x16_1[2][2]));
1542 const __m128i T23 = _mm_load_si128((__m128i *) (transform16x16_1[2][3]));
1543 const __m128i T24 = _mm_load_si128((__m128i *) (transform16x16_1[2][4]));
1544 const __m128i T25 = _mm_load_si128((__m128i *) (transform16x16_1[2][5]));
1545 const __m128i T26 = _mm_load_si128((__m128i *) (transform16x16_1[2][6]));
1546 const __m128i T27 = _mm_load_si128((__m128i *) (transform16x16_1[2][7]));
1547 const __m128i T30 = _mm_load_si128((__m128i *) (transform16x16_1[3][0]));
1548 const __m128i T31 = _mm_load_si128((__m128i *) (transform16x16_1[3][1]));
1549 const __m128i T32 = _mm_load_si128((__m128i *) (transform16x16_1[3][2]));
1550 const __m128i T33 = _mm_load_si128((__m128i *) (transform16x16_1[3][3]));
1551 const __m128i T34 = _mm_load_si128((__m128i *) (transform16x16_1[3][4]));
1552 const __m128i T35 = _mm_load_si128((__m128i *) (transform16x16_1[3][5]));
1553 const __m128i T36 = _mm_load_si128((__m128i *) (transform16x16_1[3][6]));
1554 const __m128i T37 = _mm_load_si128((__m128i *) (transform16x16_1[3][7]));
1555
1556 const __m128i U00 = _mm_load_si128((__m128i *) (transform16x16_2[0][0]));
1557 const __m128i U01 = _mm_load_si128((__m128i *) (transform16x16_2[0][1]));
1558 const __m128i U02 = _mm_load_si128((__m128i *) (transform16x16_2[0][2]));
1559 const __m128i U03 = _mm_load_si128((__m128i *) (transform16x16_2[0][3]));
1560 const __m128i U10 = _mm_load_si128((__m128i *) (transform16x16_2[1][0]));
1561 const __m128i U11 = _mm_load_si128((__m128i *) (transform16x16_2[1][1]));
1562 const __m128i U12 = _mm_load_si128((__m128i *) (transform16x16_2[1][2]));
1563 const __m128i U13 = _mm_load_si128((__m128i *) (transform16x16_2[1][3]));
1564
1565 const __m128i V00 = _mm_load_si128((__m128i *) (transform16x16_3[0][0]));
1566 const __m128i V01 = _mm_load_si128((__m128i *) (transform16x16_3[0][1]));
1567 const __m128i V10 = _mm_load_si128((__m128i *) (transform16x16_3[1][0]));
1568 const __m128i V11 = _mm_load_si128((__m128i *) (transform16x16_3[1][1]));
1569
1570
1571
1572 int j;
1573 m128iS0 = _mm_load_si128((__m128i *) (src));
1574 m128iS1 = _mm_load_si128((__m128i *) (src + 16));
1575 m128iS2 = _mm_load_si128((__m128i *) (src + 32));
1576 m128iS3 = _mm_load_si128((__m128i *) (src + 48));
1577 m128iS4 = _mm_loadu_si128((__m128i *) (src + 64));
1578 m128iS5 = _mm_load_si128((__m128i *) (src + 80));
1579 m128iS6 = _mm_load_si128((__m128i *) (src + 96));
1580 m128iS7 = _mm_load_si128((__m128i *) (src + 112));
1581 m128iS8 = _mm_load_si128((__m128i *) (src + 128));
1582 m128iS9 = _mm_load_si128((__m128i *) (src + 144));
1583 m128iS10 = _mm_load_si128((__m128i *) (src + 160));
1584 m128iS11 = _mm_load_si128((__m128i *) (src + 176));
1585 m128iS12 = _mm_load_si128((__m128i *) (src + 192));
1586 m128iS13 = _mm_load_si128((__m128i *) (src + 208));
1587 m128iS14 = _mm_load_si128((__m128i *) (src + 224));
1588 m128iS15 = _mm_load_si128((__m128i *) (src + 240));
1589 shift = shift_1st;
1590 m128iAdd = _mm_set1_epi32(add_1st);
1591
1592 for (j = 0; j < 2; j++) {
1593 for (i = 0; i < 16; i += 8) {
1594
1595 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1596 E0l = _mm_madd_epi16(m128Tmp0,T00);
1597 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1598 E0h = _mm_madd_epi16(m128Tmp1,T00);
1599
1600 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1601 E1l = _mm_madd_epi16(m128Tmp2,T10);
1602 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1603 E1h = _mm_madd_epi16(m128Tmp3,T10);
1604
1605 m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
1606 E2l = _mm_madd_epi16(m128Tmp4,T20);
1607 m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
1608 E2h = _mm_madd_epi16(m128Tmp5,T20);
1609
1610 m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
1611 E3l = _mm_madd_epi16(m128Tmp6,T30);
1612 m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
1613 E3h = _mm_madd_epi16(m128Tmp7,T30);
1614
1615 O0l = _mm_add_epi32(E0l, E1l);
1616 O0l = _mm_add_epi32(O0l, E2l);
1617 O0l = _mm_add_epi32(O0l, E3l);
1618
1619 O0h = _mm_add_epi32(E0h, E1h);
1620 O0h = _mm_add_epi32(O0h, E2h);
1621 O0h = _mm_add_epi32(O0h, E3h);
1622
1623 /* Compute O1*/
1624 E0l = _mm_madd_epi16(m128Tmp0,T01);
1625 E0h = _mm_madd_epi16(m128Tmp1,T01);
1626 E1l = _mm_madd_epi16(m128Tmp2,T11);
1627 E1h = _mm_madd_epi16(m128Tmp3,T11);
1628 E2l = _mm_madd_epi16(m128Tmp4,T21);
1629 E2h = _mm_madd_epi16(m128Tmp5,T21);
1630 E3l = _mm_madd_epi16(m128Tmp6,T31);
1631 E3h = _mm_madd_epi16(m128Tmp7,T31);
1632 O1l = _mm_add_epi32(E0l, E1l);
1633 O1l = _mm_add_epi32(O1l, E2l);
1634 O1l = _mm_add_epi32(O1l, E3l);
1635 O1h = _mm_add_epi32(E0h, E1h);
1636 O1h = _mm_add_epi32(O1h, E2h);
1637 O1h = _mm_add_epi32(O1h, E3h);
1638
1639 /* Compute O2*/
1640 E0l = _mm_madd_epi16(m128Tmp0,T02);
1641 E0h = _mm_madd_epi16(m128Tmp1,T02);
1642 E1l = _mm_madd_epi16(m128Tmp2,T12);
1643 E1h = _mm_madd_epi16(m128Tmp3,T12);
1644 E2l = _mm_madd_epi16(m128Tmp4,T22);
1645 E2h = _mm_madd_epi16(m128Tmp5,T22);
1646 E3l = _mm_madd_epi16(m128Tmp6,T32);
1647 E3h = _mm_madd_epi16(m128Tmp7,T32);
1648 O2l = _mm_add_epi32(E0l, E1l);
1649 O2l = _mm_add_epi32(O2l, E2l);
1650 O2l = _mm_add_epi32(O2l, E3l);
1651
1652 O2h = _mm_add_epi32(E0h, E1h);
1653 O2h = _mm_add_epi32(O2h, E2h);
1654 O2h = _mm_add_epi32(O2h, E3h);
1655
1656 /* Compute O3*/
1657 E0l = _mm_madd_epi16(m128Tmp0,T03);
1658 E0h = _mm_madd_epi16(m128Tmp1,T03);
1659 E1l = _mm_madd_epi16(m128Tmp2,T13);
1660 E1h = _mm_madd_epi16(m128Tmp3,T13);
1661 E2l = _mm_madd_epi16(m128Tmp4,T23);
1662 E2h = _mm_madd_epi16(m128Tmp5,T23);
1663 E3l = _mm_madd_epi16(m128Tmp6,T33);
1664 E3h = _mm_madd_epi16(m128Tmp7,T33);
1665
1666 O3l = _mm_add_epi32(E0l, E1l);
1667 O3l = _mm_add_epi32(O3l, E2l);
1668 O3l = _mm_add_epi32(O3l, E3l);
1669
1670 O3h = _mm_add_epi32(E0h, E1h);
1671 O3h = _mm_add_epi32(O3h, E2h);
1672 O3h = _mm_add_epi32(O3h, E3h);
1673
1674 /* Compute O4*/
1675
1676 E0l = _mm_madd_epi16(m128Tmp0,T04);
1677 E0h = _mm_madd_epi16(m128Tmp1,T04);
1678 E1l = _mm_madd_epi16(m128Tmp2,T14);
1679 E1h = _mm_madd_epi16(m128Tmp3,T14);
1680 E2l = _mm_madd_epi16(m128Tmp4,T24);
1681 E2h = _mm_madd_epi16(m128Tmp5,T24);
1682 E3l = _mm_madd_epi16(m128Tmp6,T34);
1683 E3h = _mm_madd_epi16(m128Tmp7,T34);
1684
1685 O4l = _mm_add_epi32(E0l, E1l);
1686 O4l = _mm_add_epi32(O4l, E2l);
1687 O4l = _mm_add_epi32(O4l, E3l);
1688
1689 O4h = _mm_add_epi32(E0h, E1h);
1690 O4h = _mm_add_epi32(O4h, E2h);
1691 O4h = _mm_add_epi32(O4h, E3h);
1692
1693 /* Compute O5*/
1694 E0l = _mm_madd_epi16(m128Tmp0,T05);
1695 E0h = _mm_madd_epi16(m128Tmp1,T05);
1696 E1l = _mm_madd_epi16(m128Tmp2,T15);
1697 E1h = _mm_madd_epi16(m128Tmp3,T15);
1698 E2l = _mm_madd_epi16(m128Tmp4,T25);
1699 E2h = _mm_madd_epi16(m128Tmp5,T25);
1700 E3l = _mm_madd_epi16(m128Tmp6,T35);
1701 E3h = _mm_madd_epi16(m128Tmp7,T35);
1702
1703 O5l = _mm_add_epi32(E0l, E1l);
1704 O5l = _mm_add_epi32(O5l, E2l);
1705 O5l = _mm_add_epi32(O5l, E3l);
1706
1707 O5h = _mm_add_epi32(E0h, E1h);
1708 O5h = _mm_add_epi32(O5h, E2h);
1709 O5h = _mm_add_epi32(O5h, E3h);
1710
1711 /* Compute O6*/
1712
1713 E0l = _mm_madd_epi16(m128Tmp0,T06);
1714 E0h = _mm_madd_epi16(m128Tmp1,T06);
1715 E1l = _mm_madd_epi16(m128Tmp2,T16);
1716 E1h = _mm_madd_epi16(m128Tmp3,T16);
1717 E2l = _mm_madd_epi16(m128Tmp4,T26);
1718 E2h = _mm_madd_epi16(m128Tmp5,T26);
1719 E3l = _mm_madd_epi16(m128Tmp6,T36);
1720 E3h = _mm_madd_epi16(m128Tmp7,T36);
1721
1722 O6l = _mm_add_epi32(E0l, E1l);
1723 O6l = _mm_add_epi32(O6l, E2l);
1724 O6l = _mm_add_epi32(O6l, E3l);
1725
1726 O6h = _mm_add_epi32(E0h, E1h);
1727 O6h = _mm_add_epi32(O6h, E2h);
1728 O6h = _mm_add_epi32(O6h, E3h);
1729
1730 /* Compute O7*/
1731
1732 E0l = _mm_madd_epi16(m128Tmp0,T07);
1733 E0h = _mm_madd_epi16(m128Tmp1,T07);
1734 E1l = _mm_madd_epi16(m128Tmp2,T17);
1735 E1h = _mm_madd_epi16(m128Tmp3,T17);
1736 E2l = _mm_madd_epi16(m128Tmp4,T27);
1737 E2h = _mm_madd_epi16(m128Tmp5,T27);
1738 E3l = _mm_madd_epi16(m128Tmp6,T37);
1739 E3h = _mm_madd_epi16(m128Tmp7,T37);
1740
1741 O7l = _mm_add_epi32(E0l, E1l);
1742 O7l = _mm_add_epi32(O7l, E2l);
1743 O7l = _mm_add_epi32(O7l, E3l);
1744
1745 O7h = _mm_add_epi32(E0h, E1h);
1746 O7h = _mm_add_epi32(O7h, E2h);
1747 O7h = _mm_add_epi32(O7h, E3h);
1748
1749 /* Compute E0 */
1750
1751
1752
1753 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1754 E0l = _mm_madd_epi16(m128Tmp0,U00);
1755 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1756 E0h = _mm_madd_epi16(m128Tmp1,U00);
1757
1758 m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
1759 E0l = _mm_add_epi32(E0l,
1760 _mm_madd_epi16(m128Tmp2,U10));
1761 m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
1762 E0h = _mm_add_epi32(E0h,
1763 _mm_madd_epi16(m128Tmp3,U10));
1764
1765 /* Compute E1 */
1766 E1l = _mm_madd_epi16(m128Tmp0,U01);
1767 E1h = _mm_madd_epi16(m128Tmp1,U01);
1768 E1l = _mm_add_epi32(E1l,
1769 _mm_madd_epi16(m128Tmp2,U11));
1770 E1h = _mm_add_epi32(E1h,
1771 _mm_madd_epi16(m128Tmp3,U11));
1772
1773 /* Compute E2 */
1774 E2l = _mm_madd_epi16(m128Tmp0,U02);
1775 E2h = _mm_madd_epi16(m128Tmp1,U02);
1776 E2l = _mm_add_epi32(E2l,
1777 _mm_madd_epi16(m128Tmp2,U12));
1778 E2h = _mm_add_epi32(E2h,
1779 _mm_madd_epi16(m128Tmp3,U12));
1780 /* Compute E3 */
1781 E3l = _mm_madd_epi16(m128Tmp0,U03);
1782 E3h = _mm_madd_epi16(m128Tmp1,U03);
1783 E3l = _mm_add_epi32(E3l,
1784 _mm_madd_epi16(m128Tmp2,U13));
1785 E3h = _mm_add_epi32(E3h,
1786 _mm_madd_epi16(m128Tmp3,U13));
1787
1788 /* Compute EE0 and EEE */
1789
1790 m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
1791 E00l = _mm_madd_epi16(m128Tmp0,V00);
1792 m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
1793 E00h = _mm_madd_epi16(m128Tmp1,V00);
1794
1795 m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8);
1796 EE0l = _mm_madd_epi16(m128Tmp2,V10);
1797 m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8);
1798 EE0h = _mm_madd_epi16(m128Tmp3,V10);
1799
1800 E01l = _mm_madd_epi16(m128Tmp0,V01);
1801 E01h = _mm_madd_epi16(m128Tmp1,V01);
1802
1803 EE1l = _mm_madd_epi16(m128Tmp2,V11);
1804 EE1h = _mm_madd_epi16(m128Tmp3,V11);
1805
1806 /* Compute EE */
1807 EE2l = _mm_sub_epi32(EE1l, E01l);
1808 EE3l = _mm_sub_epi32(EE0l, E00l);
1809 EE2h = _mm_sub_epi32(EE1h, E01h);
1810 EE3h = _mm_sub_epi32(EE0h, E00h);
1811
1812 EE0l = _mm_add_epi32(EE0l, E00l);
1813 EE1l = _mm_add_epi32(EE1l, E01l);
1814 EE0h = _mm_add_epi32(EE0h, E00h);
1815 EE1h = _mm_add_epi32(EE1h, E01h);
1816
1817 /* Compute E */
1818
1819 E4l = _mm_sub_epi32(EE3l, E3l);
1820 E4l = _mm_add_epi32(E4l, m128iAdd);
1821
1822 E5l = _mm_sub_epi32(EE2l, E2l);
1823 E5l = _mm_add_epi32(E5l, m128iAdd);
1824
1825 E6l = _mm_sub_epi32(EE1l, E1l);
1826 E6l = _mm_add_epi32(E6l, m128iAdd);
1827
1828 E7l = _mm_sub_epi32(EE0l, E0l);
1829 E7l = _mm_add_epi32(E7l, m128iAdd);
1830
1831 E4h = _mm_sub_epi32(EE3h, E3h);
1832 E4h = _mm_add_epi32(E4h, m128iAdd);
1833
1834 E5h = _mm_sub_epi32(EE2h, E2h);
1835 E5h = _mm_add_epi32(E5h, m128iAdd);
1836
1837 E6h = _mm_sub_epi32(EE1h, E1h);
1838 E6h = _mm_add_epi32(E6h, m128iAdd);
1839
1840 E7h = _mm_sub_epi32(EE0h, E0h);
1841 E7h = _mm_add_epi32(E7h, m128iAdd);
1842
1843 E0l = _mm_add_epi32(EE0l, E0l);
1844 E0l = _mm_add_epi32(E0l, m128iAdd);
1845
1846 E1l = _mm_add_epi32(EE1l, E1l);
1847 E1l = _mm_add_epi32(E1l, m128iAdd);
1848
1849 E2l = _mm_add_epi32(EE2l, E2l);
1850 E2l = _mm_add_epi32(E2l, m128iAdd);
1851
1852 E3l = _mm_add_epi32(EE3l, E3l);
1853 E3l = _mm_add_epi32(E3l, m128iAdd);
1854
1855 E0h = _mm_add_epi32(EE0h, E0h);
1856 E0h = _mm_add_epi32(E0h, m128iAdd);
1857
1858 E1h = _mm_add_epi32(EE1h, E1h);
1859 E1h = _mm_add_epi32(E1h, m128iAdd);
1860
1861 E2h = _mm_add_epi32(EE2h, E2h);
1862 E2h = _mm_add_epi32(E2h, m128iAdd);
1863
1864 E3h = _mm_add_epi32(EE3h, E3h);
1865 E3h = _mm_add_epi32(E3h, m128iAdd);
1866
1867 m128iS0 = _mm_packs_epi32(
1868 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
1869 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
1870 m128iS1 = _mm_packs_epi32(
1871 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
1872 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
1873 m128iS2 = _mm_packs_epi32(
1874 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
1875 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
1876 m128iS3 = _mm_packs_epi32(
1877 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
1878 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
1879
1880 m128iS4 = _mm_packs_epi32(
1881 _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
1882 _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
1883 m128iS5 = _mm_packs_epi32(
1884 _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
1885 _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
1886 m128iS6 = _mm_packs_epi32(
1887 _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
1888 _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
1889 m128iS7 = _mm_packs_epi32(
1890 _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
1891 _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
1892
1893 m128iS15 = _mm_packs_epi32(
1894 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
1895 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
1896 m128iS14 = _mm_packs_epi32(
1897 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
1898 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
1899 m128iS13 = _mm_packs_epi32(
1900 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
1901 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
1902 m128iS12 = _mm_packs_epi32(
1903 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
1904 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
1905
1906 m128iS11 = _mm_packs_epi32(
1907 _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
1908 _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
1909 m128iS10 = _mm_packs_epi32(
1910 _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
1911 _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
1912 m128iS9 = _mm_packs_epi32(
1913 _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
1914 _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
1915 m128iS8 = _mm_packs_epi32(
1916 _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
1917 _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
1918
1919
1920
1921 if (!j) { //first pass
1922
1923 /* Inverse the matrix */
1924 E0l = _mm_unpacklo_epi16(m128iS0, m128iS8);
1925 E1l = _mm_unpacklo_epi16(m128iS1, m128iS9);
1926 E2l = _mm_unpacklo_epi16(m128iS2, m128iS10);
1927 E3l = _mm_unpacklo_epi16(m128iS3, m128iS11);
1928 E4l = _mm_unpacklo_epi16(m128iS4, m128iS12);
1929 E5l = _mm_unpacklo_epi16(m128iS5, m128iS13);
1930 E6l = _mm_unpacklo_epi16(m128iS6, m128iS14);
1931 E7l = _mm_unpacklo_epi16(m128iS7, m128iS15);
1932
1933 E0h = _mm_unpackhi_epi16(m128iS0, m128iS8);
1934 E1h = _mm_unpackhi_epi16(m128iS1, m128iS9);
1935 E2h = _mm_unpackhi_epi16(m128iS2, m128iS10);
1936 E3h = _mm_unpackhi_epi16(m128iS3, m128iS11);
1937 E4h = _mm_unpackhi_epi16(m128iS4, m128iS12);
1938 E5h = _mm_unpackhi_epi16(m128iS5, m128iS13);
1939 E6h = _mm_unpackhi_epi16(m128iS6, m128iS14);
1940 E7h = _mm_unpackhi_epi16(m128iS7, m128iS15);
1941
1942 m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l);
1943 m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l);
1944 m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l);
1945 m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l);
1946
1947 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1948 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1949 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1950 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1951
1952 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1953 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1954 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1955 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1956
1957 m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l);
1958 m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l);
1959 m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l);
1960 m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l);
1961
1962 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1963 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1964 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1965 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1966
1967 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1968 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1969 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1970 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1971
1972 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
1973 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
1974 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
1975 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
1976
1977 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1978 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1979 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1980 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1981
1982 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1983 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1984 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1985 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1986
1987 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
1988 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
1989 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
1990 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
1991
1992 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1993 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1994 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1995 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1996
1997 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1998 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1999 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2000 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2001
2002 if (!i) {
2003
2004 r0= m128iS0; //0
2005 r1= m128iS1; //16
2006 r2= m128iS2; //32
2007 r3= m128iS3; //48
2008 r4= m128iS4; //64
2009 r5= m128iS5; //80
2010 r6= m128iS6; //96
2011 r7= m128iS7; //112
2012 r8= m128iS8; //128
2013 r9= m128iS9; //144
2014 r10= m128iS10; //160
2015 r11= m128iS11; //176
2016 r12= m128iS12; //192
2017 r13= m128iS13; //208
2018 r14= m128iS14; //224
2019 r15= m128iS15; //240
2020
2021
2022
2023 m128iS0 = _mm_load_si128((__m128i *) (src + 8));
2024 m128iS1 = _mm_load_si128((__m128i *) (src + 24));
2025 m128iS2 = _mm_load_si128((__m128i *) (src + 40));
2026 m128iS3 = _mm_load_si128((__m128i *) (src + 56));
2027 m128iS4 = _mm_loadu_si128((__m128i *) (src + 72));
2028 m128iS5 = _mm_load_si128((__m128i *) (src + 88));
2029 m128iS6 = _mm_load_si128((__m128i *) (src + 104));
2030 m128iS7 = _mm_load_si128((__m128i *) (src + 120));
2031 m128iS8 = _mm_load_si128((__m128i *) (src + 136));
2032 m128iS9 = _mm_load_si128((__m128i *) (src + 152));
2033 m128iS10 = _mm_load_si128((__m128i *) (src + 168));
2034 m128iS11 = _mm_load_si128((__m128i *) (src + 184));
2035 m128iS12 = _mm_load_si128((__m128i *) (src + 200));
2036 m128iS13 = _mm_load_si128((__m128i *) (src + 216));
2037 m128iS14 = _mm_load_si128((__m128i *) (src + 232));
2038 m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2039 } else {
2040
2041 r16= m128iS0; //8
2042 r17= m128iS1; //24
2043 r18= m128iS2; //40
2044 r19= m128iS3; //56
2045 r20= m128iS4; //72
2046 r21= m128iS5; //88
2047 r22= m128iS6; //104
2048 r23= m128iS7; //120
2049 r24= m128iS8; //136
2050 r25= m128iS9; //152
2051 r26= m128iS10; //168
2052 r27= m128iS11; //184
2053 r28= m128iS12; //200
2054 r29= m128iS13; //216
2055 r30= m128iS14; //232
2056 r31= m128iS15; //248
2057
2058 //prepare next iteration :
2059
2060 m128iS0= r0;
2061 m128iS1= r2;
2062 m128iS2= r4;
2063 m128iS3= r6;
2064 m128iS4= r8;
2065 m128iS5= r10;
2066 m128iS6= r12;
2067 m128iS7= r14;
2068 m128iS8= r16;
2069 m128iS9= r18;
2070 m128iS10=r20;
2071 m128iS11=r22;
2072 m128iS12=r24;
2073 m128iS13=r26;
2074 m128iS14=r28;
2075 m128iS15=r30;
2076
2077 shift = shift_2nd;
2078 m128iAdd = _mm_set1_epi32(add_2nd);
2079 }
2080
2081 } else {
2082
2083 //transpose half matrix :
2084 //instead of having 1 register = 1 half-column,
2085 //1 register = 1 half-row.
2086 E0l = _mm_unpacklo_epi16(m128iS0, m128iS1);
2087 E1l = _mm_unpacklo_epi16(m128iS2, m128iS3);
2088 E2l = _mm_unpacklo_epi16(m128iS4, m128iS5);
2089 E3l = _mm_unpacklo_epi16(m128iS6, m128iS7);
2090 E4l = _mm_unpacklo_epi16(m128iS8, m128iS9);
2091 E5l = _mm_unpacklo_epi16(m128iS10, m128iS11);
2092 E6l = _mm_unpacklo_epi16(m128iS12, m128iS13);
2093 E7l = _mm_unpacklo_epi16(m128iS14, m128iS15);
2094
2095 O0l = _mm_unpackhi_epi16(m128iS0, m128iS1);
2096 O1l = _mm_unpackhi_epi16(m128iS2, m128iS3);
2097 O2l = _mm_unpackhi_epi16(m128iS4, m128iS5);
2098 O3l = _mm_unpackhi_epi16(m128iS6, m128iS7);
2099 O4l = _mm_unpackhi_epi16(m128iS8, m128iS9);
2100 O5l = _mm_unpackhi_epi16(m128iS10, m128iS11);
2101 O6l = _mm_unpackhi_epi16(m128iS12, m128iS13);
2102 O7l = _mm_unpackhi_epi16(m128iS14, m128iS15);
2103
2104
2105 m128Tmp0 = _mm_unpacklo_epi32(E0l, E1l);
2106 m128Tmp1 = _mm_unpacklo_epi32(E2l, E3l);
2107
2108 m128Tmp2 = _mm_unpacklo_epi32(E4l, E5l);
2109 m128Tmp3 = _mm_unpacklo_epi32(E6l, E7l);
2110
2111 r0 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); //1st half 1st row
2112 r2 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); //2nd half 1st row
2113
2114
2115 r4 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); //1st half 2nd row
2116 r6 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); //2nd hald 2nd row
2117
2118 m128Tmp0 = _mm_unpackhi_epi32(E0l, E1l);
2119 m128Tmp1 = _mm_unpackhi_epi32(E2l, E3l);
2120 m128Tmp2 = _mm_unpackhi_epi32(E4l, E5l);
2121 m128Tmp3 = _mm_unpackhi_epi32(E6l, E7l);
2122
2123
2124 r8 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2125 r10 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2126
2127 r12 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2128 r14 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2129
2130 m128Tmp0 = _mm_unpacklo_epi32(O0l, O1l);
2131 m128Tmp1 = _mm_unpacklo_epi32(O2l, O3l);
2132 m128Tmp2 = _mm_unpacklo_epi32(O4l, O5l);
2133 m128Tmp3 = _mm_unpacklo_epi32(O6l, O7l);
2134
2135 r16 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2136 r18 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2137
2138
2139 r20 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2140 r22 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2141
2142 m128Tmp0 = _mm_unpackhi_epi32(O0l, O1l);
2143 m128Tmp1 = _mm_unpackhi_epi32(O2l, O3l);
2144 m128Tmp2 = _mm_unpackhi_epi32(O4l, O5l);
2145 m128Tmp3 = _mm_unpackhi_epi32(O6l, O7l);
2146
2147 r24 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2148 r26 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2149
2150
2151 r28 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2152 r30 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2153
2154 dst = (uint8_t*) (_dst + (i*stride));
2155 m128Tmp0= _mm_setzero_si128();
2156 m128Tmp1= _mm_load_si128((__m128i*)dst);
2157 m128Tmp2= _mm_load_si128((__m128i*)(dst+stride));
2158 m128Tmp3= _mm_load_si128((__m128i*)(dst+2*stride));
2159 m128Tmp4= _mm_load_si128((__m128i*)(dst+3*stride));
2160 m128Tmp5= _mm_load_si128((__m128i*)(dst+4*stride));
2161 m128Tmp6= _mm_load_si128((__m128i*)(dst+5*stride));
2162 m128Tmp7= _mm_load_si128((__m128i*)(dst+6*stride));
2163 E0l= _mm_load_si128((__m128i*)(dst+7*stride));
2164
2165
2166 r0= _mm_adds_epi16(r0,_mm_unpacklo_epi8(m128Tmp1,m128Tmp0));
2167 r2= _mm_adds_epi16(r2,_mm_unpackhi_epi8(m128Tmp1,m128Tmp0));
2168 r0= _mm_packus_epi16(r0,r2);
2169
2170
2171
2172
2173 r4= _mm_adds_epi16(r4,_mm_unpacklo_epi8(m128Tmp2,m128Tmp0));
2174 r6= _mm_adds_epi16(r6,_mm_unpackhi_epi8(m128Tmp2,m128Tmp0));
2175 r4= _mm_packus_epi16(r4,r6);
2176
2177
2178 r8= _mm_adds_epi16(r8,_mm_unpacklo_epi8(m128Tmp3,m128Tmp0));
2179 r10= _mm_adds_epi16(r10,_mm_unpackhi_epi8(m128Tmp3,m128Tmp0));
2180 r8= _mm_packus_epi16(r8,r10);
2181
2182
2183 r12= _mm_adds_epi16(r12,_mm_unpacklo_epi8(m128Tmp4,m128Tmp0));
2184 r14= _mm_adds_epi16(r14,_mm_unpackhi_epi8(m128Tmp4,m128Tmp0));
2185 r12= _mm_packus_epi16(r12,r14);
2186
2187
2188 r16= _mm_adds_epi16(r16,_mm_unpacklo_epi8(m128Tmp5,m128Tmp0));
2189 r18= _mm_adds_epi16(r18,_mm_unpackhi_epi8(m128Tmp5,m128Tmp0));
2190 r16= _mm_packus_epi16(r16,r18);
2191
2192
2193 r20= _mm_adds_epi16(r20,_mm_unpacklo_epi8(m128Tmp6,m128Tmp0));
2194 r22= _mm_adds_epi16(r22,_mm_unpackhi_epi8(m128Tmp6,m128Tmp0));
2195 r20= _mm_packus_epi16(r20,r22);
2196
2197
2198 r24= _mm_adds_epi16(r24,_mm_unpacklo_epi8(m128Tmp7,m128Tmp0));
2199 r26= _mm_adds_epi16(r26,_mm_unpackhi_epi8(m128Tmp7,m128Tmp0));
2200 r24= _mm_packus_epi16(r24,r26);
2201
2202
2203
2204 r28= _mm_adds_epi16(r28,_mm_unpacklo_epi8(E0l,m128Tmp0));
2205 r30= _mm_adds_epi16(r30,_mm_unpackhi_epi8(E0l,m128Tmp0));
2206 r28= _mm_packus_epi16(r28,r30);
2207
2208 _mm_store_si128((__m128i*)dst,r0);
2209 _mm_store_si128((__m128i*)(dst+stride),r4);
2210 _mm_store_si128((__m128i*)(dst+2*stride),r8);
2211 _mm_store_si128((__m128i*)(dst+3*stride),r12);
2212 _mm_store_si128((__m128i*)(dst+4*stride),r16);
2213 _mm_store_si128((__m128i*)(dst+5*stride),r20);
2214 _mm_store_si128((__m128i*)(dst+6*stride),r24);
2215 _mm_store_si128((__m128i*)(dst+7*stride),r28);
2216
2217
2218
2219 if (!i) {
2220 //first half done, can store !
2221
2222
2223 m128iS0= r1;
2224 m128iS1= r3;
2225 m128iS2= r5;
2226 m128iS3= r7;
2227 m128iS4= r9;
2228 m128iS5= r11;
2229 m128iS6= r13;
2230 m128iS7= r15;
2231 m128iS8= r17;
2232 m128iS9= r19;
2233 m128iS10=r21;
2234 m128iS11=r23;
2235 m128iS12=r25;
2236 m128iS13=r27;
2237 m128iS14=r29;
2238 m128iS15=r31;
2239 }
2240 }
2241 }
2242 }
2243 }
2244 #endif
2245
2246
2247 #if 0
2248 void ff_hevc_transform_16x16_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
2249 ptrdiff_t _stride) {
2250 int i;
2251 uint16_t *dst = (uint16_t*) _dst;
2252 ptrdiff_t stride = _stride / 2;
2253 int16_t *src = coeffs;
2254 int32_t shift;
2255 uint8_t shift_2nd = 10; //20 - bit depth
2256 uint16_t add_2nd = 1 << 9; //shift - 1;
2257 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
2258 m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
2259 m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
2260 m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
2261 E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
2262 O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
2263 E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
2264 __m128i E4l, E5l, E6l, E7l;
2265 __m128i E4h, E5h, E6h, E7h;
2266 int j;
2267 m128iS0 = _mm_load_si128((__m128i *) (src));
2268 m128iS1 = _mm_load_si128((__m128i *) (src + 16));
2269 m128iS2 = _mm_load_si128((__m128i *) (src + 32));
2270 m128iS3 = _mm_load_si128((__m128i *) (src + 48));
2271 m128iS4 = _mm_loadu_si128((__m128i *) (src + 64));
2272 m128iS5 = _mm_load_si128((__m128i *) (src + 80));
2273 m128iS6 = _mm_load_si128((__m128i *) (src + 96));
2274 m128iS7 = _mm_load_si128((__m128i *) (src + 112));
2275 m128iS8 = _mm_load_si128((__m128i *) (src + 128));
2276 m128iS9 = _mm_load_si128((__m128i *) (src + 144));
2277 m128iS10 = _mm_load_si128((__m128i *) (src + 160));
2278 m128iS11 = _mm_load_si128((__m128i *) (src + 176));
2279 m128iS12 = _mm_loadu_si128((__m128i *) (src + 192));
2280 m128iS13 = _mm_load_si128((__m128i *) (src + 208));
2281 m128iS14 = _mm_load_si128((__m128i *) (src + 224));
2282 m128iS15 = _mm_load_si128((__m128i *) (src + 240));
2283 shift = shift_1st;
2284 m128iAdd = _mm_set1_epi32(add_1st);
2285
2286 for (j = 0; j < 2; j++) {
2287 for (i = 0; i < 16; i += 8) {
2288
2289 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
2290 E0l = _mm_madd_epi16(m128Tmp0,
2291 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
2292 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
2293 E0h = _mm_madd_epi16(m128Tmp1,
2294 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
2295
2296 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
2297 E1l = _mm_madd_epi16(m128Tmp2,
2298 _mm_load_si128((__m128i *) (transform16x16_1[1][0])));
2299 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
2300 E1h = _mm_madd_epi16(m128Tmp3,
2301 _mm_load_si128((__m128i *) (transform16x16_1[1][0])));
2302
2303 m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
2304 E2l = _mm_madd_epi16(m128Tmp4,
2305 _mm_load_si128((__m128i *) (transform16x16_1[2][0])));
2306 m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
2307 E2h = _mm_madd_epi16(m128Tmp5,
2308 _mm_load_si128((__m128i *) (transform16x16_1[2][0])));
2309
2310 m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
2311 E3l = _mm_madd_epi16(m128Tmp6,
2312 _mm_load_si128((__m128i *) (transform16x16_1[3][0])));
2313 m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
2314 E3h = _mm_madd_epi16(m128Tmp7,
2315 _mm_load_si128((__m128i *) (transform16x16_1[3][0])));
2316
2317 O0l = _mm_add_epi32(E0l, E1l);
2318 O0l = _mm_add_epi32(O0l, E2l);
2319 O0l = _mm_add_epi32(O0l, E3l);
2320
2321 O0h = _mm_add_epi32(E0h, E1h);
2322 O0h = _mm_add_epi32(O0h, E2h);
2323 O0h = _mm_add_epi32(O0h, E3h);
2324
2325 /* Compute O1*/
2326 E0l = _mm_madd_epi16(m128Tmp0,
2327 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
2328 E0h = _mm_madd_epi16(m128Tmp1,
2329 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
2330 E1l = _mm_madd_epi16(m128Tmp2,
2331 _mm_load_si128((__m128i *) (transform16x16_1[1][1])));
2332 E1h = _mm_madd_epi16(m128Tmp3,
2333 _mm_load_si128((__m128i *) (transform16x16_1[1][1])));
2334 E2l = _mm_madd_epi16(m128Tmp4,
2335 _mm_load_si128((__m128i *) (transform16x16_1[2][1])));
2336 E2h = _mm_madd_epi16(m128Tmp5,
2337 _mm_load_si128((__m128i *) (transform16x16_1[2][1])));
2338 E3l = _mm_madd_epi16(m128Tmp6,
2339 _mm_load_si128((__m128i *) (transform16x16_1[3][1])));
2340 E3h = _mm_madd_epi16(m128Tmp7,
2341 _mm_load_si128((__m128i *) (transform16x16_1[3][1])));
2342 O1l = _mm_add_epi32(E0l, E1l);
2343 O1l = _mm_add_epi32(O1l, E2l);
2344 O1l = _mm_add_epi32(O1l, E3l);
2345 O1h = _mm_add_epi32(E0h, E1h);
2346 O1h = _mm_add_epi32(O1h, E2h);
2347 O1h = _mm_add_epi32(O1h, E3h);
2348
2349 /* Compute O2*/
2350 E0l = _mm_madd_epi16(m128Tmp0,
2351 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
2352 E0h = _mm_madd_epi16(m128Tmp1,
2353 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
2354 E1l = _mm_madd_epi16(m128Tmp2,
2355 _mm_load_si128((__m128i *) (transform16x16_1[1][2])));
2356 E1h = _mm_madd_epi16(m128Tmp3,
2357 _mm_load_si128((__m128i *) (transform16x16_1[1][2])));
2358 E2l = _mm_madd_epi16(m128Tmp4,
2359 _mm_load_si128((__m128i *) (transform16x16_1[2][2])));
2360 E2h = _mm_madd_epi16(m128Tmp5,
2361 _mm_load_si128((__m128i *) (transform16x16_1[2][2])));
2362 E3l = _mm_madd_epi16(m128Tmp6,
2363 _mm_load_si128((__m128i *) (transform16x16_1[3][2])));
2364 E3h = _mm_madd_epi16(m128Tmp7,
2365 _mm_load_si128((__m128i *) (transform16x16_1[3][2])));
2366 O2l = _mm_add_epi32(E0l, E1l);
2367 O2l = _mm_add_epi32(O2l, E2l);
2368 O2l = _mm_add_epi32(O2l, E3l);
2369
2370 O2h = _mm_add_epi32(E0h, E1h);
2371 O2h = _mm_add_epi32(O2h, E2h);
2372 O2h = _mm_add_epi32(O2h, E3h);
2373
2374 /* Compute O3*/
2375 E0l = _mm_madd_epi16(m128Tmp0,
2376 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
2377 E0h = _mm_madd_epi16(m128Tmp1,
2378 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
2379 E1l = _mm_madd_epi16(m128Tmp2,
2380 _mm_load_si128((__m128i *) (transform16x16_1[1][3])));
2381 E1h = _mm_madd_epi16(m128Tmp3,
2382 _mm_load_si128((__m128i *) (transform16x16_1[1][3])));
2383 E2l = _mm_madd_epi16(m128Tmp4,
2384 _mm_load_si128((__m128i *) (transform16x16_1[2][3])));
2385 E2h = _mm_madd_epi16(m128Tmp5,
2386 _mm_load_si128((__m128i *) (transform16x16_1[2][3])));
2387 E3l = _mm_madd_epi16(m128Tmp6,
2388 _mm_load_si128((__m128i *) (transform16x16_1[3][3])));
2389 E3h = _mm_madd_epi16(m128Tmp7,
2390 _mm_load_si128((__m128i *) (transform16x16_1[3][3])));
2391
2392 O3l = _mm_add_epi32(E0l, E1l);
2393 O3l = _mm_add_epi32(O3l, E2l);
2394 O3l = _mm_add_epi32(O3l, E3l);
2395
2396 O3h = _mm_add_epi32(E0h, E1h);
2397 O3h = _mm_add_epi32(O3h, E2h);
2398 O3h = _mm_add_epi32(O3h, E3h);
2399
2400 /* Compute O4*/
2401
2402 E0l = _mm_madd_epi16(m128Tmp0,
2403 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
2404 E0h = _mm_madd_epi16(m128Tmp1,
2405 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
2406 E1l = _mm_madd_epi16(m128Tmp2,
2407 _mm_load_si128((__m128i *) (transform16x16_1[1][4])));
2408 E1h = _mm_madd_epi16(m128Tmp3,
2409 _mm_load_si128((__m128i *) (transform16x16_1[1][4])));
2410 E2l = _mm_madd_epi16(m128Tmp4,
2411 _mm_load_si128((__m128i *) (transform16x16_1[2][4])));
2412 E2h = _mm_madd_epi16(m128Tmp5,
2413 _mm_load_si128((__m128i *) (transform16x16_1[2][4])));
2414 E3l = _mm_madd_epi16(m128Tmp6,
2415 _mm_load_si128((__m128i *) (transform16x16_1[3][4])));
2416 E3h = _mm_madd_epi16(m128Tmp7,
2417 _mm_load_si128((__m128i *) (transform16x16_1[3][4])));
2418
2419 O4l = _mm_add_epi32(E0l, E1l);
2420 O4l = _mm_add_epi32(O4l, E2l);
2421 O4l = _mm_add_epi32(O4l, E3l);
2422
2423 O4h = _mm_add_epi32(E0h, E1h);
2424 O4h = _mm_add_epi32(O4h, E2h);
2425 O4h = _mm_add_epi32(O4h, E3h);
2426
2427 /* Compute O5*/
2428 E0l = _mm_madd_epi16(m128Tmp0,
2429 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
2430 E0h = _mm_madd_epi16(m128Tmp1,
2431 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
2432 E1l = _mm_madd_epi16(m128Tmp2,
2433 _mm_load_si128((__m128i *) (transform16x16_1[1][5])));
2434 E1h = _mm_madd_epi16(m128Tmp3,
2435 _mm_load_si128((__m128i *) (transform16x16_1[1][5])));
2436 E2l = _mm_madd_epi16(m128Tmp4,
2437 _mm_load_si128((__m128i *) (transform16x16_1[2][5])));
2438 E2h = _mm_madd_epi16(m128Tmp5,
2439 _mm_load_si128((__m128i *) (transform16x16_1[2][5])));
2440 E3l = _mm_madd_epi16(m128Tmp6,
2441 _mm_load_si128((__m128i *) (transform16x16_1[3][5])));
2442 E3h = _mm_madd_epi16(m128Tmp7,
2443 _mm_load_si128((__m128i *) (transform16x16_1[3][5])));
2444
2445 O5l = _mm_add_epi32(E0l, E1l);
2446 O5l = _mm_add_epi32(O5l, E2l);
2447 O5l = _mm_add_epi32(O5l, E3l);
2448
2449 O5h = _mm_add_epi32(E0h, E1h);
2450 O5h = _mm_add_epi32(O5h, E2h);
2451 O5h = _mm_add_epi32(O5h, E3h);
2452
2453 /* Compute O6*/
2454
2455 E0l = _mm_madd_epi16(m128Tmp0,
2456 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
2457 E0h = _mm_madd_epi16(m128Tmp1,
2458 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
2459 E1l = _mm_madd_epi16(m128Tmp2,
2460 _mm_load_si128((__m128i *) (transform16x16_1[1][6])));
2461 E1h = _mm_madd_epi16(m128Tmp3,
2462 _mm_load_si128((__m128i *) (transform16x16_1[1][6])));
2463 E2l = _mm_madd_epi16(m128Tmp4,
2464 _mm_load_si128((__m128i *) (transform16x16_1[2][6])));
2465 E2h = _mm_madd_epi16(m128Tmp5,
2466 _mm_load_si128((__m128i *) (transform16x16_1[2][6])));
2467 E3l = _mm_madd_epi16(m128Tmp6,
2468 _mm_load_si128((__m128i *) (transform16x16_1[3][6])));
2469 E3h = _mm_madd_epi16(m128Tmp7,
2470 _mm_load_si128((__m128i *) (transform16x16_1[3][6])));
2471
2472 O6l = _mm_add_epi32(E0l, E1l);
2473 O6l = _mm_add_epi32(O6l, E2l);
2474 O6l = _mm_add_epi32(O6l, E3l);
2475
2476 O6h = _mm_add_epi32(E0h, E1h);
2477 O6h = _mm_add_epi32(O6h, E2h);
2478 O6h = _mm_add_epi32(O6h, E3h);
2479
2480 /* Compute O7*/
2481
2482 E0l = _mm_madd_epi16(m128Tmp0,
2483 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
2484 E0h = _mm_madd_epi16(m128Tmp1,
2485 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
2486 E1l = _mm_madd_epi16(m128Tmp2,
2487 _mm_load_si128((__m128i *) (transform16x16_1[1][7])));
2488 E1h = _mm_madd_epi16(m128Tmp3,
2489 _mm_load_si128((__m128i *) (transform16x16_1[1][7])));
2490 E2l = _mm_madd_epi16(m128Tmp4,
2491 _mm_load_si128((__m128i *) (transform16x16_1[2][7])));
2492 E2h = _mm_madd_epi16(m128Tmp5,
2493 _mm_load_si128((__m128i *) (transform16x16_1[2][7])));
2494 E3l = _mm_madd_epi16(m128Tmp6,
2495 _mm_load_si128((__m128i *) (transform16x16_1[3][7])));
2496 E3h = _mm_madd_epi16(m128Tmp7,
2497 _mm_load_si128((__m128i *) (transform16x16_1[3][7])));
2498
2499 O7l = _mm_add_epi32(E0l, E1l);
2500 O7l = _mm_add_epi32(O7l, E2l);
2501 O7l = _mm_add_epi32(O7l, E3l);
2502
2503 O7h = _mm_add_epi32(E0h, E1h);
2504 O7h = _mm_add_epi32(O7h, E2h);
2505 O7h = _mm_add_epi32(O7h, E3h);
2506
2507 /* Compute E0 */
2508
2509 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
2510 E0l = _mm_madd_epi16(m128Tmp0,
2511 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
2512 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
2513 E0h = _mm_madd_epi16(m128Tmp1,
2514 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
2515
2516 m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
2517 E0l = _mm_add_epi32(E0l,
2518 _mm_madd_epi16(m128Tmp2,
2519 _mm_load_si128(
2520 (__m128i *) (transform16x16_2[1][0]))));
2521 m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
2522 E0h = _mm_add_epi32(E0h,
2523 _mm_madd_epi16(m128Tmp3,
2524 _mm_load_si128(
2525 (__m128i *) (transform16x16_2[1][0]))));
2526
2527 /* Compute E1 */
2528 E1l = _mm_madd_epi16(m128Tmp0,
2529 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
2530 E1h = _mm_madd_epi16(m128Tmp1,
2531 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
2532 E1l = _mm_add_epi32(E1l,
2533 _mm_madd_epi16(m128Tmp2,
2534 _mm_load_si128(
2535 (__m128i *) (transform16x16_2[1][1]))));
2536 E1h = _mm_add_epi32(E1h,
2537 _mm_madd_epi16(m128Tmp3,
2538 _mm_load_si128(
2539 (__m128i *) (transform16x16_2[1][1]))));
2540
2541 /* Compute E2 */
2542 E2l = _mm_madd_epi16(m128Tmp0,
2543 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
2544 E2h = _mm_madd_epi16(m128Tmp1,
2545 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
2546 E2l = _mm_add_epi32(E2l,
2547 _mm_madd_epi16(m128Tmp2,
2548 _mm_load_si128(
2549 (__m128i *) (transform16x16_2[1][2]))));
2550 E2h = _mm_add_epi32(E2h,
2551 _mm_madd_epi16(m128Tmp3,
2552 _mm_load_si128(
2553 (__m128i *) (transform16x16_2[1][2]))));
2554 /* Compute E3 */
2555 E3l = _mm_madd_epi16(m128Tmp0,
2556 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
2557 E3h = _mm_madd_epi16(m128Tmp1,
2558 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
2559 E3l = _mm_add_epi32(E3l,
2560 _mm_madd_epi16(m128Tmp2,
2561 _mm_load_si128(
2562 (__m128i *) (transform16x16_2[1][3]))));
2563 E3h = _mm_add_epi32(E3h,
2564 _mm_madd_epi16(m128Tmp3,
2565 _mm_load_si128(
2566 (__m128i *) (transform16x16_2[1][3]))));
2567
2568 /* Compute EE0 and EEE */
2569
2570 m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
2571 E00l = _mm_madd_epi16(m128Tmp0,
2572 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
2573 m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
2574 E00h = _mm_madd_epi16(m128Tmp1,
2575 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
2576
2577 m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8);
2578 EE0l = _mm_madd_epi16(m128Tmp2,
2579 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
2580 m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8);
2581 EE0h = _mm_madd_epi16(m128Tmp3,
2582 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
2583
2584 E01l = _mm_madd_epi16(m128Tmp0,
2585 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
2586 E01h = _mm_madd_epi16(m128Tmp1,
2587 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
2588
2589 EE1l = _mm_madd_epi16(m128Tmp2,
2590 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
2591 EE1h = _mm_madd_epi16(m128Tmp3,
2592 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
2593
2594 /* Compute EE */
2595 EE2l = _mm_sub_epi32(EE1l, E01l);
2596 EE3l = _mm_sub_epi32(EE0l, E00l);
2597 EE2h = _mm_sub_epi32(EE1h, E01h);
2598 EE3h = _mm_sub_epi32(EE0h, E00h);
2599
2600 EE0l = _mm_add_epi32(EE0l, E00l);
2601 EE1l = _mm_add_epi32(EE1l, E01l);
2602 EE0h = _mm_add_epi32(EE0h, E00h);
2603 EE1h = _mm_add_epi32(EE1h, E01h);
2604
2605 /* Compute E */
2606
2607 E4l = _mm_sub_epi32(EE3l, E3l);
2608 E4l = _mm_add_epi32(E4l, m128iAdd);
2609
2610 E5l = _mm_sub_epi32(EE2l, E2l);
2611 E5l = _mm_add_epi32(E5l, m128iAdd);
2612
2613 E6l = _mm_sub_epi32(EE1l, E1l);
2614 E6l = _mm_add_epi32(E6l, m128iAdd);
2615
2616 E7l = _mm_sub_epi32(EE0l, E0l);
2617 E7l = _mm_add_epi32(E7l, m128iAdd);
2618
2619 E4h = _mm_sub_epi32(EE3h, E3h);
2620 E4h = _mm_add_epi32(E4h, m128iAdd);
2621
2622 E5h = _mm_sub_epi32(EE2h, E2h);
2623 E5h = _mm_add_epi32(E5h, m128iAdd);
2624
2625 E6h = _mm_sub_epi32(EE1h, E1h);
2626 E6h = _mm_add_epi32(E6h, m128iAdd);
2627
2628 E7h = _mm_sub_epi32(EE0h, E0h);
2629 E7h = _mm_add_epi32(E7h, m128iAdd);
2630
2631 E0l = _mm_add_epi32(EE0l, E0l);
2632 E0l = _mm_add_epi32(E0l, m128iAdd);
2633
2634 E1l = _mm_add_epi32(EE1l, E1l);
2635 E1l = _mm_add_epi32(E1l, m128iAdd);
2636
2637 E2l = _mm_add_epi32(EE2l, E2l);
2638 E2l = _mm_add_epi32(E2l, m128iAdd);
2639
2640 E3l = _mm_add_epi32(EE3l, E3l);
2641 E3l = _mm_add_epi32(E3l, m128iAdd);
2642
2643 E0h = _mm_add_epi32(EE0h, E0h);
2644 E0h = _mm_add_epi32(E0h, m128iAdd);
2645
2646 E1h = _mm_add_epi32(EE1h, E1h);
2647 E1h = _mm_add_epi32(E1h, m128iAdd);
2648
2649 E2h = _mm_add_epi32(EE2h, E2h);
2650 E2h = _mm_add_epi32(E2h, m128iAdd);
2651
2652 E3h = _mm_add_epi32(EE3h, E3h);
2653 E3h = _mm_add_epi32(E3h, m128iAdd);
2654
2655 m128iS0 = _mm_packs_epi32(
2656 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
2657 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
2658 m128iS1 = _mm_packs_epi32(
2659 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
2660 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
2661 m128iS2 = _mm_packs_epi32(
2662 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
2663 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
2664 m128iS3 = _mm_packs_epi32(
2665 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
2666 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
2667
2668 m128iS4 = _mm_packs_epi32(
2669 _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
2670 _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
2671 m128iS5 = _mm_packs_epi32(
2672 _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
2673 _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
2674 m128iS6 = _mm_packs_epi32(
2675 _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
2676 _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
2677 m128iS7 = _mm_packs_epi32(
2678 _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
2679 _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
2680
2681 m128iS15 = _mm_packs_epi32(
2682 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
2683 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
2684 m128iS14 = _mm_packs_epi32(
2685 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
2686 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
2687 m128iS13 = _mm_packs_epi32(
2688 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
2689 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
2690 m128iS12 = _mm_packs_epi32(
2691 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
2692 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
2693
2694 m128iS11 = _mm_packs_epi32(
2695 _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
2696 _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
2697 m128iS10 = _mm_packs_epi32(
2698 _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
2699 _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
2700 m128iS9 = _mm_packs_epi32(
2701 _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
2702 _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
2703 m128iS8 = _mm_packs_epi32(
2704 _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
2705 _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
2706
2707 if (!j) {
2708 /* Inverse the matrix */
2709 E0l = _mm_unpacklo_epi16(m128iS0, m128iS8);
2710 E1l = _mm_unpacklo_epi16(m128iS1, m128iS9);
2711 E2l = _mm_unpacklo_epi16(m128iS2, m128iS10);
2712 E3l = _mm_unpacklo_epi16(m128iS3, m128iS11);
2713 E4l = _mm_unpacklo_epi16(m128iS4, m128iS12);
2714 E5l = _mm_unpacklo_epi16(m128iS5, m128iS13);
2715 E6l = _mm_unpacklo_epi16(m128iS6, m128iS14);
2716 E7l = _mm_unpacklo_epi16(m128iS7, m128iS15);
2717
2718 O0l = _mm_unpackhi_epi16(m128iS0, m128iS8);
2719 O1l = _mm_unpackhi_epi16(m128iS1, m128iS9);
2720 O2l = _mm_unpackhi_epi16(m128iS2, m128iS10);
2721 O3l = _mm_unpackhi_epi16(m128iS3, m128iS11);
2722 O4l = _mm_unpackhi_epi16(m128iS4, m128iS12);
2723 O5l = _mm_unpackhi_epi16(m128iS5, m128iS13);
2724 O6l = _mm_unpackhi_epi16(m128iS6, m128iS14);
2725 O7l = _mm_unpackhi_epi16(m128iS7, m128iS15);
2726
2727 m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l);
2728 m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l);
2729 m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l);
2730 m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l);
2731
2732 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2733 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2734 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2735 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2736
2737 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2738 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2739 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2740 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2741
2742 m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l);
2743 m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l);
2744 m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l);
2745 m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l);
2746
2747 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2748 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2749 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2750 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2751
2752 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2753 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2754 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2755 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2756
2757 m128Tmp0 = _mm_unpacklo_epi16(O0l, O4l);
2758 m128Tmp1 = _mm_unpacklo_epi16(O1l, O5l);
2759 m128Tmp2 = _mm_unpacklo_epi16(O2l, O6l);
2760 m128Tmp3 = _mm_unpacklo_epi16(O3l, O7l);
2761
2762 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2763 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2764 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2765 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2766
2767 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2768 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2769 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2770 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2771
2772 m128Tmp0 = _mm_unpackhi_epi16(O0l, O4l);
2773 m128Tmp1 = _mm_unpackhi_epi16(O1l, O5l);
2774 m128Tmp2 = _mm_unpackhi_epi16(O2l, O6l);
2775 m128Tmp3 = _mm_unpackhi_epi16(O3l, O7l);
2776
2777 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2778 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2779 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2780 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2781
2782 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2783 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2784 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2785 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2786
2787 /* */
2788 _mm_store_si128((__m128i *) (src + i), m128iS0);
2789 _mm_store_si128((__m128i *) (src + 16 + i), m128iS1);
2790 _mm_store_si128((__m128i *) (src + 32 + i), m128iS2);
2791 _mm_store_si128((__m128i *) (src + 48 + i), m128iS3);
2792 _mm_store_si128((__m128i *) (src + 64 + i), m128iS4);
2793 _mm_store_si128((__m128i *) (src + 80 + i), m128iS5);
2794 _mm_store_si128((__m128i *) (src + 96 + i), m128iS6);
2795 _mm_store_si128((__m128i *) (src + 112 + i), m128iS7);
2796 _mm_store_si128((__m128i *) (src + 128 + i), m128iS8);
2797 _mm_store_si128((__m128i *) (src + 144 + i), m128iS9);
2798 _mm_store_si128((__m128i *) (src + 160 + i), m128iS10);
2799 _mm_store_si128((__m128i *) (src + 176 + i), m128iS11);
2800 _mm_store_si128((__m128i *) (src + 192 + i), m128iS12);
2801 _mm_store_si128((__m128i *) (src + 208 + i), m128iS13);
2802 _mm_store_si128((__m128i *) (src + 224 + i), m128iS14);
2803 _mm_store_si128((__m128i *) (src + 240 + i), m128iS15);
2804
2805 if (!i) {
2806 m128iS0 = _mm_load_si128((__m128i *) (src + 8));
2807 m128iS1 = _mm_load_si128((__m128i *) (src + 24));
2808 m128iS2 = _mm_load_si128((__m128i *) (src + 40));
2809 m128iS3 = _mm_load_si128((__m128i *) (src + 56));
2810 m128iS4 = _mm_loadu_si128((__m128i *) (src + 72));
2811 m128iS5 = _mm_load_si128((__m128i *) (src + 88));
2812 m128iS6 = _mm_load_si128((__m128i *) (src + 104));
2813 m128iS7 = _mm_load_si128((__m128i *) (src + 120));
2814 m128iS8 = _mm_load_si128((__m128i *) (src + 136));
2815 m128iS9 = _mm_load_si128((__m128i *) (src + 152));
2816 m128iS10 = _mm_load_si128((__m128i *) (src + 168));
2817 m128iS11 = _mm_load_si128((__m128i *) (src + 184));
2818 m128iS12 = _mm_loadu_si128((__m128i *) (src + 200));
2819 m128iS13 = _mm_load_si128((__m128i *) (src + 216));
2820 m128iS14 = _mm_load_si128((__m128i *) (src + 232));
2821 m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2822 } else {
2823 m128iS0 = _mm_load_si128((__m128i *) (src));
2824 m128iS1 = _mm_load_si128((__m128i *) (src + 32));
2825 m128iS2 = _mm_load_si128((__m128i *) (src + 64));
2826 m128iS3 = _mm_load_si128((__m128i *) (src + 96));
2827 m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
2828 m128iS5 = _mm_load_si128((__m128i *) (src + 160));
2829 m128iS6 = _mm_load_si128((__m128i *) (src + 192));
2830 m128iS7 = _mm_load_si128((__m128i *) (src + 224));
2831 m128iS8 = _mm_load_si128((__m128i *) (src + 8));
2832 m128iS9 = _mm_load_si128((__m128i *) (src + 32 + 8));
2833 m128iS10 = _mm_load_si128((__m128i *) (src + 64 + 8));
2834 m128iS11 = _mm_load_si128((__m128i *) (src + 96 + 8));
2835 m128iS12 = _mm_loadu_si128((__m128i *) (src + 128 + 8));
2836 m128iS13 = _mm_load_si128((__m128i *) (src + 160 + 8));
2837 m128iS14 = _mm_load_si128((__m128i *) (src + 192 + 8));
2838 m128iS15 = _mm_load_si128((__m128i *) (src + 224 + 8));
2839 shift = shift_2nd;
2840 m128iAdd = _mm_set1_epi32(add_2nd);
2841 }
2842
2843 } else {
2844 int k, m = 0;
2845 _mm_storeu_si128((__m128i *) (src), m128iS0);
2846 _mm_storeu_si128((__m128i *) (src + 8), m128iS1);
2847 _mm_storeu_si128((__m128i *) (src + 32), m128iS2);
2848 _mm_storeu_si128((__m128i *) (src + 40), m128iS3);
2849 _mm_storeu_si128((__m128i *) (src + 64), m128iS4);
2850 _mm_storeu_si128((__m128i *) (src + 72), m128iS5);
2851 _mm_storeu_si128((__m128i *) (src + 96), m128iS6);
2852 _mm_storeu_si128((__m128i *) (src + 104), m128iS7);
2853 _mm_storeu_si128((__m128i *) (src + 128), m128iS8);
2854 _mm_storeu_si128((__m128i *) (src + 136), m128iS9);
2855 _mm_storeu_si128((__m128i *) (src + 160), m128iS10);
2856 _mm_storeu_si128((__m128i *) (src + 168), m128iS11);
2857 _mm_storeu_si128((__m128i *) (src + 192), m128iS12);
2858 _mm_storeu_si128((__m128i *) (src + 200), m128iS13);
2859 _mm_storeu_si128((__m128i *) (src + 224), m128iS14);
2860 _mm_storeu_si128((__m128i *) (src + 232), m128iS15);
2861 dst = (uint16_t*) _dst + (i * stride);
2862
2863 for (k = 0; k < 8; k++) {
2864 dst[0] = av_clip_uintp2(dst[0] + src[m],10);
2865 dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10);
2866 dst[2] = av_clip_uintp2(dst[2] + src[m + 32],10);
2867 dst[3] = av_clip_uintp2(dst[3] + src[m + 40],10);
2868 dst[4] = av_clip_uintp2(dst[4] + src[m + 64],10);
2869 dst[5] = av_clip_uintp2(dst[5] + src[m + 72],10);
2870 dst[6] = av_clip_uintp2(dst[6] + src[m + 96],10);
2871 dst[7] = av_clip_uintp2(dst[7] + src[m + 104],10);
2872
2873 dst[8] = av_clip_uintp2(dst[8] + src[m + 128],10);
2874 dst[9] = av_clip_uintp2(dst[9] + src[m + 136],10);
2875 dst[10] = av_clip_uintp2(dst[10] + src[m + 160],10);
2876 dst[11] = av_clip_uintp2(dst[11] + src[m + 168],10);
2877 dst[12] = av_clip_uintp2(dst[12] + src[m + 192],10);
2878 dst[13] = av_clip_uintp2(dst[13] + src[m + 200],10);
2879 dst[14] = av_clip_uintp2(dst[14] + src[m + 224],10);
2880 dst[15] = av_clip_uintp2(dst[15] + src[m + 232],10);
2881 m += 1;
2882 dst += stride;
2883 }
2884 if (!i) {
2885 m128iS0 = _mm_load_si128((__m128i *) (src + 16));
2886 m128iS1 = _mm_load_si128((__m128i *) (src + 48));
2887 m128iS2 = _mm_load_si128((__m128i *) (src + 80));
2888 m128iS3 = _mm_loadu_si128((__m128i *) (src + 112));
2889 m128iS4 = _mm_load_si128((__m128i *) (src + 144));
2890 m128iS5 = _mm_load_si128((__m128i *) (src + 176));
2891 m128iS6 = _mm_load_si128((__m128i *) (src + 208));
2892 m128iS7 = _mm_load_si128((__m128i *) (src + 240));
2893 m128iS8 = _mm_load_si128((__m128i *) (src + 24));
2894 m128iS9 = _mm_load_si128((__m128i *) (src + 56));
2895 m128iS10 = _mm_load_si128((__m128i *) (src + 88));
2896 m128iS11 = _mm_loadu_si128((__m128i *) (src + 120));
2897 m128iS12 = _mm_load_si128((__m128i *) (src + 152));
2898 m128iS13 = _mm_load_si128((__m128i *) (src + 184));
2899 m128iS14 = _mm_load_si128((__m128i *) (src + 216));
2900 m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2901 }
2902 }
2903 }
2904 }
2905
2906 }
2907 #endif
2908
2909
2910 #if HAVE_SSE4_1
2911 void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
2912 ptrdiff_t _stride) {
2913 uint8_t shift_2nd = 12; // 20 - Bit depth
2914 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
2915 int i, j;
2916 uint8_t *dst = (uint8_t*) _dst;
2917 ptrdiff_t stride = _stride / sizeof(uint8_t);
2918 int shift;
2919 int16_t *src = coeffs;
2920
2921 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
2922 m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
2923 m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
2924 m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
2925 E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
2926 O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
2927 E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
2928 __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l;
2929 __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h,
2930 EEE0l, EEE1l, EEE0h, EEE1h;
2931 __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21,
2932 m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27,
2933 m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9,
2934 m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15,
2935 O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l,
2936 O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l,
2937 EE4l, EE7h, EE6h, EE5h, EE4h;
2938
2939 __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31;
2940 __m128i r32,r33,r34,r35,r36,r37,r38,r39,r40,r41,r42,r43,r44,r45,r46,r47,r48,r49,r50,r51,r52,r53,r54,r55,r56,r57,r58,r59,r60,r61,r62,r63;
2941 __m128i r64,r65,r66,r67,r68,r69,r70,r71,r72,r73,r74,r75,r76,r77,r78,r79,r80,r81,r82,r83,r84,r85,r86,r87,r88,r89,r90,r91,r92,r93,r94,r95;
2942 __m128i r96,r97,r98,r99,r100,r101,r102,r103,r104,r105,r106,r107,r108,r109,r110,r111,r112,r113,r114,r115,r116,r117,r118,r119,r120,r121,r122,r123,r124,r125,r126,r127;
2943
2944
2945 m128iS0 = _mm_load_si128((__m128i *) (src));
2946 m128iS1 = _mm_load_si128((__m128i *) (src + 32));
2947 m128iS2 = _mm_load_si128((__m128i *) (src + 64));
2948 m128iS3 = _mm_load_si128((__m128i *) (src + 96));
2949 m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
2950 m128iS5 = _mm_load_si128((__m128i *) (src + 160));
2951 m128iS6 = _mm_load_si128((__m128i *) (src + 192));
2952 m128iS7 = _mm_load_si128((__m128i *) (src + 224));
2953 m128iS8 = _mm_load_si128((__m128i *) (src + 256));
2954 m128iS9 = _mm_load_si128((__m128i *) (src + 288));
2955 m128iS10 = _mm_load_si128((__m128i *) (src + 320));
2956 m128iS11 = _mm_load_si128((__m128i *) (src + 352));
2957 m128iS12 = _mm_load_si128((__m128i *) (src + 384));
2958 m128iS13 = _mm_load_si128((__m128i *) (src + 416));
2959 m128iS14 = _mm_load_si128((__m128i *) (src + 448));
2960 m128iS15 = _mm_load_si128((__m128i *) (src + 480));
2961 m128iS16 = _mm_load_si128((__m128i *) (src + 512));
2962 m128iS17 = _mm_load_si128((__m128i *) (src + 544));
2963 m128iS18 = _mm_load_si128((__m128i *) (src + 576));
2964 m128iS19 = _mm_load_si128((__m128i *) (src + 608));
2965 m128iS20 = _mm_load_si128((__m128i *) (src + 640));
2966 m128iS21 = _mm_load_si128((__m128i *) (src + 672));
2967 m128iS22 = _mm_load_si128((__m128i *) (src + 704));
2968 m128iS23 = _mm_load_si128((__m128i *) (src + 736));
2969 m128iS24 = _mm_load_si128((__m128i *) (src + 768));
2970 m128iS25 = _mm_load_si128((__m128i *) (src + 800));
2971 m128iS26 = _mm_load_si128((__m128i *) (src + 832));
2972 m128iS27 = _mm_load_si128((__m128i *) (src + 864));
2973 m128iS28 = _mm_load_si128((__m128i *) (src + 896));
2974 m128iS29 = _mm_load_si128((__m128i *) (src + 928));
2975 m128iS30 = _mm_load_si128((__m128i *) (src + 960));
2976 m128iS31 = _mm_load_si128((__m128i *) (src + 992));
2977
2978 shift = shift_1st;
2979 m128iAdd = _mm_set1_epi32(add_1st);
2980
2981 for (j = 0; j < 2; j++) {
2982 for (i = 0; i < 32; i += 8) {
2983 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
2984 E0l = _mm_madd_epi16(m128Tmp0,
2985 _mm_load_si128((__m128i *) (transform32x32[0][0])));
2986 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
2987 E0h = _mm_madd_epi16(m128Tmp1,
2988 _mm_load_si128((__m128i *) (transform32x32[0][0])));
2989
2990 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
2991 E1l = _mm_madd_epi16(m128Tmp2,
2992 _mm_load_si128((__m128i *) (transform32x32[1][0])));
2993 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
2994 E1h = _mm_madd_epi16(m128Tmp3,
2995 _mm_load_si128((__m128i *) (transform32x32[1][0])));
2996
2997 m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
2998 E2l = _mm_madd_epi16(m128Tmp4,
2999 _mm_load_si128((__m128i *) (transform32x32[2][0])));
3000 m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
3001 E2h = _mm_madd_epi16(m128Tmp5,
3002 _mm_load_si128((__m128i *) (transform32x32[2][0])));
3003
3004 m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
3005 E3l = _mm_madd_epi16(m128Tmp6,
3006 _mm_load_si128((__m128i *) (transform32x32[3][0])));
3007 m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
3008 E3h = _mm_madd_epi16(m128Tmp7,
3009 _mm_load_si128((__m128i *) (transform32x32[3][0])));
3010
3011 m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19);
3012 E4l = _mm_madd_epi16(m128Tmp8,
3013 _mm_load_si128((__m128i *) (transform32x32[4][0])));
3014 m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19);
3015 E4h = _mm_madd_epi16(m128Tmp9,
3016 _mm_load_si128((__m128i *) (transform32x32[4][0])));
3017
3018 m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23);
3019 E5l = _mm_madd_epi16(m128Tmp10,
3020 _mm_load_si128((__m128i *) (transform32x32[5][0])));
3021 m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23);
3022 E5h = _mm_madd_epi16(m128Tmp11,
3023 _mm_load_si128((__m128i *) (transform32x32[5][0])));
3024
3025 m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27);
3026 E6l = _mm_madd_epi16(m128Tmp12,
3027 _mm_load_si128((__m128i *) (transform32x32[6][0])));
3028 m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27);
3029 E6h = _mm_madd_epi16(m128Tmp13,
3030 _mm_load_si128((__m128i *) (transform32x32[6][0])));
3031
3032 m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31);
3033 E7l = _mm_madd_epi16(m128Tmp14,
3034 _mm_load_si128((__m128i *) (transform32x32[7][0])));
3035 m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31);
3036 E7h = _mm_madd_epi16(m128Tmp15,
3037 _mm_load_si128((__m128i *) (transform32x32[7][0])));
3038
3039 O0l = _mm_add_epi32(E0l, E1l);
3040 O0l = _mm_add_epi32(O0l, E2l);
3041 O0l = _mm_add_epi32(O0l, E3l);
3042 O0l = _mm_add_epi32(O0l, E4l);
3043 O0l = _mm_add_epi32(O0l, E5l);
3044 O0l = _mm_add_epi32(O0l, E6l);
3045 O0l = _mm_add_epi32(O0l, E7l);
3046
3047 O0h = _mm_add_epi32(E0h, E1h);
3048 O0h = _mm_add_epi32(O0h, E2h);
3049 O0h = _mm_add_epi32(O0h, E3h);
3050 O0h = _mm_add_epi32(O0h, E4h);
3051 O0h = _mm_add_epi32(O0h, E5h);
3052 O0h = _mm_add_epi32(O0h, E6h);
3053 O0h = _mm_add_epi32(O0h, E7h);
3054
3055 /* Compute O1*/
3056 E0l = _mm_madd_epi16(m128Tmp0,
3057 _mm_load_si128((__m128i *) (transform32x32[0][1])));
3058 E0h = _mm_madd_epi16(m128Tmp1,
3059 _mm_load_si128((__m128i *) (transform32x32[0][1])));
3060 E1l = _mm_madd_epi16(m128Tmp2,
3061 _mm_load_si128((__m128i *) (transform32x32[1][1])));
3062 E1h = _mm_madd_epi16(m128Tmp3,
3063 _mm_load_si128((__m128i *) (transform32x32[1][1])));
3064 E2l = _mm_madd_epi16(m128Tmp4,
3065 _mm_load_si128((__m128i *) (transform32x32[2][1])));
3066 E2h = _mm_madd_epi16(m128Tmp5,
3067 _mm_load_si128((__m128i *) (transform32x32[2][1])));
3068 E3l = _mm_madd_epi16(m128Tmp6,
3069 _mm_load_si128((__m128i *) (transform32x32[3][1])));
3070 E3h = _mm_madd_epi16(m128Tmp7,
3071 _mm_load_si128((__m128i *) (transform32x32[3][1])));
3072
3073 E4l = _mm_madd_epi16(m128Tmp8,
3074 _mm_load_si128((__m128i *) (transform32x32[4][1])));
3075 E4h = _mm_madd_epi16(m128Tmp9,
3076 _mm_load_si128((__m128i *) (transform32x32[4][1])));
3077 E5l = _mm_madd_epi16(m128Tmp10,
3078 _mm_load_si128((__m128i *) (transform32x32[5][1])));
3079 E5h = _mm_madd_epi16(m128Tmp11,
3080 _mm_load_si128((__m128i *) (transform32x32[5][1])));
3081 E6l = _mm_madd_epi16(m128Tmp12,
3082 _mm_load_si128((__m128i *) (transform32x32[6][1])));
3083 E6h = _mm_madd_epi16(m128Tmp13,
3084 _mm_load_si128((__m128i *) (transform32x32[6][1])));
3085 E7l = _mm_madd_epi16(m128Tmp14,
3086 _mm_load_si128((__m128i *) (transform32x32[7][1])));
3087 E7h = _mm_madd_epi16(m128Tmp15,
3088 _mm_load_si128((__m128i *) (transform32x32[7][1])));
3089
3090 O1l = _mm_add_epi32(E0l, E1l);
3091 O1l = _mm_add_epi32(O1l, E2l);
3092 O1l = _mm_add_epi32(O1l, E3l);
3093 O1l = _mm_add_epi32(O1l, E4l);
3094 O1l = _mm_add_epi32(O1l, E5l);
3095 O1l = _mm_add_epi32(O1l, E6l);
3096 O1l = _mm_add_epi32(O1l, E7l);
3097
3098 O1h = _mm_add_epi32(E0h, E1h);
3099 O1h = _mm_add_epi32(O1h, E2h);
3100 O1h = _mm_add_epi32(O1h, E3h);
3101 O1h = _mm_add_epi32(O1h, E4h);
3102 O1h = _mm_add_epi32(O1h, E5h);
3103 O1h = _mm_add_epi32(O1h, E6h);
3104 O1h = _mm_add_epi32(O1h, E7h);
3105 /* Compute O2*/
3106 E0l = _mm_madd_epi16(m128Tmp0,
3107 _mm_load_si128((__m128i *) (transform32x32[0][2])));
3108 E0h = _mm_madd_epi16(m128Tmp1,
3109 _mm_load_si128((__m128i *) (transform32x32[0][2])));
3110 E1l = _mm_madd_epi16(m128Tmp2,
3111 _mm_load_si128((__m128i *) (transform32x32[1][2])));
3112 E1h = _mm_madd_epi16(m128Tmp3,
3113 _mm_load_si128((__m128i *) (transform32x32[1][2])));
3114 E2l = _mm_madd_epi16(m128Tmp4,
3115 _mm_load_si128((__m128i *) (transform32x32[2][2])));
3116 E2h = _mm_madd_epi16(m128Tmp5,
3117 _mm_load_si128((__m128i *) (transform32x32[2][2])));
3118 E3l = _mm_madd_epi16(m128Tmp6,
3119 _mm_load_si128((__m128i *) (transform32x32[3][2])));
3120 E3h = _mm_madd_epi16(m128Tmp7,
3121 _mm_load_si128((__m128i *) (transform32x32[3][2])));
3122
3123 E4l = _mm_madd_epi16(m128Tmp8,
3124 _mm_load_si128((__m128i *) (transform32x32[4][2])));
3125 E4h = _mm_madd_epi16(m128Tmp9,
3126 _mm_load_si128((__m128i *) (transform32x32[4][2])));
3127 E5l = _mm_madd_epi16(m128Tmp10,
3128 _mm_load_si128((__m128i *) (transform32x32[5][2])));
3129 E5h = _mm_madd_epi16(m128Tmp11,
3130 _mm_load_si128((__m128i *) (transform32x32[5][2])));
3131 E6l = _mm_madd_epi16(m128Tmp12,
3132 _mm_load_si128((__m128i *) (transform32x32[6][2])));
3133 E6h = _mm_madd_epi16(m128Tmp13,
3134 _mm_load_si128((__m128i *) (transform32x32[6][2])));
3135 E7l = _mm_madd_epi16(m128Tmp14,
3136 _mm_load_si128((__m128i *) (transform32x32[7][2])));
3137 E7h = _mm_madd_epi16(m128Tmp15,
3138 _mm_load_si128((__m128i *) (transform32x32[7][2])));
3139
3140 O2l = _mm_add_epi32(E0l, E1l);
3141 O2l = _mm_add_epi32(O2l, E2l);
3142 O2l = _mm_add_epi32(O2l, E3l);
3143 O2l = _mm_add_epi32(O2l, E4l);
3144 O2l = _mm_add_epi32(O2l, E5l);
3145 O2l = _mm_add_epi32(O2l, E6l);
3146 O2l = _mm_add_epi32(O2l, E7l);
3147
3148 O2h = _mm_add_epi32(E0h, E1h);
3149 O2h = _mm_add_epi32(O2h, E2h);
3150 O2h = _mm_add_epi32(O2h, E3h);
3151 O2h = _mm_add_epi32(O2h, E4h);
3152 O2h = _mm_add_epi32(O2h, E5h);
3153 O2h = _mm_add_epi32(O2h, E6h);
3154 O2h = _mm_add_epi32(O2h, E7h);
3155 /* Compute O3*/
3156 E0l = _mm_madd_epi16(m128Tmp0,
3157 _mm_load_si128((__m128i *) (transform32x32[0][3])));
3158 E0h = _mm_madd_epi16(m128Tmp1,
3159 _mm_load_si128((__m128i *) (transform32x32[0][3])));
3160 E1l = _mm_madd_epi16(m128Tmp2,
3161 _mm_load_si128((__m128i *) (transform32x32[1][3])));
3162 E1h = _mm_madd_epi16(m128Tmp3,
3163 _mm_load_si128((__m128i *) (transform32x32[1][3])));
3164 E2l = _mm_madd_epi16(m128Tmp4,
3165 _mm_load_si128((__m128i *) (transform32x32[2][3])));
3166 E2h = _mm_madd_epi16(m128Tmp5,
3167 _mm_load_si128((__m128i *) (transform32x32[2][3])));
3168 E3l = _mm_madd_epi16(m128Tmp6,
3169 _mm_load_si128((__m128i *) (transform32x32[3][3])));
3170 E3h = _mm_madd_epi16(m128Tmp7,
3171 _mm_load_si128((__m128i *) (transform32x32[3][3])));
3172
3173 E4l = _mm_madd_epi16(m128Tmp8,
3174 _mm_load_si128((__m128i *) (transform32x32[4][3])));
3175 E4h = _mm_madd_epi16(m128Tmp9,
3176 _mm_load_si128((__m128i *) (transform32x32[4][3])));
3177 E5l = _mm_madd_epi16(m128Tmp10,
3178 _mm_load_si128((__m128i *) (transform32x32[5][3])));
3179 E5h = _mm_madd_epi16(m128Tmp11,
3180 _mm_load_si128((__m128i *) (transform32x32[5][3])));
3181 E6l = _mm_madd_epi16(m128Tmp12,
3182 _mm_load_si128((__m128i *) (transform32x32[6][3])));
3183 E6h = _mm_madd_epi16(m128Tmp13,
3184 _mm_load_si128((__m128i *) (transform32x32[6][3])));
3185 E7l = _mm_madd_epi16(m128Tmp14,
3186 _mm_load_si128((__m128i *) (transform32x32[7][3])));
3187 E7h = _mm_madd_epi16(m128Tmp15,
3188 _mm_load_si128((__m128i *) (transform32x32[7][3])));
3189
3190 O3l = _mm_add_epi32(E0l, E1l);
3191 O3l = _mm_add_epi32(O3l, E2l);
3192 O3l = _mm_add_epi32(O3l, E3l);
3193 O3l = _mm_add_epi32(O3l, E4l);
3194 O3l = _mm_add_epi32(O3l, E5l);
3195 O3l = _mm_add_epi32(O3l, E6l);
3196 O3l = _mm_add_epi32(O3l, E7l);
3197
3198 O3h = _mm_add_epi32(E0h, E1h);
3199 O3h = _mm_add_epi32(O3h, E2h);
3200 O3h = _mm_add_epi32(O3h, E3h);
3201 O3h = _mm_add_epi32(O3h, E4h);
3202 O3h = _mm_add_epi32(O3h, E5h);
3203 O3h = _mm_add_epi32(O3h, E6h);
3204 O3h = _mm_add_epi32(O3h, E7h);
3205 /* Compute O4*/
3206
3207 E0l = _mm_madd_epi16(m128Tmp0,
3208 _mm_load_si128((__m128i *) (transform32x32[0][4])));
3209 E0h = _mm_madd_epi16(m128Tmp1,
3210 _mm_load_si128((__m128i *) (transform32x32[0][4])));
3211 E1l = _mm_madd_epi16(m128Tmp2,
3212 _mm_load_si128((__m128i *) (transform32x32[1][4])));
3213 E1h = _mm_madd_epi16(m128Tmp3,
3214 _mm_load_si128((__m128i *) (transform32x32[1][4])));
3215 E2l = _mm_madd_epi16(m128Tmp4,
3216 _mm_load_si128((__m128i *) (transform32x32[2][4])));
3217 E2h = _mm_madd_epi16(m128Tmp5,
3218 _mm_load_si128((__m128i *) (transform32x32[2][4])));
3219 E3l = _mm_madd_epi16(m128Tmp6,
3220 _mm_load_si128((__m128i *) (transform32x32[3][4])));
3221 E3h = _mm_madd_epi16(m128Tmp7,
3222 _mm_load_si128((__m128i *) (transform32x32[3][4])));
3223
3224 E4l = _mm_madd_epi16(m128Tmp8,
3225 _mm_load_si128((__m128i *) (transform32x32[4][4])));
3226 E4h = _mm_madd_epi16(m128Tmp9,
3227 _mm_load_si128((__m128i *) (transform32x32[4][4])));
3228 E5l = _mm_madd_epi16(m128Tmp10,
3229 _mm_load_si128((__m128i *) (transform32x32[5][4])));
3230 E5h = _mm_madd_epi16(m128Tmp11,
3231 _mm_load_si128((__m128i *) (transform32x32[5][4])));
3232 E6l = _mm_madd_epi16(m128Tmp12,
3233 _mm_load_si128((__m128i *) (transform32x32[6][4])));
3234 E6h = _mm_madd_epi16(m128Tmp13,
3235 _mm_load_si128((__m128i *) (transform32x32[6][4])));
3236 E7l = _mm_madd_epi16(m128Tmp14,
3237 _mm_load_si128((__m128i *) (transform32x32[7][4])));
3238 E7h = _mm_madd_epi16(m128Tmp15,
3239 _mm_load_si128((__m128i *) (transform32x32[7][4])));
3240
3241 O4l = _mm_add_epi32(E0l, E1l);
3242 O4l = _mm_add_epi32(O4l, E2l);
3243 O4l = _mm_add_epi32(O4l, E3l);
3244 O4l = _mm_add_epi32(O4l, E4l);
3245 O4l = _mm_add_epi32(O4l, E5l);
3246 O4l = _mm_add_epi32(O4l, E6l);
3247 O4l = _mm_add_epi32(O4l, E7l);
3248
3249 O4h = _mm_add_epi32(E0h, E1h);
3250 O4h = _mm_add_epi32(O4h, E2h);
3251 O4h = _mm_add_epi32(O4h, E3h);
3252 O4h = _mm_add_epi32(O4h, E4h);
3253 O4h = _mm_add_epi32(O4h, E5h);
3254 O4h = _mm_add_epi32(O4h, E6h);
3255 O4h = _mm_add_epi32(O4h, E7h);
3256
3257 /* Compute O5*/
3258 E0l = _mm_madd_epi16(m128Tmp0,
3259 _mm_load_si128((__m128i *) (transform32x32[0][5])));
3260 E0h = _mm_madd_epi16(m128Tmp1,
3261 _mm_load_si128((__m128i *) (transform32x32[0][5])));
3262 E1l = _mm_madd_epi16(m128Tmp2,
3263 _mm_load_si128((__m128i *) (transform32x32[1][5])));
3264 E1h = _mm_madd_epi16(m128Tmp3,
3265 _mm_load_si128((__m128i *) (transform32x32[1][5])));
3266 E2l = _mm_madd_epi16(m128Tmp4,
3267 _mm_load_si128((__m128i *) (transform32x32[2][5])));
3268 E2h = _mm_madd_epi16(m128Tmp5,
3269 _mm_load_si128((__m128i *) (transform32x32[2][5])));
3270 E3l = _mm_madd_epi16(m128Tmp6,
3271 _mm_load_si128((__m128i *) (transform32x32[3][5])));
3272 E3h = _mm_madd_epi16(m128Tmp7,
3273 _mm_load_si128((__m128i *) (transform32x32[3][5])));
3274
3275 E4l = _mm_madd_epi16(m128Tmp8,
3276 _mm_load_si128((__m128i *) (transform32x32[4][5])));
3277 E4h = _mm_madd_epi16(m128Tmp9,
3278 _mm_load_si128((__m128i *) (transform32x32[4][5])));
3279 E5l = _mm_madd_epi16(m128Tmp10,
3280 _mm_load_si128((__m128i *) (transform32x32[5][5])));
3281 E5h = _mm_madd_epi16(m128Tmp11,
3282 _mm_load_si128((__m128i *) (transform32x32[5][5])));
3283 E6l = _mm_madd_epi16(m128Tmp12,
3284 _mm_load_si128((__m128i *) (transform32x32[6][5])));
3285 E6h = _mm_madd_epi16(m128Tmp13,
3286 _mm_load_si128((__m128i *) (transform32x32[6][5])));
3287 E7l = _mm_madd_epi16(m128Tmp14,
3288 _mm_load_si128((__m128i *) (transform32x32[7][5])));
3289 E7h = _mm_madd_epi16(m128Tmp15,
3290 _mm_load_si128((__m128i *) (transform32x32[7][5])));
3291
3292 O5l = _mm_add_epi32(E0l, E1l);
3293 O5l = _mm_add_epi32(O5l, E2l);
3294 O5l = _mm_add_epi32(O5l, E3l);
3295 O5l = _mm_add_epi32(O5l, E4l);
3296 O5l = _mm_add_epi32(O5l, E5l);
3297 O5l = _mm_add_epi32(O5l, E6l);
3298 O5l = _mm_add_epi32(O5l, E7l);
3299
3300 O5h = _mm_add_epi32(E0h, E1h);
3301 O5h = _mm_add_epi32(O5h, E2h);
3302 O5h = _mm_add_epi32(O5h, E3h);
3303 O5h = _mm_add_epi32(O5h, E4h);
3304 O5h = _mm_add_epi32(O5h, E5h);
3305 O5h = _mm_add_epi32(O5h, E6h);
3306 O5h = _mm_add_epi32(O5h, E7h);
3307
3308 /* Compute O6*/
3309
3310 E0l = _mm_madd_epi16(m128Tmp0,
3311 _mm_load_si128((__m128i *) (transform32x32[0][6])));
3312 E0h = _mm_madd_epi16(m128Tmp1,
3313 _mm_load_si128((__m128i *) (transform32x32[0][6])));
3314 E1l = _mm_madd_epi16(m128Tmp2,
3315 _mm_load_si128((__m128i *) (transform32x32[1][6])));
3316 E1h = _mm_madd_epi16(m128Tmp3,
3317 _mm_load_si128((__m128i *) (transform32x32[1][6])));
3318 E2l = _mm_madd_epi16(m128Tmp4,
3319 _mm_load_si128((__m128i *) (transform32x32[2][6])));
3320 E2h = _mm_madd_epi16(m128Tmp5,
3321 _mm_load_si128((__m128i *) (transform32x32[2][6])));
3322 E3l = _mm_madd_epi16(m128Tmp6,
3323 _mm_load_si128((__m128i *) (transform32x32[3][6])));
3324 E3h = _mm_madd_epi16(m128Tmp7,
3325 _mm_load_si128((__m128i *) (transform32x32[3][6])));
3326
3327 E4l = _mm_madd_epi16(m128Tmp8,
3328 _mm_load_si128((__m128i *) (transform32x32[4][6])));
3329 E4h = _mm_madd_epi16(m128Tmp9,
3330 _mm_load_si128((__m128i *) (transform32x32[4][6])));
3331 E5l = _mm_madd_epi16(m128Tmp10,
3332 _mm_load_si128((__m128i *) (transform32x32[5][6])));
3333 E5h = _mm_madd_epi16(m128Tmp11,
3334 _mm_load_si128((__m128i *) (transform32x32[5][6])));
3335 E6l = _mm_madd_epi16(m128Tmp12,
3336 _mm_load_si128((__m128i *) (transform32x32[6][6])));
3337 E6h = _mm_madd_epi16(m128Tmp13,
3338 _mm_load_si128((__m128i *) (transform32x32[6][6])));
3339 E7l = _mm_madd_epi16(m128Tmp14,
3340 _mm_load_si128((__m128i *) (transform32x32[7][6])));
3341 E7h = _mm_madd_epi16(m128Tmp15,
3342 _mm_load_si128((__m128i *) (transform32x32[7][6])));
3343
3344 O6l = _mm_add_epi32(E0l, E1l);
3345 O6l = _mm_add_epi32(O6l, E2l);
3346 O6l = _mm_add_epi32(O6l, E3l);
3347 O6l = _mm_add_epi32(O6l, E4l);
3348 O6l = _mm_add_epi32(O6l, E5l);
3349 O6l = _mm_add_epi32(O6l, E6l);
3350 O6l = _mm_add_epi32(O6l, E7l);
3351
3352 O6h = _mm_add_epi32(E0h, E1h);
3353 O6h = _mm_add_epi32(O6h, E2h);
3354 O6h = _mm_add_epi32(O6h, E3h);
3355 O6h = _mm_add_epi32(O6h, E4h);
3356 O6h = _mm_add_epi32(O6h, E5h);
3357 O6h = _mm_add_epi32(O6h, E6h);
3358 O6h = _mm_add_epi32(O6h, E7h);
3359
3360 /* Compute O7*/
3361
3362 E0l = _mm_madd_epi16(m128Tmp0,
3363 _mm_load_si128((__m128i *) (transform32x32[0][7])));
3364 E0h = _mm_madd_epi16(m128Tmp1,
3365 _mm_load_si128((__m128i *) (transform32x32[0][7])));
3366 E1l = _mm_madd_epi16(m128Tmp2,
3367 _mm_load_si128((__m128i *) (transform32x32[1][7])));
3368 E1h = _mm_madd_epi16(m128Tmp3,
3369 _mm_load_si128((__m128i *) (transform32x32[1][7])));
3370 E2l = _mm_madd_epi16(m128Tmp4,
3371 _mm_load_si128((__m128i *) (transform32x32[2][7])));
3372 E2h = _mm_madd_epi16(m128Tmp5,
3373 _mm_load_si128((__m128i *) (transform32x32[2][7])));
3374 E3l = _mm_madd_epi16(m128Tmp6,
3375 _mm_load_si128((__m128i *) (transform32x32[3][7])));
3376 E3h = _mm_madd_epi16(m128Tmp7,
3377 _mm_load_si128((__m128i *) (transform32x32[3][7])));
3378
3379 E4l = _mm_madd_epi16(m128Tmp8,
3380 _mm_load_si128((__m128i *) (transform32x32[4][7])));
3381 E4h = _mm_madd_epi16(m128Tmp9,
3382 _mm_load_si128((__m128i *) (transform32x32[4][7])));
3383 E5l = _mm_madd_epi16(m128Tmp10,
3384 _mm_load_si128((__m128i *) (transform32x32[5][7])));
3385 E5h = _mm_madd_epi16(m128Tmp11,
3386 _mm_load_si128((__m128i *) (transform32x32[5][7])));
3387 E6l = _mm_madd_epi16(m128Tmp12,
3388 _mm_load_si128((__m128i *) (transform32x32[6][7])));
3389 E6h = _mm_madd_epi16(m128Tmp13,
3390 _mm_load_si128((__m128i *) (transform32x32[6][7])));
3391 E7l = _mm_madd_epi16(m128Tmp14,
3392 _mm_load_si128((__m128i *) (transform32x32[7][7])));
3393 E7h = _mm_madd_epi16(m128Tmp15,
3394 _mm_load_si128((__m128i *) (transform32x32[7][7])));
3395
3396 O7l = _mm_add_epi32(E0l, E1l);
3397 O7l = _mm_add_epi32(O7l, E2l);
3398 O7l = _mm_add_epi32(O7l, E3l);
3399 O7l = _mm_add_epi32(O7l, E4l);
3400 O7l = _mm_add_epi32(O7l, E5l);
3401 O7l = _mm_add_epi32(O7l, E6l);
3402 O7l = _mm_add_epi32(O7l, E7l);
3403
3404 O7h = _mm_add_epi32(E0h, E1h);
3405 O7h = _mm_add_epi32(O7h, E2h);
3406 O7h = _mm_add_epi32(O7h, E3h);
3407 O7h = _mm_add_epi32(O7h, E4h);
3408 O7h = _mm_add_epi32(O7h, E5h);
3409 O7h = _mm_add_epi32(O7h, E6h);
3410 O7h = _mm_add_epi32(O7h, E7h);
3411
3412 /* Compute O8*/
3413
3414 E0l = _mm_madd_epi16(m128Tmp0,
3415 _mm_load_si128((__m128i *) (transform32x32[0][8])));
3416 E0h = _mm_madd_epi16(m128Tmp1,
3417 _mm_load_si128((__m128i *) (transform32x32[0][8])));
3418 E1l = _mm_madd_epi16(m128Tmp2,
3419 _mm_load_si128((__m128i *) (transform32x32[1][8])));
3420 E1h = _mm_madd_epi16(m128Tmp3,
3421 _mm_load_si128((__m128i *) (transform32x32[1][8])));
3422 E2l = _mm_madd_epi16(m128Tmp4,
3423 _mm_load_si128((__m128i *) (transform32x32[2][8])));
3424 E2h = _mm_madd_epi16(m128Tmp5,
3425 _mm_load_si128((__m128i *) (transform32x32[2][8])));
3426 E3l = _mm_madd_epi16(m128Tmp6,
3427 _mm_load_si128((__m128i *) (transform32x32[3][8])));
3428 E3h = _mm_madd_epi16(m128Tmp7,
3429 _mm_load_si128((__m128i *) (transform32x32[3][8])));
3430
3431 E4l = _mm_madd_epi16(m128Tmp8,
3432 _mm_load_si128((__m128i *) (transform32x32[4][8])));
3433 E4h = _mm_madd_epi16(m128Tmp9,
3434 _mm_load_si128((__m128i *) (transform32x32[4][8])));
3435 E5l = _mm_madd_epi16(m128Tmp10,
3436 _mm_load_si128((__m128i *) (transform32x32[5][8])));
3437 E5h = _mm_madd_epi16(m128Tmp11,
3438 _mm_load_si128((__m128i *) (transform32x32[5][8])));
3439 E6l = _mm_madd_epi16(m128Tmp12,
3440 _mm_load_si128((__m128i *) (transform32x32[6][8])));
3441 E6h = _mm_madd_epi16(m128Tmp13,
3442 _mm_load_si128((__m128i *) (transform32x32[6][8])));
3443 E7l = _mm_madd_epi16(m128Tmp14,
3444 _mm_load_si128((__m128i *) (transform32x32[7][8])));
3445 E7h = _mm_madd_epi16(m128Tmp15,
3446 _mm_load_si128((__m128i *) (transform32x32[7][8])));
3447
3448 O8l = _mm_add_epi32(E0l, E1l);
3449 O8l = _mm_add_epi32(O8l, E2l);
3450 O8l = _mm_add_epi32(O8l, E3l);
3451 O8l = _mm_add_epi32(O8l, E4l);
3452 O8l = _mm_add_epi32(O8l, E5l);
3453 O8l = _mm_add_epi32(O8l, E6l);
3454 O8l = _mm_add_epi32(O8l, E7l);
3455
3456 O8h = _mm_add_epi32(E0h, E1h);
3457 O8h = _mm_add_epi32(O8h, E2h);
3458 O8h = _mm_add_epi32(O8h, E3h);
3459 O8h = _mm_add_epi32(O8h, E4h);
3460 O8h = _mm_add_epi32(O8h, E5h);
3461 O8h = _mm_add_epi32(O8h, E6h);
3462 O8h = _mm_add_epi32(O8h, E7h);
3463
3464 /* Compute O9*/
3465
3466 E0l = _mm_madd_epi16(m128Tmp0,
3467 _mm_load_si128((__m128i *) (transform32x32[0][9])));
3468 E0h = _mm_madd_epi16(m128Tmp1,
3469 _mm_load_si128((__m128i *) (transform32x32[0][9])));
3470 E1l = _mm_madd_epi16(m128Tmp2,
3471 _mm_load_si128((__m128i *) (transform32x32[1][9])));
3472 E1h = _mm_madd_epi16(m128Tmp3,
3473 _mm_load_si128((__m128i *) (transform32x32[1][9])));
3474 E2l = _mm_madd_epi16(m128Tmp4,
3475 _mm_load_si128((__m128i *) (transform32x32[2][9])));
3476 E2h = _mm_madd_epi16(m128Tmp5,
3477 _mm_load_si128((__m128i *) (transform32x32[2][9])));
3478 E3l = _mm_madd_epi16(m128Tmp6,
3479 _mm_load_si128((__m128i *) (transform32x32[3][9])));
3480 E3h = _mm_madd_epi16(m128Tmp7,
3481 _mm_load_si128((__m128i *) (transform32x32[3][9])));
3482
3483 E4l = _mm_madd_epi16(m128Tmp8,
3484 _mm_load_si128((__m128i *) (transform32x32[4][9])));
3485 E4h = _mm_madd_epi16(m128Tmp9,
3486 _mm_load_si128((__m128i *) (transform32x32[4][9])));
3487 E5l = _mm_madd_epi16(m128Tmp10,
3488 _mm_load_si128((__m128i *) (transform32x32[5][9])));
3489 E5h = _mm_madd_epi16(m128Tmp11,
3490 _mm_load_si128((__m128i *) (transform32x32[5][9])));
3491 E6l = _mm_madd_epi16(m128Tmp12,
3492 _mm_load_si128((__m128i *) (transform32x32[6][9])));
3493 E6h = _mm_madd_epi16(m128Tmp13,
3494 _mm_load_si128((__m128i *) (transform32x32[6][9])));
3495 E7l = _mm_madd_epi16(m128Tmp14,
3496 _mm_load_si128((__m128i *) (transform32x32[7][9])));
3497 E7h = _mm_madd_epi16(m128Tmp15,
3498 _mm_load_si128((__m128i *) (transform32x32[7][9])));
3499
3500 O9l = _mm_add_epi32(E0l, E1l);
3501 O9l = _mm_add_epi32(O9l, E2l);
3502 O9l = _mm_add_epi32(O9l, E3l);
3503 O9l = _mm_add_epi32(O9l, E4l);
3504 O9l = _mm_add_epi32(O9l, E5l);
3505 O9l = _mm_add_epi32(O9l, E6l);
3506 O9l = _mm_add_epi32(O9l, E7l);
3507
3508 O9h = _mm_add_epi32(E0h, E1h);
3509 O9h = _mm_add_epi32(O9h, E2h);
3510 O9h = _mm_add_epi32(O9h, E3h);
3511 O9h = _mm_add_epi32(O9h, E4h);
3512 O9h = _mm_add_epi32(O9h, E5h);
3513 O9h = _mm_add_epi32(O9h, E6h);
3514 O9h = _mm_add_epi32(O9h, E7h);
3515
3516 /* Compute 10*/
3517
3518 E0l = _mm_madd_epi16(m128Tmp0,
3519 _mm_load_si128((__m128i *) (transform32x32[0][10])));
3520 E0h = _mm_madd_epi16(m128Tmp1,
3521 _mm_load_si128((__m128i *) (transform32x32[0][10])));
3522 E1l = _mm_madd_epi16(m128Tmp2,
3523 _mm_load_si128((__m128i *) (transform32x32[1][10])));
3524 E1h = _mm_madd_epi16(m128Tmp3,
3525 _mm_load_si128((__m128i *) (transform32x32[1][10])));
3526 E2l = _mm_madd_epi16(m128Tmp4,
3527 _mm_load_si128((__m128i *) (transform32x32[2][10])));
3528 E2h = _mm_madd_epi16(m128Tmp5,
3529 _mm_load_si128((__m128i *) (transform32x32[2][10])));
3530 E3l = _mm_madd_epi16(m128Tmp6,
3531 _mm_load_si128((__m128i *) (transform32x32[3][10])));
3532 E3h = _mm_madd_epi16(m128Tmp7,
3533 _mm_load_si128((__m128i *) (transform32x32[3][10])));
3534
3535 E4l = _mm_madd_epi16(m128Tmp8,
3536 _mm_load_si128((__m128i *) (transform32x32[4][10])));
3537 E4h = _mm_madd_epi16(m128Tmp9,
3538 _mm_load_si128((__m128i *) (transform32x32[4][10])));
3539 E5l = _mm_madd_epi16(m128Tmp10,
3540 _mm_load_si128((__m128i *) (transform32x32[5][10])));
3541 E5h = _mm_madd_epi16(m128Tmp11,
3542 _mm_load_si128((__m128i *) (transform32x32[5][10])));
3543 E6l = _mm_madd_epi16(m128Tmp12,
3544 _mm_load_si128((__m128i *) (transform32x32[6][10])));
3545 E6h = _mm_madd_epi16(m128Tmp13,
3546 _mm_load_si128((__m128i *) (transform32x32[6][10])));
3547 E7l = _mm_madd_epi16(m128Tmp14,
3548 _mm_load_si128((__m128i *) (transform32x32[7][10])));
3549 E7h = _mm_madd_epi16(m128Tmp15,
3550 _mm_load_si128((__m128i *) (transform32x32[7][10])));
3551
3552 O10l = _mm_add_epi32(E0l, E1l);
3553 O10l = _mm_add_epi32(O10l, E2l);
3554 O10l = _mm_add_epi32(O10l, E3l);
3555 O10l = _mm_add_epi32(O10l, E4l);
3556 O10l = _mm_add_epi32(O10l, E5l);
3557 O10l = _mm_add_epi32(O10l, E6l);
3558 O10l = _mm_add_epi32(O10l, E7l);
3559
3560 O10h = _mm_add_epi32(E0h, E1h);
3561 O10h = _mm_add_epi32(O10h, E2h);
3562 O10h = _mm_add_epi32(O10h, E3h);
3563 O10h = _mm_add_epi32(O10h, E4h);
3564 O10h = _mm_add_epi32(O10h, E5h);
3565 O10h = _mm_add_epi32(O10h, E6h);
3566 O10h = _mm_add_epi32(O10h, E7h);
3567
3568 /* Compute 11*/
3569
3570 E0l = _mm_madd_epi16(m128Tmp0,
3571 _mm_load_si128((__m128i *) (transform32x32[0][11])));
3572 E0h = _mm_madd_epi16(m128Tmp1,
3573 _mm_load_si128((__m128i *) (transform32x32[0][11])));
3574 E1l = _mm_madd_epi16(m128Tmp2,
3575 _mm_load_si128((__m128i *) (transform32x32[1][11])));
3576 E1h = _mm_madd_epi16(m128Tmp3,
3577 _mm_load_si128((__m128i *) (transform32x32[1][11])));
3578 E2l = _mm_madd_epi16(m128Tmp4,
3579 _mm_load_si128((__m128i *) (transform32x32[2][11])));
3580 E2h = _mm_madd_epi16(m128Tmp5,
3581 _mm_load_si128((__m128i *) (transform32x32[2][11])));
3582 E3l = _mm_madd_epi16(m128Tmp6,
3583 _mm_load_si128((__m128i *) (transform32x32[3][11])));
3584 E3h = _mm_madd_epi16(m128Tmp7,
3585 _mm_load_si128((__m128i *) (transform32x32[3][11])));
3586
3587 E4l = _mm_madd_epi16(m128Tmp8,
3588 _mm_load_si128((__m128i *) (transform32x32[4][11])));
3589 E4h = _mm_madd_epi16(m128Tmp9,
3590 _mm_load_si128((__m128i *) (transform32x32[4][11])));
3591 E5l = _mm_madd_epi16(m128Tmp10,
3592 _mm_load_si128((__m128i *) (transform32x32[5][11])));
3593 E5h = _mm_madd_epi16(m128Tmp11,
3594 _mm_load_si128((__m128i *) (transform32x32[5][11])));
3595 E6l = _mm_madd_epi16(m128Tmp12,
3596 _mm_load_si128((__m128i *) (transform32x32[6][11])));
3597 E6h = _mm_madd_epi16(m128Tmp13,
3598 _mm_load_si128((__m128i *) (transform32x32[6][11])));
3599 E7l = _mm_madd_epi16(m128Tmp14,
3600 _mm_load_si128((__m128i *) (transform32x32[7][11])));
3601 E7h = _mm_madd_epi16(m128Tmp15,
3602 _mm_load_si128((__m128i *) (transform32x32[7][11])));
3603
3604 O11l = _mm_add_epi32(E0l, E1l);
3605 O11l = _mm_add_epi32(O11l, E2l);
3606 O11l = _mm_add_epi32(O11l, E3l);
3607 O11l = _mm_add_epi32(O11l, E4l);
3608 O11l = _mm_add_epi32(O11l, E5l);
3609 O11l = _mm_add_epi32(O11l, E6l);
3610 O11l = _mm_add_epi32(O11l, E7l);
3611
3612 O11h = _mm_add_epi32(E0h, E1h);
3613 O11h = _mm_add_epi32(O11h, E2h);
3614 O11h = _mm_add_epi32(O11h, E3h);
3615 O11h = _mm_add_epi32(O11h, E4h);
3616 O11h = _mm_add_epi32(O11h, E5h);
3617 O11h = _mm_add_epi32(O11h, E6h);
3618 O11h = _mm_add_epi32(O11h, E7h);
3619
3620 /* Compute 12*/
3621
3622 E0l = _mm_madd_epi16(m128Tmp0,
3623 _mm_load_si128((__m128i *) (transform32x32[0][12])));
3624 E0h = _mm_madd_epi16(m128Tmp1,
3625 _mm_load_si128((__m128i *) (transform32x32[0][12])));
3626 E1l = _mm_madd_epi16(m128Tmp2,
3627 _mm_load_si128((__m128i *) (transform32x32[1][12])));
3628 E1h = _mm_madd_epi16(m128Tmp3,
3629 _mm_load_si128((__m128i *) (transform32x32[1][12])));
3630 E2l = _mm_madd_epi16(m128Tmp4,
3631 _mm_load_si128((__m128i *) (transform32x32[2][12])));
3632 E2h = _mm_madd_epi16(m128Tmp5,
3633 _mm_load_si128((__m128i *) (transform32x32[2][12])));
3634 E3l = _mm_madd_epi16(m128Tmp6,
3635 _mm_load_si128((__m128i *) (transform32x32[3][12])));
3636 E3h = _mm_madd_epi16(m128Tmp7,
3637 _mm_load_si128((__m128i *) (transform32x32[3][12])));
3638
3639 E4l = _mm_madd_epi16(m128Tmp8,
3640 _mm_load_si128((__m128i *) (transform32x32[4][12])));
3641 E4h = _mm_madd_epi16(m128Tmp9,
3642 _mm_load_si128((__m128i *) (transform32x32[4][12])));
3643 E5l = _mm_madd_epi16(m128Tmp10,
3644 _mm_load_si128((__m128i *) (transform32x32[5][12])));
3645 E5h = _mm_madd_epi16(m128Tmp11,
3646 _mm_load_si128((__m128i *) (transform32x32[5][12])));
3647 E6l = _mm_madd_epi16(m128Tmp12,
3648 _mm_load_si128((__m128i *) (transform32x32[6][12])));
3649 E6h = _mm_madd_epi16(m128Tmp13,
3650 _mm_load_si128((__m128i *) (transform32x32[6][12])));
3651 E7l = _mm_madd_epi16(m128Tmp14,
3652 _mm_load_si128((__m128i *) (transform32x32[7][12])));
3653 E7h = _mm_madd_epi16(m128Tmp15,
3654 _mm_load_si128((__m128i *) (transform32x32[7][12])));
3655
3656 O12l = _mm_add_epi32(E0l, E1l);
3657 O12l = _mm_add_epi32(O12l, E2l);
3658 O12l = _mm_add_epi32(O12l, E3l);
3659 O12l = _mm_add_epi32(O12l, E4l);
3660 O12l = _mm_add_epi32(O12l, E5l);
3661 O12l = _mm_add_epi32(O12l, E6l);
3662 O12l = _mm_add_epi32(O12l, E7l);
3663
3664 O12h = _mm_add_epi32(E0h, E1h);
3665 O12h = _mm_add_epi32(O12h, E2h);
3666 O12h = _mm_add_epi32(O12h, E3h);
3667 O12h = _mm_add_epi32(O12h, E4h);
3668 O12h = _mm_add_epi32(O12h, E5h);
3669 O12h = _mm_add_epi32(O12h, E6h);
3670 O12h = _mm_add_epi32(O12h, E7h);
3671
3672 /* Compute 13*/
3673
3674 E0l = _mm_madd_epi16(m128Tmp0,
3675 _mm_load_si128((__m128i *) (transform32x32[0][13])));
3676 E0h = _mm_madd_epi16(m128Tmp1,
3677 _mm_load_si128((__m128i *) (transform32x32[0][13])));
3678 E1l = _mm_madd_epi16(m128Tmp2,
3679 _mm_load_si128((__m128i *) (transform32x32[1][13])));
3680 E1h = _mm_madd_epi16(m128Tmp3,
3681 _mm_load_si128((__m128i *) (transform32x32[1][13])));
3682 E2l = _mm_madd_epi16(m128Tmp4,
3683 _mm_load_si128((__m128i *) (transform32x32[2][13])));
3684 E2h = _mm_madd_epi16(m128Tmp5,
3685 _mm_load_si128((__m128i *) (transform32x32[2][13])));
3686 E3l = _mm_madd_epi16(m128Tmp6,
3687 _mm_load_si128((__m128i *) (transform32x32[3][13])));
3688 E3h = _mm_madd_epi16(m128Tmp7,
3689 _mm_load_si128((__m128i *) (transform32x32[3][13])));
3690
3691 E4l = _mm_madd_epi16(m128Tmp8,
3692 _mm_load_si128((__m128i *) (transform32x32[4][13])));
3693 E4h = _mm_madd_epi16(m128Tmp9,
3694 _mm_load_si128((__m128i *) (transform32x32[4][13])));
3695 E5l = _mm_madd_epi16(m128Tmp10,
3696 _mm_load_si128((__m128i *) (transform32x32[5][13])));
3697 E5h = _mm_madd_epi16(m128Tmp11,
3698 _mm_load_si128((__m128i *) (transform32x32[5][13])));
3699 E6l = _mm_madd_epi16(m128Tmp12,
3700 _mm_load_si128((__m128i *) (transform32x32[6][13])));
3701 E6h = _mm_madd_epi16(m128Tmp13,
3702 _mm_load_si128((__m128i *) (transform32x32[6][13])));
3703 E7l = _mm_madd_epi16(m128Tmp14,
3704 _mm_load_si128((__m128i *) (transform32x32[7][13])));
3705 E7h = _mm_madd_epi16(m128Tmp15,
3706 _mm_load_si128((__m128i *) (transform32x32[7][13])));
3707
3708 O13l = _mm_add_epi32(E0l, E1l);
3709 O13l = _mm_add_epi32(O13l, E2l);
3710 O13l = _mm_add_epi32(O13l, E3l);
3711 O13l = _mm_add_epi32(O13l, E4l);
3712 O13l = _mm_add_epi32(O13l, E5l);
3713 O13l = _mm_add_epi32(O13l, E6l);
3714 O13l = _mm_add_epi32(O13l, E7l);
3715
3716 O13h = _mm_add_epi32(E0h, E1h);
3717 O13h = _mm_add_epi32(O13h, E2h);
3718 O13h = _mm_add_epi32(O13h, E3h);
3719 O13h = _mm_add_epi32(O13h, E4h);
3720 O13h = _mm_add_epi32(O13h, E5h);
3721 O13h = _mm_add_epi32(O13h, E6h);
3722 O13h = _mm_add_epi32(O13h, E7h);
3723
3724 /* Compute O14 */
3725
3726 E0l = _mm_madd_epi16(m128Tmp0,
3727 _mm_load_si128((__m128i *) (transform32x32[0][14])));
3728 E0h = _mm_madd_epi16(m128Tmp1,
3729 _mm_load_si128((__m128i *) (transform32x32[0][14])));
3730 E1l = _mm_madd_epi16(m128Tmp2,
3731 _mm_load_si128((__m128i *) (transform32x32[1][14])));
3732 E1h = _mm_madd_epi16(m128Tmp3,
3733 _mm_load_si128((__m128i *) (transform32x32[1][14])));
3734 E2l = _mm_madd_epi16(m128Tmp4,
3735 _mm_load_si128((__m128i *) (transform32x32[2][14])));
3736 E2h = _mm_madd_epi16(m128Tmp5,
3737 _mm_load_si128((__m128i *) (transform32x32[2][14])));
3738 E3l = _mm_madd_epi16(m128Tmp6,
3739 _mm_load_si128((__m128i *) (transform32x32[3][14])));
3740 E3h = _mm_madd_epi16(m128Tmp7,
3741 _mm_load_si128((__m128i *) (transform32x32[3][14])));
3742
3743 E4l = _mm_madd_epi16(m128Tmp8,
3744 _mm_load_si128((__m128i *) (transform32x32[4][14])));
3745 E4h = _mm_madd_epi16(m128Tmp9,
3746 _mm_load_si128((__m128i *) (transform32x32[4][14])));
3747 E5l = _mm_madd_epi16(m128Tmp10,
3748 _mm_load_si128((__m128i *) (transform32x32[5][14])));
3749 E5h = _mm_madd_epi16(m128Tmp11,
3750 _mm_load_si128((__m128i *) (transform32x32[5][14])));
3751 E6l = _mm_madd_epi16(m128Tmp12,
3752 _mm_load_si128((__m128i *) (transform32x32[6][14])));
3753 E6h = _mm_madd_epi16(m128Tmp13,
3754 _mm_load_si128((__m128i *) (transform32x32[6][14])));
3755 E7l = _mm_madd_epi16(m128Tmp14,
3756 _mm_load_si128((__m128i *) (transform32x32[7][14])));
3757 E7h = _mm_madd_epi16(m128Tmp15,
3758 _mm_load_si128((__m128i *) (transform32x32[7][14])));
3759
3760 O14l = _mm_add_epi32(E0l, E1l);
3761 O14l = _mm_add_epi32(O14l, E2l);
3762 O14l = _mm_add_epi32(O14l, E3l);
3763 O14l = _mm_add_epi32(O14l, E4l);
3764 O14l = _mm_add_epi32(O14l, E5l);
3765 O14l = _mm_add_epi32(O14l, E6l);
3766 O14l = _mm_add_epi32(O14l, E7l);
3767
3768 O14h = _mm_add_epi32(E0h, E1h);
3769 O14h = _mm_add_epi32(O14h, E2h);
3770 O14h = _mm_add_epi32(O14h, E3h);
3771 O14h = _mm_add_epi32(O14h, E4h);
3772 O14h = _mm_add_epi32(O14h, E5h);
3773 O14h = _mm_add_epi32(O14h, E6h);
3774 O14h = _mm_add_epi32(O14h, E7h);
3775
3776 /* Compute O15*/
3777
3778 E0l = _mm_madd_epi16(m128Tmp0,
3779 _mm_load_si128((__m128i *) (transform32x32[0][15])));
3780 E0h = _mm_madd_epi16(m128Tmp1,
3781 _mm_load_si128((__m128i *) (transform32x32[0][15])));
3782 E1l = _mm_madd_epi16(m128Tmp2,
3783 _mm_load_si128((__m128i *) (transform32x32[1][15])));
3784 E1h = _mm_madd_epi16(m128Tmp3,
3785 _mm_load_si128((__m128i *) (transform32x32[1][15])));
3786 E2l = _mm_madd_epi16(m128Tmp4,
3787 _mm_load_si128((__m128i *) (transform32x32[2][15])));
3788 E2h = _mm_madd_epi16(m128Tmp5,
3789 _mm_load_si128((__m128i *) (transform32x32[2][15])));
3790 E3l = _mm_madd_epi16(m128Tmp6,
3791 _mm_load_si128((__m128i *) (transform32x32[3][15])));
3792 E3h = _mm_madd_epi16(m128Tmp7,
3793 _mm_load_si128((__m128i *) (transform32x32[3][15])));
3794
3795 E4l = _mm_madd_epi16(m128Tmp8,
3796 _mm_load_si128((__m128i *) (transform32x32[4][15])));
3797 E4h = _mm_madd_epi16(m128Tmp9,
3798 _mm_load_si128((__m128i *) (transform32x32[4][15])));
3799 E5l = _mm_madd_epi16(m128Tmp10,
3800 _mm_load_si128((__m128i *) (transform32x32[5][15])));
3801 E5h = _mm_madd_epi16(m128Tmp11,
3802 _mm_load_si128((__m128i *) (transform32x32[5][15])));
3803 E6l = _mm_madd_epi16(m128Tmp12,
3804 _mm_load_si128((__m128i *) (transform32x32[6][15])));
3805 E6h = _mm_madd_epi16(m128Tmp13,
3806 _mm_load_si128((__m128i *) (transform32x32[6][15])));
3807 E7l = _mm_madd_epi16(m128Tmp14,
3808 _mm_load_si128((__m128i *) (transform32x32[7][15])));
3809 E7h = _mm_madd_epi16(m128Tmp15,
3810 _mm_load_si128((__m128i *) (transform32x32[7][15])));
3811
3812 O15l = _mm_add_epi32(E0l, E1l);
3813 O15l = _mm_add_epi32(O15l, E2l);
3814 O15l = _mm_add_epi32(O15l, E3l);
3815 O15l = _mm_add_epi32(O15l, E4l);
3816 O15l = _mm_add_epi32(O15l, E5l);
3817 O15l = _mm_add_epi32(O15l, E6l);
3818 O15l = _mm_add_epi32(O15l, E7l);
3819
3820 O15h = _mm_add_epi32(E0h, E1h);
3821 O15h = _mm_add_epi32(O15h, E2h);
3822 O15h = _mm_add_epi32(O15h, E3h);
3823 O15h = _mm_add_epi32(O15h, E4h);
3824 O15h = _mm_add_epi32(O15h, E5h);
3825 O15h = _mm_add_epi32(O15h, E6h);
3826 O15h = _mm_add_epi32(O15h, E7h);
3827 /* Compute E0 */
3828
3829 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
3830 E0l = _mm_madd_epi16(m128Tmp0,
3831 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
3832 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
3833 E0h = _mm_madd_epi16(m128Tmp1,
3834 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
3835
3836 m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
3837 E0l = _mm_add_epi32(E0l,
3838 _mm_madd_epi16(m128Tmp2,
3839 _mm_load_si128(
3840 (__m128i *) (transform16x16_1[1][0]))));
3841 m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
3842 E0h = _mm_add_epi32(E0h,
3843 _mm_madd_epi16(m128Tmp3,
3844 _mm_load_si128(
3845 (__m128i *) (transform16x16_1[1][0]))));
3846
3847 m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22);
3848 E0l = _mm_add_epi32(E0l,
3849 _mm_madd_epi16(m128Tmp4,
3850 _mm_load_si128(
3851 (__m128i *) (transform16x16_1[2][0]))));
3852 m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22);
3853 E0h = _mm_add_epi32(E0h,
3854 _mm_madd_epi16(m128Tmp5,
3855 _mm_load_si128(
3856 (__m128i *) (transform16x16_1[2][0]))));
3857
3858 m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30);
3859 E0l = _mm_add_epi32(E0l,
3860 _mm_madd_epi16(m128Tmp6,
3861 _mm_load_si128(
3862 (__m128i *) (transform16x16_1[3][0]))));
3863 m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30);
3864 E0h = _mm_add_epi32(E0h,
3865 _mm_madd_epi16(m128Tmp7,
3866 _mm_load_si128(
3867 (__m128i *) (transform16x16_1[3][0]))));
3868
3869 /* Compute E1 */
3870 E1l = _mm_madd_epi16(m128Tmp0,
3871 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
3872 E1h = _mm_madd_epi16(m128Tmp1,
3873 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
3874 E1l = _mm_add_epi32(E1l,
3875 _mm_madd_epi16(m128Tmp2,
3876 _mm_load_si128(
3877 (__m128i *) (transform16x16_1[1][1]))));
3878 E1h = _mm_add_epi32(E1h,
3879 _mm_madd_epi16(m128Tmp3,
3880 _mm_load_si128(
3881 (__m128i *) (transform16x16_1[1][1]))));
3882 E1l = _mm_add_epi32(E1l,
3883 _mm_madd_epi16(m128Tmp4,
3884 _mm_load_si128(
3885 (__m128i *) (transform16x16_1[2][1]))));
3886 E1h = _mm_add_epi32(E1h,
3887 _mm_madd_epi16(m128Tmp5,
3888 _mm_load_si128(
3889 (__m128i *) (transform16x16_1[2][1]))));
3890 E1l = _mm_add_epi32(E1l,
3891 _mm_madd_epi16(m128Tmp6,
3892 _mm_load_si128(
3893 (__m128i *) (transform16x16_1[3][1]))));
3894 E1h = _mm_add_epi32(E1h,
3895 _mm_madd_epi16(m128Tmp7,
3896 _mm_load_si128(
3897 (__m128i *) (transform16x16_1[3][1]))));
3898
3899 /* Compute E2 */
3900 E2l = _mm_madd_epi16(m128Tmp0,
3901 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
3902 E2h = _mm_madd_epi16(m128Tmp1,
3903 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
3904 E2l = _mm_add_epi32(E2l,
3905 _mm_madd_epi16(m128Tmp2,
3906 _mm_load_si128(
3907 (__m128i *) (transform16x16_1[1][2]))));
3908 E2h = _mm_add_epi32(E2h,
3909 _mm_madd_epi16(m128Tmp3,
3910 _mm_load_si128(
3911 (__m128i *) (transform16x16_1[1][2]))));
3912 E2l = _mm_add_epi32(E2l,
3913 _mm_madd_epi16(m128Tmp4,
3914 _mm_load_si128(
3915 (__m128i *) (transform16x16_1[2][2]))));
3916 E2h = _mm_add_epi32(E2h,
3917 _mm_madd_epi16(m128Tmp5,
3918 _mm_load_si128(
3919 (__m128i *) (transform16x16_1[2][2]))));
3920 E2l = _mm_add_epi32(E2l,
3921 _mm_madd_epi16(m128Tmp6,
3922 _mm_load_si128(
3923 (__m128i *) (transform16x16_1[3][2]))));
3924 E2h = _mm_add_epi32(E2h,
3925 _mm_madd_epi16(m128Tmp7,
3926 _mm_load_si128(
3927 (__m128i *) (transform16x16_1[3][2]))));
3928
3929 /* Compute E3 */
3930 E3l = _mm_madd_epi16(m128Tmp0,
3931 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
3932 E3h = _mm_madd_epi16(m128Tmp1,
3933 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
3934 E3l = _mm_add_epi32(E3l,
3935 _mm_madd_epi16(m128Tmp2,
3936 _mm_load_si128(
3937 (__m128i *) (transform16x16_1[1][3]))));
3938 E3h = _mm_add_epi32(E3h,
3939 _mm_madd_epi16(m128Tmp3,
3940 _mm_load_si128(
3941 (__m128i *) (transform16x16_1[1][3]))));
3942 E3l = _mm_add_epi32(E3l,
3943 _mm_madd_epi16(m128Tmp4,
3944 _mm_load_si128(
3945 (__m128i *) (transform16x16_1[2][3]))));
3946 E3h = _mm_add_epi32(E3h,
3947 _mm_madd_epi16(m128Tmp5,
3948 _mm_load_si128(
3949 (__m128i *) (transform16x16_1[2][3]))));
3950 E3l = _mm_add_epi32(E3l,
3951 _mm_madd_epi16(m128Tmp6,
3952 _mm_load_si128(
3953 (__m128i *) (transform16x16_1[3][3]))));
3954 E3h = _mm_add_epi32(E3h,
3955 _mm_madd_epi16(m128Tmp7,
3956 _mm_load_si128(
3957 (__m128i *) (transform16x16_1[3][3]))));
3958
3959 /* Compute E4 */
3960 E4l = _mm_madd_epi16(m128Tmp0,
3961 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
3962 E4h = _mm_madd_epi16(m128Tmp1,
3963 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
3964 E4l = _mm_add_epi32(E4l,
3965 _mm_madd_epi16(m128Tmp2,
3966 _mm_load_si128(
3967 (__m128i *) (transform16x16_1[1][4]))));
3968 E4h = _mm_add_epi32(E4h,
3969 _mm_madd_epi16(m128Tmp3,
3970 _mm_load_si128(
3971 (__m128i *) (transform16x16_1[1][4]))));
3972 E4l = _mm_add_epi32(E4l,
3973 _mm_madd_epi16(m128Tmp4,
3974 _mm_load_si128(
3975 (__m128i *) (transform16x16_1[2][4]))));
3976 E4h = _mm_add_epi32(E4h,
3977 _mm_madd_epi16(m128Tmp5,
3978 _mm_load_si128(
3979 (__m128i *) (transform16x16_1[2][4]))));
3980 E4l = _mm_add_epi32(E4l,
3981 _mm_madd_epi16(m128Tmp6,
3982 _mm_load_si128(
3983 (__m128i *) (transform16x16_1[3][4]))));
3984 E4h = _mm_add_epi32(E4h,
3985 _mm_madd_epi16(m128Tmp7,
3986 _mm_load_si128(
3987 (__m128i *) (transform16x16_1[3][4]))));
3988
3989 /* Compute E3 */
3990 E5l = _mm_madd_epi16(m128Tmp0,
3991 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
3992 E5h = _mm_madd_epi16(m128Tmp1,
3993 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
3994 E5l = _mm_add_epi32(E5l,
3995 _mm_madd_epi16(m128Tmp2,
3996 _mm_load_si128(
3997 (__m128i *) (transform16x16_1[1][5]))));
3998 E5h = _mm_add_epi32(E5h,
3999 _mm_madd_epi16(m128Tmp3,
4000 _mm_load_si128(
4001 (__m128i *) (transform16x16_1[1][5]))));
4002 E5l = _mm_add_epi32(E5l,
4003 _mm_madd_epi16(m128Tmp4,
4004 _mm_load_si128(
4005 (__m128i *) (transform16x16_1[2][5]))));
4006 E5h = _mm_add_epi32(E5h,
4007 _mm_madd_epi16(m128Tmp5,
4008 _mm_load_si128(
4009 (__m128i *) (transform16x16_1[2][5]))));
4010 E5l = _mm_add_epi32(E5l,
4011 _mm_madd_epi16(m128Tmp6,
4012 _mm_load_si128(
4013 (__m128i *) (transform16x16_1[3][5]))));
4014 E5h = _mm_add_epi32(E5h,
4015 _mm_madd_epi16(m128Tmp7,
4016 _mm_load_si128(
4017 (__m128i *) (transform16x16_1[3][5]))));
4018
4019 /* Compute E6 */
4020 E6l = _mm_madd_epi16(m128Tmp0,
4021 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
4022 E6h = _mm_madd_epi16(m128Tmp1,
4023 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
4024 E6l = _mm_add_epi32(E6l,
4025 _mm_madd_epi16(m128Tmp2,
4026 _mm_load_si128(
4027 (__m128i *) (transform16x16_1[1][6]))));
4028 E6h = _mm_add_epi32(E6h,
4029 _mm_madd_epi16(m128Tmp3,
4030 _mm_load_si128(
4031 (__m128i *) (transform16x16_1[1][6]))));
4032 E6l = _mm_add_epi32(E6l,
4033 _mm_madd_epi16(m128Tmp4,
4034 _mm_load_si128(
4035 (__m128i *) (transform16x16_1[2][6]))));
4036 E6h = _mm_add_epi32(E6h,
4037 _mm_madd_epi16(m128Tmp5,
4038 _mm_load_si128(
4039 (__m128i *) (transform16x16_1[2][6]))));
4040 E6l = _mm_add_epi32(E6l,
4041 _mm_madd_epi16(m128Tmp6,
4042 _mm_load_si128(
4043 (__m128i *) (transform16x16_1[3][6]))));
4044 E6h = _mm_add_epi32(E6h,
4045 _mm_madd_epi16(m128Tmp7,
4046 _mm_load_si128(
4047 (__m128i *) (transform16x16_1[3][6]))));
4048
4049 /* Compute E7 */
4050 E7l = _mm_madd_epi16(m128Tmp0,
4051 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
4052 E7h = _mm_madd_epi16(m128Tmp1,
4053 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
4054 E7l = _mm_add_epi32(E7l,
4055 _mm_madd_epi16(m128Tmp2,
4056 _mm_load_si128(
4057 (__m128i *) (transform16x16_1[1][7]))));
4058 E7h = _mm_add_epi32(E7h,
4059 _mm_madd_epi16(m128Tmp3,
4060 _mm_load_si128(
4061 (__m128i *) (transform16x16_1[1][7]))));
4062 E7l = _mm_add_epi32(E7l,
4063 _mm_madd_epi16(m128Tmp4,
4064 _mm_load_si128(
4065 (__m128i *) (transform16x16_1[2][7]))));
4066 E7h = _mm_add_epi32(E7h,
4067 _mm_madd_epi16(m128Tmp5,
4068 _mm_load_si128(
4069 (__m128i *) (transform16x16_1[2][7]))));
4070 E7l = _mm_add_epi32(E7l,
4071 _mm_madd_epi16(m128Tmp6,
4072 _mm_load_si128(
4073 (__m128i *) (transform16x16_1[3][7]))));
4074 E7h = _mm_add_epi32(E7h,
4075 _mm_madd_epi16(m128Tmp7,
4076 _mm_load_si128(
4077 (__m128i *) (transform16x16_1[3][7]))));
4078
4079 /* Compute EE0 and EEE */
4080
4081 m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
4082 E00l = _mm_madd_epi16(m128Tmp0,
4083 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
4084 m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
4085 E00h = _mm_madd_epi16(m128Tmp1,
4086 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
4087
4088 m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28);
4089 E00l = _mm_add_epi32(E00l,
4090 _mm_madd_epi16(m128Tmp2,
4091 _mm_load_si128(
4092 (__m128i *) (transform16x16_2[1][0]))));
4093 m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28);
4094 E00h = _mm_add_epi32(E00h,
4095 _mm_madd_epi16(m128Tmp3,
4096 _mm_load_si128(
4097 (__m128i *) (transform16x16_2[1][0]))));
4098
4099 E01l = _mm_madd_epi16(m128Tmp0,
4100 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
4101 E01h = _mm_madd_epi16(m128Tmp1,
4102 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
4103 E01l = _mm_add_epi32(E01l,
4104 _mm_madd_epi16(m128Tmp2,
4105 _mm_load_si128(
4106 (__m128i *) (transform16x16_2[1][1]))));
4107 E01h = _mm_add_epi32(E01h,
4108 _mm_madd_epi16(m128Tmp3,
4109 _mm_load_si128(
4110 (__m128i *) (transform16x16_2[1][1]))));
4111
4112 E02l = _mm_madd_epi16(m128Tmp0,
4113 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
4114 E02h = _mm_madd_epi16(m128Tmp1,
4115 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
4116 E02l = _mm_add_epi32(E02l,
4117 _mm_madd_epi16(m128Tmp2,
4118 _mm_load_si128(
4119 (__m128i *) (transform16x16_2[1][2]))));
4120 E02h = _mm_add_epi32(E02h,
4121 _mm_madd_epi16(m128Tmp3,
4122 _mm_load_si128(
4123 (__m128i *) (transform16x16_2[1][2]))));
4124
4125 E03l = _mm_madd_epi16(m128Tmp0,
4126 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
4127 E03h = _mm_madd_epi16(m128Tmp1,
4128 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
4129 E03l = _mm_add_epi32(E03l,
4130 _mm_madd_epi16(m128Tmp2,
4131 _mm_load_si128(
4132 (__m128i *) (transform16x16_2[1][3]))));
4133 E03h = _mm_add_epi32(E03h,
4134 _mm_madd_epi16(m128Tmp3,
4135 _mm_load_si128(
4136 (__m128i *) (transform16x16_2[1][3]))));
4137
4138 /* Compute EE0 and EEE */
4139
4140 m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24);
4141 EE0l = _mm_madd_epi16(m128Tmp0,
4142 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
4143 m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24);
4144 EE0h = _mm_madd_epi16(m128Tmp1,
4145 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
4146
4147 m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16);
4148 EEE0l = _mm_madd_epi16(m128Tmp2,
4149 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
4150 m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16);
4151 EEE0h = _mm_madd_epi16(m128Tmp3,
4152 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
4153
4154 EE1l = _mm_madd_epi16(m128Tmp0,
4155 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
4156 EE1h = _mm_madd_epi16(m128Tmp1,
4157 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
4158
4159 EEE1l = _mm_madd_epi16(m128Tmp2,
4160 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
4161 EEE1h = _mm_madd_epi16(m128Tmp3,
4162 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
4163
4164 /* Compute EE */
4165
4166 EE2l = _mm_sub_epi32(EEE1l, EE1l);
4167 EE3l = _mm_sub_epi32(EEE0l, EE0l);
4168 EE2h = _mm_sub_epi32(EEE1h, EE1h);
4169 EE3h = _mm_sub_epi32(EEE0h, EE0h);
4170
4171 EE0l = _mm_add_epi32(EEE0l, EE0l);
4172 EE1l = _mm_add_epi32(EEE1l, EE1l);
4173 EE0h = _mm_add_epi32(EEE0h, EE0h);
4174 EE1h = _mm_add_epi32(EEE1h, EE1h);
4175 /**/
4176
4177 EE7l = _mm_sub_epi32(EE0l, E00l);
4178 EE6l = _mm_sub_epi32(EE1l, E01l);
4179 EE5l = _mm_sub_epi32(EE2l, E02l);
4180 EE4l = _mm_sub_epi32(EE3l, E03l);
4181
4182 EE7h = _mm_sub_epi32(EE0h, E00h);
4183 EE6h = _mm_sub_epi32(EE1h, E01h);
4184 EE5h = _mm_sub_epi32(EE2h, E02h);
4185 EE4h = _mm_sub_epi32(EE3h, E03h);
4186
4187 EE0l = _mm_add_epi32(EE0l, E00l);
4188 EE1l = _mm_add_epi32(EE1l, E01l);
4189 EE2l = _mm_add_epi32(EE2l, E02l);
4190 EE3l = _mm_add_epi32(EE3l, E03l);
4191
4192 EE0h = _mm_add_epi32(EE0h, E00h);
4193 EE1h = _mm_add_epi32(EE1h, E01h);
4194 EE2h = _mm_add_epi32(EE2h, E02h);
4195 EE3h = _mm_add_epi32(EE3h, E03h);
4196 /* Compute E */
4197
4198 E15l = _mm_sub_epi32(EE0l, E0l);
4199 E15l = _mm_add_epi32(E15l, m128iAdd);
4200 E14l = _mm_sub_epi32(EE1l, E1l);
4201 E14l = _mm_add_epi32(E14l, m128iAdd);
4202 E13l = _mm_sub_epi32(EE2l, E2l);
4203 E13l = _mm_add_epi32(E13l, m128iAdd);
4204 E12l = _mm_sub_epi32(EE3l, E3l);
4205 E12l = _mm_add_epi32(E12l, m128iAdd);
4206 E11l = _mm_sub_epi32(EE4l, E4l);
4207 E11l = _mm_add_epi32(E11l, m128iAdd);
4208 E10l = _mm_sub_epi32(EE5l, E5l);
4209 E10l = _mm_add_epi32(E10l, m128iAdd);
4210 E9l = _mm_sub_epi32(EE6l, E6l);
4211 E9l = _mm_add_epi32(E9l, m128iAdd);
4212 E8l = _mm_sub_epi32(EE7l, E7l);
4213 E8l = _mm_add_epi32(E8l, m128iAdd);
4214
4215 E0l = _mm_add_epi32(EE0l, E0l);
4216 E0l = _mm_add_epi32(E0l, m128iAdd);
4217 E1l = _mm_add_epi32(EE1l, E1l);
4218 E1l = _mm_add_epi32(E1l, m128iAdd);
4219 E2l = _mm_add_epi32(EE2l, E2l);
4220 E2l = _mm_add_epi32(E2l, m128iAdd);
4221 E3l = _mm_add_epi32(EE3l, E3l);
4222 E3l = _mm_add_epi32(E3l, m128iAdd);
4223 E4l = _mm_add_epi32(EE4l, E4l);
4224 E4l = _mm_add_epi32(E4l, m128iAdd);
4225 E5l = _mm_add_epi32(EE5l, E5l);
4226 E5l = _mm_add_epi32(E5l, m128iAdd);
4227 E6l = _mm_add_epi32(EE6l, E6l);
4228 E6l = _mm_add_epi32(E6l, m128iAdd);
4229 E7l = _mm_add_epi32(EE7l, E7l);
4230 E7l = _mm_add_epi32(E7l, m128iAdd);
4231
4232 E15h = _mm_sub_epi32(EE0h, E0h);
4233 E15h = _mm_add_epi32(E15h, m128iAdd);
4234 E14h = _mm_sub_epi32(EE1h, E1h);
4235 E14h = _mm_add_epi32(E14h, m128iAdd);
4236 E13h = _mm_sub_epi32(EE2h, E2h);
4237 E13h = _mm_add_epi32(E13h, m128iAdd);
4238 E12h = _mm_sub_epi32(EE3h, E3h);
4239 E12h = _mm_add_epi32(E12h, m128iAdd);
4240 E11h = _mm_sub_epi32(EE4h, E4h);
4241 E11h = _mm_add_epi32(E11h, m128iAdd);
4242 E10h = _mm_sub_epi32(EE5h, E5h);
4243 E10h = _mm_add_epi32(E10h, m128iAdd);
4244 E9h = _mm_sub_epi32(EE6h, E6h);
4245 E9h = _mm_add_epi32(E9h, m128iAdd);
4246 E8h = _mm_sub_epi32(EE7h, E7h);
4247 E8h = _mm_add_epi32(E8h, m128iAdd);
4248
4249 E0h = _mm_add_epi32(EE0h, E0h);
4250 E0h = _mm_add_epi32(E0h, m128iAdd);
4251 E1h = _mm_add_epi32(EE1h, E1h);
4252 E1h = _mm_add_epi32(E1h, m128iAdd);
4253 E2h = _mm_add_epi32(EE2h, E2h);
4254 E2h = _mm_add_epi32(E2h, m128iAdd);
4255 E3h = _mm_add_epi32(EE3h, E3h);
4256 E3h = _mm_add_epi32(E3h, m128iAdd);
4257 E4h = _mm_add_epi32(EE4h, E4h);
4258 E4h = _mm_add_epi32(E4h, m128iAdd);
4259 E5h = _mm_add_epi32(EE5h, E5h);
4260 E5h = _mm_add_epi32(E5h, m128iAdd);
4261 E6h = _mm_add_epi32(EE6h, E6h);
4262 E6h = _mm_add_epi32(E6h, m128iAdd);
4263 E7h = _mm_add_epi32(EE7h, E7h);
4264 E7h = _mm_add_epi32(E7h, m128iAdd);
4265
4266 m128iS0 = _mm_packs_epi32(
4267 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
4268 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
4269 m128iS1 = _mm_packs_epi32(
4270 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
4271 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
4272 m128iS2 = _mm_packs_epi32(
4273 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
4274 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
4275 m128iS3 = _mm_packs_epi32(
4276 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
4277 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
4278 m128iS4 = _mm_packs_epi32(
4279 _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
4280 _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
4281 m128iS5 = _mm_packs_epi32(
4282 _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
4283 _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
4284 m128iS6 = _mm_packs_epi32(
4285 _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
4286 _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
4287 m128iS7 = _mm_packs_epi32(
4288 _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
4289 _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
4290 m128iS8 = _mm_packs_epi32(
4291 _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift),
4292 _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift));
4293 m128iS9 = _mm_packs_epi32(
4294 _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift),
4295 _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift));
4296 m128iS10 = _mm_packs_epi32(
4297 _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift),
4298 _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift));
4299 m128iS11 = _mm_packs_epi32(
4300 _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift),
4301 _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift));
4302 m128iS12 = _mm_packs_epi32(
4303 _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift),
4304 _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift));
4305 m128iS13 = _mm_packs_epi32(
4306 _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift),
4307 _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift));
4308 m128iS14 = _mm_packs_epi32(
4309 _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift),
4310 _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift));
4311 m128iS15 = _mm_packs_epi32(
4312 _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift),
4313 _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift));
4314
4315 m128iS31 = _mm_packs_epi32(
4316 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
4317 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
4318 m128iS30 = _mm_packs_epi32(
4319 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
4320 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
4321 m128iS29 = _mm_packs_epi32(
4322 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
4323 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
4324 m128iS28 = _mm_packs_epi32(
4325 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
4326 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
4327 m128iS27 = _mm_packs_epi32(
4328 _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
4329 _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
4330 m128iS26 = _mm_packs_epi32(
4331 _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
4332 _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
4333 m128iS25 = _mm_packs_epi32(
4334 _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
4335 _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
4336 m128iS24 = _mm_packs_epi32(
4337 _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
4338 _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
4339 m128iS23 = _mm_packs_epi32(
4340 _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift),
4341 _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift));
4342 m128iS22 = _mm_packs_epi32(
4343 _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift),
4344 _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift));
4345 m128iS21 = _mm_packs_epi32(
4346 _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift),
4347 _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift));
4348 m128iS20 = _mm_packs_epi32(
4349 _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift),
4350 _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift));
4351 m128iS19 = _mm_packs_epi32(
4352 _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift),
4353 _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift));
4354 m128iS18 = _mm_packs_epi32(
4355 _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift),
4356 _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift));
4357 m128iS17 = _mm_packs_epi32(
4358 _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift),
4359 _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift));
4360 m128iS16 = _mm_packs_epi32(
4361 _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift),
4362 _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift));
4363
4364 if (!j) {
4365 /* Inverse the matrix */
4366 E0l = _mm_unpacklo_epi16(m128iS0, m128iS16);
4367 E1l = _mm_unpacklo_epi16(m128iS1, m128iS17);
4368 E2l = _mm_unpacklo_epi16(m128iS2, m128iS18);
4369 E3l = _mm_unpacklo_epi16(m128iS3, m128iS19);
4370 E4l = _mm_unpacklo_epi16(m128iS4, m128iS20);
4371 E5l = _mm_unpacklo_epi16(m128iS5, m128iS21);
4372 E6l = _mm_unpacklo_epi16(m128iS6, m128iS22);
4373 E7l = _mm_unpacklo_epi16(m128iS7, m128iS23);
4374 E8l = _mm_unpacklo_epi16(m128iS8, m128iS24);
4375 E9l = _mm_unpacklo_epi16(m128iS9, m128iS25);
4376 E10l = _mm_unpacklo_epi16(m128iS10, m128iS26);
4377 E11l = _mm_unpacklo_epi16(m128iS11, m128iS27);
4378 E12l = _mm_unpacklo_epi16(m128iS12, m128iS28);
4379 E13l = _mm_unpacklo_epi16(m128iS13, m128iS29);
4380 E14l = _mm_unpacklo_epi16(m128iS14, m128iS30);
4381 E15l = _mm_unpacklo_epi16(m128iS15, m128iS31);
4382
4383 O0l = _mm_unpackhi_epi16(m128iS0, m128iS16);
4384 O1l = _mm_unpackhi_epi16(m128iS1, m128iS17);
4385 O2l = _mm_unpackhi_epi16(m128iS2, m128iS18);
4386 O3l = _mm_unpackhi_epi16(m128iS3, m128iS19);
4387 O4l = _mm_unpackhi_epi16(m128iS4, m128iS20);
4388 O5l = _mm_unpackhi_epi16(m128iS5, m128iS21);
4389 O6l = _mm_unpackhi_epi16(m128iS6, m128iS22);
4390 O7l = _mm_unpackhi_epi16(m128iS7, m128iS23);
4391 O8l = _mm_unpackhi_epi16(m128iS8, m128iS24);
4392 O9l = _mm_unpackhi_epi16(m128iS9, m128iS25);
4393 O10l = _mm_unpackhi_epi16(m128iS10, m128iS26);
4394 O11l = _mm_unpackhi_epi16(m128iS11, m128iS27);
4395 O12l = _mm_unpackhi_epi16(m128iS12, m128iS28);
4396 O13l = _mm_unpackhi_epi16(m128iS13, m128iS29);
4397 O14l = _mm_unpackhi_epi16(m128iS14, m128iS30);
4398 O15l = _mm_unpackhi_epi16(m128iS15, m128iS31);
4399
4400 E0h = _mm_unpacklo_epi16(E0l, E8l);
4401 E1h = _mm_unpacklo_epi16(E1l, E9l);
4402 E2h = _mm_unpacklo_epi16(E2l, E10l);
4403 E3h = _mm_unpacklo_epi16(E3l, E11l);
4404 E4h = _mm_unpacklo_epi16(E4l, E12l);
4405 E5h = _mm_unpacklo_epi16(E5l, E13l);
4406 E6h = _mm_unpacklo_epi16(E6l, E14l);
4407 E7h = _mm_unpacklo_epi16(E7l, E15l);
4408
4409 E8h = _mm_unpackhi_epi16(E0l, E8l);
4410 E9h = _mm_unpackhi_epi16(E1l, E9l);
4411 E10h = _mm_unpackhi_epi16(E2l, E10l);
4412 E11h = _mm_unpackhi_epi16(E3l, E11l);
4413 E12h = _mm_unpackhi_epi16(E4l, E12l);
4414 E13h = _mm_unpackhi_epi16(E5l, E13l);
4415 E14h = _mm_unpackhi_epi16(E6l, E14l);
4416 E15h = _mm_unpackhi_epi16(E7l, E15l);
4417
4418 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
4419 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
4420 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
4421 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
4422
4423 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4424 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4425 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4426 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4427
4428 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4429 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4430 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4431 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4432
4433 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
4434 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
4435 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
4436 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
4437
4438 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4439 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4440 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4441 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4442
4443 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4444 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4445 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4446 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4447
4448 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
4449 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
4450 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
4451 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
4452
4453 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4454 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4455 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4456 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4457
4458 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4459 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4460 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4461 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4462
4463 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
4464 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
4465 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
4466 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
4467
4468 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4469 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4470 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4471 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4472
4473 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4474 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4475 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4476 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4477
4478 /* */
4479 E0h = _mm_unpacklo_epi16(O0l, O8l);
4480 E1h = _mm_unpacklo_epi16(O1l, O9l);
4481 E2h = _mm_unpacklo_epi16(O2l, O10l);
4482 E3h = _mm_unpacklo_epi16(O3l, O11l);
4483 E4h = _mm_unpacklo_epi16(O4l, O12l);
4484 E5h = _mm_unpacklo_epi16(O5l, O13l);
4485 E6h = _mm_unpacklo_epi16(O6l, O14l);
4486 E7h = _mm_unpacklo_epi16(O7l, O15l);
4487
4488 E8h = _mm_unpackhi_epi16(O0l, O8l);
4489 E9h = _mm_unpackhi_epi16(O1l, O9l);
4490 E10h = _mm_unpackhi_epi16(O2l, O10l);
4491 E11h = _mm_unpackhi_epi16(O3l, O11l);
4492 E12h = _mm_unpackhi_epi16(O4l, O12l);
4493 E13h = _mm_unpackhi_epi16(O5l, O13l);
4494 E14h = _mm_unpackhi_epi16(O6l, O14l);
4495 E15h = _mm_unpackhi_epi16(O7l, O15l);
4496
4497 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
4498 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
4499 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
4500 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
4501
4502 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4503 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4504 m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4505 m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4506
4507 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4508 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4509 m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4510 m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4511
4512 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
4513 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
4514 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
4515 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
4516
4517 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4518 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4519 m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4520 m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4521
4522 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4523 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4524 m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4525 m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4526
4527 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
4528 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
4529 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
4530 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
4531
4532 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4533 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4534 m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4535 m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4536
4537 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4538 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4539 m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4540 m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4541
4542 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
4543 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
4544 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
4545 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
4546
4547 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4548 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4549 m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4550 m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4551
4552 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4553 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4554 m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4555 m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4556
4557 if(i==0){
4558 int k = 8;
4559 r0=m128iS0;
4560 r1=m128iS1;
4561 r2=m128iS2;
4562 r3=m128iS3;
4563 r4=m128iS4;
4564 r5=m128iS5;
4565 r6=m128iS6;
4566 r7=m128iS7;
4567 r8=m128iS8;
4568 r9=m128iS9;
4569 r10=m128iS10;
4570 r11=m128iS11;
4571 r12=m128iS12;
4572 r13=m128iS13;
4573 r14=m128iS14;
4574 r15=m128iS15;
4575 r16=m128iS16;
4576 r17=m128iS17;
4577 r18=m128iS18;
4578 r19=m128iS19;
4579 r20=m128iS20;
4580 r21=m128iS21;
4581 r22=m128iS22;
4582 r23=m128iS23;
4583 r24=m128iS24;
4584 r25=m128iS25;
4585 r26=m128iS26;
4586 r27=m128iS27;
4587 r28=m128iS28;
4588 r29=m128iS29;
4589 r30=m128iS30;
4590 r31=m128iS31;
4591 m128iS0 = _mm_load_si128((__m128i *) (src + k));
4592 m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k));
4593 m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k));
4594 m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k));
4595 m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k));
4596 m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k));
4597 m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k));
4598 m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k));
4599 m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k));
4600 m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k));
4601 m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k));
4602 m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k));
4603 m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k));
4604 m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k));
4605 m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k));
4606 m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k));
4607
4608 m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k));
4609 m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k));
4610 m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k));
4611 m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k));
4612 m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k));
4613 m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k));
4614 m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k));
4615 m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k));
4616 m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k));
4617 m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k));
4618 m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k));
4619 m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k));
4620 m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k));
4621 m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k));
4622 m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k));
4623 m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k));
4624
4625 }else if(i ==8){
4626
4627 r32=m128iS0;
4628 r33=m128iS1;
4629 r34=m128iS2;
4630 r35=m128iS3;
4631 r36=m128iS4;
4632 r37=m128iS5;
4633 r38=m128iS6;
4634 r39=m128iS7;
4635 r40=m128iS8;
4636 r41=m128iS9;
4637 r42=m128iS10;
4638 r43=m128iS11;
4639 r44=m128iS12;
4640 r45=m128iS13;
4641 r46=m128iS14;
4642 r47=m128iS15;
4643 r48=m128iS16;
4644 r49=m128iS17;
4645 r50=m128iS18;
4646 r51=m128iS19;
4647 r52=m128iS20;
4648 r53=m128iS21;
4649 r54=m128iS22;
4650 r55=m128iS23;
4651 r56=m128iS24;
4652 r57=m128iS25;
4653 r58=m128iS26;
4654 r59=m128iS27;
4655 r60=m128iS28;
4656 r61=m128iS29;
4657 r62=m128iS30;
4658 r63=m128iS31;
4659
4660 m128iS0 = _mm_load_si128((__m128i *) (src + 16));
4661 m128iS1 = _mm_load_si128((__m128i *) (src + 48));
4662 m128iS2 = _mm_load_si128((__m128i *) (src + 80));
4663 m128iS3 = _mm_load_si128((__m128i *) (src + 112));
4664 m128iS4 = _mm_load_si128((__m128i *) (src + 144));
4665 m128iS5 = _mm_load_si128((__m128i *) (src + 176));
4666 m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 16));
4667 m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 16));
4668 m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 16));
4669 m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 16));
4670 m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 16));
4671 m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 16));
4672 m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 16));
4673 m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 16));
4674 m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 16));
4675 m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 16));
4676
4677 m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 16));
4678 m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 16));
4679 m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 16));
4680 m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 16));
4681 m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 16));
4682 m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 16));
4683 m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 16));
4684 m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 16));
4685 m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 16));
4686 m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 16));
4687 m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 16));
4688 m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 16));
4689 m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 16));
4690 m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 16));
4691 m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 16));
4692 m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 16));
4693
4694
4695 }else if(i ==16){
4696
4697 r64=m128iS0;
4698 r65=m128iS1;
4699 r66=m128iS2;
4700 r67=m128iS3;
4701 r68=m128iS4;
4702 r69=m128iS5;
4703 r70=m128iS6;
4704 r71=m128iS7;
4705 r72=m128iS8;
4706 r73=m128iS9;
4707 r74=m128iS10;
4708 r75=m128iS11;
4709 r76=m128iS12;
4710 r77=m128iS13;
4711 r78=m128iS14;
4712 r79=m128iS15;
4713 r80=m128iS16;
4714 r81=m128iS17;
4715 r82=m128iS18;
4716 r83=m128iS19;
4717 r84=m128iS20;
4718 r85=m128iS21;
4719 r86=m128iS22;
4720 r87=m128iS23;
4721 r88=m128iS24;
4722 r89=m128iS25;
4723 r90=m128iS26;
4724 r91=m128iS27;
4725 r92=m128iS28;
4726 r93=m128iS29;
4727 r94=m128iS30;
4728 r95=m128iS31;
4729
4730 m128iS0 = _mm_load_si128((__m128i *) (src + 24));
4731 m128iS1 = _mm_load_si128((__m128i *) (src + 56));
4732 m128iS2 = _mm_load_si128((__m128i *) (src + 64 + 24));
4733 m128iS3 = _mm_load_si128((__m128i *) (src + 96 + 24));
4734 m128iS4 = _mm_load_si128((__m128i *) (src + 128 + 24));
4735 m128iS5 = _mm_load_si128((__m128i *) (src + 160 + 24));
4736 m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 24));
4737 m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 24));
4738 m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 24));
4739 m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 24));
4740 m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 24));
4741 m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 24));
4742 m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 24));
4743 m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 24));
4744 m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 24));
4745 m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 24));
4746
4747 m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 24));
4748 m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 24));
4749 m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 24));
4750 m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 24));
4751 m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 24));
4752 m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 24));
4753 m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 24));
4754 m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 24));
4755 m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 24));
4756 m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 24));
4757 m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 24));
4758 m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 24));
4759 m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 24));
4760 m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 24));
4761 m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 24));
4762 m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 24));
4763
4764 }else{
4765 r96=m128iS0;
4766 r97=m128iS1;
4767 r98=m128iS2;
4768 r99=m128iS3;
4769 r100=m128iS4;
4770 r101=m128iS5;
4771 r102=m128iS6;
4772 r103=m128iS7;
4773 r104=m128iS8;
4774 r105=m128iS9;
4775 r106=m128iS10;
4776 r107=m128iS11;
4777 r108=m128iS12;
4778 r109=m128iS13;
4779 r110=m128iS14;
4780 r111=m128iS15;
4781 r112=m128iS16;
4782 r113=m128iS17;
4783 r114=m128iS18;
4784 r115=m128iS19;
4785 r116=m128iS20;
4786 r117=m128iS21;
4787 r118=m128iS22;
4788 r119=m128iS23;
4789 r120=m128iS24;
4790 r121=m128iS25;
4791 r122=m128iS26;
4792 r123=m128iS27;
4793 r124=m128iS28;
4794 r125=m128iS29;
4795 r126=m128iS30;
4796 r127=m128iS31;
4797
4798 //load data for next j :
4799 m128iS0 = r0;
4800 m128iS1 = r4;
4801 m128iS2 = r8;
4802 m128iS3 = r12;
4803 m128iS4 = r16;
4804 m128iS5 = r20;
4805 m128iS6 = r24;
4806 m128iS7 = r28;
4807 m128iS8 = r32;
4808 m128iS9 = r36;
4809 m128iS10 = r40;
4810 m128iS11 = r44;
4811 m128iS12 = r48;
4812 m128iS13 = r52;
4813 m128iS14 = r56;
4814 m128iS15 = r60;
4815 m128iS16 = r64;
4816 m128iS17 = r68;
4817 m128iS18 = r72;
4818 m128iS19 = r76;
4819 m128iS20 = r80;
4820 m128iS21 = r84;
4821 m128iS22 = r88;
4822 m128iS23 = r92;
4823 m128iS24 = r96;
4824 m128iS25 = r100;
4825 m128iS26 = r104;
4826 m128iS27 = r108;
4827 m128iS28 = r112;
4828 m128iS29 = r116;
4829 m128iS30 = r120;
4830 m128iS31 =r124;
4831 shift = shift_2nd;
4832 m128iAdd = _mm_set1_epi32(add_2nd);
4833
4834
4835 }
4836
4837 } else {
4838
4839 //Transpose Matrix
4840
4841 E0l= _mm_unpacklo_epi16(m128iS0,m128iS1);
4842 E1l= _mm_unpacklo_epi16(m128iS2,m128iS3);
4843 E2l= _mm_unpacklo_epi16(m128iS4,m128iS5);
4844 E3l= _mm_unpacklo_epi16(m128iS6,m128iS7);
4845 E4l= _mm_unpacklo_epi16(m128iS8,m128iS9);
4846 E5l= _mm_unpacklo_epi16(m128iS10,m128iS11);
4847 E6l= _mm_unpacklo_epi16(m128iS12,m128iS13);
4848 E7l= _mm_unpacklo_epi16(m128iS14,m128iS15);
4849 E8l= _mm_unpacklo_epi16(m128iS16,m128iS17);
4850 E9l= _mm_unpacklo_epi16(m128iS18,m128iS19);
4851 E10l= _mm_unpacklo_epi16(m128iS20,m128iS21);
4852 E11l= _mm_unpacklo_epi16(m128iS22,m128iS23);
4853 E12l= _mm_unpacklo_epi16(m128iS24,m128iS25);
4854 E13l= _mm_unpacklo_epi16(m128iS26,m128iS27);
4855 E14l= _mm_unpacklo_epi16(m128iS28,m128iS29);
4856 E15l= _mm_unpacklo_epi16(m128iS30,m128iS31);
4857
4858
4859 E0h= _mm_unpackhi_epi16(m128iS0,m128iS1);
4860 E1h= _mm_unpackhi_epi16(m128iS2,m128iS3);
4861 E2h= _mm_unpackhi_epi16(m128iS4,m128iS5);
4862 E3h= _mm_unpackhi_epi16(m128iS6,m128iS7);
4863 E4h= _mm_unpackhi_epi16(m128iS8,m128iS9);
4864 E5h= _mm_unpackhi_epi16(m128iS10,m128iS11);
4865 E6h= _mm_unpackhi_epi16(m128iS12,m128iS13);
4866 E7h= _mm_unpackhi_epi16(m128iS14,m128iS15);
4867 E8h= _mm_unpackhi_epi16(m128iS16,m128iS17);
4868 E9h= _mm_unpackhi_epi16(m128iS18,m128iS19);
4869 E10h= _mm_unpackhi_epi16(m128iS20,m128iS21);
4870 E11h= _mm_unpackhi_epi16(m128iS22,m128iS23);
4871 E12h= _mm_unpackhi_epi16(m128iS24,m128iS25);
4872 E13h= _mm_unpackhi_epi16(m128iS26,m128iS27);
4873 E14h= _mm_unpackhi_epi16(m128iS28,m128iS29);
4874 E15h= _mm_unpackhi_epi16(m128iS30,m128iS31);
4875
4876 m128Tmp0= _mm_unpacklo_epi32(E0l,E1l);
4877 m128Tmp1= _mm_unpacklo_epi32(E2l,E3l);
4878 m128Tmp2= _mm_unpacklo_epi32(E4l,E5l);
4879 m128Tmp3= _mm_unpacklo_epi32(E6l,E7l);
4880 m128Tmp4= _mm_unpacklo_epi32(E8l,E9l);
4881 m128Tmp5= _mm_unpacklo_epi32(E10l,E11l);
4882 m128Tmp6= _mm_unpacklo_epi32(E12l,E13l);
4883 m128Tmp7= _mm_unpacklo_epi32(E14l,E15l);
4884
4885 m128iS0= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter 1st row
4886 m128iS1= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter 1st row
4887
4888
4889 m128iS2= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter 1st row
4890 m128iS3= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter 1st row
4891
4892 //second row
4893
4894 m128iS4= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4895 m128iS5= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4896
4897 m128iS6= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4898 m128iS7= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4899
4900 //third row
4901
4902 m128Tmp0= _mm_unpackhi_epi32(E0l,E1l);
4903 m128Tmp1= _mm_unpackhi_epi32(E2l,E3l);
4904 m128Tmp2= _mm_unpackhi_epi32(E4l,E5l);
4905 m128Tmp3= _mm_unpackhi_epi32(E6l,E7l);
4906 m128Tmp4= _mm_unpackhi_epi32(E8l,E9l);
4907 m128Tmp5= _mm_unpackhi_epi32(E10l,E11l);
4908 m128Tmp6= _mm_unpackhi_epi32(E12l,E13l);
4909 m128Tmp7= _mm_unpackhi_epi32(E14l,E15l);
4910
4911
4912 m128iS8= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4913 m128iS9= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4914
4915 m128iS10= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4916 m128iS11= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter
4917
4918 //fourth row
4919
4920 m128iS12= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4921 m128iS13= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4922
4923 m128iS14= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4924 m128iS15= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4925
4926 //fith row
4927
4928 m128Tmp0= _mm_unpacklo_epi32(E0h,E1h);
4929 m128Tmp1= _mm_unpacklo_epi32(E2h,E3h);
4930 m128Tmp2= _mm_unpacklo_epi32(E4h,E5h);
4931 m128Tmp3= _mm_unpacklo_epi32(E6h,E7h);
4932 m128Tmp4= _mm_unpacklo_epi32(E8h,E9h);
4933 m128Tmp5= _mm_unpacklo_epi32(E10h,E11h);
4934 m128Tmp6= _mm_unpacklo_epi32(E12h,E13h);
4935 m128Tmp7= _mm_unpacklo_epi32(E14h,E15h);
4936
4937 m128iS16= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4938 m128iS17= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4939
4940
4941 m128iS18= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4942 m128iS19= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7);
4943
4944 //sixth row
4945
4946 m128iS20= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4947 m128iS21= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4948
4949
4950 m128iS22= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4951 m128iS23= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4952
4953 //seventh row
4954
4955 m128Tmp0= _mm_unpackhi_epi32(E0h,E1h);
4956 m128Tmp1= _mm_unpackhi_epi32(E2h,E3h);
4957 m128Tmp2= _mm_unpackhi_epi32(E4h,E5h);
4958 m128Tmp3= _mm_unpackhi_epi32(E6h,E7h);
4959 m128Tmp4= _mm_unpackhi_epi32(E8h,E9h);
4960 m128Tmp5= _mm_unpackhi_epi32(E10h,E11h);
4961 m128Tmp6= _mm_unpackhi_epi32(E12h,E13h);
4962 m128Tmp7= _mm_unpackhi_epi32(E14h,E15h);
4963
4964
4965 m128iS24= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4966 m128iS25= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4967
4968
4969 m128iS26= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4970 m128iS27= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter
4971
4972 //last row
4973
4974
4975 m128iS28= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4976 m128iS29= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4977
4978 m128iS30= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4979 m128iS31= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4980
4981
4982 m128Tmp0=_mm_setzero_si128();
4983
4984
4985 //store
4986 dst = (uint8_t*) _dst + i*stride;
4987
4988
4989 E0l= _mm_load_si128((__m128i*)dst); //16 values
4990 E1l= _mm_load_si128((__m128i*)(dst+16));
4991 E2l= _mm_load_si128((__m128i*)(dst+stride));
4992 E3l= _mm_load_si128((__m128i*)(dst+stride+16));
4993 E4l= _mm_load_si128((__m128i*)(dst+2*stride));
4994 E5l= _mm_load_si128((__m128i*)(dst+2*stride+16));
4995 E6l= _mm_load_si128((__m128i*)(dst+3*stride));
4996 E7l= _mm_load_si128((__m128i*)(dst+3*stride+16));
4997 E8l= _mm_load_si128((__m128i*)(dst+4*stride));
4998 E9l= _mm_load_si128((__m128i*)(dst+4*stride+16));
4999 E10l= _mm_load_si128((__m128i*)(dst+5*stride));
5000 E11l= _mm_load_si128((__m128i*)(dst+5*stride+16));
5001 E12l= _mm_load_si128((__m128i*)(dst+6*stride));
5002 E13l= _mm_load_si128((__m128i*)(dst+6*stride+16));
5003 E14l= _mm_load_si128((__m128i*)(dst+7*stride));
5004 E15l= _mm_load_si128((__m128i*)(dst+7*stride+16));
5005
5006 m128iS0= _mm_adds_epi16(m128iS0,_mm_unpacklo_epi8(E0l,m128Tmp0));
5007 m128iS1= _mm_adds_epi16(m128iS1,_mm_unpackhi_epi8(E0l,m128Tmp0));
5008 m128iS0= _mm_packus_epi16(m128iS0,m128iS1);
5009
5010 m128iS2= _mm_adds_epi16(m128iS2,_mm_unpacklo_epi8(E1l,m128Tmp0));
5011 m128iS3= _mm_adds_epi16(m128iS3,_mm_unpackhi_epi8(E1l,m128Tmp0));
5012 m128iS2= _mm_packus_epi16(m128iS2,m128iS3);
5013
5014 m128iS4= _mm_adds_epi16(m128iS4,_mm_unpacklo_epi8(E2l,m128Tmp0));
5015 m128iS5= _mm_adds_epi16(m128iS5,_mm_unpackhi_epi8(E2l,m128Tmp0));
5016 m128iS4= _mm_packus_epi16(m128iS4,m128iS5);
5017
5018 m128iS6= _mm_adds_epi16(m128iS6,_mm_unpacklo_epi8(E3l,m128Tmp0));
5019 m128iS7= _mm_adds_epi16(m128iS7,_mm_unpackhi_epi8(E3l,m128Tmp0));
5020 m128iS6= _mm_packus_epi16(m128iS6,m128iS7);
5021
5022 m128iS8= _mm_adds_epi16(m128iS8,_mm_unpacklo_epi8(E4l,m128Tmp0));
5023 m128iS9= _mm_adds_epi16(m128iS9,_mm_unpackhi_epi8(E4l,m128Tmp0));
5024 m128iS8= _mm_packus_epi16(m128iS8,m128iS9);
5025
5026 m128iS10= _mm_adds_epi16(m128iS10,_mm_unpacklo_epi8(E5l,m128Tmp0));
5027 m128iS11= _mm_adds_epi16(m128iS11,_mm_unpackhi_epi8(E5l,m128Tmp0));
5028 m128iS10= _mm_packus_epi16(m128iS10,m128iS11);
5029
5030 m128iS12= _mm_adds_epi16(m128iS12,_mm_unpacklo_epi8(E6l,m128Tmp0));
5031 m128iS13= _mm_adds_epi16(m128iS13,_mm_unpackhi_epi8(E6l,m128Tmp0));
5032 m128iS12= _mm_packus_epi16(m128iS12,m128iS13);
5033
5034 m128iS14= _mm_adds_epi16(m128iS14,_mm_unpacklo_epi8(E7l,m128Tmp0));
5035 m128iS15= _mm_adds_epi16(m128iS15,_mm_unpackhi_epi8(E7l,m128Tmp0));
5036 m128iS14= _mm_packus_epi16(m128iS14,m128iS15);
5037
5038 m128iS16= _mm_adds_epi16(m128iS16,_mm_unpacklo_epi8(E8l,m128Tmp0));
5039 m128iS17= _mm_adds_epi16(m128iS17,_mm_unpackhi_epi8(E8l,m128Tmp0));
5040 m128iS16= _mm_packus_epi16(m128iS16,m128iS17);
5041
5042 m128iS18= _mm_adds_epi16(m128iS18,_mm_unpacklo_epi8(E9l,m128Tmp0));
5043 m128iS19= _mm_adds_epi16(m128iS19,_mm_unpackhi_epi8(E9l,m128Tmp0));
5044 m128iS18= _mm_packus_epi16(m128iS18,m128iS19);
5045
5046 m128iS20= _mm_adds_epi16(m128iS20,_mm_unpacklo_epi8(E10l,m128Tmp0));
5047 m128iS21= _mm_adds_epi16(m128iS21,_mm_unpackhi_epi8(E10l,m128Tmp0));
5048 m128iS20= _mm_packus_epi16(m128iS20,m128iS21);
5049
5050 m128iS22= _mm_adds_epi16(m128iS22,_mm_unpacklo_epi8(E11l,m128Tmp0));
5051 m128iS23= _mm_adds_epi16(m128iS23,_mm_unpackhi_epi8(E11l,m128Tmp0));
5052 m128iS22= _mm_packus_epi16(m128iS22,m128iS23);
5053
5054 m128iS24= _mm_adds_epi16(m128iS24,_mm_unpacklo_epi8(E12l,m128Tmp0));
5055 m128iS25= _mm_adds_epi16(m128iS25,_mm_unpackhi_epi8(E12l,m128Tmp0));
5056 m128iS24= _mm_packus_epi16(m128iS24,m128iS25);
5057
5058 m128iS26= _mm_adds_epi16(m128iS26,_mm_unpacklo_epi8(E13l,m128Tmp0));
5059 m128iS27= _mm_adds_epi16(m128iS27,_mm_unpackhi_epi8(E13l,m128Tmp0));
5060 m128iS26= _mm_packus_epi16(m128iS26,m128iS27);
5061
5062 m128iS28= _mm_adds_epi16(m128iS28,_mm_unpacklo_epi8(E14l,m128Tmp0));
5063 m128iS29= _mm_adds_epi16(m128iS29,_mm_unpackhi_epi8(E14l,m128Tmp0));
5064 m128iS28= _mm_packus_epi16(m128iS28,m128iS29);
5065
5066 m128iS30= _mm_adds_epi16(m128iS30,_mm_unpacklo_epi8(E15l,m128Tmp0));
5067 m128iS31= _mm_adds_epi16(m128iS31,_mm_unpackhi_epi8(E15l,m128Tmp0));
5068 m128iS30= _mm_packus_epi16(m128iS30,m128iS31);
5069
5070
5071 _mm_store_si128((__m128i*)dst,m128iS0);
5072 _mm_store_si128((__m128i*)(dst+16),m128iS2);
5073 _mm_store_si128((__m128i*)(dst+stride),m128iS4);
5074 _mm_store_si128((__m128i*)(dst+stride+16),m128iS6);
5075 _mm_store_si128((__m128i*)(dst+2*stride),m128iS8);
5076 _mm_store_si128((__m128i*)(dst+2*stride+16),m128iS10);
5077 _mm_store_si128((__m128i*)(dst+3*stride),m128iS12);
5078 _mm_store_si128((__m128i*)(dst+3*stride+16),m128iS14);
5079 _mm_store_si128((__m128i*)(dst+4*stride),m128iS16);
5080 _mm_store_si128((__m128i*)(dst+4*stride+16),m128iS18);
5081 _mm_store_si128((__m128i*)(dst+5*stride),m128iS20);
5082 _mm_store_si128((__m128i*)(dst+5*stride+16),m128iS22);
5083 _mm_store_si128((__m128i*)(dst+6*stride),m128iS24);
5084 _mm_store_si128((__m128i*)(dst+6*stride+16),m128iS26);
5085 _mm_store_si128((__m128i*)(dst+7*stride),m128iS28);
5086 _mm_store_si128((__m128i*)(dst+7*stride+16),m128iS30);
5087
5088
5089 if(i==0){
5090 //load next values :
5091 m128iS0 = r1;
5092 m128iS1 = r5;
5093 m128iS2 = r9;
5094 m128iS3 = r13;
5095 m128iS4 = r17;
5096 m128iS5 = r21;
5097 m128iS6 = r25;
5098 m128iS7 = r29;
5099 m128iS8 = r33;
5100 m128iS9 = r37;
5101 m128iS10 = r41;
5102 m128iS11 = r45;
5103 m128iS12 = r49;
5104 m128iS13 = r53;
5105 m128iS14 = r57;
5106 m128iS15 = r61;
5107 m128iS16 = r65;
5108 m128iS17 = r69;
5109 m128iS18 = r73;
5110 m128iS19 = r77;
5111 m128iS20 = r81;
5112 m128iS21 = r85;
5113 m128iS22 = r89;
5114 m128iS23 = r93;
5115 m128iS24 = r97;
5116 m128iS25 = r101;
5117 m128iS26 = r105;
5118 m128iS27 = r109;
5119 m128iS28 = r113;
5120 m128iS29 = r117;
5121 m128iS30 = r121;
5122 m128iS31 =r125;
5123
5124 }else if(i ==8){
5125 //load next values :
5126 m128iS0 = r2;
5127 m128iS1 = r6;
5128 m128iS2 = r10;
5129 m128iS3 = r14;
5130 m128iS4 = r18;
5131 m128iS5 = r22;
5132 m128iS6 = r26;
5133 m128iS7 = r30;
5134 m128iS8 = r34;
5135 m128iS9 = r38;
5136 m128iS10 = r42;
5137 m128iS11 = r46;
5138 m128iS12 = r50;
5139 m128iS13 = r54;
5140 m128iS14 = r58;
5141 m128iS15 = r62;
5142 m128iS16 = r66;
5143 m128iS17 = r70;
5144 m128iS18 = r74;
5145 m128iS19 = r78;
5146 m128iS20 = r82;
5147 m128iS21 = r86;
5148 m128iS22 = r90;
5149 m128iS23 = r94;
5150 m128iS24 = r98;
5151 m128iS25 = r102;
5152 m128iS26 = r106;
5153 m128iS27 = r110;
5154 m128iS28 = r114;
5155 m128iS29 = r118;
5156 m128iS30 = r122;
5157 m128iS31 =r126;
5158
5159 }else if(i==16)
5160 {
5161 //load next values :
5162 m128iS0 = r3;
5163 m128iS1 = r7;
5164 m128iS2 = r11;
5165 m128iS3 = r15;
5166 m128iS4 = r19;
5167 m128iS5 = r23;
5168 m128iS6 = r27;
5169 m128iS7 = r31;
5170 m128iS8 = r35;
5171 m128iS9 = r39;
5172 m128iS10 = r43;
5173 m128iS11 = r47;
5174 m128iS12 = r51;
5175 m128iS13 = r55;
5176 m128iS14 = r59;
5177 m128iS15 = r63;
5178 m128iS16 = r67;
5179 m128iS17 = r71;
5180 m128iS18 = r75;
5181 m128iS19 = r79;
5182 m128iS20 = r83;
5183 m128iS21 = r87;
5184 m128iS22 = r91;
5185 m128iS23 = r95;
5186 m128iS24 = r99;
5187 m128iS25 = r103;
5188 m128iS26 = r107;
5189 m128iS27 = r111;
5190 m128iS28 = r115;
5191 m128iS29 = r119;
5192 m128iS30 = r123;
5193 m128iS31 =r127;
5194 }
5195 }
5196 }
5197 }
5198 }
5199 #endif
5200
5201
5202 #if 0
5203 void ff_hevc_transform_32x32_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
5204 ptrdiff_t _stride) {
5205 int i, j;
5206 uint16_t *dst = (uint16_t*) _dst;
5207 ptrdiff_t stride = _stride / 2;
5208 int shift;
5209 uint8_t shift_2nd = 10; //20 - bit depth
5210 uint16_t add_2nd = 1<<9; //shift2 - 1
5211 int16_t *src = coeffs;
5212
5213 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
5214 m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
5215 m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
5216 m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
5217 E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
5218 O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
5219 E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
5220 __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l;
5221 __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h,
5222 EEE0l, EEE1l, EEE0h, EEE1h;
5223 __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21,
5224 m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27,
5225 m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9,
5226 m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15,
5227 O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l,
5228 O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l,
5229 EE4l, EE7h, EE6h, EE5h, EE4h;
5230 m128iS0 = _mm_load_si128((__m128i *) (src));
5231 m128iS1 = _mm_load_si128((__m128i *) (src + 32));
5232 m128iS2 = _mm_load_si128((__m128i *) (src + 64));
5233 m128iS3 = _mm_load_si128((__m128i *) (src + 96));
5234 m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
5235 m128iS5 = _mm_load_si128((__m128i *) (src + 160));
5236 m128iS6 = _mm_load_si128((__m128i *) (src + 192));
5237 m128iS7 = _mm_load_si128((__m128i *) (src + 224));
5238 m128iS8 = _mm_load_si128((__m128i *) (src + 256));
5239 m128iS9 = _mm_load_si128((__m128i *) (src + 288));
5240 m128iS10 = _mm_load_si128((__m128i *) (src + 320));
5241 m128iS11 = _mm_load_si128((__m128i *) (src + 352));
5242 m128iS12 = _mm_loadu_si128((__m128i *) (src + 384));
5243 m128iS13 = _mm_load_si128((__m128i *) (src + 416));
5244 m128iS14 = _mm_load_si128((__m128i *) (src + 448));
5245 m128iS15 = _mm_load_si128((__m128i *) (src + 480));
5246 m128iS16 = _mm_load_si128((__m128i *) (src + 512));
5247 m128iS17 = _mm_load_si128((__m128i *) (src + 544));
5248 m128iS18 = _mm_load_si128((__m128i *) (src + 576));
5249 m128iS19 = _mm_load_si128((__m128i *) (src + 608));
5250 m128iS20 = _mm_load_si128((__m128i *) (src + 640));
5251 m128iS21 = _mm_load_si128((__m128i *) (src + 672));
5252 m128iS22 = _mm_load_si128((__m128i *) (src + 704));
5253 m128iS23 = _mm_load_si128((__m128i *) (src + 736));
5254 m128iS24 = _mm_load_si128((__m128i *) (src + 768));
5255 m128iS25 = _mm_load_si128((__m128i *) (src + 800));
5256 m128iS26 = _mm_load_si128((__m128i *) (src + 832));
5257 m128iS27 = _mm_load_si128((__m128i *) (src + 864));
5258 m128iS28 = _mm_load_si128((__m128i *) (src + 896));
5259 m128iS29 = _mm_load_si128((__m128i *) (src + 928));
5260 m128iS30 = _mm_load_si128((__m128i *) (src + 960));
5261 m128iS31 = _mm_load_si128((__m128i *) (src + 992));
5262
5263 shift = shift_1st;
5264 m128iAdd = _mm_set1_epi32(add_1st);
5265
5266 for (j = 0; j < 2; j++) {
5267 for (i = 0; i < 32; i += 8) {
5268 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
5269 E0l = _mm_madd_epi16(m128Tmp0,
5270 _mm_load_si128((__m128i *) (transform32x32[0][0])));
5271 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
5272 E0h = _mm_madd_epi16(m128Tmp1,
5273 _mm_load_si128((__m128i *) (transform32x32[0][0])));
5274
5275 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
5276 E1l = _mm_madd_epi16(m128Tmp2,
5277 _mm_load_si128((__m128i *) (transform32x32[1][0])));
5278 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
5279 E1h = _mm_madd_epi16(m128Tmp3,
5280 _mm_load_si128((__m128i *) (transform32x32[1][0])));
5281
5282 m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
5283 E2l = _mm_madd_epi16(m128Tmp4,
5284 _mm_load_si128((__m128i *) (transform32x32[2][0])));
5285 m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
5286 E2h = _mm_madd_epi16(m128Tmp5,
5287 _mm_load_si128((__m128i *) (transform32x32[2][0])));
5288
5289 m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
5290 E3l = _mm_madd_epi16(m128Tmp6,
5291 _mm_load_si128((__m128i *) (transform32x32[3][0])));
5292 m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
5293 E3h = _mm_madd_epi16(m128Tmp7,
5294 _mm_load_si128((__m128i *) (transform32x32[3][0])));
5295
5296 m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19);
5297 E4l = _mm_madd_epi16(m128Tmp8,
5298 _mm_load_si128((__m128i *) (transform32x32[4][0])));
5299 m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19);
5300 E4h = _mm_madd_epi16(m128Tmp9,
5301 _mm_load_si128((__m128i *) (transform32x32[4][0])));
5302
5303 m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23);
5304 E5l = _mm_madd_epi16(m128Tmp10,
5305 _mm_load_si128((__m128i *) (transform32x32[5][0])));
5306 m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23);
5307 E5h = _mm_madd_epi16(m128Tmp11,
5308 _mm_load_si128((__m128i *) (transform32x32[5][0])));
5309
5310 m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27);
5311 E6l = _mm_madd_epi16(m128Tmp12,
5312 _mm_load_si128((__m128i *) (transform32x32[6][0])));
5313 m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27);
5314 E6h = _mm_madd_epi16(m128Tmp13,
5315 _mm_load_si128((__m128i *) (transform32x32[6][0])));
5316
5317 m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31);
5318 E7l = _mm_madd_epi16(m128Tmp14,
5319 _mm_load_si128((__m128i *) (transform32x32[7][0])));
5320 m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31);
5321 E7h = _mm_madd_epi16(m128Tmp15,
5322 _mm_load_si128((__m128i *) (transform32x32[7][0])));
5323
5324 O0l = _mm_add_epi32(E0l, E1l);
5325 O0l = _mm_add_epi32(O0l, E2l);
5326 O0l = _mm_add_epi32(O0l, E3l);
5327 O0l = _mm_add_epi32(O0l, E4l);
5328 O0l = _mm_add_epi32(O0l, E5l);
5329 O0l = _mm_add_epi32(O0l, E6l);
5330 O0l = _mm_add_epi32(O0l, E7l);
5331
5332 O0h = _mm_add_epi32(E0h, E1h);
5333 O0h = _mm_add_epi32(O0h, E2h);
5334 O0h = _mm_add_epi32(O0h, E3h);
5335 O0h = _mm_add_epi32(O0h, E4h);
5336 O0h = _mm_add_epi32(O0h, E5h);
5337 O0h = _mm_add_epi32(O0h, E6h);
5338 O0h = _mm_add_epi32(O0h, E7h);
5339
5340 /* Compute O1*/
5341 E0l = _mm_madd_epi16(m128Tmp0,
5342 _mm_load_si128((__m128i *) (transform32x32[0][1])));
5343 E0h = _mm_madd_epi16(m128Tmp1,
5344 _mm_load_si128((__m128i *) (transform32x32[0][1])));
5345 E1l = _mm_madd_epi16(m128Tmp2,
5346 _mm_load_si128((__m128i *) (transform32x32[1][1])));
5347 E1h = _mm_madd_epi16(m128Tmp3,
5348 _mm_load_si128((__m128i *) (transform32x32[1][1])));
5349 E2l = _mm_madd_epi16(m128Tmp4,
5350 _mm_load_si128((__m128i *) (transform32x32[2][1])));
5351 E2h = _mm_madd_epi16(m128Tmp5,
5352 _mm_load_si128((__m128i *) (transform32x32[2][1])));
5353 E3l = _mm_madd_epi16(m128Tmp6,
5354 _mm_load_si128((__m128i *) (transform32x32[3][1])));
5355 E3h = _mm_madd_epi16(m128Tmp7,
5356 _mm_load_si128((__m128i *) (transform32x32[3][1])));
5357
5358 E4l = _mm_madd_epi16(m128Tmp8,
5359 _mm_load_si128((__m128i *) (transform32x32[4][1])));
5360 E4h = _mm_madd_epi16(m128Tmp9,
5361 _mm_load_si128((__m128i *) (transform32x32[4][1])));
5362 E5l = _mm_madd_epi16(m128Tmp10,
5363 _mm_load_si128((__m128i *) (transform32x32[5][1])));
5364 E5h = _mm_madd_epi16(m128Tmp11,
5365 _mm_load_si128((__m128i *) (transform32x32[5][1])));
5366 E6l = _mm_madd_epi16(m128Tmp12,
5367 _mm_load_si128((__m128i *) (transform32x32[6][1])));
5368 E6h = _mm_madd_epi16(m128Tmp13,
5369 _mm_load_si128((__m128i *) (transform32x32[6][1])));
5370 E7l = _mm_madd_epi16(m128Tmp14,
5371 _mm_load_si128((__m128i *) (transform32x32[7][1])));
5372 E7h = _mm_madd_epi16(m128Tmp15,
5373 _mm_load_si128((__m128i *) (transform32x32[7][1])));
5374
5375 O1l = _mm_add_epi32(E0l, E1l);
5376 O1l = _mm_add_epi32(O1l, E2l);
5377 O1l = _mm_add_epi32(O1l, E3l);
5378 O1l = _mm_add_epi32(O1l, E4l);
5379 O1l = _mm_add_epi32(O1l, E5l);
5380 O1l = _mm_add_epi32(O1l, E6l);
5381 O1l = _mm_add_epi32(O1l, E7l);
5382
5383 O1h = _mm_add_epi32(E0h, E1h);
5384 O1h = _mm_add_epi32(O1h, E2h);
5385 O1h = _mm_add_epi32(O1h, E3h);
5386 O1h = _mm_add_epi32(O1h, E4h);
5387 O1h = _mm_add_epi32(O1h, E5h);
5388 O1h = _mm_add_epi32(O1h, E6h);
5389 O1h = _mm_add_epi32(O1h, E7h);
5390 /* Compute O2*/
5391 E0l = _mm_madd_epi16(m128Tmp0,
5392 _mm_load_si128((__m128i *) (transform32x32[0][2])));
5393 E0h = _mm_madd_epi16(m128Tmp1,
5394 _mm_load_si128((__m128i *) (transform32x32[0][2])));
5395 E1l = _mm_madd_epi16(m128Tmp2,
5396 _mm_load_si128((__m128i *) (transform32x32[1][2])));
5397 E1h = _mm_madd_epi16(m128Tmp3,
5398 _mm_load_si128((__m128i *) (transform32x32[1][2])));
5399 E2l = _mm_madd_epi16(m128Tmp4,
5400 _mm_load_si128((__m128i *) (transform32x32[2][2])));
5401 E2h = _mm_madd_epi16(m128Tmp5,
5402 _mm_load_si128((__m128i *) (transform32x32[2][2])));
5403 E3l = _mm_madd_epi16(m128Tmp6,
5404 _mm_load_si128((__m128i *) (transform32x32[3][2])));
5405 E3h = _mm_madd_epi16(m128Tmp7,
5406 _mm_load_si128((__m128i *) (transform32x32[3][2])));
5407
5408 E4l = _mm_madd_epi16(m128Tmp8,
5409 _mm_load_si128((__m128i *) (transform32x32[4][2])));
5410 E4h = _mm_madd_epi16(m128Tmp9,
5411 _mm_load_si128((__m128i *) (transform32x32[4][2])));
5412 E5l = _mm_madd_epi16(m128Tmp10,
5413 _mm_load_si128((__m128i *) (transform32x32[5][2])));
5414 E5h = _mm_madd_epi16(m128Tmp11,
5415 _mm_load_si128((__m128i *) (transform32x32[5][2])));
5416 E6l = _mm_madd_epi16(m128Tmp12,
5417 _mm_load_si128((__m128i *) (transform32x32[6][2])));
5418 E6h = _mm_madd_epi16(m128Tmp13,
5419 _mm_load_si128((__m128i *) (transform32x32[6][2])));
5420 E7l = _mm_madd_epi16(m128Tmp14,
5421 _mm_load_si128((__m128i *) (transform32x32[7][2])));
5422 E7h = _mm_madd_epi16(m128Tmp15,
5423 _mm_load_si128((__m128i *) (transform32x32[7][2])));
5424
5425 O2l = _mm_add_epi32(E0l, E1l);
5426 O2l = _mm_add_epi32(O2l, E2l);
5427 O2l = _mm_add_epi32(O2l, E3l);
5428 O2l = _mm_add_epi32(O2l, E4l);
5429 O2l = _mm_add_epi32(O2l, E5l);
5430 O2l = _mm_add_epi32(O2l, E6l);
5431 O2l = _mm_add_epi32(O2l, E7l);
5432
5433 O2h = _mm_add_epi32(E0h, E1h);
5434 O2h = _mm_add_epi32(O2h, E2h);
5435 O2h = _mm_add_epi32(O2h, E3h);
5436 O2h = _mm_add_epi32(O2h, E4h);
5437 O2h = _mm_add_epi32(O2h, E5h);
5438 O2h = _mm_add_epi32(O2h, E6h);
5439 O2h = _mm_add_epi32(O2h, E7h);
5440 /* Compute O3*/
5441 E0l = _mm_madd_epi16(m128Tmp0,
5442 _mm_load_si128((__m128i *) (transform32x32[0][3])));
5443 E0h = _mm_madd_epi16(m128Tmp1,
5444 _mm_load_si128((__m128i *) (transform32x32[0][3])));
5445 E1l = _mm_madd_epi16(m128Tmp2,
5446 _mm_load_si128((__m128i *) (transform32x32[1][3])));
5447 E1h = _mm_madd_epi16(m128Tmp3,
5448 _mm_load_si128((__m128i *) (transform32x32[1][3])));
5449 E2l = _mm_madd_epi16(m128Tmp4,
5450 _mm_load_si128((__m128i *) (transform32x32[2][3])));
5451 E2h = _mm_madd_epi16(m128Tmp5,
5452 _mm_load_si128((__m128i *) (transform32x32[2][3])));
5453 E3l = _mm_madd_epi16(m128Tmp6,
5454 _mm_load_si128((__m128i *) (transform32x32[3][3])));
5455 E3h = _mm_madd_epi16(m128Tmp7,
5456 _mm_load_si128((__m128i *) (transform32x32[3][3])));
5457
5458 E4l = _mm_madd_epi16(m128Tmp8,
5459 _mm_load_si128((__m128i *) (transform32x32[4][3])));
5460 E4h = _mm_madd_epi16(m128Tmp9,
5461 _mm_load_si128((__m128i *) (transform32x32[4][3])));
5462 E5l = _mm_madd_epi16(m128Tmp10,
5463 _mm_load_si128((__m128i *) (transform32x32[5][3])));
5464 E5h = _mm_madd_epi16(m128Tmp11,
5465 _mm_load_si128((__m128i *) (transform32x32[5][3])));
5466 E6l = _mm_madd_epi16(m128Tmp12,
5467 _mm_load_si128((__m128i *) (transform32x32[6][3])));
5468 E6h = _mm_madd_epi16(m128Tmp13,
5469 _mm_load_si128((__m128i *) (transform32x32[6][3])));
5470 E7l = _mm_madd_epi16(m128Tmp14,
5471 _mm_load_si128((__m128i *) (transform32x32[7][3])));
5472 E7h = _mm_madd_epi16(m128Tmp15,
5473 _mm_load_si128((__m128i *) (transform32x32[7][3])));
5474
5475 O3l = _mm_add_epi32(E0l, E1l);
5476 O3l = _mm_add_epi32(O3l, E2l);
5477 O3l = _mm_add_epi32(O3l, E3l);
5478 O3l = _mm_add_epi32(O3l, E4l);
5479 O3l = _mm_add_epi32(O3l, E5l);
5480 O3l = _mm_add_epi32(O3l, E6l);
5481 O3l = _mm_add_epi32(O3l, E7l);
5482
5483 O3h = _mm_add_epi32(E0h, E1h);
5484 O3h = _mm_add_epi32(O3h, E2h);
5485 O3h = _mm_add_epi32(O3h, E3h);
5486 O3h = _mm_add_epi32(O3h, E4h);
5487 O3h = _mm_add_epi32(O3h, E5h);
5488 O3h = _mm_add_epi32(O3h, E6h);
5489 O3h = _mm_add_epi32(O3h, E7h);
5490 /* Compute O4*/
5491
5492 E0l = _mm_madd_epi16(m128Tmp0,
5493 _mm_load_si128((__m128i *) (transform32x32[0][4])));
5494 E0h = _mm_madd_epi16(m128Tmp1,
5495 _mm_load_si128((__m128i *) (transform32x32[0][4])));
5496 E1l = _mm_madd_epi16(m128Tmp2,
5497 _mm_load_si128((__m128i *) (transform32x32[1][4])));
5498 E1h = _mm_madd_epi16(m128Tmp3,
5499 _mm_load_si128((__m128i *) (transform32x32[1][4])));
5500 E2l = _mm_madd_epi16(m128Tmp4,
5501 _mm_load_si128((__m128i *) (transform32x32[2][4])));
5502 E2h = _mm_madd_epi16(m128Tmp5,
5503 _mm_load_si128((__m128i *) (transform32x32[2][4])));
5504 E3l = _mm_madd_epi16(m128Tmp6,
5505 _mm_load_si128((__m128i *) (transform32x32[3][4])));
5506 E3h = _mm_madd_epi16(m128Tmp7,
5507 _mm_load_si128((__m128i *) (transform32x32[3][4])));
5508
5509 E4l = _mm_madd_epi16(m128Tmp8,
5510 _mm_load_si128((__m128i *) (transform32x32[4][4])));
5511 E4h = _mm_madd_epi16(m128Tmp9,
5512 _mm_load_si128((__m128i *) (transform32x32[4][4])));
5513 E5l = _mm_madd_epi16(m128Tmp10,
5514 _mm_load_si128((__m128i *) (transform32x32[5][4])));
5515 E5h = _mm_madd_epi16(m128Tmp11,
5516 _mm_load_si128((__m128i *) (transform32x32[5][4])));
5517 E6l = _mm_madd_epi16(m128Tmp12,
5518 _mm_load_si128((__m128i *) (transform32x32[6][4])));
5519 E6h = _mm_madd_epi16(m128Tmp13,
5520 _mm_load_si128((__m128i *) (transform32x32[6][4])));
5521 E7l = _mm_madd_epi16(m128Tmp14,
5522 _mm_load_si128((__m128i *) (transform32x32[7][4])));
5523 E7h = _mm_madd_epi16(m128Tmp15,
5524 _mm_load_si128((__m128i *) (transform32x32[7][4])));
5525
5526 O4l = _mm_add_epi32(E0l, E1l);
5527 O4l = _mm_add_epi32(O4l, E2l);
5528 O4l = _mm_add_epi32(O4l, E3l);
5529 O4l = _mm_add_epi32(O4l, E4l);
5530 O4l = _mm_add_epi32(O4l, E5l);
5531 O4l = _mm_add_epi32(O4l, E6l);
5532 O4l = _mm_add_epi32(O4l, E7l);
5533
5534 O4h = _mm_add_epi32(E0h, E1h);
5535 O4h = _mm_add_epi32(O4h, E2h);
5536 O4h = _mm_add_epi32(O4h, E3h);
5537 O4h = _mm_add_epi32(O4h, E4h);
5538 O4h = _mm_add_epi32(O4h, E5h);
5539 O4h = _mm_add_epi32(O4h, E6h);
5540 O4h = _mm_add_epi32(O4h, E7h);
5541
5542 /* Compute O5*/
5543 E0l = _mm_madd_epi16(m128Tmp0,
5544 _mm_load_si128((__m128i *) (transform32x32[0][5])));
5545 E0h = _mm_madd_epi16(m128Tmp1,
5546 _mm_load_si128((__m128i *) (transform32x32[0][5])));
5547 E1l = _mm_madd_epi16(m128Tmp2,
5548 _mm_load_si128((__m128i *) (transform32x32[1][5])));
5549 E1h = _mm_madd_epi16(m128Tmp3,
5550 _mm_load_si128((__m128i *) (transform32x32[1][5])));
5551 E2l = _mm_madd_epi16(m128Tmp4,
5552 _mm_load_si128((__m128i *) (transform32x32[2][5])));
5553 E2h = _mm_madd_epi16(m128Tmp5,
5554 _mm_load_si128((__m128i *) (transform32x32[2][5])));
5555 E3l = _mm_madd_epi16(m128Tmp6,
5556 _mm_load_si128((__m128i *) (transform32x32[3][5])));
5557 E3h = _mm_madd_epi16(m128Tmp7,
5558 _mm_load_si128((__m128i *) (transform32x32[3][5])));
5559
5560 E4l = _mm_madd_epi16(m128Tmp8,
5561 _mm_load_si128((__m128i *) (transform32x32[4][5])));
5562 E4h = _mm_madd_epi16(m128Tmp9,
5563 _mm_load_si128((__m128i *) (transform32x32[4][5])));
5564 E5l = _mm_madd_epi16(m128Tmp10,
5565 _mm_load_si128((__m128i *) (transform32x32[5][5])));
5566 E5h = _mm_madd_epi16(m128Tmp11,
5567 _mm_load_si128((__m128i *) (transform32x32[5][5])));
5568 E6l = _mm_madd_epi16(m128Tmp12,
5569 _mm_load_si128((__m128i *) (transform32x32[6][5])));
5570 E6h = _mm_madd_epi16(m128Tmp13,
5571 _mm_load_si128((__m128i *) (transform32x32[6][5])));
5572 E7l = _mm_madd_epi16(m128Tmp14,
5573 _mm_load_si128((__m128i *) (transform32x32[7][5])));
5574 E7h = _mm_madd_epi16(m128Tmp15,
5575 _mm_load_si128((__m128i *) (transform32x32[7][5])));
5576
5577 O5l = _mm_add_epi32(E0l, E1l);
5578 O5l = _mm_add_epi32(O5l, E2l);
5579 O5l = _mm_add_epi32(O5l, E3l);
5580 O5l = _mm_add_epi32(O5l, E4l);
5581 O5l = _mm_add_epi32(O5l, E5l);
5582 O5l = _mm_add_epi32(O5l, E6l);
5583 O5l = _mm_add_epi32(O5l, E7l);
5584
5585 O5h = _mm_add_epi32(E0h, E1h);
5586 O5h = _mm_add_epi32(O5h, E2h);
5587 O5h = _mm_add_epi32(O5h, E3h);
5588 O5h = _mm_add_epi32(O5h, E4h);
5589 O5h = _mm_add_epi32(O5h, E5h);
5590 O5h = _mm_add_epi32(O5h, E6h);
5591 O5h = _mm_add_epi32(O5h, E7h);
5592
5593 /* Compute O6*/
5594
5595 E0l = _mm_madd_epi16(m128Tmp0,
5596 _mm_load_si128((__m128i *) (transform32x32[0][6])));
5597 E0h = _mm_madd_epi16(m128Tmp1,
5598 _mm_load_si128((__m128i *) (transform32x32[0][6])));
5599 E1l = _mm_madd_epi16(m128Tmp2,
5600 _mm_load_si128((__m128i *) (transform32x32[1][6])));
5601 E1h = _mm_madd_epi16(m128Tmp3,
5602 _mm_load_si128((__m128i *) (transform32x32[1][6])));
5603 E2l = _mm_madd_epi16(m128Tmp4,
5604 _mm_load_si128((__m128i *) (transform32x32[2][6])));
5605 E2h = _mm_madd_epi16(m128Tmp5,
5606 _mm_load_si128((__m128i *) (transform32x32[2][6])));
5607 E3l = _mm_madd_epi16(m128Tmp6,
5608 _mm_load_si128((__m128i *) (transform32x32[3][6])));
5609 E3h = _mm_madd_epi16(m128Tmp7,
5610 _mm_load_si128((__m128i *) (transform32x32[3][6])));
5611
5612 E4l = _mm_madd_epi16(m128Tmp8,
5613 _mm_load_si128((__m128i *) (transform32x32[4][6])));
5614 E4h = _mm_madd_epi16(m128Tmp9,
5615 _mm_load_si128((__m128i *) (transform32x32[4][6])));
5616 E5l = _mm_madd_epi16(m128Tmp10,
5617 _mm_load_si128((__m128i *) (transform32x32[5][6])));
5618 E5h = _mm_madd_epi16(m128Tmp11,
5619 _mm_load_si128((__m128i *) (transform32x32[5][6])));
5620 E6l = _mm_madd_epi16(m128Tmp12,
5621 _mm_load_si128((__m128i *) (transform32x32[6][6])));
5622 E6h = _mm_madd_epi16(m128Tmp13,
5623 _mm_load_si128((__m128i *) (transform32x32[6][6])));
5624 E7l = _mm_madd_epi16(m128Tmp14,
5625 _mm_load_si128((__m128i *) (transform32x32[7][6])));
5626 E7h = _mm_madd_epi16(m128Tmp15,
5627 _mm_load_si128((__m128i *) (transform32x32[7][6])));
5628
5629 O6l = _mm_add_epi32(E0l, E1l);
5630 O6l = _mm_add_epi32(O6l, E2l);
5631 O6l = _mm_add_epi32(O6l, E3l);
5632 O6l = _mm_add_epi32(O6l, E4l);
5633 O6l = _mm_add_epi32(O6l, E5l);
5634 O6l = _mm_add_epi32(O6l, E6l);
5635 O6l = _mm_add_epi32(O6l, E7l);
5636
5637 O6h = _mm_add_epi32(E0h, E1h);
5638 O6h = _mm_add_epi32(O6h, E2h);
5639 O6h = _mm_add_epi32(O6h, E3h);
5640 O6h = _mm_add_epi32(O6h, E4h);
5641 O6h = _mm_add_epi32(O6h, E5h);
5642 O6h = _mm_add_epi32(O6h, E6h);
5643 O6h = _mm_add_epi32(O6h, E7h);
5644
5645 /* Compute O7*/
5646
5647 E0l = _mm_madd_epi16(m128Tmp0,
5648 _mm_load_si128((__m128i *) (transform32x32[0][7])));
5649 E0h = _mm_madd_epi16(m128Tmp1,
5650 _mm_load_si128((__m128i *) (transform32x32[0][7])));
5651 E1l = _mm_madd_epi16(m128Tmp2,
5652 _mm_load_si128((__m128i *) (transform32x32[1][7])));
5653 E1h = _mm_madd_epi16(m128Tmp3,
5654 _mm_load_si128((__m128i *) (transform32x32[1][7])));
5655 E2l = _mm_madd_epi16(m128Tmp4,
5656 _mm_load_si128((__m128i *) (transform32x32[2][7])));
5657 E2h = _mm_madd_epi16(m128Tmp5,
5658 _mm_load_si128((__m128i *) (transform32x32[2][7])));
5659 E3l = _mm_madd_epi16(m128Tmp6,
5660 _mm_load_si128((__m128i *) (transform32x32[3][7])));
5661 E3h = _mm_madd_epi16(m128Tmp7,
5662 _mm_load_si128((__m128i *) (transform32x32[3][7])));
5663
5664 E4l = _mm_madd_epi16(m128Tmp8,
5665 _mm_load_si128((__m128i *) (transform32x32[4][7])));
5666 E4h = _mm_madd_epi16(m128Tmp9,
5667 _mm_load_si128((__m128i *) (transform32x32[4][7])));
5668 E5l = _mm_madd_epi16(m128Tmp10,
5669 _mm_load_si128((__m128i *) (transform32x32[5][7])));
5670 E5h = _mm_madd_epi16(m128Tmp11,
5671 _mm_load_si128((__m128i *) (transform32x32[5][7])));
5672 E6l = _mm_madd_epi16(m128Tmp12,
5673 _mm_load_si128((__m128i *) (transform32x32[6][7])));
5674 E6h = _mm_madd_epi16(m128Tmp13,
5675 _mm_load_si128((__m128i *) (transform32x32[6][7])));
5676 E7l = _mm_madd_epi16(m128Tmp14,
5677 _mm_load_si128((__m128i *) (transform32x32[7][7])));
5678 E7h = _mm_madd_epi16(m128Tmp15,
5679 _mm_load_si128((__m128i *) (transform32x32[7][7])));
5680
5681 O7l = _mm_add_epi32(E0l, E1l);
5682 O7l = _mm_add_epi32(O7l, E2l);
5683 O7l = _mm_add_epi32(O7l, E3l);
5684 O7l = _mm_add_epi32(O7l, E4l);
5685 O7l = _mm_add_epi32(O7l, E5l);
5686 O7l = _mm_add_epi32(O7l, E6l);
5687 O7l = _mm_add_epi32(O7l, E7l);
5688
5689 O7h = _mm_add_epi32(E0h, E1h);
5690 O7h = _mm_add_epi32(O7h, E2h);
5691 O7h = _mm_add_epi32(O7h, E3h);
5692 O7h = _mm_add_epi32(O7h, E4h);
5693 O7h = _mm_add_epi32(O7h, E5h);
5694 O7h = _mm_add_epi32(O7h, E6h);
5695 O7h = _mm_add_epi32(O7h, E7h);
5696
5697 /* Compute O8*/
5698
5699 E0l = _mm_madd_epi16(m128Tmp0,
5700 _mm_load_si128((__m128i *) (transform32x32[0][8])));
5701 E0h = _mm_madd_epi16(m128Tmp1,
5702 _mm_load_si128((__m128i *) (transform32x32[0][8])));
5703 E1l = _mm_madd_epi16(m128Tmp2,
5704 _mm_load_si128((__m128i *) (transform32x32[1][8])));
5705 E1h = _mm_madd_epi16(m128Tmp3,
5706 _mm_load_si128((__m128i *) (transform32x32[1][8])));
5707 E2l = _mm_madd_epi16(m128Tmp4,
5708 _mm_load_si128((__m128i *) (transform32x32[2][8])));
5709 E2h = _mm_madd_epi16(m128Tmp5,
5710 _mm_load_si128((__m128i *) (transform32x32[2][8])));
5711 E3l = _mm_madd_epi16(m128Tmp6,
5712 _mm_load_si128((__m128i *) (transform32x32[3][8])));
5713 E3h = _mm_madd_epi16(m128Tmp7,
5714 _mm_load_si128((__m128i *) (transform32x32[3][8])));
5715
5716 E4l = _mm_madd_epi16(m128Tmp8,
5717 _mm_load_si128((__m128i *) (transform32x32[4][8])));
5718 E4h = _mm_madd_epi16(m128Tmp9,
5719 _mm_load_si128((__m128i *) (transform32x32[4][8])));
5720 E5l = _mm_madd_epi16(m128Tmp10,
5721 _mm_load_si128((__m128i *) (transform32x32[5][8])));
5722 E5h = _mm_madd_epi16(m128Tmp11,
5723 _mm_load_si128((__m128i *) (transform32x32[5][8])));
5724 E6l = _mm_madd_epi16(m128Tmp12,
5725 _mm_load_si128((__m128i *) (transform32x32[6][8])));
5726 E6h = _mm_madd_epi16(m128Tmp13,
5727 _mm_load_si128((__m128i *) (transform32x32[6][8])));
5728 E7l = _mm_madd_epi16(m128Tmp14,
5729 _mm_load_si128((__m128i *) (transform32x32[7][8])));
5730 E7h = _mm_madd_epi16(m128Tmp15,
5731 _mm_load_si128((__m128i *) (transform32x32[7][8])));
5732
5733 O8l = _mm_add_epi32(E0l, E1l);
5734 O8l = _mm_add_epi32(O8l, E2l);
5735 O8l = _mm_add_epi32(O8l, E3l);
5736 O8l = _mm_add_epi32(O8l, E4l);
5737 O8l = _mm_add_epi32(O8l, E5l);
5738 O8l = _mm_add_epi32(O8l, E6l);
5739 O8l = _mm_add_epi32(O8l, E7l);
5740
5741 O8h = _mm_add_epi32(E0h, E1h);
5742 O8h = _mm_add_epi32(O8h, E2h);
5743 O8h = _mm_add_epi32(O8h, E3h);
5744 O8h = _mm_add_epi32(O8h, E4h);
5745 O8h = _mm_add_epi32(O8h, E5h);
5746 O8h = _mm_add_epi32(O8h, E6h);
5747 O8h = _mm_add_epi32(O8h, E7h);
5748
5749 /* Compute O9*/
5750
5751 E0l = _mm_madd_epi16(m128Tmp0,
5752 _mm_load_si128((__m128i *) (transform32x32[0][9])));
5753 E0h = _mm_madd_epi16(m128Tmp1,
5754 _mm_load_si128((__m128i *) (transform32x32[0][9])));
5755 E1l = _mm_madd_epi16(m128Tmp2,
5756 _mm_load_si128((__m128i *) (transform32x32[1][9])));
5757 E1h = _mm_madd_epi16(m128Tmp3,
5758 _mm_load_si128((__m128i *) (transform32x32[1][9])));
5759 E2l = _mm_madd_epi16(m128Tmp4,
5760 _mm_load_si128((__m128i *) (transform32x32[2][9])));
5761 E2h = _mm_madd_epi16(m128Tmp5,
5762 _mm_load_si128((__m128i *) (transform32x32[2][9])));
5763 E3l = _mm_madd_epi16(m128Tmp6,
5764 _mm_load_si128((__m128i *) (transform32x32[3][9])));
5765 E3h = _mm_madd_epi16(m128Tmp7,
5766 _mm_load_si128((__m128i *) (transform32x32[3][9])));
5767
5768 E4l = _mm_madd_epi16(m128Tmp8,
5769 _mm_load_si128((__m128i *) (transform32x32[4][9])));
5770 E4h = _mm_madd_epi16(m128Tmp9,
5771 _mm_load_si128((__m128i *) (transform32x32[4][9])));
5772 E5l = _mm_madd_epi16(m128Tmp10,
5773 _mm_load_si128((__m128i *) (transform32x32[5][9])));
5774 E5h = _mm_madd_epi16(m128Tmp11,
5775 _mm_load_si128((__m128i *) (transform32x32[5][9])));
5776 E6l = _mm_madd_epi16(m128Tmp12,
5777 _mm_load_si128((__m128i *) (transform32x32[6][9])));
5778 E6h = _mm_madd_epi16(m128Tmp13,
5779 _mm_load_si128((__m128i *) (transform32x32[6][9])));
5780 E7l = _mm_madd_epi16(m128Tmp14,
5781 _mm_load_si128((__m128i *) (transform32x32[7][9])));
5782 E7h = _mm_madd_epi16(m128Tmp15,
5783 _mm_load_si128((__m128i *) (transform32x32[7][9])));
5784
5785 O9l = _mm_add_epi32(E0l, E1l);
5786 O9l = _mm_add_epi32(O9l, E2l);
5787 O9l = _mm_add_epi32(O9l, E3l);
5788 O9l = _mm_add_epi32(O9l, E4l);
5789 O9l = _mm_add_epi32(O9l, E5l);
5790 O9l = _mm_add_epi32(O9l, E6l);
5791 O9l = _mm_add_epi32(O9l, E7l);
5792
5793 O9h = _mm_add_epi32(E0h, E1h);
5794 O9h = _mm_add_epi32(O9h, E2h);
5795 O9h = _mm_add_epi32(O9h, E3h);
5796 O9h = _mm_add_epi32(O9h, E4h);
5797 O9h = _mm_add_epi32(O9h, E5h);
5798 O9h = _mm_add_epi32(O9h, E6h);
5799 O9h = _mm_add_epi32(O9h, E7h);
5800
5801 /* Compute 10*/
5802
5803 E0l = _mm_madd_epi16(m128Tmp0,
5804 _mm_load_si128((__m128i *) (transform32x32[0][10])));
5805 E0h = _mm_madd_epi16(m128Tmp1,
5806 _mm_load_si128((__m128i *) (transform32x32[0][10])));
5807 E1l = _mm_madd_epi16(m128Tmp2,
5808 _mm_load_si128((__m128i *) (transform32x32[1][10])));
5809 E1h = _mm_madd_epi16(m128Tmp3,
5810 _mm_load_si128((__m128i *) (transform32x32[1][10])));
5811 E2l = _mm_madd_epi16(m128Tmp4,
5812 _mm_load_si128((__m128i *) (transform32x32[2][10])));
5813 E2h = _mm_madd_epi16(m128Tmp5,
5814 _mm_load_si128((__m128i *) (transform32x32[2][10])));
5815 E3l = _mm_madd_epi16(m128Tmp6,
5816 _mm_load_si128((__m128i *) (transform32x32[3][10])));
5817 E3h = _mm_madd_epi16(m128Tmp7,
5818 _mm_load_si128((__m128i *) (transform32x32[3][10])));
5819
5820 E4l = _mm_madd_epi16(m128Tmp8,
5821 _mm_load_si128((__m128i *) (transform32x32[4][10])));
5822 E4h = _mm_madd_epi16(m128Tmp9,
5823 _mm_load_si128((__m128i *) (transform32x32[4][10])));
5824 E5l = _mm_madd_epi16(m128Tmp10,
5825 _mm_load_si128((__m128i *) (transform32x32[5][10])));
5826 E5h = _mm_madd_epi16(m128Tmp11,
5827 _mm_load_si128((__m128i *) (transform32x32[5][10])));
5828 E6l = _mm_madd_epi16(m128Tmp12,
5829 _mm_load_si128((__m128i *) (transform32x32[6][10])));
5830 E6h = _mm_madd_epi16(m128Tmp13,
5831 _mm_load_si128((__m128i *) (transform32x32[6][10])));
5832 E7l = _mm_madd_epi16(m128Tmp14,
5833 _mm_load_si128((__m128i *) (transform32x32[7][10])));
5834 E7h = _mm_madd_epi16(m128Tmp15,
5835 _mm_load_si128((__m128i *) (transform32x32[7][10])));
5836
5837 O10l = _mm_add_epi32(E0l, E1l);
5838 O10l = _mm_add_epi32(O10l, E2l);
5839 O10l = _mm_add_epi32(O10l, E3l);
5840 O10l = _mm_add_epi32(O10l, E4l);
5841 O10l = _mm_add_epi32(O10l, E5l);
5842 O10l = _mm_add_epi32(O10l, E6l);
5843 O10l = _mm_add_epi32(O10l, E7l);
5844
5845 O10h = _mm_add_epi32(E0h, E1h);
5846 O10h = _mm_add_epi32(O10h, E2h);
5847 O10h = _mm_add_epi32(O10h, E3h);
5848 O10h = _mm_add_epi32(O10h, E4h);
5849 O10h = _mm_add_epi32(O10h, E5h);
5850 O10h = _mm_add_epi32(O10h, E6h);
5851 O10h = _mm_add_epi32(O10h, E7h);
5852
5853 /* Compute 11*/
5854
5855 E0l = _mm_madd_epi16(m128Tmp0,
5856 _mm_load_si128((__m128i *) (transform32x32[0][11])));
5857 E0h = _mm_madd_epi16(m128Tmp1,
5858 _mm_load_si128((__m128i *) (transform32x32[0][11])));
5859 E1l = _mm_madd_epi16(m128Tmp2,
5860 _mm_load_si128((__m128i *) (transform32x32[1][11])));
5861 E1h = _mm_madd_epi16(m128Tmp3,
5862 _mm_load_si128((__m128i *) (transform32x32[1][11])));
5863 E2l = _mm_madd_epi16(m128Tmp4,
5864 _mm_load_si128((__m128i *) (transform32x32[2][11])));
5865 E2h = _mm_madd_epi16(m128Tmp5,
5866 _mm_load_si128((__m128i *) (transform32x32[2][11])));
5867 E3l = _mm_madd_epi16(m128Tmp6,
5868 _mm_load_si128((__m128i *) (transform32x32[3][11])));
5869 E3h = _mm_madd_epi16(m128Tmp7,
5870 _mm_load_si128((__m128i *) (transform32x32[3][11])));
5871
5872 E4l = _mm_madd_epi16(m128Tmp8,
5873 _mm_load_si128((__m128i *) (transform32x32[4][11])));
5874 E4h = _mm_madd_epi16(m128Tmp9,
5875 _mm_load_si128((__m128i *) (transform32x32[4][11])));
5876 E5l = _mm_madd_epi16(m128Tmp10,
5877 _mm_load_si128((__m128i *) (transform32x32[5][11])));
5878 E5h = _mm_madd_epi16(m128Tmp11,
5879 _mm_load_si128((__m128i *) (transform32x32[5][11])));
5880 E6l = _mm_madd_epi16(m128Tmp12,
5881 _mm_load_si128((__m128i *) (transform32x32[6][11])));
5882 E6h = _mm_madd_epi16(m128Tmp13,
5883 _mm_load_si128((__m128i *) (transform32x32[6][11])));
5884 E7l = _mm_madd_epi16(m128Tmp14,
5885 _mm_load_si128((__m128i *) (transform32x32[7][11])));
5886 E7h = _mm_madd_epi16(m128Tmp15,
5887 _mm_load_si128((__m128i *) (transform32x32[7][11])));
5888
5889 O11l = _mm_add_epi32(E0l, E1l);
5890 O11l = _mm_add_epi32(O11l, E2l);
5891 O11l = _mm_add_epi32(O11l, E3l);
5892 O11l = _mm_add_epi32(O11l, E4l);
5893 O11l = _mm_add_epi32(O11l, E5l);
5894 O11l = _mm_add_epi32(O11l, E6l);
5895 O11l = _mm_add_epi32(O11l, E7l);
5896
5897 O11h = _mm_add_epi32(E0h, E1h);
5898 O11h = _mm_add_epi32(O11h, E2h);
5899 O11h = _mm_add_epi32(O11h, E3h);
5900 O11h = _mm_add_epi32(O11h, E4h);
5901 O11h = _mm_add_epi32(O11h, E5h);
5902 O11h = _mm_add_epi32(O11h, E6h);
5903 O11h = _mm_add_epi32(O11h, E7h);
5904
5905 /* Compute 12*/
5906
5907 E0l = _mm_madd_epi16(m128Tmp0,
5908 _mm_load_si128((__m128i *) (transform32x32[0][12])));
5909 E0h = _mm_madd_epi16(m128Tmp1,
5910 _mm_load_si128((__m128i *) (transform32x32[0][12])));
5911 E1l = _mm_madd_epi16(m128Tmp2,
5912 _mm_load_si128((__m128i *) (transform32x32[1][12])));
5913 E1h = _mm_madd_epi16(m128Tmp3,
5914 _mm_load_si128((__m128i *) (transform32x32[1][12])));
5915 E2l = _mm_madd_epi16(m128Tmp4,
5916 _mm_load_si128((__m128i *) (transform32x32[2][12])));
5917 E2h = _mm_madd_epi16(m128Tmp5,
5918 _mm_load_si128((__m128i *) (transform32x32[2][12])));
5919 E3l = _mm_madd_epi16(m128Tmp6,
5920 _mm_load_si128((__m128i *) (transform32x32[3][12])));
5921 E3h = _mm_madd_epi16(m128Tmp7,
5922 _mm_load_si128((__m128i *) (transform32x32[3][12])));
5923
5924 E4l = _mm_madd_epi16(m128Tmp8,
5925 _mm_load_si128((__m128i *) (transform32x32[4][12])));
5926 E4h = _mm_madd_epi16(m128Tmp9,
5927 _mm_load_si128((__m128i *) (transform32x32[4][12])));
5928 E5l = _mm_madd_epi16(m128Tmp10,
5929 _mm_load_si128((__m128i *) (transform32x32[5][12])));
5930 E5h = _mm_madd_epi16(m128Tmp11,
5931 _mm_load_si128((__m128i *) (transform32x32[5][12])));
5932 E6l = _mm_madd_epi16(m128Tmp12,
5933 _mm_load_si128((__m128i *) (transform32x32[6][12])));
5934 E6h = _mm_madd_epi16(m128Tmp13,
5935 _mm_load_si128((__m128i *) (transform32x32[6][12])));
5936 E7l = _mm_madd_epi16(m128Tmp14,
5937 _mm_load_si128((__m128i *) (transform32x32[7][12])));
5938 E7h = _mm_madd_epi16(m128Tmp15,
5939 _mm_load_si128((__m128i *) (transform32x32[7][12])));
5940
5941 O12l = _mm_add_epi32(E0l, E1l);
5942 O12l = _mm_add_epi32(O12l, E2l);
5943 O12l = _mm_add_epi32(O12l, E3l);
5944 O12l = _mm_add_epi32(O12l, E4l);
5945 O12l = _mm_add_epi32(O12l, E5l);
5946 O12l = _mm_add_epi32(O12l, E6l);
5947 O12l = _mm_add_epi32(O12l, E7l);
5948
5949 O12h = _mm_add_epi32(E0h, E1h);
5950 O12h = _mm_add_epi32(O12h, E2h);
5951 O12h = _mm_add_epi32(O12h, E3h);
5952 O12h = _mm_add_epi32(O12h, E4h);
5953 O12h = _mm_add_epi32(O12h, E5h);
5954 O12h = _mm_add_epi32(O12h, E6h);
5955 O12h = _mm_add_epi32(O12h, E7h);
5956
5957 /* Compute 13*/
5958
5959 E0l = _mm_madd_epi16(m128Tmp0,
5960 _mm_load_si128((__m128i *) (transform32x32[0][13])));
5961 E0h = _mm_madd_epi16(m128Tmp1,
5962 _mm_load_si128((__m128i *) (transform32x32[0][13])));
5963 E1l = _mm_madd_epi16(m128Tmp2,
5964 _mm_load_si128((__m128i *) (transform32x32[1][13])));
5965 E1h = _mm_madd_epi16(m128Tmp3,
5966 _mm_load_si128((__m128i *) (transform32x32[1][13])));
5967 E2l = _mm_madd_epi16(m128Tmp4,
5968 _mm_load_si128((__m128i *) (transform32x32[2][13])));
5969 E2h = _mm_madd_epi16(m128Tmp5,
5970 _mm_load_si128((__m128i *) (transform32x32[2][13])));
5971 E3l = _mm_madd_epi16(m128Tmp6,
5972 _mm_load_si128((__m128i *) (transform32x32[3][13])));
5973 E3h = _mm_madd_epi16(m128Tmp7,
5974 _mm_load_si128((__m128i *) (transform32x32[3][13])));
5975
5976 E4l = _mm_madd_epi16(m128Tmp8,
5977 _mm_load_si128((__m128i *) (transform32x32[4][13])));
5978 E4h = _mm_madd_epi16(m128Tmp9,
5979 _mm_load_si128((__m128i *) (transform32x32[4][13])));
5980 E5l = _mm_madd_epi16(m128Tmp10,
5981 _mm_load_si128((__m128i *) (transform32x32[5][13])));
5982 E5h = _mm_madd_epi16(m128Tmp11,
5983 _mm_load_si128((__m128i *) (transform32x32[5][13])));
5984 E6l = _mm_madd_epi16(m128Tmp12,
5985 _mm_load_si128((__m128i *) (transform32x32[6][13])));
5986 E6h = _mm_madd_epi16(m128Tmp13,
5987 _mm_load_si128((__m128i *) (transform32x32[6][13])));
5988 E7l = _mm_madd_epi16(m128Tmp14,
5989 _mm_load_si128((__m128i *) (transform32x32[7][13])));
5990 E7h = _mm_madd_epi16(m128Tmp15,
5991 _mm_load_si128((__m128i *) (transform32x32[7][13])));
5992
5993 O13l = _mm_add_epi32(E0l, E1l);
5994 O13l = _mm_add_epi32(O13l, E2l);
5995 O13l = _mm_add_epi32(O13l, E3l);
5996 O13l = _mm_add_epi32(O13l, E4l);
5997 O13l = _mm_add_epi32(O13l, E5l);
5998 O13l = _mm_add_epi32(O13l, E6l);
5999 O13l = _mm_add_epi32(O13l, E7l);
6000
6001 O13h = _mm_add_epi32(E0h, E1h);
6002 O13h = _mm_add_epi32(O13h, E2h);
6003 O13h = _mm_add_epi32(O13h, E3h);
6004 O13h = _mm_add_epi32(O13h, E4h);
6005 O13h = _mm_add_epi32(O13h, E5h);
6006 O13h = _mm_add_epi32(O13h, E6h);
6007 O13h = _mm_add_epi32(O13h, E7h);
6008
6009 /* Compute O14 */
6010
6011 E0l = _mm_madd_epi16(m128Tmp0,
6012 _mm_load_si128((__m128i *) (transform32x32[0][14])));
6013 E0h = _mm_madd_epi16(m128Tmp1,
6014 _mm_load_si128((__m128i *) (transform32x32[0][14])));
6015 E1l = _mm_madd_epi16(m128Tmp2,
6016 _mm_load_si128((__m128i *) (transform32x32[1][14])));
6017 E1h = _mm_madd_epi16(m128Tmp3,
6018 _mm_load_si128((__m128i *) (transform32x32[1][14])));
6019 E2l = _mm_madd_epi16(m128Tmp4,
6020 _mm_load_si128((__m128i *) (transform32x32[2][14])));
6021 E2h = _mm_madd_epi16(m128Tmp5,
6022 _mm_load_si128((__m128i *) (transform32x32[2][14])));
6023 E3l = _mm_madd_epi16(m128Tmp6,
6024 _mm_load_si128((__m128i *) (transform32x32[3][14])));
6025 E3h = _mm_madd_epi16(m128Tmp7,
6026 _mm_load_si128((__m128i *) (transform32x32[3][14])));
6027
6028 E4l = _mm_madd_epi16(m128Tmp8,
6029 _mm_load_si128((__m128i *) (transform32x32[4][14])));
6030 E4h = _mm_madd_epi16(m128Tmp9,
6031 _mm_load_si128((__m128i *) (transform32x32[4][14])));
6032 E5l = _mm_madd_epi16(m128Tmp10,
6033 _mm_load_si128((__m128i *) (transform32x32[5][14])));
6034 E5h = _mm_madd_epi16(m128Tmp11,
6035 _mm_load_si128((__m128i *) (transform32x32[5][14])));
6036 E6l = _mm_madd_epi16(m128Tmp12,
6037 _mm_load_si128((__m128i *) (transform32x32[6][14])));
6038 E6h = _mm_madd_epi16(m128Tmp13,
6039 _mm_load_si128((__m128i *) (transform32x32[6][14])));
6040 E7l = _mm_madd_epi16(m128Tmp14,
6041 _mm_load_si128((__m128i *) (transform32x32[7][14])));
6042 E7h = _mm_madd_epi16(m128Tmp15,
6043 _mm_load_si128((__m128i *) (transform32x32[7][14])));
6044
6045 O14l = _mm_add_epi32(E0l, E1l);
6046 O14l = _mm_add_epi32(O14l, E2l);
6047 O14l = _mm_add_epi32(O14l, E3l);
6048 O14l = _mm_add_epi32(O14l, E4l);
6049 O14l = _mm_add_epi32(O14l, E5l);
6050 O14l = _mm_add_epi32(O14l, E6l);
6051 O14l = _mm_add_epi32(O14l, E7l);
6052
6053 O14h = _mm_add_epi32(E0h, E1h);
6054 O14h = _mm_add_epi32(O14h, E2h);
6055 O14h = _mm_add_epi32(O14h, E3h);
6056 O14h = _mm_add_epi32(O14h, E4h);
6057 O14h = _mm_add_epi32(O14h, E5h);
6058 O14h = _mm_add_epi32(O14h, E6h);
6059 O14h = _mm_add_epi32(O14h, E7h);
6060
6061 /* Compute O15*/
6062
6063 E0l = _mm_madd_epi16(m128Tmp0,
6064 _mm_load_si128((__m128i *) (transform32x32[0][15])));
6065 E0h = _mm_madd_epi16(m128Tmp1,
6066 _mm_load_si128((__m128i *) (transform32x32[0][15])));
6067 E1l = _mm_madd_epi16(m128Tmp2,
6068 _mm_load_si128((__m128i *) (transform32x32[1][15])));
6069 E1h = _mm_madd_epi16(m128Tmp3,
6070 _mm_load_si128((__m128i *) (transform32x32[1][15])));
6071 E2l = _mm_madd_epi16(m128Tmp4,
6072 _mm_load_si128((__m128i *) (transform32x32[2][15])));
6073 E2h = _mm_madd_epi16(m128Tmp5,
6074 _mm_load_si128((__m128i *) (transform32x32[2][15])));
6075 E3l = _mm_madd_epi16(m128Tmp6,
6076 _mm_load_si128((__m128i *) (transform32x32[3][15])));
6077 E3h = _mm_madd_epi16(m128Tmp7,
6078 _mm_load_si128((__m128i *) (transform32x32[3][15])));
6079
6080 E4l = _mm_madd_epi16(m128Tmp8,
6081 _mm_load_si128((__m128i *) (transform32x32[4][15])));
6082 E4h = _mm_madd_epi16(m128Tmp9,
6083 _mm_load_si128((__m128i *) (transform32x32[4][15])));
6084 E5l = _mm_madd_epi16(m128Tmp10,
6085 _mm_load_si128((__m128i *) (transform32x32[5][15])));
6086 E5h = _mm_madd_epi16(m128Tmp11,
6087 _mm_load_si128((__m128i *) (transform32x32[5][15])));
6088 E6l = _mm_madd_epi16(m128Tmp12,
6089 _mm_load_si128((__m128i *) (transform32x32[6][15])));
6090 E6h = _mm_madd_epi16(m128Tmp13,
6091 _mm_load_si128((__m128i *) (transform32x32[6][15])));
6092 E7l = _mm_madd_epi16(m128Tmp14,
6093 _mm_load_si128((__m128i *) (transform32x32[7][15])));
6094 E7h = _mm_madd_epi16(m128Tmp15,
6095 _mm_load_si128((__m128i *) (transform32x32[7][15])));
6096
6097 O15l = _mm_add_epi32(E0l, E1l);
6098 O15l = _mm_add_epi32(O15l, E2l);
6099 O15l = _mm_add_epi32(O15l, E3l);
6100 O15l = _mm_add_epi32(O15l, E4l);
6101 O15l = _mm_add_epi32(O15l, E5l);
6102 O15l = _mm_add_epi32(O15l, E6l);
6103 O15l = _mm_add_epi32(O15l, E7l);
6104
6105 O15h = _mm_add_epi32(E0h, E1h);
6106 O15h = _mm_add_epi32(O15h, E2h);
6107 O15h = _mm_add_epi32(O15h, E3h);
6108 O15h = _mm_add_epi32(O15h, E4h);
6109 O15h = _mm_add_epi32(O15h, E5h);
6110 O15h = _mm_add_epi32(O15h, E6h);
6111 O15h = _mm_add_epi32(O15h, E7h);
6112 /* Compute E0 */
6113
6114 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
6115 E0l = _mm_madd_epi16(m128Tmp0,
6116 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
6117 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
6118 E0h = _mm_madd_epi16(m128Tmp1,
6119 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
6120
6121 m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
6122 E0l = _mm_add_epi32(E0l,
6123 _mm_madd_epi16(m128Tmp2,
6124 _mm_load_si128(
6125 (__m128i *) (transform16x16_1[1][0]))));
6126 m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
6127 E0h = _mm_add_epi32(E0h,
6128 _mm_madd_epi16(m128Tmp3,
6129 _mm_load_si128(
6130 (__m128i *) (transform16x16_1[1][0]))));
6131
6132 m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22);
6133 E0l = _mm_add_epi32(E0l,
6134 _mm_madd_epi16(m128Tmp4,
6135 _mm_load_si128(
6136 (__m128i *) (transform16x16_1[2][0]))));
6137 m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22);
6138 E0h = _mm_add_epi32(E0h,
6139 _mm_madd_epi16(m128Tmp5,
6140 _mm_load_si128(
6141 (__m128i *) (transform16x16_1[2][0]))));
6142
6143 m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30);
6144 E0l = _mm_add_epi32(E0l,
6145 _mm_madd_epi16(m128Tmp6,
6146 _mm_load_si128(
6147 (__m128i *) (transform16x16_1[3][0]))));
6148 m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30);
6149 E0h = _mm_add_epi32(E0h,
6150 _mm_madd_epi16(m128Tmp7,
6151 _mm_load_si128(
6152 (__m128i *) (transform16x16_1[3][0]))));
6153
6154 /* Compute E1 */
6155 E1l = _mm_madd_epi16(m128Tmp0,
6156 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
6157 E1h = _mm_madd_epi16(m128Tmp1,
6158 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
6159 E1l = _mm_add_epi32(E1l,
6160 _mm_madd_epi16(m128Tmp2,
6161 _mm_load_si128(
6162 (__m128i *) (transform16x16_1[1][1]))));
6163 E1h = _mm_add_epi32(E1h,
6164 _mm_madd_epi16(m128Tmp3,
6165 _mm_load_si128(
6166 (__m128i *) (transform16x16_1[1][1]))));
6167 E1l = _mm_add_epi32(E1l,
6168 _mm_madd_epi16(m128Tmp4,
6169 _mm_load_si128(
6170 (__m128i *) (transform16x16_1[2][1]))));
6171 E1h = _mm_add_epi32(E1h,
6172 _mm_madd_epi16(m128Tmp5,
6173 _mm_load_si128(
6174 (__m128i *) (transform16x16_1[2][1]))));
6175 E1l = _mm_add_epi32(E1l,
6176 _mm_madd_epi16(m128Tmp6,
6177 _mm_load_si128(
6178 (__m128i *) (transform16x16_1[3][1]))));
6179 E1h = _mm_add_epi32(E1h,
6180 _mm_madd_epi16(m128Tmp7,
6181 _mm_load_si128(
6182 (__m128i *) (transform16x16_1[3][1]))));
6183
6184 /* Compute E2 */
6185 E2l = _mm_madd_epi16(m128Tmp0,
6186 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
6187 E2h = _mm_madd_epi16(m128Tmp1,
6188 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
6189 E2l = _mm_add_epi32(E2l,
6190 _mm_madd_epi16(m128Tmp2,
6191 _mm_load_si128(
6192 (__m128i *) (transform16x16_1[1][2]))));
6193 E2h = _mm_add_epi32(E2h,
6194 _mm_madd_epi16(m128Tmp3,
6195 _mm_load_si128(
6196 (__m128i *) (transform16x16_1[1][2]))));
6197 E2l = _mm_add_epi32(E2l,
6198 _mm_madd_epi16(m128Tmp4,
6199 _mm_load_si128(
6200 (__m128i *) (transform16x16_1[2][2]))));
6201 E2h = _mm_add_epi32(E2h,
6202 _mm_madd_epi16(m128Tmp5,
6203 _mm_load_si128(
6204 (__m128i *) (transform16x16_1[2][2]))));
6205 E2l = _mm_add_epi32(E2l,
6206 _mm_madd_epi16(m128Tmp6,
6207 _mm_load_si128(
6208 (__m128i *) (transform16x16_1[3][2]))));
6209 E2h = _mm_add_epi32(E2h,
6210 _mm_madd_epi16(m128Tmp7,
6211 _mm_load_si128(
6212 (__m128i *) (transform16x16_1[3][2]))));
6213
6214 /* Compute E3 */
6215 E3l = _mm_madd_epi16(m128Tmp0,
6216 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
6217 E3h = _mm_madd_epi16(m128Tmp1,
6218 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
6219 E3l = _mm_add_epi32(E3l,
6220 _mm_madd_epi16(m128Tmp2,
6221 _mm_load_si128(
6222 (__m128i *) (transform16x16_1[1][3]))));
6223 E3h = _mm_add_epi32(E3h,
6224 _mm_madd_epi16(m128Tmp3,
6225 _mm_load_si128(
6226 (__m128i *) (transform16x16_1[1][3]))));
6227 E3l = _mm_add_epi32(E3l,
6228 _mm_madd_epi16(m128Tmp4,
6229 _mm_load_si128(
6230 (__m128i *) (transform16x16_1[2][3]))));
6231 E3h = _mm_add_epi32(E3h,
6232 _mm_madd_epi16(m128Tmp5,
6233 _mm_load_si128(
6234 (__m128i *) (transform16x16_1[2][3]))));
6235 E3l = _mm_add_epi32(E3l,
6236 _mm_madd_epi16(m128Tmp6,
6237 _mm_load_si128(
6238 (__m128i *) (transform16x16_1[3][3]))));
6239 E3h = _mm_add_epi32(E3h,
6240 _mm_madd_epi16(m128Tmp7,
6241 _mm_load_si128(
6242 (__m128i *) (transform16x16_1[3][3]))));
6243
6244 /* Compute E4 */
6245 E4l = _mm_madd_epi16(m128Tmp0,
6246 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
6247 E4h = _mm_madd_epi16(m128Tmp1,
6248 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
6249 E4l = _mm_add_epi32(E4l,
6250 _mm_madd_epi16(m128Tmp2,
6251 _mm_load_si128(
6252 (__m128i *) (transform16x16_1[1][4]))));
6253 E4h = _mm_add_epi32(E4h,
6254 _mm_madd_epi16(m128Tmp3,
6255 _mm_load_si128(
6256 (__m128i *) (transform16x16_1[1][4]))));
6257 E4l = _mm_add_epi32(E4l,
6258 _mm_madd_epi16(m128Tmp4,
6259 _mm_load_si128(
6260 (__m128i *) (transform16x16_1[2][4]))));
6261 E4h = _mm_add_epi32(E4h,
6262 _mm_madd_epi16(m128Tmp5,
6263 _mm_load_si128(
6264 (__m128i *) (transform16x16_1[2][4]))));
6265 E4l = _mm_add_epi32(E4l,
6266 _mm_madd_epi16(m128Tmp6,
6267 _mm_load_si128(
6268 (__m128i *) (transform16x16_1[3][4]))));
6269 E4h = _mm_add_epi32(E4h,
6270 _mm_madd_epi16(m128Tmp7,
6271 _mm_load_si128(
6272 (__m128i *) (transform16x16_1[3][4]))));
6273
6274 /* Compute E3 */
6275 E5l = _mm_madd_epi16(m128Tmp0,
6276 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
6277 E5h = _mm_madd_epi16(m128Tmp1,
6278 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
6279 E5l = _mm_add_epi32(E5l,
6280 _mm_madd_epi16(m128Tmp2,
6281 _mm_load_si128(
6282 (__m128i *) (transform16x16_1[1][5]))));
6283 E5h = _mm_add_epi32(E5h,
6284 _mm_madd_epi16(m128Tmp3,
6285 _mm_load_si128(
6286 (__m128i *) (transform16x16_1[1][5]))));
6287 E5l = _mm_add_epi32(E5l,
6288 _mm_madd_epi16(m128Tmp4,
6289 _mm_load_si128(
6290 (__m128i *) (transform16x16_1[2][5]))));
6291 E5h = _mm_add_epi32(E5h,
6292 _mm_madd_epi16(m128Tmp5,
6293 _mm_load_si128(
6294 (__m128i *) (transform16x16_1[2][5]))));
6295 E5l = _mm_add_epi32(E5l,
6296 _mm_madd_epi16(m128Tmp6,
6297 _mm_load_si128(
6298 (__m128i *) (transform16x16_1[3][5]))));
6299 E5h = _mm_add_epi32(E5h,
6300 _mm_madd_epi16(m128Tmp7,
6301 _mm_load_si128(
6302 (__m128i *) (transform16x16_1[3][5]))));
6303
6304 /* Compute E6 */
6305 E6l = _mm_madd_epi16(m128Tmp0,
6306 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
6307 E6h = _mm_madd_epi16(m128Tmp1,
6308 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
6309 E6l = _mm_add_epi32(E6l,
6310 _mm_madd_epi16(m128Tmp2,
6311 _mm_load_si128(
6312 (__m128i *) (transform16x16_1[1][6]))));
6313 E6h = _mm_add_epi32(E6h,
6314 _mm_madd_epi16(m128Tmp3,
6315 _mm_load_si128(
6316 (__m128i *) (transform16x16_1[1][6]))));
6317 E6l = _mm_add_epi32(E6l,
6318 _mm_madd_epi16(m128Tmp4,
6319 _mm_load_si128(
6320 (__m128i *) (transform16x16_1[2][6]))));
6321 E6h = _mm_add_epi32(E6h,
6322 _mm_madd_epi16(m128Tmp5,
6323 _mm_load_si128(
6324 (__m128i *) (transform16x16_1[2][6]))));
6325 E6l = _mm_add_epi32(E6l,
6326 _mm_madd_epi16(m128Tmp6,
6327 _mm_load_si128(
6328 (__m128i *) (transform16x16_1[3][6]))));
6329 E6h = _mm_add_epi32(E6h,
6330 _mm_madd_epi16(m128Tmp7,
6331 _mm_load_si128(
6332 (__m128i *) (transform16x16_1[3][6]))));
6333
6334 /* Compute E7 */
6335 E7l = _mm_madd_epi16(m128Tmp0,
6336 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
6337 E7h = _mm_madd_epi16(m128Tmp1,
6338 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
6339 E7l = _mm_add_epi32(E7l,
6340 _mm_madd_epi16(m128Tmp2,
6341 _mm_load_si128(
6342 (__m128i *) (transform16x16_1[1][7]))));
6343 E7h = _mm_add_epi32(E7h,
6344 _mm_madd_epi16(m128Tmp3,
6345 _mm_load_si128(
6346 (__m128i *) (transform16x16_1[1][7]))));
6347 E7l = _mm_add_epi32(E7l,
6348 _mm_madd_epi16(m128Tmp4,
6349 _mm_load_si128(
6350 (__m128i *) (transform16x16_1[2][7]))));
6351 E7h = _mm_add_epi32(E7h,
6352 _mm_madd_epi16(m128Tmp5,
6353 _mm_load_si128(
6354 (__m128i *) (transform16x16_1[2][7]))));
6355 E7l = _mm_add_epi32(E7l,
6356 _mm_madd_epi16(m128Tmp6,
6357 _mm_load_si128(
6358 (__m128i *) (transform16x16_1[3][7]))));
6359 E7h = _mm_add_epi32(E7h,
6360 _mm_madd_epi16(m128Tmp7,
6361 _mm_load_si128(
6362 (__m128i *) (transform16x16_1[3][7]))));
6363
6364 /* Compute EE0 and EEE */
6365
6366 m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
6367 E00l = _mm_madd_epi16(m128Tmp0,
6368 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
6369 m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
6370 E00h = _mm_madd_epi16(m128Tmp1,
6371 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
6372
6373 m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28);
6374 E00l = _mm_add_epi32(E00l,
6375 _mm_madd_epi16(m128Tmp2,
6376 _mm_load_si128(
6377 (__m128i *) (transform16x16_2[1][0]))));
6378 m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28);
6379 E00h = _mm_add_epi32(E00h,
6380 _mm_madd_epi16(m128Tmp3,
6381 _mm_load_si128(
6382 (__m128i *) (transform16x16_2[1][0]))));
6383
6384 E01l = _mm_madd_epi16(m128Tmp0,
6385 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
6386 E01h = _mm_madd_epi16(m128Tmp1,
6387 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
6388 E01l = _mm_add_epi32(E01l,
6389 _mm_madd_epi16(m128Tmp2,
6390 _mm_load_si128(
6391 (__m128i *) (transform16x16_2[1][1]))));
6392 E01h = _mm_add_epi32(E01h,
6393 _mm_madd_epi16(m128Tmp3,
6394 _mm_load_si128(
6395 (__m128i *) (transform16x16_2[1][1]))));
6396
6397 E02l = _mm_madd_epi16(m128Tmp0,
6398 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
6399 E02h = _mm_madd_epi16(m128Tmp1,
6400 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
6401 E02l = _mm_add_epi32(E02l,
6402 _mm_madd_epi16(m128Tmp2,
6403 _mm_load_si128(
6404 (__m128i *) (transform16x16_2[1][2]))));
6405 E02h = _mm_add_epi32(E02h,
6406 _mm_madd_epi16(m128Tmp3,
6407 _mm_load_si128(
6408 (__m128i *) (transform16x16_2[1][2]))));
6409
6410 E03l = _mm_madd_epi16(m128Tmp0,
6411 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
6412 E03h = _mm_madd_epi16(m128Tmp1,
6413 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
6414 E03l = _mm_add_epi32(E03l,
6415 _mm_madd_epi16(m128Tmp2,
6416 _mm_load_si128(
6417 (__m128i *) (transform16x16_2[1][3]))));
6418 E03h = _mm_add_epi32(E03h,
6419 _mm_madd_epi16(m128Tmp3,
6420 _mm_load_si128(
6421 (__m128i *) (transform16x16_2[1][3]))));
6422
6423 /* Compute EE0 and EEE */
6424
6425 m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24);
6426 EE0l = _mm_madd_epi16(m128Tmp0,
6427 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
6428 m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24);
6429 EE0h = _mm_madd_epi16(m128Tmp1,
6430 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
6431
6432 m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16);
6433 EEE0l = _mm_madd_epi16(m128Tmp2,
6434 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
6435 m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16);
6436 EEE0h = _mm_madd_epi16(m128Tmp3,
6437 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
6438
6439 EE1l = _mm_madd_epi16(m128Tmp0,
6440 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
6441 EE1h = _mm_madd_epi16(m128Tmp1,
6442 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
6443
6444 EEE1l = _mm_madd_epi16(m128Tmp2,
6445 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
6446 EEE1h = _mm_madd_epi16(m128Tmp3,
6447 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
6448
6449 /* Compute EE */
6450
6451 EE2l = _mm_sub_epi32(EEE1l, EE1l);
6452 EE3l = _mm_sub_epi32(EEE0l, EE0l);
6453 EE2h = _mm_sub_epi32(EEE1h, EE1h);
6454 EE3h = _mm_sub_epi32(EEE0h, EE0h);
6455
6456 EE0l = _mm_add_epi32(EEE0l, EE0l);
6457 EE1l = _mm_add_epi32(EEE1l, EE1l);
6458 EE0h = _mm_add_epi32(EEE0h, EE0h);
6459 EE1h = _mm_add_epi32(EEE1h, EE1h);
6460 /**/
6461
6462 EE7l = _mm_sub_epi32(EE0l, E00l);
6463 EE6l = _mm_sub_epi32(EE1l, E01l);
6464 EE5l = _mm_sub_epi32(EE2l, E02l);
6465 EE4l = _mm_sub_epi32(EE3l, E03l);
6466
6467 EE7h = _mm_sub_epi32(EE0h, E00h);
6468 EE6h = _mm_sub_epi32(EE1h, E01h);
6469 EE5h = _mm_sub_epi32(EE2h, E02h);
6470 EE4h = _mm_sub_epi32(EE3h, E03h);
6471
6472 EE0l = _mm_add_epi32(EE0l, E00l);
6473 EE1l = _mm_add_epi32(EE1l, E01l);
6474 EE2l = _mm_add_epi32(EE2l, E02l);
6475 EE3l = _mm_add_epi32(EE3l, E03l);
6476
6477 EE0h = _mm_add_epi32(EE0h, E00h);
6478 EE1h = _mm_add_epi32(EE1h, E01h);
6479 EE2h = _mm_add_epi32(EE2h, E02h);
6480 EE3h = _mm_add_epi32(EE3h, E03h);
6481 /* Compute E */
6482
6483 E15l = _mm_sub_epi32(EE0l, E0l);
6484 E15l = _mm_add_epi32(E15l, m128iAdd);
6485 E14l = _mm_sub_epi32(EE1l, E1l);
6486 E14l = _mm_add_epi32(E14l, m128iAdd);
6487 E13l = _mm_sub_epi32(EE2l, E2l);
6488 E13l = _mm_add_epi32(E13l, m128iAdd);
6489 E12l = _mm_sub_epi32(EE3l, E3l);
6490 E12l = _mm_add_epi32(E12l, m128iAdd);
6491 E11l = _mm_sub_epi32(EE4l, E4l);
6492 E11l = _mm_add_epi32(E11l, m128iAdd);
6493 E10l = _mm_sub_epi32(EE5l, E5l);
6494 E10l = _mm_add_epi32(E10l, m128iAdd);
6495 E9l = _mm_sub_epi32(EE6l, E6l);
6496 E9l = _mm_add_epi32(E9l, m128iAdd);
6497 E8l = _mm_sub_epi32(EE7l, E7l);
6498 E8l = _mm_add_epi32(E8l, m128iAdd);
6499
6500 E0l = _mm_add_epi32(EE0l, E0l);
6501 E0l = _mm_add_epi32(E0l, m128iAdd);
6502 E1l = _mm_add_epi32(EE1l, E1l);
6503 E1l = _mm_add_epi32(E1l, m128iAdd);
6504 E2l = _mm_add_epi32(EE2l, E2l);
6505 E2l = _mm_add_epi32(E2l, m128iAdd);
6506 E3l = _mm_add_epi32(EE3l, E3l);
6507 E3l = _mm_add_epi32(E3l, m128iAdd);
6508 E4l = _mm_add_epi32(EE4l, E4l);
6509 E4l = _mm_add_epi32(E4l, m128iAdd);
6510 E5l = _mm_add_epi32(EE5l, E5l);
6511 E5l = _mm_add_epi32(E5l, m128iAdd);
6512 E6l = _mm_add_epi32(EE6l, E6l);
6513 E6l = _mm_add_epi32(E6l, m128iAdd);
6514 E7l = _mm_add_epi32(EE7l, E7l);
6515 E7l = _mm_add_epi32(E7l, m128iAdd);
6516
6517 E15h = _mm_sub_epi32(EE0h, E0h);
6518 E15h = _mm_add_epi32(E15h, m128iAdd);
6519 E14h = _mm_sub_epi32(EE1h, E1h);
6520 E14h = _mm_add_epi32(E14h, m128iAdd);
6521 E13h = _mm_sub_epi32(EE2h, E2h);
6522 E13h = _mm_add_epi32(E13h, m128iAdd);
6523 E12h = _mm_sub_epi32(EE3h, E3h);
6524 E12h = _mm_add_epi32(E12h, m128iAdd);
6525 E11h = _mm_sub_epi32(EE4h, E4h);
6526 E11h = _mm_add_epi32(E11h, m128iAdd);
6527 E10h = _mm_sub_epi32(EE5h, E5h);
6528 E10h = _mm_add_epi32(E10h, m128iAdd);
6529 E9h = _mm_sub_epi32(EE6h, E6h);
6530 E9h = _mm_add_epi32(E9h, m128iAdd);
6531 E8h = _mm_sub_epi32(EE7h, E7h);
6532 E8h = _mm_add_epi32(E8h, m128iAdd);
6533
6534 E0h = _mm_add_epi32(EE0h, E0h);
6535 E0h = _mm_add_epi32(E0h, m128iAdd);
6536 E1h = _mm_add_epi32(EE1h, E1h);
6537 E1h = _mm_add_epi32(E1h, m128iAdd);
6538 E2h = _mm_add_epi32(EE2h, E2h);
6539 E2h = _mm_add_epi32(E2h, m128iAdd);
6540 E3h = _mm_add_epi32(EE3h, E3h);
6541 E3h = _mm_add_epi32(E3h, m128iAdd);
6542 E4h = _mm_add_epi32(EE4h, E4h);
6543 E4h = _mm_add_epi32(E4h, m128iAdd);
6544 E5h = _mm_add_epi32(EE5h, E5h);
6545 E5h = _mm_add_epi32(E5h, m128iAdd);
6546 E6h = _mm_add_epi32(EE6h, E6h);
6547 E6h = _mm_add_epi32(E6h, m128iAdd);
6548 E7h = _mm_add_epi32(EE7h, E7h);
6549 E7h = _mm_add_epi32(E7h, m128iAdd);
6550
6551 m128iS0 = _mm_packs_epi32(
6552 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
6553 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
6554 m128iS1 = _mm_packs_epi32(
6555 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
6556 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
6557 m128iS2 = _mm_packs_epi32(
6558 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
6559 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
6560 m128iS3 = _mm_packs_epi32(
6561 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
6562 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
6563 m128iS4 = _mm_packs_epi32(
6564 _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
6565 _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
6566 m128iS5 = _mm_packs_epi32(
6567 _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
6568 _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
6569 m128iS6 = _mm_packs_epi32(
6570 _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
6571 _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
6572 m128iS7 = _mm_packs_epi32(
6573 _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
6574 _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
6575 m128iS8 = _mm_packs_epi32(
6576 _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift),
6577 _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift));
6578 m128iS9 = _mm_packs_epi32(
6579 _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift),
6580 _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift));
6581 m128iS10 = _mm_packs_epi32(
6582 _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift),
6583 _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift));
6584 m128iS11 = _mm_packs_epi32(
6585 _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift),
6586 _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift));
6587 m128iS12 = _mm_packs_epi32(
6588 _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift),
6589 _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift));
6590 m128iS13 = _mm_packs_epi32(
6591 _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift),
6592 _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift));
6593 m128iS14 = _mm_packs_epi32(
6594 _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift),
6595 _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift));
6596 m128iS15 = _mm_packs_epi32(
6597 _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift),
6598 _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift));
6599
6600 m128iS31 = _mm_packs_epi32(
6601 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
6602 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
6603 m128iS30 = _mm_packs_epi32(
6604 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
6605 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
6606 m128iS29 = _mm_packs_epi32(
6607 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
6608 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
6609 m128iS28 = _mm_packs_epi32(
6610 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
6611 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
6612 m128iS27 = _mm_packs_epi32(
6613 _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
6614 _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
6615 m128iS26 = _mm_packs_epi32(
6616 _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
6617 _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
6618 m128iS25 = _mm_packs_epi32(
6619 _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
6620 _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
6621 m128iS24 = _mm_packs_epi32(
6622 _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
6623 _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
6624 m128iS23 = _mm_packs_epi32(
6625 _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift),
6626 _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift));
6627 m128iS22 = _mm_packs_epi32(
6628 _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift),
6629 _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift));
6630 m128iS21 = _mm_packs_epi32(
6631 _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift),
6632 _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift));
6633 m128iS20 = _mm_packs_epi32(
6634 _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift),
6635 _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift));
6636 m128iS19 = _mm_packs_epi32(
6637 _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift),
6638 _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift));
6639 m128iS18 = _mm_packs_epi32(
6640 _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift),
6641 _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift));
6642 m128iS17 = _mm_packs_epi32(
6643 _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift),
6644 _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift));
6645 m128iS16 = _mm_packs_epi32(
6646 _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift),
6647 _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift));
6648
6649 if (!j) {
6650 /* Inverse the matrix */
6651 E0l = _mm_unpacklo_epi16(m128iS0, m128iS16);
6652 E1l = _mm_unpacklo_epi16(m128iS1, m128iS17);
6653 E2l = _mm_unpacklo_epi16(m128iS2, m128iS18);
6654 E3l = _mm_unpacklo_epi16(m128iS3, m128iS19);
6655 E4l = _mm_unpacklo_epi16(m128iS4, m128iS20);
6656 E5l = _mm_unpacklo_epi16(m128iS5, m128iS21);
6657 E6l = _mm_unpacklo_epi16(m128iS6, m128iS22);
6658 E7l = _mm_unpacklo_epi16(m128iS7, m128iS23);
6659 E8l = _mm_unpacklo_epi16(m128iS8, m128iS24);
6660 E9l = _mm_unpacklo_epi16(m128iS9, m128iS25);
6661 E10l = _mm_unpacklo_epi16(m128iS10, m128iS26);
6662 E11l = _mm_unpacklo_epi16(m128iS11, m128iS27);
6663 E12l = _mm_unpacklo_epi16(m128iS12, m128iS28);
6664 E13l = _mm_unpacklo_epi16(m128iS13, m128iS29);
6665 E14l = _mm_unpacklo_epi16(m128iS14, m128iS30);
6666 E15l = _mm_unpacklo_epi16(m128iS15, m128iS31);
6667
6668 O0l = _mm_unpackhi_epi16(m128iS0, m128iS16);
6669 O1l = _mm_unpackhi_epi16(m128iS1, m128iS17);
6670 O2l = _mm_unpackhi_epi16(m128iS2, m128iS18);
6671 O3l = _mm_unpackhi_epi16(m128iS3, m128iS19);
6672 O4l = _mm_unpackhi_epi16(m128iS4, m128iS20);
6673 O5l = _mm_unpackhi_epi16(m128iS5, m128iS21);
6674 O6l = _mm_unpackhi_epi16(m128iS6, m128iS22);
6675 O7l = _mm_unpackhi_epi16(m128iS7, m128iS23);
6676 O8l = _mm_unpackhi_epi16(m128iS8, m128iS24);
6677 O9l = _mm_unpackhi_epi16(m128iS9, m128iS25);
6678 O10l = _mm_unpackhi_epi16(m128iS10, m128iS26);
6679 O11l = _mm_unpackhi_epi16(m128iS11, m128iS27);
6680 O12l = _mm_unpackhi_epi16(m128iS12, m128iS28);
6681 O13l = _mm_unpackhi_epi16(m128iS13, m128iS29);
6682 O14l = _mm_unpackhi_epi16(m128iS14, m128iS30);
6683 O15l = _mm_unpackhi_epi16(m128iS15, m128iS31);
6684
6685 E0h = _mm_unpacklo_epi16(E0l, E8l);
6686 E1h = _mm_unpacklo_epi16(E1l, E9l);
6687 E2h = _mm_unpacklo_epi16(E2l, E10l);
6688 E3h = _mm_unpacklo_epi16(E3l, E11l);
6689 E4h = _mm_unpacklo_epi16(E4l, E12l);
6690 E5h = _mm_unpacklo_epi16(E5l, E13l);
6691 E6h = _mm_unpacklo_epi16(E6l, E14l);
6692 E7h = _mm_unpacklo_epi16(E7l, E15l);
6693
6694 E8h = _mm_unpackhi_epi16(E0l, E8l);
6695 E9h = _mm_unpackhi_epi16(E1l, E9l);
6696 E10h = _mm_unpackhi_epi16(E2l, E10l);
6697 E11h = _mm_unpackhi_epi16(E3l, E11l);
6698 E12h = _mm_unpackhi_epi16(E4l, E12l);
6699 E13h = _mm_unpackhi_epi16(E5l, E13l);
6700 E14h = _mm_unpackhi_epi16(E6l, E14l);
6701 E15h = _mm_unpackhi_epi16(E7l, E15l);
6702
6703 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
6704 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
6705 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
6706 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
6707
6708 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6709 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6710 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6711 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6712
6713 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6714 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6715 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6716 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6717
6718 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
6719 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
6720 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
6721 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
6722
6723 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6724 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6725 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6726 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6727
6728 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6729 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6730 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6731 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6732
6733 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
6734 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
6735 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
6736 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
6737
6738 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6739 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6740 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6741 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6742
6743 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6744 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6745 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6746 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6747
6748 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
6749 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
6750 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
6751 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
6752
6753 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6754 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6755 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6756 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6757
6758 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6759 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6760 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6761 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6762
6763 /* */
6764 E0h = _mm_unpacklo_epi16(O0l, O8l);
6765 E1h = _mm_unpacklo_epi16(O1l, O9l);
6766 E2h = _mm_unpacklo_epi16(O2l, O10l);
6767 E3h = _mm_unpacklo_epi16(O3l, O11l);
6768 E4h = _mm_unpacklo_epi16(O4l, O12l);
6769 E5h = _mm_unpacklo_epi16(O5l, O13l);
6770 E6h = _mm_unpacklo_epi16(O6l, O14l);
6771 E7h = _mm_unpacklo_epi16(O7l, O15l);
6772
6773 E8h = _mm_unpackhi_epi16(O0l, O8l);
6774 E9h = _mm_unpackhi_epi16(O1l, O9l);
6775 E10h = _mm_unpackhi_epi16(O2l, O10l);
6776 E11h = _mm_unpackhi_epi16(O3l, O11l);
6777 E12h = _mm_unpackhi_epi16(O4l, O12l);
6778 E13h = _mm_unpackhi_epi16(O5l, O13l);
6779 E14h = _mm_unpackhi_epi16(O6l, O14l);
6780 E15h = _mm_unpackhi_epi16(O7l, O15l);
6781
6782 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
6783 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
6784 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
6785 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
6786
6787 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6788 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6789 m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6790 m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6791
6792 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6793 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6794 m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6795 m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6796
6797 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
6798 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
6799 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
6800 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
6801
6802 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6803 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6804 m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6805 m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6806
6807 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6808 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6809 m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6810 m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6811
6812 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
6813 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
6814 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
6815 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
6816
6817 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6818 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6819 m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6820 m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6821
6822 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6823 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6824 m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6825 m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6826
6827 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
6828 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
6829 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
6830 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
6831
6832 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6833 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6834 m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6835 m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6836
6837 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6838 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6839 m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6840 m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6841 /* */
6842 _mm_store_si128((__m128i *) (src + i), m128iS0);
6843 _mm_store_si128((__m128i *) (src + 32 + i), m128iS1);
6844 _mm_store_si128((__m128i *) (src + 64 + i), m128iS2);
6845 _mm_store_si128((__m128i *) (src + 96 + i), m128iS3);
6846 _mm_store_si128((__m128i *) (src + 128 + i), m128iS4);
6847 _mm_store_si128((__m128i *) (src + 160 + i), m128iS5);
6848 _mm_store_si128((__m128i *) (src + 192 + i), m128iS6);
6849 _mm_store_si128((__m128i *) (src + 224 + i), m128iS7);
6850 _mm_store_si128((__m128i *) (src + 256 + i), m128iS8);
6851 _mm_store_si128((__m128i *) (src + 288 + i), m128iS9);
6852 _mm_store_si128((__m128i *) (src + 320 + i), m128iS10);
6853 _mm_store_si128((__m128i *) (src + 352 + i), m128iS11);
6854 _mm_store_si128((__m128i *) (src + 384 + i), m128iS12);
6855 _mm_store_si128((__m128i *) (src + 416 + i), m128iS13);
6856 _mm_store_si128((__m128i *) (src + 448 + i), m128iS14);
6857 _mm_store_si128((__m128i *) (src + 480 + i), m128iS15);
6858 _mm_store_si128((__m128i *) (src + 512 + i), m128iS16);
6859 _mm_store_si128((__m128i *) (src + 544 + i), m128iS17);
6860 _mm_store_si128((__m128i *) (src + 576 + i), m128iS18);
6861 _mm_store_si128((__m128i *) (src + 608 + i), m128iS19);
6862 _mm_store_si128((__m128i *) (src + 640 + i), m128iS20);
6863 _mm_store_si128((__m128i *) (src + 672 + i), m128iS21);
6864 _mm_store_si128((__m128i *) (src + 704 + i), m128iS22);
6865 _mm_store_si128((__m128i *) (src + 736 + i), m128iS23);
6866 _mm_store_si128((__m128i *) (src + 768 + i), m128iS24);
6867 _mm_store_si128((__m128i *) (src + 800 + i), m128iS25);
6868 _mm_store_si128((__m128i *) (src + 832 + i), m128iS26);
6869 _mm_store_si128((__m128i *) (src + 864 + i), m128iS27);
6870 _mm_store_si128((__m128i *) (src + 896 + i), m128iS28);
6871 _mm_store_si128((__m128i *) (src + 928 + i), m128iS29);
6872 _mm_store_si128((__m128i *) (src + 960 + i), m128iS30);
6873 _mm_store_si128((__m128i *) (src + 992 + i), m128iS31);
6874
6875 if (i <= 16) {
6876 int k = i + 8;
6877 m128iS0 = _mm_load_si128((__m128i *) (src + k));
6878 m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k));
6879 m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k));
6880 m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k));
6881 m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k));
6882 m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k));
6883 m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k));
6884 m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k));
6885 m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k));
6886 m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k));
6887 m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k));
6888 m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k));
6889 m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k));
6890 m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k));
6891 m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k));
6892 m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k));
6893
6894 m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k));
6895 m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k));
6896 m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k));
6897 m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k));
6898 m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k));
6899 m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k));
6900 m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k));
6901 m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k));
6902 m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k));
6903 m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k));
6904 m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k));
6905 m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k));
6906 m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k));
6907 m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k));
6908 m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k));
6909 m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k));
6910 } else {
6911 m128iS0 = _mm_load_si128((__m128i *) (src));
6912 m128iS1 = _mm_load_si128((__m128i *) (src + 128));
6913 m128iS2 = _mm_load_si128((__m128i *) (src + 256));
6914 m128iS3 = _mm_load_si128((__m128i *) (src + 384));
6915 m128iS4 = _mm_loadu_si128((__m128i *) (src + 512));
6916 m128iS5 = _mm_load_si128((__m128i *) (src + 640));
6917 m128iS6 = _mm_load_si128((__m128i *) (src + 768));
6918 m128iS7 = _mm_load_si128((__m128i *) (src + 896));
6919 m128iS8 = _mm_load_si128((__m128i *) (src + 8));
6920 m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8));
6921 m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8));
6922 m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8));
6923 m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8));
6924 m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8));
6925 m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8));
6926 m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8));
6927 m128iS16 = _mm_load_si128((__m128i *) (src + 16));
6928 m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16));
6929 m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16));
6930 m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16));
6931 m128iS20 = _mm_loadu_si128((__m128i *) (src + 512 + 16));
6932 m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16));
6933 m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16));
6934 m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16));
6935 m128iS24 = _mm_load_si128((__m128i *) (src + 24));
6936 m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24));
6937 m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24));
6938 m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24));
6939 m128iS28 = _mm_loadu_si128((__m128i *) (src + 512 + 24));
6940 m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24));
6941 m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24));
6942 m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24));
6943 shift = shift_2nd;
6944 m128iAdd = _mm_set1_epi32(add_2nd);
6945 }
6946
6947 } else {
6948 int k, m = 0;
6949 _mm_storeu_si128((__m128i *) (src), m128iS0);
6950 _mm_storeu_si128((__m128i *) (src + 8), m128iS1);
6951 _mm_storeu_si128((__m128i *) (src + 16), m128iS2);
6952 _mm_storeu_si128((__m128i *) (src + 24), m128iS3);
6953 _mm_storeu_si128((__m128i *) (src + 128), m128iS4);
6954 _mm_storeu_si128((__m128i *) (src + 128 + 8), m128iS5);
6955 _mm_storeu_si128((__m128i *) (src + 128 + 16), m128iS6);
6956 _mm_storeu_si128((__m128i *) (src + 128 + 24), m128iS7);
6957 _mm_storeu_si128((__m128i *) (src + 256), m128iS8);
6958 _mm_storeu_si128((__m128i *) (src + 256 + 8), m128iS9);
6959 _mm_storeu_si128((__m128i *) (src + 256 + 16), m128iS10);
6960 _mm_storeu_si128((__m128i *) (src + 256 + 24), m128iS11);
6961 _mm_storeu_si128((__m128i *) (src + 384), m128iS12);
6962 _mm_storeu_si128((__m128i *) (src + 384 + 8), m128iS13);
6963 _mm_storeu_si128((__m128i *) (src + 384 + 16), m128iS14);
6964 _mm_storeu_si128((__m128i *) (src + 384 + 24), m128iS15);
6965
6966 _mm_storeu_si128((__m128i *) (src + 512), m128iS16);
6967 _mm_storeu_si128((__m128i *) (src + 512 + 8), m128iS17);
6968 _mm_storeu_si128((__m128i *) (src + 512 + 16), m128iS18);
6969 _mm_storeu_si128((__m128i *) (src + 512 + 24), m128iS19);
6970 _mm_storeu_si128((__m128i *) (src + 640), m128iS20);
6971 _mm_storeu_si128((__m128i *) (src + 640 + 8), m128iS21);
6972 _mm_storeu_si128((__m128i *) (src + 640 + 16), m128iS22);
6973 _mm_storeu_si128((__m128i *) (src + 640 + 24), m128iS23);
6974 _mm_storeu_si128((__m128i *) (src + 768), m128iS24);
6975 _mm_storeu_si128((__m128i *) (src + 768 + 8), m128iS25);
6976 _mm_storeu_si128((__m128i *) (src + 768 + 16), m128iS26);
6977 _mm_storeu_si128((__m128i *) (src + 768 + 24), m128iS27);
6978 _mm_storeu_si128((__m128i *) (src + 896), m128iS28);
6979 _mm_storeu_si128((__m128i *) (src + 896 + 8), m128iS29);
6980 _mm_storeu_si128((__m128i *) (src + 896 + 16), m128iS30);
6981 _mm_storeu_si128((__m128i *) (src + 896 + 24), m128iS31);
6982 dst = (uint16_t*) _dst + (i * stride);
6983 for (k = 0; k < 8; k++) {
6984 dst[0] = av_clip_uintp2(dst[0] + src[m],10);
6985 dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10);
6986 dst[2] = av_clip_uintp2(dst[2] + src[m + 16],10);
6987 dst[3] = av_clip_uintp2(dst[3] + src[m + 24],10);
6988 dst[4] = av_clip_uintp2(
6989 dst[4] + src[m + 128],10);
6990 dst[5] = av_clip_uintp2(
6991 dst[5] + src[m + 128 + 8],10);
6992 dst[6] = av_clip_uintp2(
6993 dst[6] + src[m + 128 + 16],10);
6994 dst[7] = av_clip_uintp2(
6995 dst[7] + src[m + 128 + 24],10);
6996
6997 dst[8] = av_clip_uintp2(
6998 dst[8] + src[m + 256],10);
6999 dst[9] = av_clip_uintp2(
7000 dst[9] + src[m + 256 + 8],10);
7001 dst[10] = av_clip_uintp2(
7002 dst[10] + src[m + 256 + 16],10);
7003 dst[11] = av_clip_uintp2(
7004 dst[11] + src[m + 256 + 24],10);
7005 dst[12] = av_clip_uintp2(
7006 dst[12] + src[m + 384],10);
7007 dst[13] = av_clip_uintp2(
7008 dst[13] + src[m + 384 + 8],10);
7009 dst[14] = av_clip_uintp2(
7010 dst[14] + src[m + 384 + 16],10);
7011 dst[15] = av_clip_uintp2(
7012 dst[15] + src[m + 384 + 24],10);
7013
7014 dst[16] = av_clip_uintp2(
7015 dst[16] + src[m + 512],10);
7016 dst[17] = av_clip_uintp2(
7017 dst[17] + src[m + 512 + 8],10);
7018 dst[18] = av_clip_uintp2(
7019 dst[18] + src[m + 512 + 16],10);
7020 dst[19] = av_clip_uintp2(
7021 dst[19] + src[m + 512 + 24],10);
7022 dst[20] = av_clip_uintp2(
7023 dst[20] + src[m + 640],10);
7024 dst[21] = av_clip_uintp2(
7025 dst[21] + src[m + 640 + 8],10);
7026 dst[22] = av_clip_uintp2(
7027 dst[22] + src[m + 640 + 16],10);
7028 dst[23] = av_clip_uintp2(
7029 dst[23] + src[m + 640 + 24],10);
7030
7031 dst[24] = av_clip_uintp2(
7032 dst[24] + src[m + 768],10);
7033 dst[25] = av_clip_uintp2(
7034 dst[25] + src[m + 768 + 8],10);
7035 dst[26] = av_clip_uintp2(
7036 dst[26] + src[m + 768 + 16],10);
7037 dst[27] = av_clip_uintp2(
7038 dst[27] + src[m + 768 + 24],10);
7039 dst[28] = av_clip_uintp2(
7040 dst[28] + src[m + 896],10);
7041 dst[29] = av_clip_uintp2(
7042 dst[29] + src[m + 896 + 8],10);
7043 dst[30] = av_clip_uintp2(
7044 dst[30] + src[m + 896 + 16],10);
7045 dst[31] = av_clip_uintp2(
7046 dst[31] + src[m + 896 + 24],10);
7047
7048 m += 1;
7049 dst += stride;
7050 }
7051 if (i <= 16) {
7052 int k = (i + 8) * 4;
7053 m128iS0 = _mm_load_si128((__m128i *) (src + k));
7054 m128iS1 = _mm_load_si128((__m128i *) (src + 128 + k));
7055 m128iS2 = _mm_load_si128((__m128i *) (src + 256 + k));
7056 m128iS3 = _mm_load_si128((__m128i *) (src + 384 + k));
7057 m128iS4 = _mm_loadu_si128((__m128i *) (src + 512 + k));
7058 m128iS5 = _mm_load_si128((__m128i *) (src + 640 + k));
7059 m128iS6 = _mm_load_si128((__m128i *) (src + 768 + k));
7060 m128iS7 = _mm_load_si128((__m128i *) (src + 896 + k));
7061 m128iS8 = _mm_load_si128((__m128i *) (src + 8 + k));
7062 m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8 + k));
7063 m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8 + k));
7064 m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8 + k));
7065 m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8 + k));
7066 m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8 + k));
7067 m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8 + k));
7068 m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8 + k));
7069 m128iS16 = _mm_load_si128((__m128i *) (src + 16 + k));
7070 m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16 + k));
7071 m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16 + k));
7072 m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16 + k));
7073 m128iS20 = _mm_loadu_si128(
7074 (__m128i *) (src + 512 + 16 + k));
7075 m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16 + k));
7076 m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16 + k));
7077 m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16 + k));
7078 m128iS24 = _mm_load_si128((__m128i *) (src + 24 + k));
7079 m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24 + k));
7080 m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24 + k));
7081 m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24 + k));
7082 m128iS28 = _mm_loadu_si128(
7083 (__m128i *) (src + 512 + 24 + k));
7084 m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24 + k));
7085 m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24 + k));
7086 m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24 + k));
7087 }
7088 }
7089 }
7090 }
7091 }
7092 #endif
7093
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 openHEVC contributors
3 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4 *
5 * This file is part of libde265.
6 *
7 * libde265 is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation, either version 3 of
10 * the License, or (at your option) any later version.
11 *
12 * libde265 is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
19 */
020
121 #ifndef SSE_DCT_H
222 #define SSE_DCT_H
+0
-4939
libde265/x86/sse-motion.c less more
0 /*
1 Code was taken over from openHEVC and slightly modified.
2 */
3
4 #include <stdio.h>
5 #include <emmintrin.h>
6 #include <tmmintrin.h> // SSSE3
7 #if HAVE_SSE4_1
8 #include <smmintrin.h>
9 #endif
10
11 #include "sse-motion.h"
12 #include "libde265/util.h"
13
14
15 ALIGNED_16(const int8_t) epel_filters[7][16] = {
16 { -2, 58, 10, -2,-2, 58, 10, -2,-2, 58, 10, -2,-2, 58, 10, -2 },
17 { -4, 54, 16, -2,-4, 54, 16, -2,-4, 54, 16, -2,-4, 54, 16, -2 },
18 { -6, 46, 28, -4,-6, 46, 28, -4,-6, 46, 28, -4,-6, 46, 28, -4 },
19 { -4, 36, 36, -4,-4, 36, 36, -4,-4, 36, 36, -4,-4, 36, 36, -4 },
20 { -4, 28, 46, -6,-4, 28, 46, -6,-4, 28, 46, -6,-4, 28, 46, -6 },
21 { -2, 16, 54, -4,-2, 16, 54, -4,-2, 16, 54, -4,-2, 16, 54, -4 },
22 { -2, 10, 58, -2,-2, 10, 58, -2,-2, 10, 58, -2,-2, 10, 58, -2 },
23 };
24
25 static const uint8_t qpel_extra_before[4] = { 0, 3, 3, 2 };
26 static const uint8_t qpel_extra_after[4] = { 0, 3, 4, 4 };
27 static const uint8_t qpel_extra[4] = { 0, 6, 7, 6 };
28
29 static const int epel_extra_before = 1;
30 static const int epel_extra_after = 2;
31 static const int epel_extra = 3;
32
33 #define MAX_PB_SIZE 64
34
35 #define MASKMOVE 0
36
37 void print128(const char* prefix, __m128i r)
38 {
39 unsigned char buf[16];
40
41 *(__m128i*)buf = r;
42
43 printf("%s ",prefix);
44 for (int i=0;i<16;i++)
45 {
46 if (i>0) { printf(":"); }
47 printf("%02x", buf[i]);
48 }
49
50 printf("\n");
51 }
52
53
54 void printm32(const char* prefix, unsigned char* p)
55 {
56 printf("%s ",prefix);
57
58 for (int i=0;i<4;i++)
59 {
60 if (i>0) { printf(":"); }
61 printf("%02x", p[i]);
62 }
63
64 printf("\n");
65 }
66
67
68 #define BIT_DEPTH 8
69
70 void ff_hevc_put_unweighted_pred_8_sse(uint8_t *_dst, ptrdiff_t dststride,
71 int16_t *src, ptrdiff_t srcstride, int width, int height) {
72 int x, y;
73 uint8_t *dst = (uint8_t*) _dst;
74 __m128i r0, r1, f0;
75
76 f0 = _mm_set1_epi16(32);
77
78
79 if(!(width & 15))
80 {
81 for (y = 0; y < height; y++) {
82 for (x = 0; x < width; x += 16) {
83 r0 = _mm_load_si128((__m128i *) (src+x));
84
85 r1 = _mm_load_si128((__m128i *) (src+x + 8));
86 r0 = _mm_adds_epi16(r0, f0);
87
88 r1 = _mm_adds_epi16(r1, f0);
89 r0 = _mm_srai_epi16(r0, 6);
90 r1 = _mm_srai_epi16(r1, 6);
91 r0 = _mm_packus_epi16(r0, r1);
92
93 _mm_storeu_si128((__m128i *) (dst+x), r0);
94 }
95 dst += dststride;
96 src += srcstride;
97 }
98 }else if(!(width & 7))
99 {
100 for (y = 0; y < height; y++) {
101 for (x = 0; x < width; x += 8) {
102 r0 = _mm_load_si128((__m128i *) (src+x));
103
104 r0 = _mm_adds_epi16(r0, f0);
105
106 r0 = _mm_srai_epi16(r0, 6);
107 r0 = _mm_packus_epi16(r0, r0);
108
109 _mm_storel_epi64((__m128i *) (dst+x), r0);
110 }
111 dst += dststride;
112 src += srcstride;
113 }
114 }else if(!(width & 3)){
115 for (y = 0; y < height; y++) {
116 for(x = 0;x < width; x+=4){
117 r0 = _mm_loadl_epi64((__m128i *) (src+x));
118 r0 = _mm_adds_epi16(r0, f0);
119
120 r0 = _mm_srai_epi16(r0, 6);
121 r0 = _mm_packus_epi16(r0, r0);
122 #if MASKMOVE
123 _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
124 #else
125 //r0 = _mm_shuffle_epi32 (r0, 0x00);
126 *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
127 #endif
128 }
129 dst += dststride;
130 src += srcstride;
131 }
132 }else{
133 for (y = 0; y < height; y++) {
134 for(x = 0;x < width; x+=2){
135 r0 = _mm_loadl_epi64((__m128i *) (src+x));
136 r0 = _mm_adds_epi16(r0, f0);
137
138 r0 = _mm_srai_epi16(r0, 6);
139 r0 = _mm_packus_epi16(r0, r0);
140 #if MASKMOVE
141 _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
142 #else
143 *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
144 #endif
145 }
146 dst += dststride;
147 src += srcstride;
148 }
149 }
150
151 }
152
153 void ff_hevc_put_unweighted_pred_sse(uint8_t *_dst, ptrdiff_t _dststride,
154 int16_t *src, ptrdiff_t srcstride, int width, int height) {
155 int x, y;
156 uint8_t *dst = (uint8_t*) _dst;
157 ptrdiff_t dststride = _dststride / sizeof(uint8_t);
158 __m128i r0, r1, f0;
159 int shift = 14 - BIT_DEPTH;
160 #if BIT_DEPTH < 14
161 int16_t offset = 1 << (shift - 1);
162 #else
163 int16_t offset = 0;
164
165 #endif
166 f0 = _mm_set1_epi16(offset);
167
168 for (y = 0; y < height; y++) {
169 for (x = 0; x < width; x += 16) {
170 r0 = _mm_load_si128((__m128i *) &src[x]);
171
172 r1 = _mm_load_si128((__m128i *) &src[x + 8]);
173 r0 = _mm_adds_epi16(r0, f0);
174
175 r1 = _mm_adds_epi16(r1, f0);
176 r0 = _mm_srai_epi16(r0, shift);
177 r1 = _mm_srai_epi16(r1, shift);
178 r0 = _mm_packus_epi16(r0, r1);
179
180 _mm_storeu_si128((__m128i *) &dst[x], r0);
181 }
182 dst += dststride;
183 src += srcstride;
184 }
185 }
186
187 void ff_hevc_put_weighted_pred_avg_8_sse(uint8_t *_dst, ptrdiff_t dststride,
188 int16_t *src1, int16_t *src2, ptrdiff_t srcstride, int width,
189 int height) {
190 int x, y;
191 uint8_t *dst = (uint8_t*) _dst;
192 __m128i r0, r1, f0, r2, r3;
193
194 f0 = _mm_set1_epi16(64);
195 if(!(width & 15)){
196 for (y = 0; y < height; y++) {
197
198 for (x = 0; x < width; x += 16) {
199 r0 = _mm_load_si128((__m128i *) &src1[x]);
200 r1 = _mm_load_si128((__m128i *) &src1[x + 8]);
201 r2 = _mm_load_si128((__m128i *) &src2[x]);
202 r3 = _mm_load_si128((__m128i *) &src2[x + 8]);
203
204 r0 = _mm_adds_epi16(r0, f0);
205 r1 = _mm_adds_epi16(r1, f0);
206 r0 = _mm_adds_epi16(r0, r2);
207 r1 = _mm_adds_epi16(r1, r3);
208 r0 = _mm_srai_epi16(r0, 7);
209 r1 = _mm_srai_epi16(r1, 7);
210 r0 = _mm_packus_epi16(r0, r1);
211
212 _mm_storeu_si128((__m128i *) (dst + x), r0);
213 }
214 dst += dststride;
215 src1 += srcstride;
216 src2 += srcstride;
217 }
218 }else if(!(width & 7)){
219 for (y = 0; y < height; y++) {
220 for(x=0;x<width;x+=8){
221 r0 = _mm_load_si128((__m128i *) (src1+x));
222 r2 = _mm_load_si128((__m128i *) (src2+x));
223
224 r0 = _mm_adds_epi16(r0, f0);
225 r0 = _mm_adds_epi16(r0, r2);
226 r0 = _mm_srai_epi16(r0, 7);
227 r0 = _mm_packus_epi16(r0, r0);
228
229 _mm_storel_epi64((__m128i *) (dst+x), r0);
230 }
231 dst += dststride;
232 src1 += srcstride;
233 src2 += srcstride;
234 }
235 }else if(!(width & 3)){
236 #if MASKMOVE
237 r1= _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1);
238 #endif
239 for (y = 0; y < height; y++) {
240
241 for(x=0;x<width;x+=4)
242 {
243 r0 = _mm_loadl_epi64((__m128i *) (src1+x));
244 r2 = _mm_loadl_epi64((__m128i *) (src2+x));
245
246 r0 = _mm_adds_epi16(r0, f0);
247 r0 = _mm_adds_epi16(r0, r2);
248 r0 = _mm_srai_epi16(r0, 7);
249 r0 = _mm_packus_epi16(r0, r0);
250
251 #if MASKMOVE
252 _mm_maskmoveu_si128(r0,r1,(char *) (dst+x));
253 #else
254 *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
255 #endif
256 }
257 dst += dststride;
258 src1 += srcstride;
259 src2 += srcstride;
260 }
261 }else{
262 #if MASKMOVE
263 r1= _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1);
264 #endif
265 for (y = 0; y < height; y++) {
266 for(x=0;x<width;x+=2)
267 {
268 r0 = _mm_loadl_epi64((__m128i *) (src1+x));
269 r2 = _mm_loadl_epi64((__m128i *) (src2+x));
270
271 r0 = _mm_adds_epi16(r0, f0);
272 r0 = _mm_adds_epi16(r0, r2);
273 r0 = _mm_srai_epi16(r0, 7);
274 r0 = _mm_packus_epi16(r0, r0);
275
276 #if MASKMOVE
277 _mm_maskmoveu_si128(r0,r1,(char *) (dst+x));
278 #else
279 *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
280 #endif
281 }
282 dst += dststride;
283 src1 += srcstride;
284 src2 += srcstride;
285 }
286 }
287
288
289 }
290
291 void ff_hevc_put_weighted_pred_avg_sse(uint8_t *_dst, ptrdiff_t _dststride,
292 int16_t *src1, int16_t *src2, ptrdiff_t srcstride, int width,
293 int height) {
294 int x, y;
295 uint8_t *dst = (uint8_t*) _dst;
296 ptrdiff_t dststride = _dststride / sizeof(uint8_t);
297 __m128i r0, r1, f0, r2, r3;
298 int shift = 14 + 1 - BIT_DEPTH;
299 #if BIT_DEPTH < 14
300 int offset = 1 << (shift - 1);
301 #else
302 int offset = 0;
303 #endif
304 f0 = _mm_set1_epi16(offset);
305 for (y = 0; y < height; y++) {
306
307 for (x = 0; x < width; x += 16) {
308 r0 = _mm_load_si128((__m128i *) &src1[x]);
309 r1 = _mm_load_si128((__m128i *) &src1[x + 8]);
310 r2 = _mm_load_si128((__m128i *) &src2[x]);
311 r3 = _mm_load_si128((__m128i *) &src2[x + 8]);
312
313 r0 = _mm_adds_epi16(r0, f0);
314 r1 = _mm_adds_epi16(r1, f0);
315 r0 = _mm_adds_epi16(r0, r2);
316 r1 = _mm_adds_epi16(r1, r3);
317 r0 = _mm_srai_epi16(r0, shift);
318 r1 = _mm_srai_epi16(r1, shift);
319 r0 = _mm_packus_epi16(r0, r1);
320
321 _mm_storeu_si128((__m128i *) (dst + x), r0);
322 }
323 dst += dststride;
324 src1 += srcstride;
325 src2 += srcstride;
326 }
327 }
328
329 #if 0
330 void ff_hevc_weighted_pred_8_sse4(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
331 uint8_t *_dst, ptrdiff_t _dststride, int16_t *src, ptrdiff_t srcstride,
332 int width, int height) {
333
334 int log2Wd;
335 int x, y;
336
337 uint8_t *dst = (uint8_t*) _dst;
338 ptrdiff_t dststride = _dststride / sizeof(uint8_t);
339 __m128i x0, x1, x2, x3, c0, add, add2;
340
341 log2Wd = denom + 14 - BIT_DEPTH;
342
343 add = _mm_set1_epi32(olxFlag * (1 << (BIT_DEPTH - 8)));
344 add2 = _mm_set1_epi32(1 << (log2Wd - 1));
345 c0 = _mm_set1_epi16(wlxFlag);
346 if (log2Wd >= 1){
347 if(!(width & 15)){
348 for (y = 0; y < height; y++) {
349 for (x = 0; x < width; x += 16) {
350 x0 = _mm_load_si128((__m128i *) &src[x]);
351 x2 = _mm_load_si128((__m128i *) &src[x + 8]);
352 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
353 _mm_mulhi_epi16(x0, c0));
354 x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
355 _mm_mulhi_epi16(x2, c0));
356 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
357 _mm_mulhi_epi16(x0, c0));
358 x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
359 _mm_mulhi_epi16(x2, c0));
360 x0 = _mm_add_epi32(x0, add2);
361 x1 = _mm_add_epi32(x1, add2);
362 x2 = _mm_add_epi32(x2, add2);
363 x3 = _mm_add_epi32(x3, add2);
364 x0 = _mm_srai_epi32(x0, log2Wd);
365 x1 = _mm_srai_epi32(x1, log2Wd);
366 x2 = _mm_srai_epi32(x2, log2Wd);
367 x3 = _mm_srai_epi32(x3, log2Wd);
368 x0 = _mm_add_epi32(x0, add);
369 x1 = _mm_add_epi32(x1, add);
370 x2 = _mm_add_epi32(x2, add);
371 x3 = _mm_add_epi32(x3, add);
372 x0 = _mm_packus_epi32(x0, x1);
373 x2 = _mm_packus_epi32(x2, x3);
374 x0 = _mm_packus_epi16(x0, x2);
375
376 _mm_storeu_si128((__m128i *) (dst + x), x0);
377
378 }
379 dst += dststride;
380 src += srcstride;
381 }
382 }else if(!(width & 7)){
383 for (y = 0; y < height; y++) {
384 for(x=0;x<width;x+=8){
385 x0 = _mm_load_si128((__m128i *) (src+x));
386 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
387 _mm_mulhi_epi16(x0, c0));
388
389 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
390 _mm_mulhi_epi16(x0, c0));
391
392 x0 = _mm_add_epi32(x0, add2);
393 x1 = _mm_add_epi32(x1, add2);
394
395 x0 = _mm_srai_epi32(x0, log2Wd);
396 x1 = _mm_srai_epi32(x1, log2Wd);
397
398 x0 = _mm_add_epi32(x0, add);
399 x1 = _mm_add_epi32(x1, add);
400
401 x0 = _mm_packus_epi32(x0, x1);
402 x0 = _mm_packus_epi16(x0, x0);
403
404 _mm_storel_epi64((__m128i *) (dst+x), x0);
405
406 }
407 dst += dststride;
408 src += srcstride;
409 }
410 }else if(!(width & 3)){
411 for (y = 0; y < height; y++) {
412 for(x=0;x<width;x+=4){
413 x0 = _mm_loadl_epi64((__m128i *)(src+x));
414 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
415 _mm_mulhi_epi16(x0, c0));
416 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
417 _mm_mulhi_epi16(x0, c0));
418
419 x0 = _mm_add_epi32(x0, add2);
420 x1 = _mm_add_epi32(x1, add2);
421 x0 = _mm_srai_epi32(x0, log2Wd);
422 x1 = _mm_srai_epi32(x1, log2Wd);
423 x0 = _mm_add_epi32(x0, add);
424 x1 = _mm_add_epi32(x1, add);
425 x0 = _mm_packus_epi32(x0, x1);
426 x0 = _mm_packus_epi16(x0, x0);
427
428 _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
429 // _mm_storeu_si128((__m128i *) (dst + x), x0);
430 }
431 dst += dststride;
432 src += srcstride;
433 }
434 }else{
435 for (y = 0; y < height; y++) {
436 for(x=0;x<width;x+=2){
437 x0 = _mm_loadl_epi64((__m128i *)(src+x));
438 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
439 _mm_mulhi_epi16(x0, c0));
440 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
441 _mm_mulhi_epi16(x0, c0));
442
443 x0 = _mm_add_epi32(x0, add2);
444 x1 = _mm_add_epi32(x1, add2);
445 x0 = _mm_srai_epi32(x0, log2Wd);
446 x1 = _mm_srai_epi32(x1, log2Wd);
447 x0 = _mm_add_epi32(x0, add);
448 x1 = _mm_add_epi32(x1, add);
449 x0 = _mm_packus_epi32(x0, x1);
450 x0 = _mm_packus_epi16(x0, x0);
451
452 _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
453 // _mm_storeu_si128((__m128i *) (dst + x), x0);
454 }
455 dst += dststride;
456 src += srcstride;
457 }
458 }
459 }else{
460 if(!(width & 15)){
461 for (y = 0; y < height; y++) {
462 for (x = 0; x < width; x += 16) {
463
464 x0 = _mm_load_si128((__m128i *) &src[x]);
465 x2 = _mm_load_si128((__m128i *) &src[x + 8]);
466 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
467 _mm_mulhi_epi16(x0, c0));
468 x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
469 _mm_mulhi_epi16(x2, c0));
470 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
471 _mm_mulhi_epi16(x0, c0));
472 x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
473 _mm_mulhi_epi16(x2, c0));
474
475 x0 = _mm_add_epi32(x0, add2);
476 x1 = _mm_add_epi32(x1, add2);
477 x2 = _mm_add_epi32(x2, add2);
478 x3 = _mm_add_epi32(x3, add2);
479
480 x0 = _mm_packus_epi32(x0, x1);
481 x2 = _mm_packus_epi32(x2, x3);
482 x0 = _mm_packus_epi16(x0, x2);
483
484 _mm_storeu_si128((__m128i *) (dst + x), x0);
485
486 }
487 dst += dststride;
488 src += srcstride;
489 }
490 }else if(!(width & 7)){
491 for (y = 0; y < height; y++) {
492 for(x=0;x<width;x+=8){
493 x0 = _mm_load_si128((__m128i *) (src+x));
494 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
495 _mm_mulhi_epi16(x0, c0));
496
497 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
498 _mm_mulhi_epi16(x0, c0));
499
500
501 x0 = _mm_add_epi32(x0, add2);
502 x1 = _mm_add_epi32(x1, add2);
503
504 x0 = _mm_packus_epi32(x0, x1);
505 x0 = _mm_packus_epi16(x0, x0);
506
507 _mm_storeu_si128((__m128i *) (dst+x), x0);
508 }
509
510 dst += dststride;
511 src += srcstride;
512 }
513 }else if(!(width & 3)){
514 for (y = 0; y < height; y++) {
515 for(x=0;x<width;x+=4){
516 x0 = _mm_loadl_epi64((__m128i *) (src+x));
517 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
518 _mm_mulhi_epi16(x0, c0));
519
520 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
521 _mm_mulhi_epi16(x0, c0));
522
523
524 x0 = _mm_add_epi32(x0, add2);
525 x1 = _mm_add_epi32(x1, add2);
526
527
528 x0 = _mm_packus_epi32(x0, x1);
529 x0 = _mm_packus_epi16(x0, x0);
530
531
532 _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
533 }
534 dst += dststride;
535 src += srcstride;
536 }
537 }else{
538 for (y = 0; y < height; y++) {
539 for(x=0;x<width;x+=2){
540 x0 = _mm_loadl_epi64((__m128i *) (src+x));
541 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
542 _mm_mulhi_epi16(x0, c0));
543
544 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
545 _mm_mulhi_epi16(x0, c0));
546
547
548 x0 = _mm_add_epi32(x0, add2);
549 x1 = _mm_add_epi32(x1, add2);
550
551
552 x0 = _mm_packus_epi32(x0, x1);
553 x0 = _mm_packus_epi16(x0, x0);
554
555
556 _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
557 }
558 dst += dststride;
559 src += srcstride;
560 }
561
562 }
563
564 }
565
566 }
567 #endif
568
569
570 #if 0
571 void ff_hevc_weighted_pred_sse(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
572 uint8_t *_dst, ptrdiff_t _dststride, int16_t *src, ptrdiff_t srcstride,
573 int width, int height) {
574
575 int log2Wd;
576 int x, y;
577
578 uint8_t *dst = (uint8_t*) _dst;
579 ptrdiff_t dststride = _dststride / sizeof(uint8_t);
580 __m128i x0, x1, x2, x3, c0, add, add2;
581
582 log2Wd = denom + 14 - BIT_DEPTH;
583
584 add = _mm_set1_epi32(olxFlag * (1 << (BIT_DEPTH - 8)));
585 add2 = _mm_set1_epi32(1 << (log2Wd - 1));
586 c0 = _mm_set1_epi16(wlxFlag);
587 if (log2Wd >= 1)
588 for (y = 0; y < height; y++) {
589 for (x = 0; x < width; x += 16) {
590 x0 = _mm_load_si128((__m128i *) &src[x]);
591 x2 = _mm_load_si128((__m128i *) &src[x + 8]);
592 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
593 _mm_mulhi_epi16(x0, c0));
594 x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
595 _mm_mulhi_epi16(x2, c0));
596 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
597 _mm_mulhi_epi16(x0, c0));
598 x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
599 _mm_mulhi_epi16(x2, c0));
600 x0 = _mm_add_epi32(x0, add2);
601 x1 = _mm_add_epi32(x1, add2);
602 x2 = _mm_add_epi32(x2, add2);
603 x3 = _mm_add_epi32(x3, add2);
604 x0 = _mm_srai_epi32(x0, log2Wd);
605 x1 = _mm_srai_epi32(x1, log2Wd);
606 x2 = _mm_srai_epi32(x2, log2Wd);
607 x3 = _mm_srai_epi32(x3, log2Wd);
608 x0 = _mm_add_epi32(x0, add);
609 x1 = _mm_add_epi32(x1, add);
610 x2 = _mm_add_epi32(x2, add);
611 x3 = _mm_add_epi32(x3, add);
612 x0 = _mm_packus_epi32(x0, x1);
613 x2 = _mm_packus_epi32(x2, x3);
614 x0 = _mm_packus_epi16(x0, x2);
615
616 _mm_storeu_si128((__m128i *) (dst + x), x0);
617
618 }
619 dst += dststride;
620 src += srcstride;
621 }
622 else
623 for (y = 0; y < height; y++) {
624 for (x = 0; x < width; x += 16) {
625
626 x0 = _mm_load_si128((__m128i *) &src[x]);
627 x2 = _mm_load_si128((__m128i *) &src[x + 8]);
628 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
629 _mm_mulhi_epi16(x0, c0));
630 x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
631 _mm_mulhi_epi16(x2, c0));
632 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
633 _mm_mulhi_epi16(x0, c0));
634 x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
635 _mm_mulhi_epi16(x2, c0));
636
637 x0 = _mm_add_epi32(x0, add2);
638 x1 = _mm_add_epi32(x1, add2);
639 x2 = _mm_add_epi32(x2, add2);
640 x3 = _mm_add_epi32(x3, add2);
641
642 x0 = _mm_packus_epi32(x0, x1);
643 x2 = _mm_packus_epi32(x2, x3);
644 x0 = _mm_packus_epi16(x0, x2);
645
646 _mm_storeu_si128((__m128i *) (dst + x), x0);
647
648 }
649 dst += dststride;
650 src += srcstride;
651 }
652 }
653 #endif
654
655 #if HAVE_SSE4_1
656 void ff_hevc_weighted_pred_avg_8_sse4(uint8_t denom, int16_t wl0Flag,
657 int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag, uint8_t *_dst,
658 ptrdiff_t _dststride, int16_t *src1, int16_t *src2, ptrdiff_t srcstride,
659 int width, int height) {
660 int shift, shift2;
661 int log2Wd;
662 int o0;
663 int o1;
664 int x, y;
665 uint8_t *dst = (uint8_t*) _dst;
666 ptrdiff_t dststride = _dststride / sizeof(uint8_t);
667 __m128i x0, x1, x2, x3, r0, r1, r2, r3, c0, c1, c2;
668 shift = 14 - BIT_DEPTH;
669 log2Wd = denom + shift;
670
671 o0 = (ol0Flag) * (1 << (BIT_DEPTH - 8));
672 o1 = (ol1Flag) * (1 << (BIT_DEPTH - 8));
673 shift2 = (log2Wd + 1);
674 c0 = _mm_set1_epi16(wl0Flag);
675 c1 = _mm_set1_epi16(wl1Flag);
676 c2 = _mm_set1_epi32((o0 + o1 + 1) << log2Wd);
677
678 if(!(width & 15)){
679 for (y = 0; y < height; y++) {
680 for (x = 0; x < width; x += 16) {
681 x0 = _mm_load_si128((__m128i *) &src1[x]);
682 x1 = _mm_load_si128((__m128i *) &src1[x + 8]);
683 x2 = _mm_load_si128((__m128i *) &src2[x]);
684 x3 = _mm_load_si128((__m128i *) &src2[x + 8]);
685
686 r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
687 _mm_mulhi_epi16(x0, c0));
688 r1 = _mm_unpacklo_epi16(_mm_mullo_epi16(x1, c0),
689 _mm_mulhi_epi16(x1, c0));
690 r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
691 _mm_mulhi_epi16(x2, c1));
692 r3 = _mm_unpacklo_epi16(_mm_mullo_epi16(x3, c1),
693 _mm_mulhi_epi16(x3, c1));
694 x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
695 _mm_mulhi_epi16(x0, c0));
696 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x1, c0),
697 _mm_mulhi_epi16(x1, c0));
698 x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
699 _mm_mulhi_epi16(x2, c1));
700 x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x3, c1),
701 _mm_mulhi_epi16(x3, c1));
702 r0 = _mm_add_epi32(r0, r2);
703 r1 = _mm_add_epi32(r1, r3);
704 r2 = _mm_add_epi32(x0, x2);
705 r3 = _mm_add_epi32(x1, x3);
706
707 r0 = _mm_add_epi32(r0, c2);
708 r1 = _mm_add_epi32(r1, c2);
709 r2 = _mm_add_epi32(r2, c2);
710 r3 = _mm_add_epi32(r3, c2);
711
712 r0 = _mm_srai_epi32(r0, shift2);
713 r1 = _mm_srai_epi32(r1, shift2);
714 r2 = _mm_srai_epi32(r2, shift2);
715 r3 = _mm_srai_epi32(r3, shift2);
716
717 r0 = _mm_packus_epi32(r0, r2);
718 r1 = _mm_packus_epi32(r1, r3);
719 r0 = _mm_packus_epi16(r0, r1);
720
721 _mm_storeu_si128((__m128i *) (dst + x), r0);
722
723 }
724 dst += dststride;
725 src1 += srcstride;
726 src2 += srcstride;
727 }
728 }else if(!(width & 7)){
729 for (y = 0; y < height; y++) {
730 for(x=0;x<width;x+=8){
731 x0 = _mm_load_si128((__m128i *) (src1+x));
732 x2 = _mm_load_si128((__m128i *) (src2+x));
733
734 r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
735 _mm_mulhi_epi16(x0, c0));
736
737 r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
738 _mm_mulhi_epi16(x2, c1));
739
740 x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
741 _mm_mulhi_epi16(x0, c0));
742
743 x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
744 _mm_mulhi_epi16(x2, c1));
745
746 r0 = _mm_add_epi32(r0, r2);
747 r2 = _mm_add_epi32(x0, x2);
748
749
750 r0 = _mm_add_epi32(r0, c2);
751 r2 = _mm_add_epi32(r2, c2);
752
753 r0 = _mm_srai_epi32(r0, shift2);
754 r2 = _mm_srai_epi32(r2, shift2);
755
756 r0 = _mm_packus_epi32(r0, r2);
757 r0 = _mm_packus_epi16(r0, r0);
758
759 _mm_storel_epi64((__m128i *) (dst+x), r0);
760 }
761
762 dst += dststride;
763 src1 += srcstride;
764 src2 += srcstride;
765 }
766 }else if(!(width & 3)){
767 for (y = 0; y < height; y++) {
768 for(x=0;x<width;x+=4){
769 x0 = _mm_loadl_epi64((__m128i *) (src1+x));
770 x2 = _mm_loadl_epi64((__m128i *) (src2+x));
771
772 r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
773 _mm_mulhi_epi16(x0, c0));
774
775 r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
776 _mm_mulhi_epi16(x2, c1));
777
778 x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
779 _mm_mulhi_epi16(x0, c0));
780
781 x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
782 _mm_mulhi_epi16(x2, c1));
783
784 r0 = _mm_add_epi32(r0, r2);
785 r2 = _mm_add_epi32(x0, x2);
786
787 r0 = _mm_add_epi32(r0, c2);
788 r2 = _mm_add_epi32(r2, c2);
789
790 r0 = _mm_srai_epi32(r0, shift2);
791 r2 = _mm_srai_epi32(r2, shift2);
792
793 r0 = _mm_packus_epi32(r0, r2);
794 r0 = _mm_packus_epi16(r0, r0);
795
796 #if MASKMOVE
797 _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
798 #else
799 *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
800 #endif
801 }
802 dst += dststride;
803 src1 += srcstride;
804 src2 += srcstride;
805 }
806 }else{
807 for (y = 0; y < height; y++) {
808 for(x=0;x<width;x+=2){
809 x0 = _mm_loadl_epi64((__m128i *) (src1+x));
810 x2 = _mm_loadl_epi64((__m128i *) (src2+x));
811
812 r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
813 _mm_mulhi_epi16(x0, c0));
814
815 r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
816 _mm_mulhi_epi16(x2, c1));
817
818 x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
819 _mm_mulhi_epi16(x0, c0));
820
821 x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
822 _mm_mulhi_epi16(x2, c1));
823
824 r0 = _mm_add_epi32(r0, r2);
825 r2 = _mm_add_epi32(x0, x2);
826
827 r0 = _mm_add_epi32(r0, c2);
828 r2 = _mm_add_epi32(r2, c2);
829
830 r0 = _mm_srai_epi32(r0, shift2);
831 r2 = _mm_srai_epi32(r2, shift2);
832
833 r0 = _mm_packus_epi32(r0, r2);
834 r0 = _mm_packus_epi16(r0, r0);
835
836 #if MASKMOVE
837 _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
838 #else
839 *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
840 #endif
841 }
842 dst += dststride;
843 src1 += srcstride;
844 src2 += srcstride;
845 }
846 }
847 }
848 #endif
849
850
851 #if 0
852 void ff_hevc_weighted_pred_avg_sse(uint8_t denom, int16_t wl0Flag,
853 int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag, uint8_t *_dst,
854 ptrdiff_t _dststride, int16_t *src1, int16_t *src2, ptrdiff_t srcstride,
855 int width, int height) {
856 int shift, shift2;
857 int log2Wd;
858 int o0;
859 int o1;
860 int x, y;
861 uint8_t *dst = (uint8_t*) _dst;
862 ptrdiff_t dststride = _dststride / sizeof(uint8_t);
863 __m128i x0, x1, x2, x3, r0, r1, r2, r3, c0, c1, c2;
864 shift = 14 - BIT_DEPTH;
865 log2Wd = denom + shift;
866
867 o0 = (ol0Flag) * (1 << (BIT_DEPTH - 8));
868 o1 = (ol1Flag) * (1 << (BIT_DEPTH - 8));
869 shift2 = (log2Wd + 1);
870 c0 = _mm_set1_epi16(wl0Flag);
871 c1 = _mm_set1_epi16(wl1Flag);
872 c2 = _mm_set1_epi32((o0 + o1 + 1) << log2Wd);
873
874 for (y = 0; y < height; y++) {
875 for (x = 0; x < width; x += 16) {
876 x0 = _mm_load_si128((__m128i *) &src1[x]);
877 x1 = _mm_load_si128((__m128i *) &src1[x + 8]);
878 x2 = _mm_load_si128((__m128i *) &src2[x]);
879 x3 = _mm_load_si128((__m128i *) &src2[x + 8]);
880
881 r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
882 _mm_mulhi_epi16(x0, c0));
883 r1 = _mm_unpacklo_epi16(_mm_mullo_epi16(x1, c0),
884 _mm_mulhi_epi16(x1, c0));
885 r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
886 _mm_mulhi_epi16(x2, c1));
887 r3 = _mm_unpacklo_epi16(_mm_mullo_epi16(x3, c1),
888 _mm_mulhi_epi16(x3, c1));
889 x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
890 _mm_mulhi_epi16(x0, c0));
891 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x1, c0),
892 _mm_mulhi_epi16(x1, c0));
893 x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
894 _mm_mulhi_epi16(x2, c1));
895 x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x3, c1),
896 _mm_mulhi_epi16(x3, c1));
897 r0 = _mm_add_epi32(r0, r2);
898 r1 = _mm_add_epi32(r1, r3);
899 r2 = _mm_add_epi32(x0, x2);
900 r3 = _mm_add_epi32(x1, x3);
901
902 r0 = _mm_add_epi32(r0, c2);
903 r1 = _mm_add_epi32(r1, c2);
904 r2 = _mm_add_epi32(r2, c2);
905 r3 = _mm_add_epi32(r3, c2);
906
907 r0 = _mm_srai_epi32(r0, shift2);
908 r1 = _mm_srai_epi32(r1, shift2);
909 r2 = _mm_srai_epi32(r2, shift2);
910 r3 = _mm_srai_epi32(r3, shift2);
911
912 r0 = _mm_packus_epi32(r0, r2);
913 r1 = _mm_packus_epi32(r1, r3);
914 r0 = _mm_packus_epi16(r0, r1);
915
916 _mm_storeu_si128((__m128i *) (dst + x), r0);
917
918 }
919 dst += dststride;
920 src1 += srcstride;
921 src2 += srcstride;
922 }
923 }
924 #endif
925
926
927 void ff_hevc_put_hevc_epel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride,
928 uint8_t *_src, ptrdiff_t srcstride, int width, int height, int mx,
929 int my, int16_t* mcbuffer) {
930 int x, y;
931 __m128i x1, x2,x3;
932 uint8_t *src = (uint8_t*) _src;
933 if(!(width & 15)){
934 x3= _mm_setzero_si128();
935 for (y = 0; y < height; y++) {
936 for (x = 0; x < width; x += 16) {
937
938 x1 = _mm_loadu_si128((__m128i *) &src[x]);
939 x2 = _mm_unpacklo_epi8(x1, x3);
940
941 x1 = _mm_unpackhi_epi8(x1, x3);
942
943 x2 = _mm_slli_epi16(x2, 6);
944 x1 = _mm_slli_epi16(x1, 6);
945 _mm_store_si128((__m128i *) &dst[x], x2);
946 _mm_store_si128((__m128i *) &dst[x + 8], x1);
947
948 }
949 src += srcstride;
950 dst += dststride;
951 }
952 }else if(!(width & 7)){
953 x1= _mm_setzero_si128();
954 for (y = 0; y < height; y++) {
955 for (x = 0; x < width; x += 8) {
956
957 x2 = _mm_loadl_epi64((__m128i *) &src[x]);
958 x2 = _mm_unpacklo_epi8(x2, x1);
959 x2 = _mm_slli_epi16(x2, 6);
960 _mm_store_si128((__m128i *) &dst[x], x2);
961
962 }
963 src += srcstride;
964 dst += dststride;
965 }
966 }else if(!(width & 3)){
967 x1= _mm_setzero_si128();
968 for (y = 0; y < height; y++) {
969 for (x = 0; x < width; x += 4) {
970
971 x2 = _mm_loadl_epi64((__m128i *) &src[x]);
972 x2 = _mm_unpacklo_epi8(x2,x1);
973
974 x2 = _mm_slli_epi16(x2, 6);
975
976 _mm_storel_epi64((__m128i *) &dst[x], x2);
977
978 }
979 src += srcstride;
980 dst += dststride;
981 }
982 }else{
983 x1= _mm_setzero_si128();
984 for (y = 0; y < height; y++) {
985 for (x = 0; x < width; x += 2) {
986
987 x2 = _mm_loadl_epi64((__m128i *) &src[x]);
988 x2 = _mm_unpacklo_epi8(x2, x1);
989 x2 = _mm_slli_epi16(x2, 6);
990 #if MASKMOVE
991 _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
992 #else
993 *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
994 #endif
995 }
996 src += srcstride;
997 dst += dststride;
998 }
999 }
1000
1001 }
1002
1003 #ifndef __native_client__
1004 void ff_hevc_put_hevc_epel_pixels_10_sse(int16_t *dst, ptrdiff_t dststride,
1005 uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1006 int my, int16_t* mcbuffer) {
1007 int x, y;
1008 __m128i x2;
1009 uint16_t *src = (uint16_t*) _src;
1010 ptrdiff_t srcstride = _srcstride>>1;
1011 if(!(width & 7)){
1012 //x1= _mm_setzero_si128();
1013 for (y = 0; y < height; y++) {
1014 for (x = 0; x < width; x += 8) {
1015
1016 x2 = _mm_loadu_si128((__m128i *) &src[x]);
1017 x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH
1018 _mm_store_si128((__m128i *) &dst[x], x2);
1019
1020 }
1021 src += srcstride;
1022 dst += dststride;
1023 }
1024 }else if(!(width & 3)){
1025 //x1= _mm_setzero_si128();
1026 for (y = 0; y < height; y++) {
1027 for (x = 0; x < width; x += 4) {
1028
1029 x2 = _mm_loadl_epi64((__m128i *) &src[x]);
1030 x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH
1031
1032 _mm_storel_epi64((__m128i *) &dst[x], x2);
1033
1034 }
1035 src += srcstride;
1036 dst += dststride;
1037 }
1038 }else{
1039 //x1= _mm_setzero_si128();
1040 for (y = 0; y < height; y++) {
1041 for (x = 0; x < width; x += 2) {
1042
1043 x2 = _mm_loadl_epi64((__m128i *) &src[x]);
1044 x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH
1045 _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1046 }
1047 src += srcstride;
1048 dst += dststride;
1049 }
1050 }
1051
1052 }
1053 #endif
1054
1055 void ff_hevc_put_hevc_epel_h_8_sse(int16_t *dst, ptrdiff_t dststride,
1056 uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1057 int my, int16_t* mcbuffer) {
1058 int x, y;
1059 uint8_t *src = (uint8_t*) _src;
1060 ptrdiff_t srcstride = _srcstride;
1061 const int8_t *filter = epel_filters[mx - 1];
1062 __m128i r0, bshuffle1, bshuffle2, x1, x2, x3;
1063 int8_t filter_0 = filter[0];
1064 int8_t filter_1 = filter[1];
1065 int8_t filter_2 = filter[2];
1066 int8_t filter_3 = filter[3];
1067 r0 = _mm_set_epi8(filter_3, filter_2, filter_1, filter_0, filter_3,
1068 filter_2, filter_1, filter_0, filter_3, filter_2, filter_1,
1069 filter_0, filter_3, filter_2, filter_1, filter_0);
1070 bshuffle1 = _mm_set_epi8(6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0);
1071
1072
1073 /*
1074 printf("---IN---SSE\n");
1075
1076 int extra_top = 1;
1077 int extra_left = 1;
1078 int extra_right = 2;
1079 int extra_bottom = 2;
1080
1081 for (int y=-extra_top;y<height+extra_bottom;y++) {
1082 uint8_t* p = &_src[y*_srcstride -extra_left];
1083
1084 for (int x=-extra_left;x<width+extra_right;x++) {
1085 printf("%05d ",*p << 6);
1086 p++;
1087 }
1088 printf("\n");
1089 }
1090 */
1091
1092 if(!(width & 7)){
1093 bshuffle2 = _mm_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5,
1094 4);
1095 for (y = 0; y < height; y++) {
1096 for (x = 0; x < width; x += 8) {
1097
1098 x1 = _mm_loadu_si128((__m128i *) &src[x - 1]);
1099 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1100 x3 = _mm_shuffle_epi8(x1, bshuffle2);
1101
1102 /* PMADDUBSW then PMADDW */
1103 x2 = _mm_maddubs_epi16(x2, r0);
1104 x3 = _mm_maddubs_epi16(x3, r0);
1105 x2 = _mm_hadd_epi16(x2, x3);
1106 _mm_store_si128((__m128i *) &dst[x], x2);
1107 }
1108 src += srcstride;
1109 dst += dststride;
1110 }
1111 }else if(!(width & 3)){
1112
1113 for (y = 0; y < height; y++) {
1114 for (x = 0; x < width; x += 4) {
1115 /* load data in register */
1116 x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1117 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1118
1119 /* PMADDUBSW then PMADDW */
1120 x2 = _mm_maddubs_epi16(x2, r0);
1121 x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
1122 /* give results back */
1123 _mm_storel_epi64((__m128i *) &dst[x], x2);
1124 }
1125 src += srcstride;
1126 dst += dststride;
1127 }
1128 }else{
1129 for (y = 0; y < height; y++) {
1130 for (x = 0; x < width; x += 2) {
1131 /* load data in register */
1132 x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1133 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1134
1135 /* PMADDUBSW then PMADDW */
1136 x2 = _mm_maddubs_epi16(x2, r0);
1137 x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
1138 /* give results back */
1139 #if MASKMOVE
1140 _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1141 #else
1142 *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
1143 #endif
1144 }
1145 src += srcstride;
1146 dst += dststride;
1147 }
1148 }
1149 }
1150
1151 #ifndef __native_client__
1152 void ff_hevc_put_hevc_epel_h_10_sse(int16_t *dst, ptrdiff_t dststride,
1153 uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1154 int my, int16_t* mcbuffer) {
1155 int x, y;
1156 uint16_t *src = (uint16_t*) _src;
1157 ptrdiff_t srcstride = _srcstride>>1;
1158 const int8_t *filter = epel_filters[mx - 1];
1159 __m128i r0, bshuffle1, bshuffle2, x1, x2, x3, r1;
1160 int8_t filter_0 = filter[0];
1161 int8_t filter_1 = filter[1];
1162 int8_t filter_2 = filter[2];
1163 int8_t filter_3 = filter[3];
1164 r0 = _mm_set_epi16(filter_3, filter_2, filter_1,
1165 filter_0, filter_3, filter_2, filter_1, filter_0);
1166 bshuffle1 = _mm_set_epi8(9,8,7,6,5,4, 3, 2,7,6,5,4, 3, 2, 1, 0);
1167
1168 if(!(width & 3)){
1169 bshuffle2 = _mm_set_epi8(13,12,11,10,9,8,7,6,11,10, 9,8,7,6,5, 4);
1170 for (y = 0; y < height; y++) {
1171 for (x = 0; x < width; x += 4) {
1172
1173 x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1174 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1175 x3 = _mm_shuffle_epi8(x1, bshuffle2);
1176
1177
1178 x2 = _mm_madd_epi16(x2, r0);
1179 x3 = _mm_madd_epi16(x3, r0);
1180 x2 = _mm_hadd_epi32(x2, x3);
1181 x2= _mm_srai_epi32(x2,2); //>> (BIT_DEPTH - 8)
1182
1183 x2 = _mm_packs_epi32(x2,r0);
1184 //give results back
1185 _mm_storel_epi64((__m128i *) &dst[x], x2);
1186 }
1187 src += srcstride;
1188 dst += dststride;
1189 }
1190 }else{
1191 r1= _mm_setzero_si128();
1192 for (y = 0; y < height; y++) {
1193 for (x = 0; x < width; x += 2) {
1194 /* load data in register */
1195 x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1196 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1197
1198 /* PMADDUBSW then PMADDW */
1199 x2 = _mm_madd_epi16(x2, r0);
1200 x2 = _mm_hadd_epi32(x2, r1);
1201 x2= _mm_srai_epi32(x2,2); //>> (BIT_DEPTH - 8)
1202 x2 = _mm_packs_epi32(x2, r1);
1203 /* give results back */
1204 _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1205 }
1206 src += srcstride;
1207 dst += dststride;
1208 }
1209 }
1210 }
1211 #endif
1212
1213
1214 void ff_hevc_put_hevc_epel_v_8_sse(int16_t *dst, ptrdiff_t dststride,
1215 uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1216 int my, int16_t* mcbuffer) {
1217 int x, y;
1218 __m128i x0, x1, x2, x3, t0, t1, t2, t3, r0, f0, f1, f2, f3, r1;
1219 uint8_t *src = (uint8_t*) _src;
1220 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
1221 const int8_t *filter = epel_filters[my - 1];
1222 int8_t filter_0 = filter[0];
1223 int8_t filter_1 = filter[1];
1224 int8_t filter_2 = filter[2];
1225 int8_t filter_3 = filter[3];
1226 f0 = _mm_set1_epi16(filter_0);
1227 f1 = _mm_set1_epi16(filter_1);
1228 f2 = _mm_set1_epi16(filter_2);
1229 f3 = _mm_set1_epi16(filter_3);
1230
1231 if(!(width & 15)){
1232 for (y = 0; y < height; y++) {
1233 for (x = 0; x < width; x += 16) {
1234 /* check if memory needs to be reloaded */
1235
1236 x0 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
1237 x1 = _mm_loadu_si128((__m128i *) &src[x]);
1238 x2 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
1239 x3 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
1240
1241 t0 = _mm_unpacklo_epi8(x0, _mm_setzero_si128());
1242 t1 = _mm_unpacklo_epi8(x1, _mm_setzero_si128());
1243 t2 = _mm_unpacklo_epi8(x2, _mm_setzero_si128());
1244 t3 = _mm_unpacklo_epi8(x3, _mm_setzero_si128());
1245
1246 x0 = _mm_unpackhi_epi8(x0, _mm_setzero_si128());
1247 x1 = _mm_unpackhi_epi8(x1, _mm_setzero_si128());
1248 x2 = _mm_unpackhi_epi8(x2, _mm_setzero_si128());
1249 x3 = _mm_unpackhi_epi8(x3, _mm_setzero_si128());
1250
1251 /* multiply by correct value : */
1252 r0 = _mm_mullo_epi16(t0, f0);
1253 r1 = _mm_mullo_epi16(x0, f0);
1254 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1255 r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x1, f1));
1256 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1257 r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x2, f2));
1258 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1259 r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x3, f3));
1260 /* give results back */
1261 _mm_store_si128((__m128i *) &dst[x], r0);
1262 _mm_storeu_si128((__m128i *) &dst[x + 8], r1);
1263 }
1264 src += srcstride;
1265 dst += dststride;
1266 }
1267 }else if(!(width & 7)){
1268 r1= _mm_setzero_si128();
1269 for (y = 0; y < height; y++) {
1270 for(x=0;x<width;x+=8){
1271 x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1272 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1273 x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1274 x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1275
1276 t0 = _mm_unpacklo_epi8(x0, r1);
1277 t1 = _mm_unpacklo_epi8(x1, r1);
1278 t2 = _mm_unpacklo_epi8(x2, r1);
1279 t3 = _mm_unpacklo_epi8(x3, r1);
1280
1281
1282 /* multiply by correct value : */
1283 r0 = _mm_mullo_epi16(t0, f0);
1284 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1285 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1286 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1287 /* give results back */
1288 _mm_storeu_si128((__m128i *) &dst[x], r0);
1289 }
1290 src += srcstride;
1291 dst += dststride;
1292 }
1293 }else if(!(width & 3)){
1294 r1= _mm_setzero_si128();
1295 for (y = 0; y < height; y++) {
1296 for(x=0;x<width;x+=4){
1297 x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1298 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1299 x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1300 x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1301
1302 t0 = _mm_unpacklo_epi8(x0, r1);
1303 t1 = _mm_unpacklo_epi8(x1, r1);
1304 t2 = _mm_unpacklo_epi8(x2, r1);
1305 t3 = _mm_unpacklo_epi8(x3, r1);
1306
1307
1308 /* multiply by correct value : */
1309 r0 = _mm_mullo_epi16(t0, f0);
1310 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1311 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1312 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1313 /* give results back */
1314 _mm_storel_epi64((__m128i *) &dst[x], r0);
1315 }
1316 src += srcstride;
1317 dst += dststride;
1318 }
1319 }else{
1320 r1= _mm_setzero_si128();
1321 for (y = 0; y < height; y++) {
1322 for(x=0;x<width;x+=2){
1323 x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1324 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1325 x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1326 x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1327
1328 t0 = _mm_unpacklo_epi8(x0, r1);
1329 t1 = _mm_unpacklo_epi8(x1, r1);
1330 t2 = _mm_unpacklo_epi8(x2, r1);
1331 t3 = _mm_unpacklo_epi8(x3, r1);
1332
1333
1334 /* multiply by correct value : */
1335 r0 = _mm_mullo_epi16(t0, f0);
1336 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1337 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1338 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1339 /* give results back */
1340 #if MASKMOVE
1341 _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1342 #else
1343 *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
1344 #endif
1345 }
1346 src += srcstride;
1347 dst += dststride;
1348 }
1349 }
1350 }
1351
1352 #ifndef __native_client__
1353 void ff_hevc_put_hevc_epel_v_10_sse(int16_t *dst, ptrdiff_t dststride,
1354 uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1355 int my, int16_t* mcbuffer) {
1356 int x, y;
1357 __m128i x0, x1, x2, x3, t0, t1, t2, t3, r0, f0, f1, f2, f3, r1, r2, r3;
1358 uint16_t *src = (uint16_t*) _src;
1359 ptrdiff_t srcstride = _srcstride >>1;
1360 const int8_t *filter = epel_filters[my - 1];
1361 int8_t filter_0 = filter[0];
1362 int8_t filter_1 = filter[1];
1363 int8_t filter_2 = filter[2];
1364 int8_t filter_3 = filter[3];
1365 f0 = _mm_set1_epi16(filter_0);
1366 f1 = _mm_set1_epi16(filter_1);
1367 f2 = _mm_set1_epi16(filter_2);
1368 f3 = _mm_set1_epi16(filter_3);
1369
1370 if(!(width & 7)){
1371 r1= _mm_setzero_si128();
1372 for (y = 0; y < height; y++) {
1373 for(x=0;x<width;x+=8){
1374 x0 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
1375 x1 = _mm_loadu_si128((__m128i *) &src[x]);
1376 x2 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
1377 x3 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
1378
1379 // multiply by correct value :
1380 r0 = _mm_mullo_epi16(x0, f0);
1381 t0 = _mm_mulhi_epi16(x0, f0);
1382
1383 x0= _mm_unpacklo_epi16(r0,t0);
1384 t0= _mm_unpackhi_epi16(r0,t0);
1385
1386 r1 = _mm_mullo_epi16(x1, f1);
1387 t1 = _mm_mulhi_epi16(x1, f1);
1388
1389 x1= _mm_unpacklo_epi16(r1,t1);
1390 t1= _mm_unpackhi_epi16(r1,t1);
1391
1392
1393 r2 = _mm_mullo_epi16(x2, f2);
1394 t2 = _mm_mulhi_epi16(x2, f2);
1395
1396 x2= _mm_unpacklo_epi16(r2,t2);
1397 t2= _mm_unpackhi_epi16(r2,t2);
1398
1399
1400 r3 = _mm_mullo_epi16(x3, f3);
1401 t3 = _mm_mulhi_epi16(x3, f3);
1402
1403 x3= _mm_unpacklo_epi16(r3,t3);
1404 t3= _mm_unpackhi_epi16(r3,t3);
1405
1406
1407 r0= _mm_add_epi32(x0,x1);
1408 r1= _mm_add_epi32(x2,x3);
1409
1410 t0= _mm_add_epi32(t0,t1);
1411 t1= _mm_add_epi32(t2,t3);
1412
1413 r0= _mm_add_epi32(r0,r1);
1414 t0= _mm_add_epi32(t0,t1);
1415
1416 r0= _mm_srai_epi32(r0,2);//>> (BIT_DEPTH - 8)
1417 t0= _mm_srai_epi32(t0,2);//>> (BIT_DEPTH - 8)
1418
1419 r0= _mm_packs_epi32(r0, t0);
1420 // give results back
1421 _mm_storeu_si128((__m128i *) &dst[x], r0);
1422 }
1423 src += srcstride;
1424 dst += dststride;
1425 }
1426 }else if(!(width & 3)){
1427 r1= _mm_setzero_si128();
1428 for (y = 0; y < height; y++) {
1429 for(x=0;x<width;x+=4){
1430 x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1431 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1432 x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1433 x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1434
1435 /* multiply by correct value : */
1436 r0 = _mm_mullo_epi16(x0, f0);
1437 t0 = _mm_mulhi_epi16(x0, f0);
1438
1439 x0= _mm_unpacklo_epi16(r0,t0);
1440
1441 r1 = _mm_mullo_epi16(x1, f1);
1442 t1 = _mm_mulhi_epi16(x1, f1);
1443
1444 x1= _mm_unpacklo_epi16(r1,t1);
1445
1446
1447 r2 = _mm_mullo_epi16(x2, f2);
1448 t2 = _mm_mulhi_epi16(x2, f2);
1449
1450 x2= _mm_unpacklo_epi16(r2,t2);
1451
1452
1453 r3 = _mm_mullo_epi16(x3, f3);
1454 t3 = _mm_mulhi_epi16(x3, f3);
1455
1456 x3= _mm_unpacklo_epi16(r3,t3);
1457
1458
1459 r0= _mm_add_epi32(x0,x1);
1460 r1= _mm_add_epi32(x2,x3);
1461 r0= _mm_add_epi32(r0,r1);
1462 r0= _mm_srai_epi32(r0,2);//>> (BIT_DEPTH - 8)
1463
1464 r0= _mm_packs_epi32(r0, r0);
1465
1466 // give results back
1467 _mm_storel_epi64((__m128i *) &dst[x], r0);
1468 }
1469 src += srcstride;
1470 dst += dststride;
1471 }
1472 }else{
1473 r1= _mm_setzero_si128();
1474 for (y = 0; y < height; y++) {
1475 for(x=0;x<width;x+=2){
1476 x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1477 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1478 x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1479 x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1480
1481 /* multiply by correct value : */
1482 r0 = _mm_mullo_epi16(x0, f0);
1483 t0 = _mm_mulhi_epi16(x0, f0);
1484
1485 x0= _mm_unpacklo_epi16(r0,t0);
1486
1487 r1 = _mm_mullo_epi16(x1, f1);
1488 t1 = _mm_mulhi_epi16(x1, f1);
1489
1490 x1= _mm_unpacklo_epi16(r1,t1);
1491
1492 r2 = _mm_mullo_epi16(x2, f2);
1493 t2 = _mm_mulhi_epi16(x2, f2);
1494
1495 x2= _mm_unpacklo_epi16(r2,t2);
1496
1497 r3 = _mm_mullo_epi16(x3, f3);
1498 t3 = _mm_mulhi_epi16(x3, f3);
1499
1500 x3= _mm_unpacklo_epi16(r3,t3);
1501
1502 r0= _mm_add_epi32(x0,x1);
1503 r1= _mm_add_epi32(x2,x3);
1504 r0= _mm_add_epi32(r0,r1);
1505 r0= _mm_srai_epi32(r0,2);//>> (BIT_DEPTH - 8)
1506
1507 r0= _mm_packs_epi32(r0, r0);
1508
1509 /* give results back */
1510 _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1511
1512 }
1513 src += srcstride;
1514 dst += dststride;
1515 }
1516 }
1517 }
1518 #endif
1519
1520 void ff_hevc_put_hevc_epel_hv_8_sse(int16_t *dst, ptrdiff_t dststride,
1521 uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1522 int my, int16_t* mcbuffer) {
1523 int x, y;
1524 uint8_t *src = (uint8_t*) _src;
1525 ptrdiff_t srcstride = _srcstride;
1526 const int8_t *filter_h = epel_filters[mx - 1];
1527 const int8_t *filter_v = epel_filters[my - 1];
1528 __m128i r0, bshuffle1, bshuffle2, x0, x1, x2, x3, t0, t1, t2, t3, f0, f1,
1529 f2, f3, r1, r2;
1530 int8_t filter_0 = filter_h[0];
1531 int8_t filter_1 = filter_h[1];
1532 int8_t filter_2 = filter_h[2];
1533 int8_t filter_3 = filter_h[3];
1534 int16_t *tmp = mcbuffer;
1535 r0 = _mm_set_epi8(filter_3, filter_2, filter_1, filter_0, filter_3,
1536 filter_2, filter_1, filter_0, filter_3, filter_2, filter_1,
1537 filter_0, filter_3, filter_2, filter_1, filter_0);
1538 bshuffle1 = _mm_set_epi8(6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0);
1539
1540 src -= epel_extra_before * srcstride;
1541
1542 f3 = _mm_set1_epi16(filter_v[3]);
1543 f1 = _mm_set1_epi16(filter_v[1]);
1544 f2 = _mm_set1_epi16(filter_v[2]);
1545 f0 = _mm_set1_epi16(filter_v[0]);
1546
1547 /* horizontal treatment */
1548 if(!(width & 7)){
1549 bshuffle2 = _mm_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5,
1550 4);
1551 for (y = 0; y < height + epel_extra; y++) {
1552 for (x = 0; x < width; x += 8) {
1553
1554 x1 = _mm_loadu_si128((__m128i *) &src[x - 1]);
1555 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1556 x3 = _mm_shuffle_epi8(x1, bshuffle2);
1557
1558 /* PMADDUBSW then PMADDW */
1559 x2 = _mm_maddubs_epi16(x2, r0);
1560 x3 = _mm_maddubs_epi16(x3, r0);
1561 x2 = _mm_hadd_epi16(x2, x3);
1562 _mm_store_si128((__m128i *) &tmp[x], x2);
1563 }
1564 src += srcstride;
1565 tmp += MAX_PB_SIZE;
1566 }
1567 tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1568
1569 /* vertical treatment */
1570
1571 for (y = 0; y < height; y++) {
1572 for (x = 0; x < width; x += 8) {
1573 /* check if memory needs to be reloaded */
1574 x0 = _mm_load_si128((__m128i *) &tmp[x - MAX_PB_SIZE]);
1575 x1 = _mm_load_si128((__m128i *) &tmp[x]);
1576 x2 = _mm_load_si128((__m128i *) &tmp[x + MAX_PB_SIZE]);
1577 x3 = _mm_load_si128((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1578
1579 r0 = _mm_mullo_epi16(x0, f0);
1580 r1 = _mm_mulhi_epi16(x0, f0);
1581 r2 = _mm_mullo_epi16(x1, f1);
1582 t0 = _mm_unpacklo_epi16(r0, r1);
1583 x0 = _mm_unpackhi_epi16(r0, r1);
1584 r0 = _mm_mulhi_epi16(x1, f1);
1585 r1 = _mm_mullo_epi16(x2, f2);
1586 t1 = _mm_unpacklo_epi16(r2, r0);
1587 x1 = _mm_unpackhi_epi16(r2, r0);
1588 r2 = _mm_mulhi_epi16(x2, f2);
1589 r0 = _mm_mullo_epi16(x3, f3);
1590 t2 = _mm_unpacklo_epi16(r1, r2);
1591 x2 = _mm_unpackhi_epi16(r1, r2);
1592 r1 = _mm_mulhi_epi16(x3, f3);
1593 t3 = _mm_unpacklo_epi16(r0, r1);
1594 x3 = _mm_unpackhi_epi16(r0, r1);
1595
1596 /* multiply by correct value : */
1597 r0 = _mm_add_epi32(t0, t1);
1598 r1 = _mm_add_epi32(x0, x1);
1599 r0 = _mm_add_epi32(r0, t2);
1600 r1 = _mm_add_epi32(r1, x2);
1601 r0 = _mm_add_epi32(r0, t3);
1602 r1 = _mm_add_epi32(r1, x3);
1603 r0 = _mm_srai_epi32(r0, 6);
1604 r1 = _mm_srai_epi32(r1, 6);
1605
1606 /* give results back */
1607 r0 = _mm_packs_epi32(r0, r1);
1608 _mm_store_si128((__m128i *) &dst[x], r0);
1609 }
1610 tmp += MAX_PB_SIZE;
1611 dst += dststride;
1612 }
1613 }else if(!(width & 3)){
1614 for (y = 0; y < height + epel_extra; y ++) {
1615 for(x=0;x<width;x+=4){
1616 /* load data in register */
1617 x1 = _mm_loadl_epi64((__m128i *) &src[x-1]);
1618
1619 x1 = _mm_shuffle_epi8(x1, bshuffle1);
1620
1621 /* PMADDUBSW then PMADDW */
1622 x1 = _mm_maddubs_epi16(x1, r0);
1623 x1 = _mm_hadd_epi16(x1, _mm_setzero_si128());
1624
1625 /* give results back */
1626 _mm_storel_epi64((__m128i *) &tmp[x], x1);
1627
1628 }
1629 src += srcstride;
1630 tmp += MAX_PB_SIZE;
1631 }
1632 tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1633
1634 /* vertical treatment */
1635
1636
1637 for (y = 0; y < height; y++) {
1638 for (x = 0; x < width; x += 4) {
1639 /* check if memory needs to be reloaded */
1640 x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1641 x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1642 x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1643 x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1644
1645 r0 = _mm_mullo_epi16(x0, f0);
1646 r1 = _mm_mulhi_epi16(x0, f0);
1647 r2 = _mm_mullo_epi16(x1, f1);
1648 t0 = _mm_unpacklo_epi16(r0, r1);
1649
1650 r0 = _mm_mulhi_epi16(x1, f1);
1651 r1 = _mm_mullo_epi16(x2, f2);
1652 t1 = _mm_unpacklo_epi16(r2, r0);
1653
1654 r2 = _mm_mulhi_epi16(x2, f2);
1655 r0 = _mm_mullo_epi16(x3, f3);
1656 t2 = _mm_unpacklo_epi16(r1, r2);
1657
1658 r1 = _mm_mulhi_epi16(x3, f3);
1659 t3 = _mm_unpacklo_epi16(r0, r1);
1660
1661
1662 /* multiply by correct value : */
1663 r0 = _mm_add_epi32(t0, t1);
1664 r0 = _mm_add_epi32(r0, t2);
1665 r0 = _mm_add_epi32(r0, t3);
1666 r0 = _mm_srai_epi32(r0, 6);
1667
1668 /* give results back */
1669 r0 = _mm_packs_epi32(r0, r0);
1670 _mm_storel_epi64((__m128i *) &dst[x], r0);
1671 }
1672 tmp += MAX_PB_SIZE;
1673 dst += dststride;
1674 }
1675 }else{
1676 #if MASKMOVE
1677 bshuffle2=_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1);
1678 #endif
1679 for (y = 0; y < height + epel_extra; y ++) {
1680 for(x=0;x<width;x+=2){
1681 /* load data in register */
1682 x1 = _mm_loadl_epi64((__m128i *) &src[x-1]);
1683 x1 = _mm_shuffle_epi8(x1, bshuffle1);
1684
1685 /* PMADDUBSW then PMADDW */
1686 x1 = _mm_maddubs_epi16(x1, r0);
1687 x1 = _mm_hadd_epi16(x1, _mm_setzero_si128());
1688
1689 /* give results back */
1690 #if MASKMOVE
1691 _mm_maskmoveu_si128(x1,bshuffle2,(char *) (tmp+x));
1692 #else
1693 *((uint32_t*)(tmp+x)) = _mm_cvtsi128_si32(x1);
1694 #endif
1695 }
1696 src += srcstride;
1697 tmp += MAX_PB_SIZE;
1698 }
1699
1700 tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1701
1702 /* vertical treatment */
1703
1704 for (y = 0; y < height; y++) {
1705 for (x = 0; x < width; x += 2) {
1706 /* check if memory needs to be reloaded */
1707 x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1708 x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1709 x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1710 x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1711
1712 r0 = _mm_mullo_epi16(x0, f0);
1713 r1 = _mm_mulhi_epi16(x0, f0);
1714 r2 = _mm_mullo_epi16(x1, f1);
1715 t0 = _mm_unpacklo_epi16(r0, r1);
1716 r0 = _mm_mulhi_epi16(x1, f1);
1717 r1 = _mm_mullo_epi16(x2, f2);
1718 t1 = _mm_unpacklo_epi16(r2, r0);
1719 r2 = _mm_mulhi_epi16(x2, f2);
1720 r0 = _mm_mullo_epi16(x3, f3);
1721 t2 = _mm_unpacklo_epi16(r1, r2);
1722 r1 = _mm_mulhi_epi16(x3, f3);
1723 t3 = _mm_unpacklo_epi16(r0, r1);
1724
1725 /* multiply by correct value : */
1726 r0 = _mm_add_epi32(t0, t1);
1727 r0 = _mm_add_epi32(r0, t2);
1728 r0 = _mm_add_epi32(r0, t3);
1729 r0 = _mm_srai_epi32(r0, 6);
1730 /* give results back */
1731 r0 = _mm_packs_epi32(r0, r0);
1732 #if MASKMOVE
1733 _mm_maskmoveu_si128(r0,bshuffle2,(char *) (dst+x));
1734 #else
1735 *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
1736 #endif
1737 }
1738 tmp += MAX_PB_SIZE;
1739 dst += dststride;
1740 }
1741 }
1742
1743 }
1744
1745
1746 #ifndef __native_client__
1747 void ff_hevc_put_hevc_epel_hv_10_sse(int16_t *dst, ptrdiff_t dststride,
1748 uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1749 int my, int16_t* mcbuffer) {
1750 int x, y;
1751 uint16_t *src = (uint16_t*) _src;
1752 ptrdiff_t srcstride = _srcstride>>1;
1753 const int8_t *filter_h = epel_filters[mx - 1];
1754 const int8_t *filter_v = epel_filters[my - 1];
1755 __m128i r0, bshuffle1, bshuffle2, x0, x1, x2, x3, t0, t1, t2, t3, f0, f1,
1756 f2, f3, r1, r2, r3;
1757 int8_t filter_0 = filter_h[0];
1758 int8_t filter_1 = filter_h[1];
1759 int8_t filter_2 = filter_h[2];
1760 int8_t filter_3 = filter_h[3];
1761 int16_t *tmp = mcbuffer;
1762
1763 r0 = _mm_set_epi16(filter_3, filter_2, filter_1,
1764 filter_0, filter_3, filter_2, filter_1, filter_0);
1765 bshuffle1 = _mm_set_epi8(9,8,7,6,5,4, 3, 2,7,6,5,4, 3, 2, 1, 0);
1766
1767 src -= epel_extra_before * srcstride;
1768
1769 f0 = _mm_set1_epi16(filter_v[0]);
1770 f1 = _mm_set1_epi16(filter_v[1]);
1771 f2 = _mm_set1_epi16(filter_v[2]);
1772 f3 = _mm_set1_epi16(filter_v[3]);
1773
1774
1775 /* horizontal treatment */
1776 if(!(width & 3)){
1777 bshuffle2 = _mm_set_epi8(13,12,11,10,9,8,7,6,11,10, 9,8,7,6,5, 4);
1778 for (y = 0; y < height + epel_extra; y ++) {
1779 for(x=0;x<width;x+=4){
1780
1781 x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1782 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1783 x3 = _mm_shuffle_epi8(x1, bshuffle2);
1784
1785
1786 x2 = _mm_madd_epi16(x2, r0);
1787 x3 = _mm_madd_epi16(x3, r0);
1788 x2 = _mm_hadd_epi32(x2, x3);
1789 x2= _mm_srai_epi32(x2,2); //>> (BIT_DEPTH - 8)
1790
1791 x2 = _mm_packs_epi32(x2,r0);
1792 //give results back
1793 _mm_storel_epi64((__m128i *) &tmp[x], x2);
1794
1795 }
1796 src += srcstride;
1797 tmp += MAX_PB_SIZE;
1798 }
1799 tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1800
1801 // vertical treatment
1802
1803
1804 for (y = 0; y < height; y++) {
1805 for (x = 0; x < width; x += 4) {
1806 x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1807 x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1808 x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1809 x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1810
1811 r0 = _mm_mullo_epi16(x0, f0);
1812 r1 = _mm_mulhi_epi16(x0, f0);
1813 r2 = _mm_mullo_epi16(x1, f1);
1814 t0 = _mm_unpacklo_epi16(r0, r1);
1815
1816 r0 = _mm_mulhi_epi16(x1, f1);
1817 r1 = _mm_mullo_epi16(x2, f2);
1818 t1 = _mm_unpacklo_epi16(r2, r0);
1819
1820 r2 = _mm_mulhi_epi16(x2, f2);
1821 r0 = _mm_mullo_epi16(x3, f3);
1822 t2 = _mm_unpacklo_epi16(r1, r2);
1823
1824 r1 = _mm_mulhi_epi16(x3, f3);
1825 t3 = _mm_unpacklo_epi16(r0, r1);
1826
1827
1828
1829 r0 = _mm_add_epi32(t0, t1);
1830 r0 = _mm_add_epi32(r0, t2);
1831 r0 = _mm_add_epi32(r0, t3);
1832 r0 = _mm_srai_epi32(r0, 6);
1833
1834 // give results back
1835 r0 = _mm_packs_epi32(r0, r0);
1836 _mm_storel_epi64((__m128i *) &dst[x], r0);
1837 }
1838 tmp += MAX_PB_SIZE;
1839 dst += dststride;
1840 }
1841 }else{
1842 bshuffle2=_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1);
1843 r1= _mm_setzero_si128();
1844 for (y = 0; y < height + epel_extra; y ++) {
1845 for(x=0;x<width;x+=2){
1846 /* load data in register */
1847 x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1848 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1849
1850 /* PMADDUBSW then PMADDW */
1851 x2 = _mm_madd_epi16(x2, r0);
1852 x2 = _mm_hadd_epi32(x2, r1);
1853 x2= _mm_srai_epi32(x2,2); //>> (BIT_DEPTH - 8)
1854 x2 = _mm_packs_epi32(x2, r1);
1855 /* give results back */
1856 _mm_maskmoveu_si128(x2,bshuffle2,(char *) (tmp+x));
1857 }
1858 src += srcstride;
1859 tmp += MAX_PB_SIZE;
1860 }
1861
1862 tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1863
1864 /* vertical treatment */
1865
1866 for (y = 0; y < height; y++) {
1867 for (x = 0; x < width; x += 2) {
1868 /* check if memory needs to be reloaded */
1869 x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1870 x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1871 x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1872 x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1873
1874 r0 = _mm_mullo_epi16(x0, f0);
1875 t0 = _mm_mulhi_epi16(x0, f0);
1876
1877 x0= _mm_unpacklo_epi16(r0,t0);
1878
1879 r1 = _mm_mullo_epi16(x1, f1);
1880 t1 = _mm_mulhi_epi16(x1, f1);
1881
1882 x1= _mm_unpacklo_epi16(r1,t1);
1883
1884 r2 = _mm_mullo_epi16(x2, f2);
1885 t2 = _mm_mulhi_epi16(x2, f2);
1886
1887 x2= _mm_unpacklo_epi16(r2,t2);
1888
1889 r3 = _mm_mullo_epi16(x3, f3);
1890 t3 = _mm_mulhi_epi16(x3, f3);
1891
1892 x3= _mm_unpacklo_epi16(r3,t3);
1893
1894 r0= _mm_add_epi32(x0,x1);
1895 r1= _mm_add_epi32(x2,x3);
1896 r0= _mm_add_epi32(r0,r1);
1897 r0 = _mm_srai_epi32(r0, 6);
1898 /* give results back */
1899 r0 = _mm_packs_epi32(r0, r0);
1900 _mm_maskmoveu_si128(r0,bshuffle2,(char *) (dst+x));
1901 }
1902 tmp += MAX_PB_SIZE;
1903 dst += dststride;
1904 }
1905 }
1906 }
1907 #endif
1908
1909 void ff_hevc_put_hevc_qpel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride,
1910 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
1911 int16_t* mcbuffer) {
1912 int x, y;
1913 __m128i x1, x2, x3, x0;
1914 uint8_t *src = (uint8_t*) _src;
1915 ptrdiff_t srcstride = _srcstride;
1916 x0= _mm_setzero_si128();
1917 if(!(width & 15)){
1918 for (y = 0; y < height; y++) {
1919 for (x = 0; x < width; x += 16) {
1920
1921 x1 = _mm_loadu_si128((__m128i *) &src[x]);
1922 x2 = _mm_unpacklo_epi8(x1, x0);
1923
1924 x3 = _mm_unpackhi_epi8(x1, x0);
1925
1926 x2 = _mm_slli_epi16(x2, 6);
1927 x3 = _mm_slli_epi16(x3, 6);
1928 _mm_storeu_si128((__m128i *) &dst[x], x2);
1929 _mm_storeu_si128((__m128i *) &dst[x + 8], x3);
1930
1931 }
1932 src += srcstride;
1933 dst += dststride;
1934 }
1935 }else if(!(width & 7)){
1936 for (y = 0; y < height; y++) {
1937 for (x = 0; x < width; x += 8) {
1938
1939 x1 = _mm_loadu_si128((__m128i *) &src[x]);
1940 x2 = _mm_unpacklo_epi8(x1, x0);
1941 x2 = _mm_slli_epi16(x2, 6);
1942 _mm_storeu_si128((__m128i *) &dst[x], x2);
1943
1944 }
1945 src += srcstride;
1946 dst += dststride;
1947 }
1948 }else if(!(width & 3)){
1949 for (y = 0; y < height; y++) {
1950 for(x=0;x<width;x+=4){
1951 x1 = _mm_loadu_si128((__m128i *) &src[x]);
1952 x2 = _mm_unpacklo_epi8(x1, x0);
1953 x2 = _mm_slli_epi16(x2, 6);
1954 _mm_storel_epi64((__m128i *) &dst[x], x2);
1955 }
1956 src += srcstride;
1957 dst += dststride;
1958 }
1959 }else{
1960 #if MASKMOVE
1961 x4= _mm_set_epi32(0,0,0,-1); //mask to store
1962 #endif
1963 for (y = 0; y < height; y++) {
1964 for(x=0;x<width;x+=2){
1965 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1966 x2 = _mm_unpacklo_epi8(x1, x0);
1967 x2 = _mm_slli_epi16(x2, 6);
1968 #if MASKMOVE
1969 _mm_maskmoveu_si128(x2,x4,(char *) (dst+x));
1970 #else
1971 *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
1972 #endif
1973 }
1974 src += srcstride;
1975 dst += dststride;
1976 }
1977 }
1978
1979
1980 }
1981
1982 #ifndef __native_client__
1983 void ff_hevc_put_hevc_qpel_pixels_10_sse(int16_t *dst, ptrdiff_t dststride,
1984 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
1985 int16_t* mcbuffer) {
1986 int x, y;
1987 __m128i x1, x2, x4;
1988 uint16_t *src = (uint16_t*) _src;
1989 ptrdiff_t srcstride = _srcstride>>1;
1990 if(!(width & 7)){
1991 for (y = 0; y < height; y++) {
1992 for (x = 0; x < width; x += 8) {
1993
1994 x1 = _mm_loadu_si128((__m128i *) &src[x]);
1995 x2 = _mm_slli_epi16(x1, 4); //14-BIT DEPTH
1996 _mm_storeu_si128((__m128i *) &dst[x], x2);
1997
1998 }
1999 src += srcstride;
2000 dst += dststride;
2001 }
2002 }else if(!(width & 3)){
2003 for (y = 0; y < height; y++) {
2004 for(x=0;x<width;x+=4){
2005 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
2006 x2 = _mm_slli_epi16(x1, 4);//14-BIT DEPTH
2007 _mm_storel_epi64((__m128i *) &dst[x], x2);
2008 }
2009 src += srcstride;
2010 dst += dststride;
2011 }
2012 }else{
2013 x4= _mm_set_epi32(0,0,0,-1); //mask to store
2014 for (y = 0; y < height; y++) {
2015 for(x=0;x<width;x+=2){
2016 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
2017 x2 = _mm_slli_epi16(x1, 4);//14-BIT DEPTH
2018 _mm_maskmoveu_si128(x2,x4,(char *) (dst+x));
2019 }
2020 src += srcstride;
2021 dst += dststride;
2022 }
2023 }
2024
2025
2026 }
2027 #endif
2028
2029
2030 void ff_hevc_put_hevc_qpel_h_1_8_sse(int16_t *dst, ptrdiff_t dststride,
2031 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2032 int16_t* mcbuffer) {
2033 int x, y;
2034 uint8_t *src = _src;
2035 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2036 __m128i x1, r0, x2, x3, x4, x5;
2037
2038 r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
2039 -1);
2040
2041 if(!(width & 7)){
2042 for (y = 0; y < height; y++) {
2043 for (x = 0; x < width; x += 8) {
2044 /* load data in register */
2045 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
2046 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2047 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2048 _mm_srli_si128(x1, 3));
2049 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2050 _mm_srli_si128(x1, 5));
2051 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2052 _mm_srli_si128(x1, 7));
2053
2054 /* PMADDUBSW then PMADDW */
2055 x2 = _mm_maddubs_epi16(x2, r0);
2056 x3 = _mm_maddubs_epi16(x3, r0);
2057 x4 = _mm_maddubs_epi16(x4, r0);
2058 x5 = _mm_maddubs_epi16(x5, r0);
2059 x2 = _mm_hadd_epi16(x2, x3);
2060 x4 = _mm_hadd_epi16(x4, x5);
2061 x2 = _mm_hadd_epi16(x2, x4);
2062 /* give results back */
2063 _mm_store_si128((__m128i *) &dst[x],x2);
2064
2065 }
2066 src += srcstride;
2067 dst += dststride;
2068 }
2069 }else if(!(width &3)){
2070
2071 for (y = 0; y < height; y ++) {
2072 for(x=0;x<width;x+=4){
2073 /* load data in register */
2074 x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2075 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2076 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2077 _mm_srli_si128(x1, 3));
2078
2079 /* PMADDUBSW then PMADDW */
2080 x2 = _mm_maddubs_epi16(x2, r0);
2081 x3 = _mm_maddubs_epi16(x3, r0);
2082 x2 = _mm_hadd_epi16(x2, x3);
2083 x2 = _mm_hadd_epi16(x2, x2);
2084
2085 /* give results back */
2086 _mm_storel_epi64((__m128i *) &dst[x], x2);
2087 }
2088
2089 src += srcstride;
2090 dst += dststride;
2091 }
2092 }else{
2093 x5= _mm_setzero_si128();
2094 #if MASKMOVE
2095 x3= _mm_set_epi32(0,0,0,-1);
2096 #endif
2097 for (y = 0; y < height; y ++) {
2098 for(x=0;x<width;x+=4){
2099 /* load data in register */
2100 x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2101 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2102
2103
2104
2105 /* PMADDUBSW then PMADDW */
2106 x2 = _mm_maddubs_epi16(x2, r0);
2107 x2 = _mm_hadd_epi16(x2,x5 );
2108 x2 = _mm_hadd_epi16(x2,x5 );
2109
2110 /* give results back */
2111 //_mm_storel_epi64((__m128i *) &dst[x], x2);
2112 #if MASKMOVE
2113 _mm_maskmoveu_si128(x2,x3,(char *) (dst+x));
2114 #else
2115 *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
2116 #endif
2117 }
2118
2119 src += srcstride;
2120 dst += dststride;
2121 }
2122 }
2123
2124 }
2125 #ifndef __native_client__
2126 /*
2127 * @TODO : Valgrind to see if it's useful to use SSE or wait for AVX2 implementation
2128 */
2129 void ff_hevc_put_hevc_qpel_h_1_10_sse(int16_t *dst, ptrdiff_t dststride,
2130 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2131 int16_t* mcbuffer) {
2132 int x, y;
2133 uint16_t *src = (uint16_t*)_src;
2134 ptrdiff_t srcstride = _srcstride>>1;
2135 __m128i x0, x1, x2, x3, r0;
2136
2137 r0 = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
2138 x0= _mm_setzero_si128();
2139 x3= _mm_set_epi32(0,0,0,-1);
2140 for (y = 0; y < height; y ++) {
2141 for(x=0;x<width;x+=2){
2142 x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2143 x2 = _mm_srli_si128(x1,2); //last 16bit not used so 1 load can be used for 2 dst
2144
2145 x1 = _mm_madd_epi16(x1,r0);
2146 x2 = _mm_madd_epi16(x2,r0);
2147
2148 x1 = _mm_hadd_epi32(x1,x2);
2149 x1 = _mm_hadd_epi32(x1,x0);
2150 x1= _mm_srai_epi32(x1,2); //>>BIT_DEPTH-8
2151 x1= _mm_packs_epi32(x1,x0);
2152 // dst[x]= _mm_extract_epi16(x1,0);
2153 _mm_maskmoveu_si128(x1,x3,(char *) (dst+x));
2154 }
2155 src += srcstride;
2156 dst += dststride;
2157 }
2158
2159 }
2160 #endif
2161
2162
2163 void ff_hevc_put_hevc_qpel_h_2_8_sse(int16_t *dst, ptrdiff_t dststride,
2164 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2165 int16_t* mcbuffer) {
2166 int x, y;
2167 uint8_t *src = _src;
2168 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2169 __m128i x1, r0, x2, x3, x4, x5;
2170
2171 r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
2172 4, -1);
2173
2174 /* LOAD src from memory to registers to limit memory bandwidth */
2175 if(!(width - 15)){
2176 for (y = 0; y < height; y++) {
2177 for (x = 0; x < width; x += 8) {
2178 /* load data in register */
2179 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
2180 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2181 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2182 _mm_srli_si128(x1, 3));
2183 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2184 _mm_srli_si128(x1, 5));
2185 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2186 _mm_srli_si128(x1, 7));
2187
2188 /* PMADDUBSW then PMADDW */
2189 x2 = _mm_maddubs_epi16(x2, r0);
2190 x3 = _mm_maddubs_epi16(x3, r0);
2191 x4 = _mm_maddubs_epi16(x4, r0);
2192 x5 = _mm_maddubs_epi16(x5, r0);
2193 x2 = _mm_hadd_epi16(x2, x3);
2194 x4 = _mm_hadd_epi16(x4, x5);
2195 x2 = _mm_hadd_epi16(x2, x4);
2196 /* give results back */
2197 _mm_store_si128((__m128i *) &dst[x],x2);
2198 }
2199 src += srcstride;
2200 dst += dststride;
2201 }
2202
2203 }else{
2204
2205 for (y = 0; y < height; y ++) {
2206 for(x=0;x<width;x+=4){
2207 /* load data in register */
2208 x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2209
2210 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2211 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2212 _mm_srli_si128(x1, 3));
2213
2214
2215 /* PMADDUBSW then PMADDW */
2216 x2 = _mm_maddubs_epi16(x2, r0);
2217 x3 = _mm_maddubs_epi16(x3, r0);
2218 x2 = _mm_hadd_epi16(x2, x3);
2219 x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2220
2221 /* give results back */
2222 _mm_storel_epi64((__m128i *) &dst[x], x2);
2223
2224 }
2225 src += srcstride;
2226 dst += dststride;
2227 }
2228 }
2229
2230 }
2231
2232 #if 0
2233 static void ff_hevc_put_hevc_qpel_h_2_sse(int16_t *dst, ptrdiff_t dststride,
2234 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2235 int16_t* mcbuffer) {
2236 int x, y;
2237 uint8_t *src = _src;
2238 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2239 __m128i x1, r0, x2, x3, x4, x5;
2240
2241 r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
2242 4, -1);
2243
2244 /* LOAD src from memory to registers to limit memory bandwidth */
2245 if(!(width & 7)){
2246 for (y = 0; y < height; y++) {
2247 for (x = 0; x < width; x += 8) {
2248 /* load data in register */
2249 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
2250 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2251 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2252 _mm_srli_si128(x1, 3));
2253 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2254 _mm_srli_si128(x1, 5));
2255 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2256 _mm_srli_si128(x1, 7));
2257
2258 /* PMADDUBSW then PMADDW */
2259 x2 = _mm_maddubs_epi16(x2, r0);
2260 x3 = _mm_maddubs_epi16(x3, r0);
2261 x4 = _mm_maddubs_epi16(x4, r0);
2262 x5 = _mm_maddubs_epi16(x5, r0);
2263 x2 = _mm_hadd_epi16(x2, x3);
2264 x4 = _mm_hadd_epi16(x4, x5);
2265 x2 = _mm_hadd_epi16(x2, x4);
2266 /* give results back */
2267 _mm_store_si128((__m128i *) &dst[x],x2);
2268 }
2269 src += srcstride;
2270 dst += dststride;
2271 }
2272
2273 }else{
2274
2275 for (y = 0; y < height; y ++) {
2276 for(x=0;x<width;x+=4){
2277 /* load data in register */
2278 x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2279
2280 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2281 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2282 _mm_srli_si128(x1, 3));
2283
2284
2285 /* PMADDUBSW then PMADDW */
2286 x2 = _mm_maddubs_epi16(x2, r0);
2287 x3 = _mm_maddubs_epi16(x3, r0);
2288 x2 = _mm_hadd_epi16(x2, x3);
2289 x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2290
2291 /* give results back */
2292 _mm_storel_epi64((__m128i *) &dst[x], x2);
2293
2294 }
2295 src += srcstride;
2296 dst += dststride;
2297 }
2298 }
2299
2300 }
2301 static void ff_hevc_put_hevc_qpel_h_3_sse(int16_t *dst, ptrdiff_t dststride,
2302 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2303 int16_t* mcbuffer) {
2304 int x, y;
2305 uint8_t *src = _src;
2306 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2307 __m128i x1, r0, x2, x3, x4, x5;
2308
2309 r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
2310 0);
2311
2312 if(!(width & 7)){
2313 for (y = 0; y < height; y++) {
2314 for (x = 0; x < width; x += 8) {
2315 /* load data in register */
2316 x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
2317 x1 = _mm_slli_si128(x1, 1);
2318 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2319 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2320 _mm_srli_si128(x1, 3));
2321 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2322 _mm_srli_si128(x1, 5));
2323 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2324 _mm_srli_si128(x1, 7));
2325
2326 /* PMADDUBSW then PMADDW */
2327 x2 = _mm_maddubs_epi16(x2, r0);
2328 x3 = _mm_maddubs_epi16(x3, r0);
2329 x4 = _mm_maddubs_epi16(x4, r0);
2330 x5 = _mm_maddubs_epi16(x5, r0);
2331 x2 = _mm_hadd_epi16(x2, x3);
2332 x4 = _mm_hadd_epi16(x4, x5);
2333 x2 = _mm_hadd_epi16(x2, x4);
2334 /* give results back */
2335 _mm_store_si128((__m128i *) &dst[x],
2336 _mm_srli_si128(x2, BIT_DEPTH - 8));
2337 }
2338 src += srcstride;
2339 dst += dststride;
2340 }
2341 }else{
2342 for (y = 0; y < height; y ++) {
2343 for(x=0;x<width;x+=4){
2344 /* load data in register */
2345 x1 = _mm_loadu_si128((__m128i *) &src[x-2]);
2346 x1 = _mm_slli_si128(x1, 1);
2347 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2348 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2349 _mm_srli_si128(x1, 3));
2350
2351 /* PMADDUBSW then PMADDW */
2352 x2 = _mm_maddubs_epi16(x2, r0);
2353 x3 = _mm_maddubs_epi16(x3, r0);
2354 x2 = _mm_hadd_epi16(x2, x3);
2355 x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2356 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
2357 /* give results back */
2358 _mm_storel_epi64((__m128i *) &dst[x], x2);
2359
2360 }
2361 src += srcstride;
2362 dst += dststride;
2363 }
2364 }
2365 }
2366 #endif
2367
2368 void ff_hevc_put_hevc_qpel_h_3_8_sse(int16_t *dst, ptrdiff_t dststride,
2369 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2370 int16_t* mcbuffer) {
2371 int x, y;
2372 uint8_t *src = _src;
2373 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2374 __m128i x1, r0, x2, x3, x4, x5;
2375
2376 r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
2377 0);
2378
2379 if(!(width & 7)){
2380 for (y = 0; y < height; y++) {
2381 for (x = 0; x < width; x += 8) {
2382 /* load data in register */
2383 x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
2384 x1 = _mm_slli_si128(x1, 1);
2385 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2386 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2387 _mm_srli_si128(x1, 3));
2388 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2389 _mm_srli_si128(x1, 5));
2390 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2391 _mm_srli_si128(x1, 7));
2392
2393 /* PMADDUBSW then PMADDW */
2394 x2 = _mm_maddubs_epi16(x2, r0);
2395 x3 = _mm_maddubs_epi16(x3, r0);
2396 x4 = _mm_maddubs_epi16(x4, r0);
2397 x5 = _mm_maddubs_epi16(x5, r0);
2398 x2 = _mm_hadd_epi16(x2, x3);
2399 x4 = _mm_hadd_epi16(x4, x5);
2400 x2 = _mm_hadd_epi16(x2, x4);
2401 /* give results back */
2402 _mm_store_si128((__m128i *) &dst[x],x2);
2403 }
2404 src += srcstride;
2405 dst += dststride;
2406 }
2407 }else{
2408 for (y = 0; y < height; y ++) {
2409 for(x=0;x<width;x+=4){
2410 /* load data in register */
2411 x1 = _mm_loadu_si128((__m128i *) &src[x-2]);
2412 x1 = _mm_slli_si128(x1, 1);
2413 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2414 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2415 _mm_srli_si128(x1, 3));
2416
2417 /* PMADDUBSW then PMADDW */
2418 x2 = _mm_maddubs_epi16(x2, r0);
2419 x3 = _mm_maddubs_epi16(x3, r0);
2420 x2 = _mm_hadd_epi16(x2, x3);
2421 x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2422 /* give results back */
2423 _mm_storel_epi64((__m128i *) &dst[x], x2);
2424
2425 }
2426 src += srcstride;
2427 dst += dststride;
2428 }
2429 }
2430 }
2431 /**
2432 for column MC treatment, we will calculate 8 pixels at the same time by multiplying the values
2433 of each row.
2434
2435 */
2436 void ff_hevc_put_hevc_qpel_v_1_8_sse(int16_t *dst, ptrdiff_t dststride,
2437 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2438 int16_t* mcbuffer) {
2439 int x, y;
2440 uint8_t *src = (uint8_t*) _src;
2441 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2442 __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2443 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2444 r1 = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
2445
2446 if(!(width & 15)){
2447 x8 = _mm_setzero_si128();
2448 for (y = 0; y < height; y++) {
2449 for (x = 0; x < width; x += 16) {
2450 /* check if memory needs to be reloaded */
2451 x1 = _mm_loadu_si128((__m128i *) &src[x - 3 * srcstride]);
2452 x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
2453 x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
2454 x4 = _mm_loadu_si128((__m128i *) &src[x]);
2455 x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
2456 x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
2457 x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
2458
2459 t1 = _mm_unpacklo_epi8(x1,x8);
2460 t2 = _mm_unpacklo_epi8(x2, x8);
2461 t3 = _mm_unpacklo_epi8(x3, x8);
2462 t4 = _mm_unpacklo_epi8(x4, x8);
2463 t5 = _mm_unpacklo_epi8(x5, x8);
2464 t6 = _mm_unpacklo_epi8(x6, x8);
2465 t7 = _mm_unpacklo_epi8(x7, x8);
2466
2467 x1 = _mm_unpackhi_epi8(x1,x8);
2468 x2 = _mm_unpackhi_epi8(x2, x8);
2469 x3 = _mm_unpackhi_epi8(x3, x8);
2470 x4 = _mm_unpackhi_epi8(x4, x8);
2471 x5 = _mm_unpackhi_epi8(x5, x8);
2472 x6 = _mm_unpackhi_epi8(x6, x8);
2473 x7 = _mm_unpackhi_epi8(x7, x8);
2474
2475 /* multiply by correct value : */
2476 r0 = _mm_mullo_epi16(t1,
2477 _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2478 r2 = _mm_mullo_epi16(x1,
2479 _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2480 r0 = _mm_adds_epi16(r0,
2481 _mm_mullo_epi16(t2,
2482 _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2483 r2 = _mm_adds_epi16(r2,
2484 _mm_mullo_epi16(x2,
2485 _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2486 r0 = _mm_adds_epi16(r0,
2487 _mm_mullo_epi16(t3,
2488 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2489 r2 = _mm_adds_epi16(r2,
2490 _mm_mullo_epi16(x3,
2491 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2492
2493 r0 = _mm_adds_epi16(r0,
2494 _mm_mullo_epi16(t4,
2495 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2496 r2 = _mm_adds_epi16(r2,
2497 _mm_mullo_epi16(x4,
2498 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2499
2500 r0 = _mm_adds_epi16(r0,
2501 _mm_mullo_epi16(t5,
2502 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2503 r2 = _mm_adds_epi16(r2,
2504 _mm_mullo_epi16(x5,
2505 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2506
2507 r0 = _mm_adds_epi16(r0,
2508 _mm_mullo_epi16(t6,
2509 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2510 r2 = _mm_adds_epi16(r2,
2511 _mm_mullo_epi16(x6,
2512 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2513
2514 r0 = _mm_adds_epi16(r0,
2515 _mm_mullo_epi16(t7,
2516 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2517 r2 = _mm_adds_epi16(r2,
2518 _mm_mullo_epi16(x7,
2519 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2520
2521
2522 /* give results back */
2523 _mm_store_si128((__m128i *) &dst[x],r0);
2524 _mm_store_si128((__m128i *) &dst[x + 8],r2);
2525 }
2526 src += srcstride;
2527 dst += dststride;
2528 }
2529
2530 }else{
2531 x = 0;
2532 x8 = _mm_setzero_si128();
2533 t8 = _mm_setzero_si128();
2534 for (y = 0; y < height; y ++) {
2535 for(x=0;x<width;x+=4){
2536 /* load data in register */
2537 x1 = _mm_loadl_epi64((__m128i *) &src[x-(3 * srcstride)]);
2538 x2 = _mm_loadl_epi64((__m128i *) &src[x-(2 * srcstride)]);
2539 x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2540 x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2541 x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2542 x6 = _mm_loadl_epi64((__m128i *) &src[x+(2 * srcstride)]);
2543 x7 = _mm_loadl_epi64((__m128i *) &src[x+(3 * srcstride)]);
2544
2545
2546
2547 x1 = _mm_unpacklo_epi8(x1, t8);
2548 x2 = _mm_unpacklo_epi8(x2, t8);
2549 x3 = _mm_unpacklo_epi8(x3, t8);
2550 x4 = _mm_unpacklo_epi8(x4, t8);
2551 x5 = _mm_unpacklo_epi8(x5, t8);
2552 x6 = _mm_unpacklo_epi8(x6, t8);
2553 x7 = _mm_unpacklo_epi8(x7, t8);
2554
2555
2556 r0 = _mm_mullo_epi16(x1, _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2557
2558 r0 = _mm_adds_epi16(r0,
2559 _mm_mullo_epi16(x2,
2560 _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2561
2562
2563 r0 = _mm_adds_epi16(r0,
2564 _mm_mullo_epi16(x3,
2565 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2566
2567 r0 = _mm_adds_epi16(r0,
2568 _mm_mullo_epi16(x4,
2569 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2570
2571 r0 = _mm_adds_epi16(r0,
2572 _mm_mullo_epi16(x5,
2573 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2574
2575
2576 r0 = _mm_adds_epi16(r0,
2577 _mm_mullo_epi16(x6,
2578 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2579
2580
2581 r0 = _mm_adds_epi16(r0,
2582 _mm_mullo_epi16(x7,
2583 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2584
2585 /* give results back */
2586 _mm_storel_epi64((__m128i *) &dst[x], r0);
2587 }
2588 src += srcstride;
2589 dst += dststride;
2590 }
2591 }
2592 }
2593
2594 #if 0
2595 void ff_hevc_put_hevc_qpel_v_1_10_sse4(int16_t *dst, ptrdiff_t dststride,
2596 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2597 int16_t* mcbuffer) {
2598 int x, y;
2599 uint16_t *src = (uint16_t*) _src;
2600 ptrdiff_t srcstride = _srcstride >> 1;
2601 __m128i x1, x2, x3, x4, x5, x6, x7, r1;
2602 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2603
2604 t7= _mm_set1_epi32(1);
2605 t6= _mm_set1_epi32(-5);
2606 t5= _mm_set1_epi32(17);
2607 t4= _mm_set1_epi32(58);
2608 t3= _mm_set1_epi32(-10);
2609 t2= _mm_set1_epi32(4);
2610 t1= _mm_set1_epi32(-1);
2611 t8= _mm_setzero_si128();
2612
2613 for (y = 0; y < height; y ++) {
2614 for(x=0;x<width;x+=4){
2615 /* load data in register */
2616 x1 = _mm_loadl_epi64((__m128i *) &src[x-(3 * srcstride)]);
2617 x2 = _mm_loadl_epi64((__m128i *) &src[x-(2 * srcstride)]);
2618 x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2619 x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2620 x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2621 x6 = _mm_loadl_epi64((__m128i *) &src[x+(2 * srcstride)]);
2622 x7 = _mm_loadl_epi64((__m128i *) &src[x+(3 * srcstride)]);
2623
2624
2625 x1 = _mm_unpacklo_epi16(x1, t8);
2626 x2 = _mm_unpacklo_epi16(x2, t8);
2627 x3 = _mm_unpacklo_epi16(x3, t8);
2628 x4 = _mm_unpacklo_epi16(x4, t8);
2629 x5 = _mm_unpacklo_epi16(x5, t8);
2630 x6 = _mm_unpacklo_epi16(x6, t8);
2631 x7 = _mm_unpacklo_epi16(x7, t8);
2632
2633
2634 r1 = _mm_mullo_epi32(x1,t1);
2635
2636 r1 = _mm_add_epi32(r1,
2637 _mm_mullo_epi32(x2,t2));
2638
2639
2640 r1 = _mm_add_epi32(r1,
2641 _mm_mullo_epi32(x3,t3));
2642
2643 r1 = _mm_add_epi32(r1,
2644 _mm_mullo_epi32(x4,t4));
2645
2646 r1 = _mm_add_epi32(r1,
2647 _mm_mullo_epi32(x5,t5));
2648
2649
2650 r1 = _mm_add_epi32(r1,
2651 _mm_mullo_epi32(x6,t6));
2652
2653
2654 r1 = _mm_add_epi32(r1, _mm_mullo_epi32(x7,t7));
2655 r1 = _mm_srai_epi32(r1,2); //bit depth - 8
2656
2657
2658 r1 = _mm_packs_epi32(r1,t8);
2659
2660 // give results back
2661 _mm_storel_epi64((__m128i *) (dst + x), r1);
2662 }
2663 src += srcstride;
2664 dst += dststride;
2665 }
2666
2667 }
2668 #endif
2669
2670
2671
2672 void ff_hevc_put_hevc_qpel_v_2_8_sse(int16_t *dst, ptrdiff_t dststride,
2673 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2674 int16_t* mcbuffer) {
2675 int x, y;
2676 uint8_t *src = (uint8_t*) _src;
2677 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2678 __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2679 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2680 r1 = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
2681
2682 if(!(width & 15)){
2683 for (y = 0; y < height; y++) {
2684 for (x = 0; x < width; x += 16) {
2685 r0 = _mm_setzero_si128();
2686 /* check if memory needs to be reloaded */
2687 x1 = _mm_loadu_si128((__m128i *) &src[x - 3 * srcstride]);
2688 x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
2689 x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
2690 x4 = _mm_loadu_si128((__m128i *) &src[x]);
2691 x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
2692 x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
2693 x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
2694 x8 = _mm_loadu_si128((__m128i *) &src[x + 4 * srcstride]);
2695
2696 t1 = _mm_unpacklo_epi8(x1, r0);
2697 t2 = _mm_unpacklo_epi8(x2, r0);
2698 t3 = _mm_unpacklo_epi8(x3, r0);
2699 t4 = _mm_unpacklo_epi8(x4, r0);
2700 t5 = _mm_unpacklo_epi8(x5, r0);
2701 t6 = _mm_unpacklo_epi8(x6, r0);
2702 t7 = _mm_unpacklo_epi8(x7, r0);
2703 t8 = _mm_unpacklo_epi8(x8, r0);
2704
2705 x1 = _mm_unpackhi_epi8(x1, r0);
2706 x2 = _mm_unpackhi_epi8(x2, r0);
2707 x3 = _mm_unpackhi_epi8(x3, r0);
2708 x4 = _mm_unpackhi_epi8(x4, r0);
2709 x5 = _mm_unpackhi_epi8(x5, r0);
2710 x6 = _mm_unpackhi_epi8(x6, r0);
2711 x7 = _mm_unpackhi_epi8(x7, r0);
2712 x8 = _mm_unpackhi_epi8(x8, r0);
2713
2714 /* multiply by correct value : */
2715 r0 = _mm_mullo_epi16(t1,
2716 _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2717 r2 = _mm_mullo_epi16(x1,
2718 _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2719 r0 = _mm_adds_epi16(r0,
2720 _mm_mullo_epi16(t2,
2721 _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2722 r2 = _mm_adds_epi16(r2,
2723 _mm_mullo_epi16(x2,
2724 _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2725 r0 = _mm_adds_epi16(r0,
2726 _mm_mullo_epi16(t3,
2727 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2728 r2 = _mm_adds_epi16(r2,
2729 _mm_mullo_epi16(x3,
2730 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2731
2732 r0 = _mm_adds_epi16(r0,
2733 _mm_mullo_epi16(t4,
2734 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2735 r2 = _mm_adds_epi16(r2,
2736 _mm_mullo_epi16(x4,
2737 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2738
2739 r0 = _mm_adds_epi16(r0,
2740 _mm_mullo_epi16(t5,
2741 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2742 r2 = _mm_adds_epi16(r2,
2743 _mm_mullo_epi16(x5,
2744 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2745
2746 r0 = _mm_adds_epi16(r0,
2747 _mm_mullo_epi16(t6,
2748 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2749 r2 = _mm_adds_epi16(r2,
2750 _mm_mullo_epi16(x6,
2751 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2752
2753 r0 = _mm_adds_epi16(r0,
2754 _mm_mullo_epi16(t7,
2755 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2756 r2 = _mm_adds_epi16(r2,
2757 _mm_mullo_epi16(x7,
2758 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2759
2760 r0 = _mm_adds_epi16(r0,
2761 _mm_mullo_epi16(t8,
2762 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
2763 r2 = _mm_adds_epi16(r2,
2764 _mm_mullo_epi16(x8,
2765 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
2766
2767 /* give results back */
2768 _mm_store_si128((__m128i *) &dst[x],r0);
2769 _mm_store_si128((__m128i *) &dst[x + 8],r2);
2770 }
2771 src += srcstride;
2772 dst += dststride;
2773 }
2774 }else{
2775 x = 0;
2776 for (y = 0; y < height; y ++) {
2777 for(x=0;x<width;x+=4){
2778 r0 = _mm_setzero_si128();
2779 /* load data in register */
2780 x1 = _mm_loadl_epi64((__m128i *) &src[x - 3 * srcstride]);
2781 x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
2782 x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2783 x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2784 x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2785 x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
2786 x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
2787 x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
2788
2789 x1 = _mm_unpacklo_epi8(x1,r0);
2790 x2 = _mm_unpacklo_epi8(x2, r0);
2791 x3 = _mm_unpacklo_epi8(x3, r0);
2792 x4 = _mm_unpacklo_epi8(x4, r0);
2793 x5 = _mm_unpacklo_epi8(x5, r0);
2794 x6 = _mm_unpacklo_epi8(x6, r0);
2795 x7 = _mm_unpacklo_epi8(x7, r0);
2796 x8 = _mm_unpacklo_epi8(x8, r0);
2797
2798
2799 r0 = _mm_mullo_epi16(x1, _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2800
2801 r0 = _mm_adds_epi16(r0,
2802 _mm_mullo_epi16(x2,
2803 _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2804
2805
2806 r0 = _mm_adds_epi16(r0,
2807 _mm_mullo_epi16(x3,
2808 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2809
2810
2811 r0 = _mm_adds_epi16(r0,
2812 _mm_mullo_epi16(x4,
2813 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2814
2815
2816 r0 = _mm_adds_epi16(r0,
2817 _mm_mullo_epi16(x5,
2818 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2819
2820
2821 r0 = _mm_adds_epi16(r0,
2822 _mm_mullo_epi16(x6,
2823 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2824
2825
2826 r0 = _mm_adds_epi16(r0,
2827 _mm_mullo_epi16(x7,
2828 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2829
2830
2831 r0 = _mm_adds_epi16(r0,
2832 _mm_mullo_epi16(x8,
2833 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
2834
2835
2836 /* give results back */
2837 _mm_storel_epi64((__m128i *) &dst[x], r0);
2838
2839 }
2840 src += srcstride;
2841 dst += dststride;
2842 }
2843 }
2844 }
2845
2846 #if 0
2847 void ff_hevc_put_hevc_qpel_v_2_10_sse(int16_t *dst, ptrdiff_t dststride,
2848 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2849 int16_t* mcbuffer) {
2850 int x, y;
2851 uint16_t *src = (uint16_t*) _src;
2852 ptrdiff_t srcstride = _srcstride >> 1;
2853 __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2854 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2855 r1 = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
2856
2857 t1= _mm_set1_epi32(-1);
2858 t2= _mm_set1_epi32(4);
2859 t3= _mm_set1_epi32(-11);
2860 t4= _mm_set1_epi32(40);
2861 t5= _mm_set1_epi32(40);
2862 t6= _mm_set1_epi32(-11);
2863 t7= _mm_set1_epi32(4);
2864 t8= _mm_set1_epi32(-1);
2865
2866 {
2867 x = 0;
2868 r0 = _mm_setzero_si128();
2869 for (y = 0; y < height; y ++) {
2870 for(x=0;x<width;x+=4){
2871
2872 /* load data in register */
2873 x1 = _mm_loadl_epi64((__m128i *) &src[x - 3 * srcstride]);
2874 x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
2875 x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2876 x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2877 x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2878 x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
2879 x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
2880 x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
2881
2882 x1 = _mm_unpacklo_epi16(x1, r0);
2883 x2 = _mm_unpacklo_epi16(x2, r0);
2884 x3 = _mm_unpacklo_epi16(x3, r0);
2885 x4 = _mm_unpacklo_epi16(x4, r0);
2886 x5 = _mm_unpacklo_epi16(x5, r0);
2887 x6 = _mm_unpacklo_epi16(x6, r0);
2888 x7 = _mm_unpacklo_epi16(x7, r0);
2889 x8 = _mm_unpacklo_epi16(x8, r0);
2890
2891
2892 r1 = _mm_mullo_epi32(x1, t1);
2893
2894 r1 = _mm_add_epi32(r1,
2895 _mm_mullo_epi32(x2,t2));
2896
2897
2898 r1 = _mm_add_epi32(r1,
2899 _mm_mullo_epi32(x3,t3));
2900
2901
2902 r1 = _mm_add_epi32(r1,
2903 _mm_mullo_epi32(x4,t4));
2904
2905
2906 r1 = _mm_add_epi32(r1,
2907 _mm_mullo_epi32(x5,t5));
2908
2909
2910 r1 = _mm_add_epi32(r1,
2911 _mm_mullo_epi32(x6,t6));
2912
2913
2914 r1 = _mm_add_epi32(r1,
2915 _mm_mullo_epi32(x7,t7));
2916
2917
2918 r1 = _mm_add_epi32(r1,
2919 _mm_mullo_epi32(x8,t8));
2920
2921
2922 r1= _mm_srai_epi32(r1,2); //bit depth - 8
2923
2924 r1= _mm_packs_epi32(r1,t8);
2925
2926 /* give results back */
2927 _mm_storel_epi64((__m128i *) (dst+x), r1);
2928
2929 }
2930 src += srcstride;
2931 dst += dststride;
2932 }
2933 }
2934 }
2935 #endif
2936
2937 #if 0
2938 static void ff_hevc_put_hevc_qpel_v_3_sse(int16_t *dst, ptrdiff_t dststride,
2939 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2940 int16_t* mcbuffer) {
2941 int x, y;
2942 uint8_t *src = (uint8_t*) _src;
2943 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2944 __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2945 __m128i t2, t3, t4, t5, t6, t7, t8;
2946 r1 = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
2947
2948 if(!(width & 15)){
2949 for (y = 0; y < height; y++) {
2950 for (x = 0; x < width; x += 16) {
2951 /* check if memory needs to be reloaded */
2952 x1 = _mm_setzero_si128();
2953 x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
2954 x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
2955 x4 = _mm_loadu_si128((__m128i *) &src[x]);
2956 x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
2957 x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
2958 x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
2959 x8 = _mm_loadu_si128((__m128i *) &src[x + 4 * srcstride]);
2960
2961 t2 = _mm_unpacklo_epi8(x2, x1);
2962 t3 = _mm_unpacklo_epi8(x3, x1);
2963 t4 = _mm_unpacklo_epi8(x4, x1);
2964 t5 = _mm_unpacklo_epi8(x5, x1);
2965 t6 = _mm_unpacklo_epi8(x6, x1);
2966 t7 = _mm_unpacklo_epi8(x7, x1);
2967 t8 = _mm_unpacklo_epi8(x8, x1);
2968
2969 x2 = _mm_unpackhi_epi8(x2, x1);
2970 x3 = _mm_unpackhi_epi8(x3, x1);
2971 x4 = _mm_unpackhi_epi8(x4, x1);
2972 x5 = _mm_unpackhi_epi8(x5, x1);
2973 x6 = _mm_unpackhi_epi8(x6, x1);
2974 x7 = _mm_unpackhi_epi8(x7, x1);
2975 x8 = _mm_unpackhi_epi8(x8, x1);
2976
2977 /* multiply by correct value : */
2978 r0 = _mm_mullo_epi16(t2,
2979 _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
2980 r2 = _mm_mullo_epi16(x2,
2981 _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
2982
2983 r0 = _mm_adds_epi16(r0,
2984 _mm_mullo_epi16(t3,
2985 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2986 r2 = _mm_adds_epi16(r2,
2987 _mm_mullo_epi16(x3,
2988 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2989
2990 r0 = _mm_adds_epi16(r0,
2991 _mm_mullo_epi16(t4,
2992 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2993 r2 = _mm_adds_epi16(r2,
2994 _mm_mullo_epi16(x4,
2995 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2996
2997 r0 = _mm_adds_epi16(r0,
2998 _mm_mullo_epi16(t5,
2999 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3000 r2 = _mm_adds_epi16(r2,
3001 _mm_mullo_epi16(x5,
3002 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3003
3004 r0 = _mm_adds_epi16(r0,
3005 _mm_mullo_epi16(t6,
3006 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3007 r2 = _mm_adds_epi16(r2,
3008 _mm_mullo_epi16(x6,
3009 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3010
3011 r0 = _mm_adds_epi16(r0,
3012 _mm_mullo_epi16(t7,
3013 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3014 r2 = _mm_adds_epi16(r2,
3015 _mm_mullo_epi16(x7,
3016 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3017
3018 r0 = _mm_adds_epi16(r0,
3019 _mm_mullo_epi16(t8,
3020 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3021 r2 = _mm_adds_epi16(r2,
3022 _mm_mullo_epi16(x8,
3023 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3024
3025 /* give results back */
3026 _mm_store_si128((__m128i *) &dst[x],
3027 _mm_srli_epi16(r0, BIT_DEPTH - 8));
3028 _mm_store_si128((__m128i *) &dst[x + 8],
3029 _mm_srli_epi16(r2, BIT_DEPTH - 8));
3030 }
3031 src += srcstride;
3032 dst += dststride;
3033 }
3034 }else{
3035 x = 0;
3036 for (y = 0; y < height; y ++) {
3037 for(x=0;x<width;x+=4){
3038 r0 = _mm_set1_epi16(0);
3039 /* load data in register */
3040 //x1 = _mm_setzero_si128();
3041 x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
3042 x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
3043 x4 = _mm_loadl_epi64((__m128i *) &src[x]);
3044 x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
3045 x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
3046 x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
3047 x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
3048
3049 x1 = _mm_unpacklo_epi8(x1,r0);
3050 x2 = _mm_unpacklo_epi8(x2, r0);
3051 x3 = _mm_unpacklo_epi8(x3, r0);
3052 x4 = _mm_unpacklo_epi8(x4, r0);
3053 x5 = _mm_unpacklo_epi8(x5, r0);
3054 x6 = _mm_unpacklo_epi8(x6, r0);
3055 x7 = _mm_unpacklo_epi8(x7, r0);
3056 x8 = _mm_unpacklo_epi8(x8, r0);
3057
3058
3059 r0 = _mm_mullo_epi16(x2, _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3060
3061
3062 r0 = _mm_adds_epi16(r0,
3063 _mm_mullo_epi16(x3,
3064 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3065
3066
3067 r0 = _mm_adds_epi16(r0,
3068 _mm_mullo_epi16(x4,
3069 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3070
3071
3072 r0 = _mm_adds_epi16(r0,
3073 _mm_mullo_epi16(x5,
3074 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3075
3076
3077 r0 = _mm_adds_epi16(r0,
3078 _mm_mullo_epi16(x6,
3079 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3080
3081
3082 r0 = _mm_adds_epi16(r0,
3083 _mm_mullo_epi16(x7,
3084 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3085
3086
3087 r0 = _mm_adds_epi16(r0,
3088 _mm_mullo_epi16(x8,
3089 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3090
3091
3092 r0 = _mm_srli_epi16(r0, BIT_DEPTH - 8);
3093 /* give results back */
3094 _mm_storel_epi64((__m128i *) &dst[x], r0);
3095
3096 }
3097 src += srcstride;
3098 dst += dststride;
3099 }
3100 }
3101
3102 }
3103 #endif
3104
3105 void ff_hevc_put_hevc_qpel_v_3_8_sse(int16_t *dst, ptrdiff_t dststride,
3106 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3107 int16_t* mcbuffer) {
3108 int x, y;
3109 uint8_t *src = (uint8_t*) _src;
3110 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3111 __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
3112 __m128i t2, t3, t4, t5, t6, t7, t8;
3113 r1 = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
3114
3115 if(!(width & 15)){
3116 for (y = 0; y < height; y++) {
3117 for (x = 0; x < width; x += 16) {
3118 /* check if memory needs to be reloaded */
3119 x1 = _mm_setzero_si128();
3120 x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
3121 x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
3122 x4 = _mm_loadu_si128((__m128i *) &src[x]);
3123 x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
3124 x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
3125 x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
3126 x8 = _mm_loadu_si128((__m128i *) &src[x + 4 * srcstride]);
3127
3128 t2 = _mm_unpacklo_epi8(x2, x1);
3129 t3 = _mm_unpacklo_epi8(x3, x1);
3130 t4 = _mm_unpacklo_epi8(x4, x1);
3131 t5 = _mm_unpacklo_epi8(x5, x1);
3132 t6 = _mm_unpacklo_epi8(x6, x1);
3133 t7 = _mm_unpacklo_epi8(x7, x1);
3134 t8 = _mm_unpacklo_epi8(x8, x1);
3135
3136 x2 = _mm_unpackhi_epi8(x2, x1);
3137 x3 = _mm_unpackhi_epi8(x3, x1);
3138 x4 = _mm_unpackhi_epi8(x4, x1);
3139 x5 = _mm_unpackhi_epi8(x5, x1);
3140 x6 = _mm_unpackhi_epi8(x6, x1);
3141 x7 = _mm_unpackhi_epi8(x7, x1);
3142 x8 = _mm_unpackhi_epi8(x8, x1);
3143
3144 /* multiply by correct value : */
3145 r0 = _mm_mullo_epi16(t2,
3146 _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3147 r2 = _mm_mullo_epi16(x2,
3148 _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3149
3150 r0 = _mm_adds_epi16(r0,
3151 _mm_mullo_epi16(t3,
3152 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3153 r2 = _mm_adds_epi16(r2,
3154 _mm_mullo_epi16(x3,
3155 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3156
3157 r0 = _mm_adds_epi16(r0,
3158 _mm_mullo_epi16(t4,
3159 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3160 r2 = _mm_adds_epi16(r2,
3161 _mm_mullo_epi16(x4,
3162 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3163
3164 r0 = _mm_adds_epi16(r0,
3165 _mm_mullo_epi16(t5,
3166 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3167 r2 = _mm_adds_epi16(r2,
3168 _mm_mullo_epi16(x5,
3169 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3170
3171 r0 = _mm_adds_epi16(r0,
3172 _mm_mullo_epi16(t6,
3173 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3174 r2 = _mm_adds_epi16(r2,
3175 _mm_mullo_epi16(x6,
3176 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3177
3178 r0 = _mm_adds_epi16(r0,
3179 _mm_mullo_epi16(t7,
3180 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3181 r2 = _mm_adds_epi16(r2,
3182 _mm_mullo_epi16(x7,
3183 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3184
3185 r0 = _mm_adds_epi16(r0,
3186 _mm_mullo_epi16(t8,
3187 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3188 r2 = _mm_adds_epi16(r2,
3189 _mm_mullo_epi16(x8,
3190 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3191
3192 /* give results back */
3193 _mm_store_si128((__m128i *) &dst[x],r0);
3194 _mm_store_si128((__m128i *) &dst[x + 8],r2);
3195 }
3196 src += srcstride;
3197 dst += dststride;
3198 }
3199 }else{
3200 x = 0;
3201 for (y = 0; y < height; y ++) {
3202 for(x=0;x<width;x+=4){
3203 r0 = _mm_set1_epi16(0);
3204 /* load data in register */
3205 x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
3206 x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
3207 x4 = _mm_loadl_epi64((__m128i *) &src[x]);
3208 x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
3209 x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
3210 x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
3211 x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
3212
3213 x2 = _mm_unpacklo_epi8(x2, r0);
3214 x3 = _mm_unpacklo_epi8(x3, r0);
3215 x4 = _mm_unpacklo_epi8(x4, r0);
3216 x5 = _mm_unpacklo_epi8(x5, r0);
3217 x6 = _mm_unpacklo_epi8(x6, r0);
3218 x7 = _mm_unpacklo_epi8(x7, r0);
3219 x8 = _mm_unpacklo_epi8(x8, r0);
3220
3221 r0 = _mm_mullo_epi16(x2, _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3222
3223 r0 = _mm_adds_epi16(r0,
3224 _mm_mullo_epi16(x3,
3225 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3226
3227 r0 = _mm_adds_epi16(r0,
3228 _mm_mullo_epi16(x4,
3229 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3230
3231 r0 = _mm_adds_epi16(r0,
3232 _mm_mullo_epi16(x5,
3233 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3234
3235 r0 = _mm_adds_epi16(r0,
3236 _mm_mullo_epi16(x6,
3237 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3238
3239 r0 = _mm_adds_epi16(r0,
3240 _mm_mullo_epi16(x7,
3241 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3242
3243 r0 = _mm_adds_epi16(r0,
3244 _mm_mullo_epi16(x8,
3245 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3246
3247 /* give results back */
3248 _mm_storel_epi64((__m128i *) &dst[x], r0);
3249
3250 }
3251 src += srcstride;
3252 dst += dststride;
3253 }
3254 }
3255
3256 }
3257
3258
3259 #if 0
3260 void ff_hevc_put_hevc_qpel_v_3_10_sse(int16_t *dst, ptrdiff_t dststride,
3261 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3262 int16_t* mcbuffer) {
3263 int x, y;
3264 uint16_t *src = (uint16_t*) _src;
3265 ptrdiff_t srcstride = _srcstride >> 1;
3266 __m128i x1, x2, x3, x4, x5, x6, x7, r0;
3267 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3268
3269 t7 = _mm_set1_epi32(-1);
3270 t6 = _mm_set1_epi32(4);
3271 t5 = _mm_set1_epi32(-10);
3272 t4 = _mm_set1_epi32(58);
3273 t3 = _mm_set1_epi32(17);
3274 t2 = _mm_set1_epi32(-5);
3275 t1 = _mm_set1_epi32(1);
3276 t8= _mm_setzero_si128();
3277 {
3278
3279 for (y = 0; y < height; y ++) {
3280 for(x=0;x<width;x+=4){
3281 /* load data in register */
3282 x1 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
3283 x2 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
3284 x3 = _mm_loadl_epi64((__m128i *) &src[x]);
3285 x4 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
3286 x5 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
3287 x6 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
3288 x7 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
3289
3290 x1 = _mm_unpacklo_epi16(x1, t8);
3291 x2 = _mm_unpacklo_epi16(x2, t8);
3292 x3 = _mm_unpacklo_epi16(x3, t8);
3293 x4 = _mm_unpacklo_epi16(x4, t8);
3294 x5 = _mm_unpacklo_epi16(x5, t8);
3295 x6 = _mm_unpacklo_epi16(x6, t8);
3296 x7 = _mm_unpacklo_epi16(x7, t8);
3297
3298 r0 = _mm_mullo_epi32(x1, t1);
3299
3300 r0 = _mm_add_epi32(r0,
3301 _mm_mullo_epi32(x2,t2));
3302
3303 r0 = _mm_add_epi32(r0,
3304 _mm_mullo_epi32(x3,t3));
3305
3306 r0 = _mm_add_epi32(r0,
3307 _mm_mullo_epi32(x4,t4));
3308
3309 r0 = _mm_add_epi32(r0,
3310 _mm_mullo_epi32(x5,t5));
3311
3312 r0 = _mm_add_epi32(r0,
3313 _mm_mullo_epi32(x6,t6));
3314
3315 r0 = _mm_add_epi32(r0,
3316 _mm_mullo_epi32(x7,t7));
3317
3318 r0= _mm_srai_epi32(r0,2);
3319
3320 r0= _mm_packs_epi32(r0,t8);
3321
3322 /* give results back */
3323 _mm_storel_epi64((__m128i *) &dst[x], r0);
3324
3325 }
3326 src += srcstride;
3327 dst += dststride;
3328 }
3329 }
3330
3331 }
3332 #endif
3333
3334
3335
3336 void ff_hevc_put_hevc_qpel_h_1_v_1_sse(int16_t *dst, ptrdiff_t dststride,
3337 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3338 int16_t* mcbuffer) {
3339 int x, y;
3340 uint8_t* src = (uint8_t*) _src;
3341 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3342 int16_t *tmp = mcbuffer;
3343 __m128i x1, x2, x3, x4, x5, x6, x7, rBuffer, rTemp, r0, r1;
3344 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3345
3346 src -= qpel_extra_before[1] * srcstride;
3347 r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
3348 -1);
3349
3350 /* LOAD src from memory to registers to limit memory bandwidth */
3351 if (width == 4) {
3352
3353 for (y = 0; y < height + qpel_extra[1]; y += 2) {
3354 /* load data in register */
3355 x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3356 src += srcstride;
3357 t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3358 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3359 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3360 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3361 _mm_srli_si128(x1, 3));
3362 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3363 _mm_srli_si128(t1, 3));
3364
3365 /* PMADDUBSW then PMADDW */
3366 x2 = _mm_maddubs_epi16(x2, r0);
3367 t2 = _mm_maddubs_epi16(t2, r0);
3368 x3 = _mm_maddubs_epi16(x3, r0);
3369 t3 = _mm_maddubs_epi16(t3, r0);
3370 x2 = _mm_hadd_epi16(x2, x3);
3371 t2 = _mm_hadd_epi16(t2, t3);
3372 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3373 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3374 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3375 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3376 /* give results back */
3377 _mm_storel_epi64((__m128i *) &tmp[0], x2);
3378
3379 tmp += MAX_PB_SIZE;
3380 _mm_storel_epi64((__m128i *) &tmp[0], t2);
3381
3382 src += srcstride;
3383 tmp += MAX_PB_SIZE;
3384 }
3385 } else
3386 for (y = 0; y < height + qpel_extra[1]; y++) {
3387 for (x = 0; x < width; x += 8) {
3388 /* load data in register */
3389 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3390 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3391 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3392 _mm_srli_si128(x1, 3));
3393 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3394 _mm_srli_si128(x1, 5));
3395 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3396 _mm_srli_si128(x1, 7));
3397
3398 /* PMADDUBSW then PMADDW */
3399 x2 = _mm_maddubs_epi16(x2, r0);
3400 x3 = _mm_maddubs_epi16(x3, r0);
3401 x4 = _mm_maddubs_epi16(x4, r0);
3402 x5 = _mm_maddubs_epi16(x5, r0);
3403 x2 = _mm_hadd_epi16(x2, x3);
3404 x4 = _mm_hadd_epi16(x4, x5);
3405 x2 = _mm_hadd_epi16(x2, x4);
3406 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3407
3408 /* give results back */
3409 _mm_store_si128((__m128i *) &tmp[x], x2);
3410
3411 }
3412 src += srcstride;
3413 tmp += MAX_PB_SIZE;
3414 }
3415
3416 tmp = mcbuffer + qpel_extra_before[1] * MAX_PB_SIZE;
3417 srcstride = MAX_PB_SIZE;
3418
3419 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
3420 for register calculations */
3421 rTemp = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
3422 for (y = 0; y < height; y++) {
3423 for (x = 0; x < width; x += 8) {
3424
3425 x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
3426 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3427 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3428 x4 = _mm_load_si128((__m128i *) &tmp[x]);
3429 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3430 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3431 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3432
3433 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
3434 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
3435 t8 = _mm_mullo_epi16(x1, r0);
3436 rBuffer = _mm_mulhi_epi16(x1, r0);
3437 t7 = _mm_mullo_epi16(x2, r1);
3438 t1 = _mm_unpacklo_epi16(t8, rBuffer);
3439 x1 = _mm_unpackhi_epi16(t8, rBuffer);
3440
3441 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
3442 rBuffer = _mm_mulhi_epi16(x2, r1);
3443 t8 = _mm_mullo_epi16(x3, r0);
3444 t2 = _mm_unpacklo_epi16(t7, rBuffer);
3445 x2 = _mm_unpackhi_epi16(t7, rBuffer);
3446
3447 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
3448 rBuffer = _mm_mulhi_epi16(x3, r0);
3449 t7 = _mm_mullo_epi16(x4, r1);
3450 t3 = _mm_unpacklo_epi16(t8, rBuffer);
3451 x3 = _mm_unpackhi_epi16(t8, rBuffer);
3452
3453 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
3454 rBuffer = _mm_mulhi_epi16(x4, r1);
3455 t8 = _mm_mullo_epi16(x5, r0);
3456 t4 = _mm_unpacklo_epi16(t7, rBuffer);
3457 x4 = _mm_unpackhi_epi16(t7, rBuffer);
3458
3459 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
3460 rBuffer = _mm_mulhi_epi16(x5, r0);
3461 t7 = _mm_mullo_epi16(x6, r1);
3462 t5 = _mm_unpacklo_epi16(t8, rBuffer);
3463 x5 = _mm_unpackhi_epi16(t8, rBuffer);
3464
3465 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
3466 rBuffer = _mm_mulhi_epi16(x6, r1);
3467 t8 = _mm_mullo_epi16(x7, r0);
3468 t6 = _mm_unpacklo_epi16(t7, rBuffer);
3469 x6 = _mm_unpackhi_epi16(t7, rBuffer);
3470
3471 rBuffer = _mm_mulhi_epi16(x7, r0);
3472 t7 = _mm_unpacklo_epi16(t8, rBuffer);
3473 x7 = _mm_unpackhi_epi16(t8, rBuffer);
3474
3475
3476
3477 /* add calculus by correct value : */
3478
3479 r1 = _mm_add_epi32(x1, x2);
3480 x3 = _mm_add_epi32(x3, x4);
3481 x5 = _mm_add_epi32(x5, x6);
3482 r1 = _mm_add_epi32(r1, x3);
3483
3484 r1 = _mm_add_epi32(r1, x5);
3485
3486 r0 = _mm_add_epi32(t1, t2);
3487 t3 = _mm_add_epi32(t3, t4);
3488 t5 = _mm_add_epi32(t5, t6);
3489 r0 = _mm_add_epi32(r0, t3);
3490 r0 = _mm_add_epi32(r0, t5);
3491 r1 = _mm_add_epi32(r1, x7);
3492 r0 = _mm_add_epi32(r0, t7);
3493 r1 = _mm_srli_epi32(r1, 6);
3494 r0 = _mm_srli_epi32(r0, 6);
3495
3496 r1 = _mm_and_si128(r1,
3497 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3498 r0 = _mm_and_si128(r0,
3499 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3500 r0 = _mm_hadd_epi16(r0, r1);
3501 _mm_store_si128((__m128i *) &dst[x], r0);
3502
3503 }
3504 tmp += MAX_PB_SIZE;
3505 dst += dststride;
3506 }
3507 }
3508 void ff_hevc_put_hevc_qpel_h_1_v_2_sse(int16_t *dst, ptrdiff_t dststride,
3509 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3510 int16_t* mcbuffer) {
3511 int x, y;
3512 uint8_t *src = (uint8_t*) _src;
3513 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3514 int16_t *tmp = mcbuffer;
3515 __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
3516 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3517
3518 src -= qpel_extra_before[2] * srcstride;
3519 r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
3520 -1);
3521
3522 /* LOAD src from memory to registers to limit memory bandwidth */
3523 if (width == 4) {
3524
3525 for (y = 0; y < height + qpel_extra[2]; y += 2) {
3526 /* load data in register */
3527 x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3528 src += srcstride;
3529 t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3530 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3531 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3532 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3533 _mm_srli_si128(x1, 3));
3534 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3535 _mm_srli_si128(t1, 3));
3536
3537 /* PMADDUBSW then PMADDW */
3538 x2 = _mm_maddubs_epi16(x2, r0);
3539 t2 = _mm_maddubs_epi16(t2, r0);
3540 x3 = _mm_maddubs_epi16(x3, r0);
3541 t3 = _mm_maddubs_epi16(t3, r0);
3542 x2 = _mm_hadd_epi16(x2, x3);
3543 t2 = _mm_hadd_epi16(t2, t3);
3544 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3545 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3546 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3547 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3548 /* give results back */
3549 _mm_storel_epi64((__m128i *) &tmp[0], x2);
3550
3551 tmp += MAX_PB_SIZE;
3552 _mm_storel_epi64((__m128i *) &tmp[0], t2);
3553
3554 src += srcstride;
3555 tmp += MAX_PB_SIZE;
3556 }
3557 } else
3558 for (y = 0; y < height + qpel_extra[2]; y++) {
3559 for (x = 0; x < width; x += 8) {
3560 /* load data in register */
3561 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3562 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3563 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3564 _mm_srli_si128(x1, 3));
3565 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3566 _mm_srli_si128(x1, 5));
3567 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3568 _mm_srli_si128(x1, 7));
3569
3570 /* PMADDUBSW then PMADDW */
3571 x2 = _mm_maddubs_epi16(x2, r0);
3572 x3 = _mm_maddubs_epi16(x3, r0);
3573 x4 = _mm_maddubs_epi16(x4, r0);
3574 x5 = _mm_maddubs_epi16(x5, r0);
3575 x2 = _mm_hadd_epi16(x2, x3);
3576 x4 = _mm_hadd_epi16(x4, x5);
3577 x2 = _mm_hadd_epi16(x2, x4);
3578 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3579
3580 /* give results back */
3581 _mm_store_si128((__m128i *) &tmp[x], x2);
3582
3583 }
3584 src += srcstride;
3585 tmp += MAX_PB_SIZE;
3586 }
3587
3588 tmp = mcbuffer + qpel_extra_before[2] * MAX_PB_SIZE;
3589 srcstride = MAX_PB_SIZE;
3590
3591 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
3592 for register calculations */
3593 rTemp = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
3594 for (y = 0; y < height; y++) {
3595 for (x = 0; x < width; x += 8) {
3596
3597 x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
3598 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3599 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3600 x4 = _mm_load_si128((__m128i *) &tmp[x]);
3601 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3602 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3603 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3604 x8 = _mm_loadu_si128((__m128i *) &tmp[x + 4 * srcstride]);
3605
3606 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
3607 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
3608 t8 = _mm_mullo_epi16(x1, r0);
3609 rBuffer = _mm_mulhi_epi16(x1, r0);
3610 t7 = _mm_mullo_epi16(x2, r1);
3611 t1 = _mm_unpacklo_epi16(t8, rBuffer);
3612 x1 = _mm_unpackhi_epi16(t8, rBuffer);
3613
3614 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
3615 rBuffer = _mm_mulhi_epi16(x2, r1);
3616 t8 = _mm_mullo_epi16(x3, r0);
3617 t2 = _mm_unpacklo_epi16(t7, rBuffer);
3618 x2 = _mm_unpackhi_epi16(t7, rBuffer);
3619
3620 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
3621 rBuffer = _mm_mulhi_epi16(x3, r0);
3622 t7 = _mm_mullo_epi16(x4, r1);
3623 t3 = _mm_unpacklo_epi16(t8, rBuffer);
3624 x3 = _mm_unpackhi_epi16(t8, rBuffer);
3625
3626 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
3627 rBuffer = _mm_mulhi_epi16(x4, r1);
3628 t8 = _mm_mullo_epi16(x5, r0);
3629 t4 = _mm_unpacklo_epi16(t7, rBuffer);
3630 x4 = _mm_unpackhi_epi16(t7, rBuffer);
3631
3632 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
3633 rBuffer = _mm_mulhi_epi16(x5, r0);
3634 t7 = _mm_mullo_epi16(x6, r1);
3635 t5 = _mm_unpacklo_epi16(t8, rBuffer);
3636 x5 = _mm_unpackhi_epi16(t8, rBuffer);
3637
3638 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
3639 rBuffer = _mm_mulhi_epi16(x6, r1);
3640 t8 = _mm_mullo_epi16(x7, r0);
3641 t6 = _mm_unpacklo_epi16(t7, rBuffer);
3642 x6 = _mm_unpackhi_epi16(t7, rBuffer);
3643
3644 rBuffer = _mm_mulhi_epi16(x7, r0);
3645 t7 = _mm_unpacklo_epi16(t8, rBuffer);
3646 x7 = _mm_unpackhi_epi16(t8, rBuffer);
3647
3648 t8 = _mm_unpacklo_epi16(
3649 _mm_mullo_epi16(x8,
3650 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3651 _mm_mulhi_epi16(x8,
3652 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3653 x8 = _mm_unpackhi_epi16(
3654 _mm_mullo_epi16(x8,
3655 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3656 _mm_mulhi_epi16(x8,
3657 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3658
3659 /* add calculus by correct value : */
3660
3661 r1 = _mm_add_epi32(x1, x2);
3662 x3 = _mm_add_epi32(x3, x4);
3663 x5 = _mm_add_epi32(x5, x6);
3664 r1 = _mm_add_epi32(r1, x3);
3665 x7 = _mm_add_epi32(x7, x8);
3666 r1 = _mm_add_epi32(r1, x5);
3667
3668 r0 = _mm_add_epi32(t1, t2);
3669 t3 = _mm_add_epi32(t3, t4);
3670 t5 = _mm_add_epi32(t5, t6);
3671 r0 = _mm_add_epi32(r0, t3);
3672 t7 = _mm_add_epi32(t7, t8);
3673 r0 = _mm_add_epi32(r0, t5);
3674 r1 = _mm_add_epi32(r1, x7);
3675 r0 = _mm_add_epi32(r0, t7);
3676 r1 = _mm_srli_epi32(r1, 6);
3677 r0 = _mm_srli_epi32(r0, 6);
3678
3679 r1 = _mm_and_si128(r1,
3680 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3681 r0 = _mm_and_si128(r0,
3682 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3683 r0 = _mm_hadd_epi16(r0, r1);
3684 _mm_store_si128((__m128i *) &dst[x], r0);
3685
3686 }
3687 tmp += MAX_PB_SIZE;
3688 dst += dststride;
3689 }
3690 }
3691 void ff_hevc_put_hevc_qpel_h_1_v_3_sse(int16_t *dst, ptrdiff_t dststride,
3692 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3693 int16_t* mcbuffer) {
3694 int x, y;
3695 uint8_t *src = (uint8_t*) _src;
3696 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3697 int16_t *tmp = mcbuffer;
3698 __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
3699 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3700
3701 src -= qpel_extra_before[3] * srcstride;
3702 r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
3703 -1);
3704
3705 /* LOAD src from memory to registers to limit memory bandwidth */
3706 if (width == 4) {
3707
3708 for (y = 0; y < height + qpel_extra[3]; y += 2) {
3709 /* load data in register */
3710 x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3711 src += srcstride;
3712 t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3713 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3714 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3715 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3716 _mm_srli_si128(x1, 3));
3717 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3718 _mm_srli_si128(t1, 3));
3719
3720 /* PMADDUBSW then PMADDW */
3721 x2 = _mm_maddubs_epi16(x2, r0);
3722 t2 = _mm_maddubs_epi16(t2, r0);
3723 x3 = _mm_maddubs_epi16(x3, r0);
3724 t3 = _mm_maddubs_epi16(t3, r0);
3725 x2 = _mm_hadd_epi16(x2, x3);
3726 t2 = _mm_hadd_epi16(t2, t3);
3727 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3728 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3729 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3730 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3731 /* give results back */
3732 _mm_storel_epi64((__m128i *) &tmp[0], x2);
3733
3734 tmp += MAX_PB_SIZE;
3735 _mm_storel_epi64((__m128i *) &tmp[0], t2);
3736
3737 src += srcstride;
3738 tmp += MAX_PB_SIZE;
3739 }
3740 } else
3741 for (y = 0; y < height + qpel_extra[3]; y++) {
3742 for (x = 0; x < width; x += 8) {
3743 /* load data in register */
3744 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3745 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3746 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3747 _mm_srli_si128(x1, 3));
3748 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3749 _mm_srli_si128(x1, 5));
3750 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3751 _mm_srli_si128(x1, 7));
3752
3753 /* PMADDUBSW then PMADDW */
3754 x2 = _mm_maddubs_epi16(x2, r0);
3755 x3 = _mm_maddubs_epi16(x3, r0);
3756 x4 = _mm_maddubs_epi16(x4, r0);
3757 x5 = _mm_maddubs_epi16(x5, r0);
3758 x2 = _mm_hadd_epi16(x2, x3);
3759 x4 = _mm_hadd_epi16(x4, x5);
3760 x2 = _mm_hadd_epi16(x2, x4);
3761 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3762
3763 /* give results back */
3764 _mm_store_si128((__m128i *) &tmp[x], x2);
3765
3766 }
3767 src += srcstride;
3768 tmp += MAX_PB_SIZE;
3769 }
3770
3771 tmp = mcbuffer + qpel_extra_before[3] * MAX_PB_SIZE;
3772 srcstride = MAX_PB_SIZE;
3773
3774 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
3775 for register calculations */
3776 rTemp = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
3777 for (y = 0; y < height; y++) {
3778 for (x = 0; x < width; x += 8) {
3779
3780 x1 = _mm_setzero_si128();
3781 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3782 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3783 x4 = _mm_load_si128((__m128i *) &tmp[x]);
3784 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3785 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3786 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3787 x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
3788
3789
3790 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
3791
3792 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
3793 t7 = _mm_mullo_epi16(x2, r1);
3794 rBuffer = _mm_mulhi_epi16(x2, r1);
3795 t8 = _mm_mullo_epi16(x3, r0);
3796 t2 = _mm_unpacklo_epi16(t7, rBuffer);
3797 x2 = _mm_unpackhi_epi16(t7, rBuffer);
3798
3799 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
3800 rBuffer = _mm_mulhi_epi16(x3, r0);
3801 t7 = _mm_mullo_epi16(x4, r1);
3802 t3 = _mm_unpacklo_epi16(t8, rBuffer);
3803 x3 = _mm_unpackhi_epi16(t8, rBuffer);
3804
3805 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
3806 rBuffer = _mm_mulhi_epi16(x4, r1);
3807 t8 = _mm_mullo_epi16(x5, r0);
3808 t4 = _mm_unpacklo_epi16(t7, rBuffer);
3809 x4 = _mm_unpackhi_epi16(t7, rBuffer);
3810
3811 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
3812 rBuffer = _mm_mulhi_epi16(x5, r0);
3813 t7 = _mm_mullo_epi16(x6, r1);
3814 t5 = _mm_unpacklo_epi16(t8, rBuffer);
3815 x5 = _mm_unpackhi_epi16(t8, rBuffer);
3816
3817 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
3818 rBuffer = _mm_mulhi_epi16(x6, r1);
3819 t8 = _mm_mullo_epi16(x7, r0);
3820 t6 = _mm_unpacklo_epi16(t7, rBuffer);
3821 x6 = _mm_unpackhi_epi16(t7, rBuffer);
3822
3823 rBuffer = _mm_mulhi_epi16(x7, r0);
3824 t7 = _mm_unpacklo_epi16(t8, rBuffer);
3825 x7 = _mm_unpackhi_epi16(t8, rBuffer);
3826
3827 t8 = _mm_unpacklo_epi16(
3828 _mm_mullo_epi16(x8,
3829 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3830 _mm_mulhi_epi16(x8,
3831 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3832 x8 = _mm_unpackhi_epi16(
3833 _mm_mullo_epi16(x8,
3834 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3835 _mm_mulhi_epi16(x8,
3836 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3837
3838 /* add calculus by correct value : */
3839
3840 x3 = _mm_add_epi32(x3, x4);
3841 x5 = _mm_add_epi32(x5, x6);
3842 r1 = _mm_add_epi32(x2, x3);
3843 x7 = _mm_add_epi32(x7, x8);
3844 r1 = _mm_add_epi32(r1, x5);
3845
3846 t3 = _mm_add_epi32(t3, t4);
3847 t5 = _mm_add_epi32(t5, t6);
3848 r0 = _mm_add_epi32(t2, t3);
3849 t7 = _mm_add_epi32(t7, t8);
3850 r0 = _mm_add_epi32(r0, t5);
3851 r1 = _mm_add_epi32(r1, x7);
3852 r0 = _mm_add_epi32(r0, t7);
3853 r1 = _mm_srli_epi32(r1, 6);
3854 r0 = _mm_srli_epi32(r0, 6);
3855
3856 r1 = _mm_and_si128(r1,
3857 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3858 r0 = _mm_and_si128(r0,
3859 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3860 r0 = _mm_hadd_epi16(r0, r1);
3861 _mm_store_si128((__m128i *) &dst[x], r0);
3862
3863 }
3864 tmp += MAX_PB_SIZE;
3865 dst += dststride;
3866 }
3867 }
3868 void ff_hevc_put_hevc_qpel_h_2_v_1_sse(int16_t *dst, ptrdiff_t dststride,
3869 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3870 int16_t* mcbuffer) {
3871 int x, y;
3872 uint8_t *src = (uint8_t*) _src;
3873 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3874 int16_t *tmp = mcbuffer;
3875 __m128i x1, x2, x3, x4, x5, x6, x7, rBuffer, rTemp, r0, r1;
3876 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3877
3878 src -= qpel_extra_before[1] * srcstride;
3879 r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
3880 4, -1);
3881
3882 /* LOAD src from memory to registers to limit memory bandwidth */
3883 if (width == 4) {
3884
3885 for (y = 0; y < height + qpel_extra[1]; y += 2) {
3886 /* load data in register */
3887 x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3888 src += srcstride;
3889 t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3890 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3891 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3892 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3893 _mm_srli_si128(x1, 3));
3894 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3895 _mm_srli_si128(t1, 3));
3896
3897 /* PMADDUBSW then PMADDW */
3898 x2 = _mm_maddubs_epi16(x2, r0);
3899 t2 = _mm_maddubs_epi16(t2, r0);
3900 x3 = _mm_maddubs_epi16(x3, r0);
3901 t3 = _mm_maddubs_epi16(t3, r0);
3902 x2 = _mm_hadd_epi16(x2, x3);
3903 t2 = _mm_hadd_epi16(t2, t3);
3904 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3905 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3906 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3907 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3908 /* give results back */
3909 _mm_storel_epi64((__m128i *) &tmp[0], x2);
3910
3911 tmp += MAX_PB_SIZE;
3912 _mm_storel_epi64((__m128i *) &tmp[0], t2);
3913
3914 src += srcstride;
3915 tmp += MAX_PB_SIZE;
3916 }
3917 } else
3918 for (y = 0; y < height + qpel_extra[1]; y++) {
3919 for (x = 0; x < width; x += 8) {
3920 /* load data in register */
3921 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3922 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3923 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3924 _mm_srli_si128(x1, 3));
3925 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3926 _mm_srli_si128(x1, 5));
3927 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3928 _mm_srli_si128(x1, 7));
3929
3930 /* PMADDUBSW then PMADDW */
3931 x2 = _mm_maddubs_epi16(x2, r0);
3932 x3 = _mm_maddubs_epi16(x3, r0);
3933 x4 = _mm_maddubs_epi16(x4, r0);
3934 x5 = _mm_maddubs_epi16(x5, r0);
3935 x2 = _mm_hadd_epi16(x2, x3);
3936 x4 = _mm_hadd_epi16(x4, x5);
3937 x2 = _mm_hadd_epi16(x2, x4);
3938 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3939
3940 /* give results back */
3941 _mm_store_si128((__m128i *) &tmp[x], x2);
3942
3943 }
3944 src += srcstride;
3945 tmp += MAX_PB_SIZE;
3946 }
3947
3948 tmp = mcbuffer + qpel_extra_before[1] * MAX_PB_SIZE;
3949 srcstride = MAX_PB_SIZE;
3950
3951 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
3952 for register calculations */
3953 rTemp = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
3954 for (y = 0; y < height; y++) {
3955 for (x = 0; x < width; x += 8) {
3956
3957 x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
3958 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3959 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3960 x4 = _mm_load_si128((__m128i *) &tmp[x]);
3961 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3962 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3963 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3964
3965 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
3966 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
3967 t8 = _mm_mullo_epi16(x1, r0);
3968 rBuffer = _mm_mulhi_epi16(x1, r0);
3969 t7 = _mm_mullo_epi16(x2, r1);
3970 t1 = _mm_unpacklo_epi16(t8, rBuffer);
3971 x1 = _mm_unpackhi_epi16(t8, rBuffer);
3972
3973 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
3974 rBuffer = _mm_mulhi_epi16(x2, r1);
3975 t8 = _mm_mullo_epi16(x3, r0);
3976 t2 = _mm_unpacklo_epi16(t7, rBuffer);
3977 x2 = _mm_unpackhi_epi16(t7, rBuffer);
3978
3979 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
3980 rBuffer = _mm_mulhi_epi16(x3, r0);
3981 t7 = _mm_mullo_epi16(x4, r1);
3982 t3 = _mm_unpacklo_epi16(t8, rBuffer);
3983 x3 = _mm_unpackhi_epi16(t8, rBuffer);
3984
3985 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
3986 rBuffer = _mm_mulhi_epi16(x4, r1);
3987 t8 = _mm_mullo_epi16(x5, r0);
3988 t4 = _mm_unpacklo_epi16(t7, rBuffer);
3989 x4 = _mm_unpackhi_epi16(t7, rBuffer);
3990
3991 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
3992 rBuffer = _mm_mulhi_epi16(x5, r0);
3993 t7 = _mm_mullo_epi16(x6, r1);
3994 t5 = _mm_unpacklo_epi16(t8, rBuffer);
3995 x5 = _mm_unpackhi_epi16(t8, rBuffer);
3996
3997 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
3998 rBuffer = _mm_mulhi_epi16(x6, r1);
3999 t8 = _mm_mullo_epi16(x7, r0);
4000 t6 = _mm_unpacklo_epi16(t7, rBuffer);
4001 x6 = _mm_unpackhi_epi16(t7, rBuffer);
4002
4003 rBuffer = _mm_mulhi_epi16(x7, r0);
4004 t7 = _mm_unpacklo_epi16(t8, rBuffer);
4005 x7 = _mm_unpackhi_epi16(t8, rBuffer);
4006
4007
4008
4009 /* add calculus by correct value : */
4010
4011 r1 = _mm_add_epi32(x1, x2);
4012 x3 = _mm_add_epi32(x3, x4);
4013 x5 = _mm_add_epi32(x5, x6);
4014 r1 = _mm_add_epi32(r1, x3);
4015 r1 = _mm_add_epi32(r1, x5);
4016
4017 r0 = _mm_add_epi32(t1, t2);
4018 t3 = _mm_add_epi32(t3, t4);
4019 t5 = _mm_add_epi32(t5, t6);
4020 r0 = _mm_add_epi32(r0, t3);
4021 r0 = _mm_add_epi32(r0, t5);
4022 r1 = _mm_add_epi32(r1, x7);
4023 r0 = _mm_add_epi32(r0, t7);
4024 r1 = _mm_srli_epi32(r1, 6);
4025 r0 = _mm_srli_epi32(r0, 6);
4026
4027 r1 = _mm_and_si128(r1,
4028 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4029 r0 = _mm_and_si128(r0,
4030 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4031 r0 = _mm_hadd_epi16(r0, r1);
4032 _mm_store_si128((__m128i *) &dst[x], r0);
4033
4034 }
4035 tmp += MAX_PB_SIZE;
4036 dst += dststride;
4037 }
4038 }
4039 void ff_hevc_put_hevc_qpel_h_2_v_2_sse(int16_t *dst, ptrdiff_t dststride,
4040 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4041 int16_t* mcbuffer) {
4042 int x, y;
4043 uint8_t *src = (uint8_t*) _src;
4044 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4045 int16_t *tmp = mcbuffer;
4046 __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4047 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4048
4049 src -= qpel_extra_before[2] * srcstride;
4050 r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
4051 4, -1);
4052
4053 /* LOAD src from memory to registers to limit memory bandwidth */
4054 if (width == 4) {
4055
4056 for (y = 0; y < height + qpel_extra[2]; y += 2) {
4057 /* load data in register */
4058 x1 = _mm_loadu_si128((__m128i *) &src[-3]);
4059 src += srcstride;
4060 t1 = _mm_loadu_si128((__m128i *) &src[-3]);
4061 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4062 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4063 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4064 _mm_srli_si128(x1, 3));
4065 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4066 _mm_srli_si128(t1, 3));
4067
4068 /* PMADDUBSW then PMADDW */
4069 x2 = _mm_maddubs_epi16(x2, r0);
4070 t2 = _mm_maddubs_epi16(t2, r0);
4071 x3 = _mm_maddubs_epi16(x3, r0);
4072 t3 = _mm_maddubs_epi16(t3, r0);
4073 x2 = _mm_hadd_epi16(x2, x3);
4074 t2 = _mm_hadd_epi16(t2, t3);
4075 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4076 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4077 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4078 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4079 /* give results back */
4080 _mm_storel_epi64((__m128i *) &tmp[0], x2);
4081
4082 tmp += MAX_PB_SIZE;
4083 _mm_storel_epi64((__m128i *) &tmp[0], t2);
4084
4085 src += srcstride;
4086 tmp += MAX_PB_SIZE;
4087 }
4088 } else
4089 for (y = 0; y < height + qpel_extra[2]; y++) {
4090 for (x = 0; x < width; x += 8) {
4091 /* load data in register */
4092 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
4093 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4094 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4095 _mm_srli_si128(x1, 3));
4096 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4097 _mm_srli_si128(x1, 5));
4098 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4099 _mm_srli_si128(x1, 7));
4100
4101 /* PMADDUBSW then PMADDW */
4102 x2 = _mm_maddubs_epi16(x2, r0);
4103 x3 = _mm_maddubs_epi16(x3, r0);
4104 x4 = _mm_maddubs_epi16(x4, r0);
4105 x5 = _mm_maddubs_epi16(x5, r0);
4106 x2 = _mm_hadd_epi16(x2, x3);
4107 x4 = _mm_hadd_epi16(x4, x5);
4108 x2 = _mm_hadd_epi16(x2, x4);
4109 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4110
4111 /* give results back */
4112 _mm_store_si128((__m128i *) &tmp[x], x2);
4113
4114 }
4115 src += srcstride;
4116 tmp += MAX_PB_SIZE;
4117 }
4118
4119 tmp = mcbuffer + qpel_extra_before[2] * MAX_PB_SIZE;
4120 srcstride = MAX_PB_SIZE;
4121
4122 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
4123 for register calculations */
4124 rTemp = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
4125 for (y = 0; y < height; y++) {
4126 for (x = 0; x < width; x += 8) {
4127
4128 x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
4129 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4130 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4131 x4 = _mm_load_si128((__m128i *) &tmp[x]);
4132 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4133 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4134 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4135 x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4136
4137 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
4138 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4139 t8 = _mm_mullo_epi16(x1, r0);
4140 rBuffer = _mm_mulhi_epi16(x1, r0);
4141 t7 = _mm_mullo_epi16(x2, r1);
4142 t1 = _mm_unpacklo_epi16(t8, rBuffer);
4143 x1 = _mm_unpackhi_epi16(t8, rBuffer);
4144
4145 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4146 rBuffer = _mm_mulhi_epi16(x2, r1);
4147 t8 = _mm_mullo_epi16(x3, r0);
4148 t2 = _mm_unpacklo_epi16(t7, rBuffer);
4149 x2 = _mm_unpackhi_epi16(t7, rBuffer);
4150
4151 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4152 rBuffer = _mm_mulhi_epi16(x3, r0);
4153 t7 = _mm_mullo_epi16(x4, r1);
4154 t3 = _mm_unpacklo_epi16(t8, rBuffer);
4155 x3 = _mm_unpackhi_epi16(t8, rBuffer);
4156
4157 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4158 rBuffer = _mm_mulhi_epi16(x4, r1);
4159 t8 = _mm_mullo_epi16(x5, r0);
4160 t4 = _mm_unpacklo_epi16(t7, rBuffer);
4161 x4 = _mm_unpackhi_epi16(t7, rBuffer);
4162
4163 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4164 rBuffer = _mm_mulhi_epi16(x5, r0);
4165 t7 = _mm_mullo_epi16(x6, r1);
4166 t5 = _mm_unpacklo_epi16(t8, rBuffer);
4167 x5 = _mm_unpackhi_epi16(t8, rBuffer);
4168
4169 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4170 rBuffer = _mm_mulhi_epi16(x6, r1);
4171 t8 = _mm_mullo_epi16(x7, r0);
4172 t6 = _mm_unpacklo_epi16(t7, rBuffer);
4173 x6 = _mm_unpackhi_epi16(t7, rBuffer);
4174
4175 rBuffer = _mm_mulhi_epi16(x7, r0);
4176 t7 = _mm_unpacklo_epi16(t8, rBuffer);
4177 x7 = _mm_unpackhi_epi16(t8, rBuffer);
4178
4179 t8 = _mm_unpacklo_epi16(
4180 _mm_mullo_epi16(x8,
4181 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4182 _mm_mulhi_epi16(x8,
4183 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4184 x8 = _mm_unpackhi_epi16(
4185 _mm_mullo_epi16(x8,
4186 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4187 _mm_mulhi_epi16(x8,
4188 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4189
4190 /* add calculus by correct value : */
4191
4192 r1 = _mm_add_epi32(x1, x2);
4193 x3 = _mm_add_epi32(x3, x4);
4194 x5 = _mm_add_epi32(x5, x6);
4195 r1 = _mm_add_epi32(r1, x3);
4196 x7 = _mm_add_epi32(x7, x8);
4197 r1 = _mm_add_epi32(r1, x5);
4198
4199 r0 = _mm_add_epi32(t1, t2);
4200 t3 = _mm_add_epi32(t3, t4);
4201 t5 = _mm_add_epi32(t5, t6);
4202 r0 = _mm_add_epi32(r0, t3);
4203 t7 = _mm_add_epi32(t7, t8);
4204 r0 = _mm_add_epi32(r0, t5);
4205 r1 = _mm_add_epi32(r1, x7);
4206 r0 = _mm_add_epi32(r0, t7);
4207 r1 = _mm_srli_epi32(r1, 6);
4208 r0 = _mm_srli_epi32(r0, 6);
4209
4210 r1 = _mm_and_si128(r1,
4211 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4212 r0 = _mm_and_si128(r0,
4213 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4214 r0 = _mm_hadd_epi16(r0, r1);
4215 _mm_store_si128((__m128i *) &dst[x], r0);
4216
4217 }
4218 tmp += MAX_PB_SIZE;
4219 dst += dststride;
4220 }
4221 }
4222 void ff_hevc_put_hevc_qpel_h_2_v_3_sse(int16_t *dst, ptrdiff_t dststride,
4223 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4224 int16_t* mcbuffer) {
4225 int x, y;
4226 uint8_t *src = (uint8_t*) _src;
4227 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4228 int16_t *tmp = mcbuffer;
4229 __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4230 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4231
4232 src -= qpel_extra_before[3] * srcstride;
4233 r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
4234 4, -1);
4235
4236 /* LOAD src from memory to registers to limit memory bandwidth */
4237 if (width == 4) {
4238
4239 for (y = 0; y < height + qpel_extra[3]; y += 2) {
4240 /* load data in register */
4241 x1 = _mm_loadu_si128((__m128i *) &src[-3]);
4242 src += srcstride;
4243 t1 = _mm_loadu_si128((__m128i *) &src[-3]);
4244 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4245 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4246 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4247 _mm_srli_si128(x1, 3));
4248 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4249 _mm_srli_si128(t1, 3));
4250
4251 /* PMADDUBSW then PMADDW */
4252 x2 = _mm_maddubs_epi16(x2, r0);
4253 t2 = _mm_maddubs_epi16(t2, r0);
4254 x3 = _mm_maddubs_epi16(x3, r0);
4255 t3 = _mm_maddubs_epi16(t3, r0);
4256 x2 = _mm_hadd_epi16(x2, x3);
4257 t2 = _mm_hadd_epi16(t2, t3);
4258 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4259 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4260 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4261 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4262 /* give results back */
4263 _mm_storel_epi64((__m128i *) &tmp[0], x2);
4264
4265 tmp += MAX_PB_SIZE;
4266 _mm_storel_epi64((__m128i *) &tmp[0], t2);
4267
4268 src += srcstride;
4269 tmp += MAX_PB_SIZE;
4270 }
4271 } else
4272 for (y = 0; y < height + qpel_extra[3]; y++) {
4273 for (x = 0; x < width; x += 8) {
4274 /* load data in register */
4275 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
4276 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4277 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4278 _mm_srli_si128(x1, 3));
4279 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4280 _mm_srli_si128(x1, 5));
4281 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4282 _mm_srli_si128(x1, 7));
4283
4284 /* PMADDUBSW then PMADDW */
4285 x2 = _mm_maddubs_epi16(x2, r0);
4286 x3 = _mm_maddubs_epi16(x3, r0);
4287 x4 = _mm_maddubs_epi16(x4, r0);
4288 x5 = _mm_maddubs_epi16(x5, r0);
4289 x2 = _mm_hadd_epi16(x2, x3);
4290 x4 = _mm_hadd_epi16(x4, x5);
4291 x2 = _mm_hadd_epi16(x2, x4);
4292 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4293
4294 /* give results back */
4295 _mm_store_si128((__m128i *) &tmp[x], x2);
4296
4297 }
4298 src += srcstride;
4299 tmp += MAX_PB_SIZE;
4300 }
4301
4302 tmp = mcbuffer + qpel_extra_before[3] * MAX_PB_SIZE;
4303 srcstride = MAX_PB_SIZE;
4304
4305 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
4306 for register calculations */
4307 rTemp = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
4308 for (y = 0; y < height; y++) {
4309 for (x = 0; x < width; x += 8) {
4310
4311 x1 = _mm_setzero_si128();
4312 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4313 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4314 x4 = _mm_load_si128((__m128i *) &tmp[x]);
4315 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4316 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4317 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4318 x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4319
4320 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4321
4322 t7 = _mm_mullo_epi16(x2, r1);
4323
4324
4325 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4326 rBuffer = _mm_mulhi_epi16(x2, r1);
4327 t8 = _mm_mullo_epi16(x3, r0);
4328 t2 = _mm_unpacklo_epi16(t7, rBuffer);
4329 x2 = _mm_unpackhi_epi16(t7, rBuffer);
4330
4331 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4332 rBuffer = _mm_mulhi_epi16(x3, r0);
4333 t7 = _mm_mullo_epi16(x4, r1);
4334 t3 = _mm_unpacklo_epi16(t8, rBuffer);
4335 x3 = _mm_unpackhi_epi16(t8, rBuffer);
4336
4337 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4338 rBuffer = _mm_mulhi_epi16(x4, r1);
4339 t8 = _mm_mullo_epi16(x5, r0);
4340 t4 = _mm_unpacklo_epi16(t7, rBuffer);
4341 x4 = _mm_unpackhi_epi16(t7, rBuffer);
4342
4343 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4344 rBuffer = _mm_mulhi_epi16(x5, r0);
4345 t7 = _mm_mullo_epi16(x6, r1);
4346 t5 = _mm_unpacklo_epi16(t8, rBuffer);
4347 x5 = _mm_unpackhi_epi16(t8, rBuffer);
4348
4349 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4350 rBuffer = _mm_mulhi_epi16(x6, r1);
4351 t8 = _mm_mullo_epi16(x7, r0);
4352 t6 = _mm_unpacklo_epi16(t7, rBuffer);
4353 x6 = _mm_unpackhi_epi16(t7, rBuffer);
4354
4355 rBuffer = _mm_mulhi_epi16(x7, r0);
4356 t7 = _mm_unpacklo_epi16(t8, rBuffer);
4357 x7 = _mm_unpackhi_epi16(t8, rBuffer);
4358
4359 t8 = _mm_unpacklo_epi16(
4360 _mm_mullo_epi16(x8,
4361 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4362 _mm_mulhi_epi16(x8,
4363 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4364 x8 = _mm_unpackhi_epi16(
4365 _mm_mullo_epi16(x8,
4366 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4367 _mm_mulhi_epi16(x8,
4368 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4369
4370 /* add calculus by correct value : */
4371
4372 x3 = _mm_add_epi32(x3, x4);
4373 x5 = _mm_add_epi32(x5, x6);
4374 r1 = _mm_add_epi32(x2, x3);
4375 x7 = _mm_add_epi32(x7, x8);
4376 r1 = _mm_add_epi32(r1, x5);
4377
4378 t3 = _mm_add_epi32(t3, t4);
4379 t5 = _mm_add_epi32(t5, t6);
4380 r0 = _mm_add_epi32(t2, t3);
4381 t7 = _mm_add_epi32(t7, t8);
4382 r0 = _mm_add_epi32(r0, t5);
4383 r1 = _mm_add_epi32(r1, x7);
4384 r0 = _mm_add_epi32(r0, t7);
4385 r1 = _mm_srli_epi32(r1, 6);
4386 r0 = _mm_srli_epi32(r0, 6);
4387
4388 r1 = _mm_and_si128(r1,
4389 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4390 r0 = _mm_and_si128(r0,
4391 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4392 r0 = _mm_hadd_epi16(r0, r1);
4393 _mm_store_si128((__m128i *) &dst[x], r0);
4394
4395 }
4396 tmp += MAX_PB_SIZE;
4397 dst += dststride;
4398 }
4399 }
4400 void ff_hevc_put_hevc_qpel_h_3_v_1_sse(int16_t *dst, ptrdiff_t dststride,
4401 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4402 int16_t* mcbuffer) {
4403 int x, y;
4404 uint8_t *src = (uint8_t*) _src;
4405 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4406 int16_t *tmp = mcbuffer;
4407 __m128i x1, x2, x3, x4, x5, x6, x7, rBuffer, rTemp, r0, r1;
4408 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4409
4410 src -= qpel_extra_before[1] * srcstride;
4411 r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
4412 0);
4413
4414 /* LOAD src from memory to registers to limit memory bandwidth */
4415 if (width == 4) {
4416
4417 for (y = 0; y < height + qpel_extra[1]; y += 2) {
4418 /* load data in register */
4419 x1 = _mm_loadu_si128((__m128i *) &src[-2]);
4420 x1 = _mm_slli_si128(x1, 1);
4421 src += srcstride;
4422 t1 = _mm_loadu_si128((__m128i *) &src[-2]);
4423 t1 = _mm_slli_si128(t1, 1);
4424 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4425 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4426 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4427 _mm_srli_si128(x1, 3));
4428 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4429 _mm_srli_si128(t1, 3));
4430
4431 /* PMADDUBSW then PMADDW */
4432 x2 = _mm_maddubs_epi16(x2, r0);
4433 t2 = _mm_maddubs_epi16(t2, r0);
4434 x3 = _mm_maddubs_epi16(x3, r0);
4435 t3 = _mm_maddubs_epi16(t3, r0);
4436 x2 = _mm_hadd_epi16(x2, x3);
4437 t2 = _mm_hadd_epi16(t2, t3);
4438 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4439 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4440 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4441 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4442 /* give results back */
4443 _mm_storel_epi64((__m128i *) &tmp[0], x2);
4444
4445 tmp += MAX_PB_SIZE;
4446 _mm_storel_epi64((__m128i *) &tmp[0], t2);
4447
4448 src += srcstride;
4449 tmp += MAX_PB_SIZE;
4450 }
4451 } else
4452 for (y = 0; y < height + qpel_extra[1]; y++) {
4453 for (x = 0; x < width; x += 8) {
4454 /* load data in register */
4455 x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
4456 x1 = _mm_slli_si128(x1, 1);
4457 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4458 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4459 _mm_srli_si128(x1, 3));
4460 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4461 _mm_srli_si128(x1, 5));
4462 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4463 _mm_srli_si128(x1, 7));
4464
4465 /* PMADDUBSW then PMADDW */
4466 x2 = _mm_maddubs_epi16(x2, r0);
4467 x3 = _mm_maddubs_epi16(x3, r0);
4468 x4 = _mm_maddubs_epi16(x4, r0);
4469 x5 = _mm_maddubs_epi16(x5, r0);
4470 x2 = _mm_hadd_epi16(x2, x3);
4471 x4 = _mm_hadd_epi16(x4, x5);
4472 x2 = _mm_hadd_epi16(x2, x4);
4473 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4474
4475 /* give results back */
4476 _mm_store_si128((__m128i *) &tmp[x], x2);
4477
4478 }
4479 src += srcstride;
4480 tmp += MAX_PB_SIZE;
4481 }
4482
4483 tmp = mcbuffer + qpel_extra_before[1] * MAX_PB_SIZE;
4484 srcstride = MAX_PB_SIZE;
4485
4486 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
4487 for register calculations */
4488 rTemp = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
4489 for (y = 0; y < height; y++) {
4490 for (x = 0; x < width; x += 8) {
4491
4492 x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
4493 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4494 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4495 x4 = _mm_load_si128((__m128i *) &tmp[x]);
4496 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4497 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4498 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4499
4500 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
4501 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4502 t8 = _mm_mullo_epi16(x1, r0);
4503 rBuffer = _mm_mulhi_epi16(x1, r0);
4504 t7 = _mm_mullo_epi16(x2, r1);
4505 t1 = _mm_unpacklo_epi16(t8, rBuffer);
4506 x1 = _mm_unpackhi_epi16(t8, rBuffer);
4507
4508 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4509 rBuffer = _mm_mulhi_epi16(x2, r1);
4510 t8 = _mm_mullo_epi16(x3, r0);
4511 t2 = _mm_unpacklo_epi16(t7, rBuffer);
4512 x2 = _mm_unpackhi_epi16(t7, rBuffer);
4513
4514 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4515 rBuffer = _mm_mulhi_epi16(x3, r0);
4516 t7 = _mm_mullo_epi16(x4, r1);
4517 t3 = _mm_unpacklo_epi16(t8, rBuffer);
4518 x3 = _mm_unpackhi_epi16(t8, rBuffer);
4519
4520 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4521 rBuffer = _mm_mulhi_epi16(x4, r1);
4522 t8 = _mm_mullo_epi16(x5, r0);
4523 t4 = _mm_unpacklo_epi16(t7, rBuffer);
4524 x4 = _mm_unpackhi_epi16(t7, rBuffer);
4525
4526 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4527 rBuffer = _mm_mulhi_epi16(x5, r0);
4528 t7 = _mm_mullo_epi16(x6, r1);
4529 t5 = _mm_unpacklo_epi16(t8, rBuffer);
4530 x5 = _mm_unpackhi_epi16(t8, rBuffer);
4531
4532 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4533 rBuffer = _mm_mulhi_epi16(x6, r1);
4534 t8 = _mm_mullo_epi16(x7, r0);
4535 t6 = _mm_unpacklo_epi16(t7, rBuffer);
4536 x6 = _mm_unpackhi_epi16(t7, rBuffer);
4537
4538 rBuffer = _mm_mulhi_epi16(x7, r0);
4539 t7 = _mm_unpacklo_epi16(t8, rBuffer);
4540 x7 = _mm_unpackhi_epi16(t8, rBuffer);
4541
4542
4543 /* add calculus by correct value : */
4544
4545 r1 = _mm_add_epi32(x1, x2);
4546 x3 = _mm_add_epi32(x3, x4);
4547 x5 = _mm_add_epi32(x5, x6);
4548 r1 = _mm_add_epi32(r1, x3);
4549 r1 = _mm_add_epi32(r1, x5);
4550
4551 r0 = _mm_add_epi32(t1, t2);
4552 t3 = _mm_add_epi32(t3, t4);
4553 t5 = _mm_add_epi32(t5, t6);
4554 r0 = _mm_add_epi32(r0, t3);
4555 r0 = _mm_add_epi32(r0, t5);
4556 r1 = _mm_add_epi32(r1, x7);
4557 r0 = _mm_add_epi32(r0, t7);
4558 r1 = _mm_srli_epi32(r1, 6);
4559 r0 = _mm_srli_epi32(r0, 6);
4560
4561 r1 = _mm_and_si128(r1,
4562 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4563 r0 = _mm_and_si128(r0,
4564 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4565 r0 = _mm_hadd_epi16(r0, r1);
4566 _mm_store_si128((__m128i *) &dst[x], r0);
4567
4568 }
4569 tmp += MAX_PB_SIZE;
4570 dst += dststride;
4571 }
4572 }
4573 void ff_hevc_put_hevc_qpel_h_3_v_2_sse(int16_t *dst, ptrdiff_t dststride,
4574 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4575 int16_t* mcbuffer) {
4576 int x, y;
4577 uint8_t *src = (uint8_t*) _src;
4578 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4579 int16_t *tmp = mcbuffer;
4580 __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4581 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4582
4583 src -= qpel_extra_before[2] * srcstride;
4584 r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
4585 0);
4586
4587 /* LOAD src from memory to registers to limit memory bandwidth */
4588 if (width == 4) {
4589
4590 for (y = 0; y < height + qpel_extra[2]; y += 2) {
4591 /* load data in register */
4592 x1 = _mm_loadu_si128((__m128i *) &src[-2]);
4593 x1 = _mm_slli_si128(x1, 1);
4594 src += srcstride;
4595 t1 = _mm_loadu_si128((__m128i *) &src[-2]);
4596 t1 = _mm_slli_si128(t1, 1);
4597 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4598 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4599 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4600 _mm_srli_si128(x1, 3));
4601 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4602 _mm_srli_si128(t1, 3));
4603
4604 /* PMADDUBSW then PMADDW */
4605 x2 = _mm_maddubs_epi16(x2, r0);
4606 t2 = _mm_maddubs_epi16(t2, r0);
4607 x3 = _mm_maddubs_epi16(x3, r0);
4608 t3 = _mm_maddubs_epi16(t3, r0);
4609 x2 = _mm_hadd_epi16(x2, x3);
4610 t2 = _mm_hadd_epi16(t2, t3);
4611 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4612 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4613 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4614 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4615 /* give results back */
4616 _mm_storel_epi64((__m128i *) &tmp[0], x2);
4617
4618 tmp += MAX_PB_SIZE;
4619 _mm_storel_epi64((__m128i *) &tmp[0], t2);
4620
4621 src += srcstride;
4622 tmp += MAX_PB_SIZE;
4623 }
4624 } else
4625 for (y = 0; y < height + qpel_extra[2]; y++) {
4626 for (x = 0; x < width; x += 8) {
4627 /* load data in register */
4628 x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
4629 x1 = _mm_slli_si128(x1, 1);
4630 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4631 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4632 _mm_srli_si128(x1, 3));
4633 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4634 _mm_srli_si128(x1, 5));
4635 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4636 _mm_srli_si128(x1, 7));
4637
4638 /* PMADDUBSW then PMADDW */
4639 x2 = _mm_maddubs_epi16(x2, r0);
4640 x3 = _mm_maddubs_epi16(x3, r0);
4641 x4 = _mm_maddubs_epi16(x4, r0);
4642 x5 = _mm_maddubs_epi16(x5, r0);
4643 x2 = _mm_hadd_epi16(x2, x3);
4644 x4 = _mm_hadd_epi16(x4, x5);
4645 x2 = _mm_hadd_epi16(x2, x4);
4646 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4647
4648 /* give results back */
4649 _mm_store_si128((__m128i *) &tmp[x], x2);
4650
4651 }
4652 src += srcstride;
4653 tmp += MAX_PB_SIZE;
4654 }
4655
4656 tmp = mcbuffer + qpel_extra_before[2] * MAX_PB_SIZE;
4657 srcstride = MAX_PB_SIZE;
4658
4659 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
4660 for register calculations */
4661 rTemp = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
4662 for (y = 0; y < height; y++) {
4663 for (x = 0; x < width; x += 8) {
4664
4665 x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
4666 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4667 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4668 x4 = _mm_load_si128((__m128i *) &tmp[x]);
4669 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4670 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4671 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4672 x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4673
4674 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
4675 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4676 t8 = _mm_mullo_epi16(x1, r0);
4677 rBuffer = _mm_mulhi_epi16(x1, r0);
4678 t7 = _mm_mullo_epi16(x2, r1);
4679 t1 = _mm_unpacklo_epi16(t8, rBuffer);
4680 x1 = _mm_unpackhi_epi16(t8, rBuffer);
4681
4682 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4683 rBuffer = _mm_mulhi_epi16(x2, r1);
4684 t8 = _mm_mullo_epi16(x3, r0);
4685 t2 = _mm_unpacklo_epi16(t7, rBuffer);
4686 x2 = _mm_unpackhi_epi16(t7, rBuffer);
4687
4688 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4689 rBuffer = _mm_mulhi_epi16(x3, r0);
4690 t7 = _mm_mullo_epi16(x4, r1);
4691 t3 = _mm_unpacklo_epi16(t8, rBuffer);
4692 x3 = _mm_unpackhi_epi16(t8, rBuffer);
4693
4694 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4695 rBuffer = _mm_mulhi_epi16(x4, r1);
4696 t8 = _mm_mullo_epi16(x5, r0);
4697 t4 = _mm_unpacklo_epi16(t7, rBuffer);
4698 x4 = _mm_unpackhi_epi16(t7, rBuffer);
4699
4700 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4701 rBuffer = _mm_mulhi_epi16(x5, r0);
4702 t7 = _mm_mullo_epi16(x6, r1);
4703 t5 = _mm_unpacklo_epi16(t8, rBuffer);
4704 x5 = _mm_unpackhi_epi16(t8, rBuffer);
4705
4706 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4707 rBuffer = _mm_mulhi_epi16(x6, r1);
4708 t8 = _mm_mullo_epi16(x7, r0);
4709 t6 = _mm_unpacklo_epi16(t7, rBuffer);
4710 x6 = _mm_unpackhi_epi16(t7, rBuffer);
4711
4712 rBuffer = _mm_mulhi_epi16(x7, r0);
4713 t7 = _mm_unpacklo_epi16(t8, rBuffer);
4714 x7 = _mm_unpackhi_epi16(t8, rBuffer);
4715
4716 t8 = _mm_unpacklo_epi16(
4717 _mm_mullo_epi16(x8,
4718 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4719 _mm_mulhi_epi16(x8,
4720 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4721 x8 = _mm_unpackhi_epi16(
4722 _mm_mullo_epi16(x8,
4723 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4724 _mm_mulhi_epi16(x8,
4725 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4726
4727 /* add calculus by correct value : */
4728
4729 r1 = _mm_add_epi32(x1, x2);
4730 x3 = _mm_add_epi32(x3, x4);
4731 x5 = _mm_add_epi32(x5, x6);
4732 r1 = _mm_add_epi32(r1, x3);
4733 x7 = _mm_add_epi32(x7, x8);
4734 r1 = _mm_add_epi32(r1, x5);
4735
4736 r0 = _mm_add_epi32(t1, t2);
4737 t3 = _mm_add_epi32(t3, t4);
4738 t5 = _mm_add_epi32(t5, t6);
4739 r0 = _mm_add_epi32(r0, t3);
4740 t7 = _mm_add_epi32(t7, t8);
4741 r0 = _mm_add_epi32(r0, t5);
4742 r1 = _mm_add_epi32(r1, x7);
4743 r0 = _mm_add_epi32(r0, t7);
4744 r1 = _mm_srli_epi32(r1, 6);
4745 r0 = _mm_srli_epi32(r0, 6);
4746
4747 r1 = _mm_and_si128(r1,
4748 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4749 r0 = _mm_and_si128(r0,
4750 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4751 r0 = _mm_hadd_epi16(r0, r1);
4752 _mm_store_si128((__m128i *) &dst[x], r0);
4753
4754 }
4755 tmp += MAX_PB_SIZE;
4756 dst += dststride;
4757 }
4758 }
4759 void ff_hevc_put_hevc_qpel_h_3_v_3_sse(int16_t *dst, ptrdiff_t dststride,
4760 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4761 int16_t* mcbuffer) {
4762 int x, y;
4763 uint8_t *src = (uint8_t*) _src;
4764 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4765 int16_t *tmp = mcbuffer;
4766 __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4767 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4768
4769 src -= qpel_extra_before[3] * srcstride;
4770 r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
4771 0);
4772
4773 /* LOAD src from memory to registers to limit memory bandwidth */
4774 if (width == 4) {
4775
4776 for (y = 0; y < height + qpel_extra[3]; y += 2) {
4777 /* load data in register */
4778 x1 = _mm_loadu_si128((__m128i *) &src[-2]);
4779 x1 = _mm_slli_si128(x1, 1);
4780 src += srcstride;
4781 t1 = _mm_loadu_si128((__m128i *) &src[-2]);
4782 t1 = _mm_slli_si128(t1, 1);
4783 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4784 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4785 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4786 _mm_srli_si128(x1, 3));
4787 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4788 _mm_srli_si128(t1, 3));
4789
4790 /* PMADDUBSW then PMADDW */
4791 x2 = _mm_maddubs_epi16(x2, r0);
4792 t2 = _mm_maddubs_epi16(t2, r0);
4793 x3 = _mm_maddubs_epi16(x3, r0);
4794 t3 = _mm_maddubs_epi16(t3, r0);
4795 x2 = _mm_hadd_epi16(x2, x3);
4796 t2 = _mm_hadd_epi16(t2, t3);
4797 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4798 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4799 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4800 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4801 /* give results back */
4802 _mm_storel_epi64((__m128i *) &tmp[0], x2);
4803
4804 tmp += MAX_PB_SIZE;
4805 _mm_storel_epi64((__m128i *) &tmp[0], t2);
4806
4807 src += srcstride;
4808 tmp += MAX_PB_SIZE;
4809 }
4810 } else
4811 for (y = 0; y < height + qpel_extra[3]; y++) {
4812 for (x = 0; x < width; x += 8) {
4813 /* load data in register */
4814 x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
4815 x1 = _mm_slli_si128(x1, 1);
4816 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4817 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4818 _mm_srli_si128(x1, 3));
4819 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4820 _mm_srli_si128(x1, 5));
4821 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4822 _mm_srli_si128(x1, 7));
4823
4824 /* PMADDUBSW then PMADDW */
4825 x2 = _mm_maddubs_epi16(x2, r0);
4826 x3 = _mm_maddubs_epi16(x3, r0);
4827 x4 = _mm_maddubs_epi16(x4, r0);
4828 x5 = _mm_maddubs_epi16(x5, r0);
4829 x2 = _mm_hadd_epi16(x2, x3);
4830 x4 = _mm_hadd_epi16(x4, x5);
4831 x2 = _mm_hadd_epi16(x2, x4);
4832 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4833
4834 /* give results back */
4835 _mm_store_si128((__m128i *) &tmp[x], x2);
4836
4837 }
4838 src += srcstride;
4839 tmp += MAX_PB_SIZE;
4840 }
4841
4842 tmp = mcbuffer + qpel_extra_before[3] * MAX_PB_SIZE;
4843 srcstride = MAX_PB_SIZE;
4844
4845 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
4846 for register calculations */
4847 rTemp = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
4848 for (y = 0; y < height; y++) {
4849 for (x = 0; x < width; x += 8) {
4850
4851 x1 = _mm_setzero_si128();
4852 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4853 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4854 x4 = _mm_load_si128((__m128i *) &tmp[x]);
4855 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4856 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4857 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4858 x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4859
4860 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4861 t7 = _mm_mullo_epi16(x2, r1);
4862
4863
4864 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4865 rBuffer = _mm_mulhi_epi16(x2, r1);
4866 t8 = _mm_mullo_epi16(x3, r0);
4867 t2 = _mm_unpacklo_epi16(t7, rBuffer);
4868 x2 = _mm_unpackhi_epi16(t7, rBuffer);
4869
4870 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4871 rBuffer = _mm_mulhi_epi16(x3, r0);
4872 t7 = _mm_mullo_epi16(x4, r1);
4873 t3 = _mm_unpacklo_epi16(t8, rBuffer);
4874 x3 = _mm_unpackhi_epi16(t8, rBuffer);
4875
4876 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4877 rBuffer = _mm_mulhi_epi16(x4, r1);
4878 t8 = _mm_mullo_epi16(x5, r0);
4879 t4 = _mm_unpacklo_epi16(t7, rBuffer);
4880 x4 = _mm_unpackhi_epi16(t7, rBuffer);
4881
4882 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4883 rBuffer = _mm_mulhi_epi16(x5, r0);
4884 t7 = _mm_mullo_epi16(x6, r1);
4885 t5 = _mm_unpacklo_epi16(t8, rBuffer);
4886 x5 = _mm_unpackhi_epi16(t8, rBuffer);
4887
4888 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4889 rBuffer = _mm_mulhi_epi16(x6, r1);
4890 t8 = _mm_mullo_epi16(x7, r0);
4891 t6 = _mm_unpacklo_epi16(t7, rBuffer);
4892 x6 = _mm_unpackhi_epi16(t7, rBuffer);
4893
4894 rBuffer = _mm_mulhi_epi16(x7, r0);
4895 t7 = _mm_unpacklo_epi16(t8, rBuffer);
4896 x7 = _mm_unpackhi_epi16(t8, rBuffer);
4897
4898 t8 = _mm_unpacklo_epi16(
4899 _mm_mullo_epi16(x8,
4900 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4901 _mm_mulhi_epi16(x8,
4902 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4903 x8 = _mm_unpackhi_epi16(
4904 _mm_mullo_epi16(x8,
4905 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4906 _mm_mulhi_epi16(x8,
4907 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4908
4909 /* add calculus by correct value : */
4910
4911 x3 = _mm_add_epi32(x3, x4);
4912 x5 = _mm_add_epi32(x5, x6);
4913 r1 = _mm_add_epi32(x2, x3);
4914 x7 = _mm_add_epi32(x7, x8);
4915 r1 = _mm_add_epi32(r1, x5);
4916
4917 t3 = _mm_add_epi32(t3, t4);
4918 t5 = _mm_add_epi32(t5, t6);
4919 r0 = _mm_add_epi32(t2, t3);
4920 t7 = _mm_add_epi32(t7, t8);
4921 r0 = _mm_add_epi32(r0, t5);
4922 r1 = _mm_add_epi32(r1, x7);
4923 r0 = _mm_add_epi32(r0, t7);
4924 r1 = _mm_srli_epi32(r1, 6);
4925 r0 = _mm_srli_epi32(r0, 6);
4926
4927 r1 = _mm_and_si128(r1,
4928 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4929 r0 = _mm_and_si128(r0,
4930 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4931 r0 = _mm_hadd_epi16(r0, r1);
4932 _mm_store_si128((__m128i *) &dst[x], r0);
4933
4934 }
4935 tmp += MAX_PB_SIZE;
4936 dst += dststride;
4937 }
4938 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 openHEVC contributors
3 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4 *
5 * This file is part of libde265.
6 *
7 * libde265 is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation, either version 3 of
10 * the License, or (at your option) any later version.
11 *
12 * libde265 is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include <stdio.h>
22 #include <emmintrin.h>
23 #include <tmmintrin.h> // SSSE3
24 #if HAVE_SSE4_1
25 #include <smmintrin.h>
26 #endif
27
28 #include "sse-motion.h"
29 #include "libde265/util.h"
30
31
32 ALIGNED_16(const int8_t) epel_filters[7][16] = {
33 { -2, 58, 10, -2,-2, 58, 10, -2,-2, 58, 10, -2,-2, 58, 10, -2 },
34 { -4, 54, 16, -2,-4, 54, 16, -2,-4, 54, 16, -2,-4, 54, 16, -2 },
35 { -6, 46, 28, -4,-6, 46, 28, -4,-6, 46, 28, -4,-6, 46, 28, -4 },
36 { -4, 36, 36, -4,-4, 36, 36, -4,-4, 36, 36, -4,-4, 36, 36, -4 },
37 { -4, 28, 46, -6,-4, 28, 46, -6,-4, 28, 46, -6,-4, 28, 46, -6 },
38 { -2, 16, 54, -4,-2, 16, 54, -4,-2, 16, 54, -4,-2, 16, 54, -4 },
39 { -2, 10, 58, -2,-2, 10, 58, -2,-2, 10, 58, -2,-2, 10, 58, -2 },
40 };
41
42 static const uint8_t qpel_extra_before[4] = { 0, 3, 3, 2 };
43 static const uint8_t qpel_extra_after[4] = { 0, 3, 4, 4 };
44 static const uint8_t qpel_extra[4] = { 0, 6, 7, 6 };
45
46 static const int epel_extra_before = 1;
47 static const int epel_extra_after = 2;
48 static const int epel_extra = 3;
49
50 #define MAX_PB_SIZE 64
51
52 #define MASKMOVE 0
53
54 void print128(const char* prefix, __m128i r)
55 {
56 unsigned char buf[16];
57
58 *(__m128i*)buf = r;
59
60 printf("%s ",prefix);
61 for (int i=0;i<16;i++)
62 {
63 if (i>0) { printf(":"); }
64 printf("%02x", buf[i]);
65 }
66
67 printf("\n");
68 }
69
70
71 void printm32(const char* prefix, unsigned char* p)
72 {
73 printf("%s ",prefix);
74
75 for (int i=0;i<4;i++)
76 {
77 if (i>0) { printf(":"); }
78 printf("%02x", p[i]);
79 }
80
81 printf("\n");
82 }
83
84
85 #define BIT_DEPTH 8
86
87 void ff_hevc_put_unweighted_pred_8_sse(uint8_t *_dst, ptrdiff_t dststride,
88 int16_t *src, ptrdiff_t srcstride, int width, int height) {
89 int x, y;
90 uint8_t *dst = (uint8_t*) _dst;
91 __m128i r0, r1, f0;
92
93 f0 = _mm_set1_epi16(32);
94
95
96 if(!(width & 15))
97 {
98 for (y = 0; y < height; y++) {
99 for (x = 0; x < width; x += 16) {
100 r0 = _mm_load_si128((__m128i *) (src+x));
101
102 r1 = _mm_load_si128((__m128i *) (src+x + 8));
103 r0 = _mm_adds_epi16(r0, f0);
104
105 r1 = _mm_adds_epi16(r1, f0);
106 r0 = _mm_srai_epi16(r0, 6);
107 r1 = _mm_srai_epi16(r1, 6);
108 r0 = _mm_packus_epi16(r0, r1);
109
110 _mm_storeu_si128((__m128i *) (dst+x), r0);
111 }
112 dst += dststride;
113 src += srcstride;
114 }
115 }else if(!(width & 7))
116 {
117 for (y = 0; y < height; y++) {
118 for (x = 0; x < width; x += 8) {
119 r0 = _mm_load_si128((__m128i *) (src+x));
120
121 r0 = _mm_adds_epi16(r0, f0);
122
123 r0 = _mm_srai_epi16(r0, 6);
124 r0 = _mm_packus_epi16(r0, r0);
125
126 _mm_storel_epi64((__m128i *) (dst+x), r0);
127 }
128 dst += dststride;
129 src += srcstride;
130 }
131 }else if(!(width & 3)){
132 for (y = 0; y < height; y++) {
133 for(x = 0;x < width; x+=4){
134 r0 = _mm_loadl_epi64((__m128i *) (src+x));
135 r0 = _mm_adds_epi16(r0, f0);
136
137 r0 = _mm_srai_epi16(r0, 6);
138 r0 = _mm_packus_epi16(r0, r0);
139 #if MASKMOVE
140 _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
141 #else
142 //r0 = _mm_shuffle_epi32 (r0, 0x00);
143 *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
144 #endif
145 }
146 dst += dststride;
147 src += srcstride;
148 }
149 }else{
150 for (y = 0; y < height; y++) {
151 for(x = 0;x < width; x+=2){
152 r0 = _mm_loadl_epi64((__m128i *) (src+x));
153 r0 = _mm_adds_epi16(r0, f0);
154
155 r0 = _mm_srai_epi16(r0, 6);
156 r0 = _mm_packus_epi16(r0, r0);
157 #if MASKMOVE
158 _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
159 #else
160 *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
161 #endif
162 }
163 dst += dststride;
164 src += srcstride;
165 }
166 }
167
168 }
169
170 void ff_hevc_put_unweighted_pred_sse(uint8_t *_dst, ptrdiff_t _dststride,
171 int16_t *src, ptrdiff_t srcstride, int width, int height) {
172 int x, y;
173 uint8_t *dst = (uint8_t*) _dst;
174 ptrdiff_t dststride = _dststride / sizeof(uint8_t);
175 __m128i r0, r1, f0;
176 int shift = 14 - BIT_DEPTH;
177 #if BIT_DEPTH < 14
178 int16_t offset = 1 << (shift - 1);
179 #else
180 int16_t offset = 0;
181
182 #endif
183 f0 = _mm_set1_epi16(offset);
184
185 for (y = 0; y < height; y++) {
186 for (x = 0; x < width; x += 16) {
187 r0 = _mm_load_si128((__m128i *) &src[x]);
188
189 r1 = _mm_load_si128((__m128i *) &src[x + 8]);
190 r0 = _mm_adds_epi16(r0, f0);
191
192 r1 = _mm_adds_epi16(r1, f0);
193 r0 = _mm_srai_epi16(r0, shift);
194 r1 = _mm_srai_epi16(r1, shift);
195 r0 = _mm_packus_epi16(r0, r1);
196
197 _mm_storeu_si128((__m128i *) &dst[x], r0);
198 }
199 dst += dststride;
200 src += srcstride;
201 }
202 }
203
204 void ff_hevc_put_weighted_pred_avg_8_sse(uint8_t *_dst, ptrdiff_t dststride,
205 int16_t *src1, int16_t *src2, ptrdiff_t srcstride, int width,
206 int height) {
207 int x, y;
208 uint8_t *dst = (uint8_t*) _dst;
209 __m128i r0, r1, f0, r2, r3;
210
211 f0 = _mm_set1_epi16(64);
212 if(!(width & 15)){
213 for (y = 0; y < height; y++) {
214
215 for (x = 0; x < width; x += 16) {
216 r0 = _mm_load_si128((__m128i *) &src1[x]);
217 r1 = _mm_load_si128((__m128i *) &src1[x + 8]);
218 r2 = _mm_load_si128((__m128i *) &src2[x]);
219 r3 = _mm_load_si128((__m128i *) &src2[x + 8]);
220
221 r0 = _mm_adds_epi16(r0, f0);
222 r1 = _mm_adds_epi16(r1, f0);
223 r0 = _mm_adds_epi16(r0, r2);
224 r1 = _mm_adds_epi16(r1, r3);
225 r0 = _mm_srai_epi16(r0, 7);
226 r1 = _mm_srai_epi16(r1, 7);
227 r0 = _mm_packus_epi16(r0, r1);
228
229 _mm_storeu_si128((__m128i *) (dst + x), r0);
230 }
231 dst += dststride;
232 src1 += srcstride;
233 src2 += srcstride;
234 }
235 }else if(!(width & 7)){
236 for (y = 0; y < height; y++) {
237 for(x=0;x<width;x+=8){
238 r0 = _mm_load_si128((__m128i *) (src1+x));
239 r2 = _mm_load_si128((__m128i *) (src2+x));
240
241 r0 = _mm_adds_epi16(r0, f0);
242 r0 = _mm_adds_epi16(r0, r2);
243 r0 = _mm_srai_epi16(r0, 7);
244 r0 = _mm_packus_epi16(r0, r0);
245
246 _mm_storel_epi64((__m128i *) (dst+x), r0);
247 }
248 dst += dststride;
249 src1 += srcstride;
250 src2 += srcstride;
251 }
252 }else if(!(width & 3)){
253 #if MASKMOVE
254 r1= _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1);
255 #endif
256 for (y = 0; y < height; y++) {
257
258 for(x=0;x<width;x+=4)
259 {
260 r0 = _mm_loadl_epi64((__m128i *) (src1+x));
261 r2 = _mm_loadl_epi64((__m128i *) (src2+x));
262
263 r0 = _mm_adds_epi16(r0, f0);
264 r0 = _mm_adds_epi16(r0, r2);
265 r0 = _mm_srai_epi16(r0, 7);
266 r0 = _mm_packus_epi16(r0, r0);
267
268 #if MASKMOVE
269 _mm_maskmoveu_si128(r0,r1,(char *) (dst+x));
270 #else
271 *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
272 #endif
273 }
274 dst += dststride;
275 src1 += srcstride;
276 src2 += srcstride;
277 }
278 }else{
279 #if MASKMOVE
280 r1= _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1);
281 #endif
282 for (y = 0; y < height; y++) {
283 for(x=0;x<width;x+=2)
284 {
285 r0 = _mm_loadl_epi64((__m128i *) (src1+x));
286 r2 = _mm_loadl_epi64((__m128i *) (src2+x));
287
288 r0 = _mm_adds_epi16(r0, f0);
289 r0 = _mm_adds_epi16(r0, r2);
290 r0 = _mm_srai_epi16(r0, 7);
291 r0 = _mm_packus_epi16(r0, r0);
292
293 #if MASKMOVE
294 _mm_maskmoveu_si128(r0,r1,(char *) (dst+x));
295 #else
296 *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
297 #endif
298 }
299 dst += dststride;
300 src1 += srcstride;
301 src2 += srcstride;
302 }
303 }
304
305
306 }
307
308 void ff_hevc_put_weighted_pred_avg_sse(uint8_t *_dst, ptrdiff_t _dststride,
309 int16_t *src1, int16_t *src2, ptrdiff_t srcstride, int width,
310 int height) {
311 int x, y;
312 uint8_t *dst = (uint8_t*) _dst;
313 ptrdiff_t dststride = _dststride / sizeof(uint8_t);
314 __m128i r0, r1, f0, r2, r3;
315 int shift = 14 + 1 - BIT_DEPTH;
316 #if BIT_DEPTH < 14
317 int offset = 1 << (shift - 1);
318 #else
319 int offset = 0;
320 #endif
321 f0 = _mm_set1_epi16(offset);
322 for (y = 0; y < height; y++) {
323
324 for (x = 0; x < width; x += 16) {
325 r0 = _mm_load_si128((__m128i *) &src1[x]);
326 r1 = _mm_load_si128((__m128i *) &src1[x + 8]);
327 r2 = _mm_load_si128((__m128i *) &src2[x]);
328 r3 = _mm_load_si128((__m128i *) &src2[x + 8]);
329
330 r0 = _mm_adds_epi16(r0, f0);
331 r1 = _mm_adds_epi16(r1, f0);
332 r0 = _mm_adds_epi16(r0, r2);
333 r1 = _mm_adds_epi16(r1, r3);
334 r0 = _mm_srai_epi16(r0, shift);
335 r1 = _mm_srai_epi16(r1, shift);
336 r0 = _mm_packus_epi16(r0, r1);
337
338 _mm_storeu_si128((__m128i *) (dst + x), r0);
339 }
340 dst += dststride;
341 src1 += srcstride;
342 src2 += srcstride;
343 }
344 }
345
346 #if 0
347 void ff_hevc_weighted_pred_8_sse4(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
348 uint8_t *_dst, ptrdiff_t _dststride, int16_t *src, ptrdiff_t srcstride,
349 int width, int height) {
350
351 int log2Wd;
352 int x, y;
353
354 uint8_t *dst = (uint8_t*) _dst;
355 ptrdiff_t dststride = _dststride / sizeof(uint8_t);
356 __m128i x0, x1, x2, x3, c0, add, add2;
357
358 log2Wd = denom + 14 - BIT_DEPTH;
359
360 add = _mm_set1_epi32(olxFlag * (1 << (BIT_DEPTH - 8)));
361 add2 = _mm_set1_epi32(1 << (log2Wd - 1));
362 c0 = _mm_set1_epi16(wlxFlag);
363 if (log2Wd >= 1){
364 if(!(width & 15)){
365 for (y = 0; y < height; y++) {
366 for (x = 0; x < width; x += 16) {
367 x0 = _mm_load_si128((__m128i *) &src[x]);
368 x2 = _mm_load_si128((__m128i *) &src[x + 8]);
369 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
370 _mm_mulhi_epi16(x0, c0));
371 x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
372 _mm_mulhi_epi16(x2, c0));
373 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
374 _mm_mulhi_epi16(x0, c0));
375 x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
376 _mm_mulhi_epi16(x2, c0));
377 x0 = _mm_add_epi32(x0, add2);
378 x1 = _mm_add_epi32(x1, add2);
379 x2 = _mm_add_epi32(x2, add2);
380 x3 = _mm_add_epi32(x3, add2);
381 x0 = _mm_srai_epi32(x0, log2Wd);
382 x1 = _mm_srai_epi32(x1, log2Wd);
383 x2 = _mm_srai_epi32(x2, log2Wd);
384 x3 = _mm_srai_epi32(x3, log2Wd);
385 x0 = _mm_add_epi32(x0, add);
386 x1 = _mm_add_epi32(x1, add);
387 x2 = _mm_add_epi32(x2, add);
388 x3 = _mm_add_epi32(x3, add);
389 x0 = _mm_packus_epi32(x0, x1);
390 x2 = _mm_packus_epi32(x2, x3);
391 x0 = _mm_packus_epi16(x0, x2);
392
393 _mm_storeu_si128((__m128i *) (dst + x), x0);
394
395 }
396 dst += dststride;
397 src += srcstride;
398 }
399 }else if(!(width & 7)){
400 for (y = 0; y < height; y++) {
401 for(x=0;x<width;x+=8){
402 x0 = _mm_load_si128((__m128i *) (src+x));
403 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
404 _mm_mulhi_epi16(x0, c0));
405
406 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
407 _mm_mulhi_epi16(x0, c0));
408
409 x0 = _mm_add_epi32(x0, add2);
410 x1 = _mm_add_epi32(x1, add2);
411
412 x0 = _mm_srai_epi32(x0, log2Wd);
413 x1 = _mm_srai_epi32(x1, log2Wd);
414
415 x0 = _mm_add_epi32(x0, add);
416 x1 = _mm_add_epi32(x1, add);
417
418 x0 = _mm_packus_epi32(x0, x1);
419 x0 = _mm_packus_epi16(x0, x0);
420
421 _mm_storel_epi64((__m128i *) (dst+x), x0);
422
423 }
424 dst += dststride;
425 src += srcstride;
426 }
427 }else if(!(width & 3)){
428 for (y = 0; y < height; y++) {
429 for(x=0;x<width;x+=4){
430 x0 = _mm_loadl_epi64((__m128i *)(src+x));
431 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
432 _mm_mulhi_epi16(x0, c0));
433 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
434 _mm_mulhi_epi16(x0, c0));
435
436 x0 = _mm_add_epi32(x0, add2);
437 x1 = _mm_add_epi32(x1, add2);
438 x0 = _mm_srai_epi32(x0, log2Wd);
439 x1 = _mm_srai_epi32(x1, log2Wd);
440 x0 = _mm_add_epi32(x0, add);
441 x1 = _mm_add_epi32(x1, add);
442 x0 = _mm_packus_epi32(x0, x1);
443 x0 = _mm_packus_epi16(x0, x0);
444
445 _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
446 // _mm_storeu_si128((__m128i *) (dst + x), x0);
447 }
448 dst += dststride;
449 src += srcstride;
450 }
451 }else{
452 for (y = 0; y < height; y++) {
453 for(x=0;x<width;x+=2){
454 x0 = _mm_loadl_epi64((__m128i *)(src+x));
455 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
456 _mm_mulhi_epi16(x0, c0));
457 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
458 _mm_mulhi_epi16(x0, c0));
459
460 x0 = _mm_add_epi32(x0, add2);
461 x1 = _mm_add_epi32(x1, add2);
462 x0 = _mm_srai_epi32(x0, log2Wd);
463 x1 = _mm_srai_epi32(x1, log2Wd);
464 x0 = _mm_add_epi32(x0, add);
465 x1 = _mm_add_epi32(x1, add);
466 x0 = _mm_packus_epi32(x0, x1);
467 x0 = _mm_packus_epi16(x0, x0);
468
469 _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
470 // _mm_storeu_si128((__m128i *) (dst + x), x0);
471 }
472 dst += dststride;
473 src += srcstride;
474 }
475 }
476 }else{
477 if(!(width & 15)){
478 for (y = 0; y < height; y++) {
479 for (x = 0; x < width; x += 16) {
480
481 x0 = _mm_load_si128((__m128i *) &src[x]);
482 x2 = _mm_load_si128((__m128i *) &src[x + 8]);
483 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
484 _mm_mulhi_epi16(x0, c0));
485 x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
486 _mm_mulhi_epi16(x2, c0));
487 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
488 _mm_mulhi_epi16(x0, c0));
489 x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
490 _mm_mulhi_epi16(x2, c0));
491
492 x0 = _mm_add_epi32(x0, add2);
493 x1 = _mm_add_epi32(x1, add2);
494 x2 = _mm_add_epi32(x2, add2);
495 x3 = _mm_add_epi32(x3, add2);
496
497 x0 = _mm_packus_epi32(x0, x1);
498 x2 = _mm_packus_epi32(x2, x3);
499 x0 = _mm_packus_epi16(x0, x2);
500
501 _mm_storeu_si128((__m128i *) (dst + x), x0);
502
503 }
504 dst += dststride;
505 src += srcstride;
506 }
507 }else if(!(width & 7)){
508 for (y = 0; y < height; y++) {
509 for(x=0;x<width;x+=8){
510 x0 = _mm_load_si128((__m128i *) (src+x));
511 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
512 _mm_mulhi_epi16(x0, c0));
513
514 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
515 _mm_mulhi_epi16(x0, c0));
516
517
518 x0 = _mm_add_epi32(x0, add2);
519 x1 = _mm_add_epi32(x1, add2);
520
521 x0 = _mm_packus_epi32(x0, x1);
522 x0 = _mm_packus_epi16(x0, x0);
523
524 _mm_storeu_si128((__m128i *) (dst+x), x0);
525 }
526
527 dst += dststride;
528 src += srcstride;
529 }
530 }else if(!(width & 3)){
531 for (y = 0; y < height; y++) {
532 for(x=0;x<width;x+=4){
533 x0 = _mm_loadl_epi64((__m128i *) (src+x));
534 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
535 _mm_mulhi_epi16(x0, c0));
536
537 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
538 _mm_mulhi_epi16(x0, c0));
539
540
541 x0 = _mm_add_epi32(x0, add2);
542 x1 = _mm_add_epi32(x1, add2);
543
544
545 x0 = _mm_packus_epi32(x0, x1);
546 x0 = _mm_packus_epi16(x0, x0);
547
548
549 _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
550 }
551 dst += dststride;
552 src += srcstride;
553 }
554 }else{
555 for (y = 0; y < height; y++) {
556 for(x=0;x<width;x+=2){
557 x0 = _mm_loadl_epi64((__m128i *) (src+x));
558 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
559 _mm_mulhi_epi16(x0, c0));
560
561 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
562 _mm_mulhi_epi16(x0, c0));
563
564
565 x0 = _mm_add_epi32(x0, add2);
566 x1 = _mm_add_epi32(x1, add2);
567
568
569 x0 = _mm_packus_epi32(x0, x1);
570 x0 = _mm_packus_epi16(x0, x0);
571
572
573 _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
574 }
575 dst += dststride;
576 src += srcstride;
577 }
578
579 }
580
581 }
582
583 }
584 #endif
585
586
587 #if 0
588 void ff_hevc_weighted_pred_sse(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
589 uint8_t *_dst, ptrdiff_t _dststride, int16_t *src, ptrdiff_t srcstride,
590 int width, int height) {
591
592 int log2Wd;
593 int x, y;
594
595 uint8_t *dst = (uint8_t*) _dst;
596 ptrdiff_t dststride = _dststride / sizeof(uint8_t);
597 __m128i x0, x1, x2, x3, c0, add, add2;
598
599 log2Wd = denom + 14 - BIT_DEPTH;
600
601 add = _mm_set1_epi32(olxFlag * (1 << (BIT_DEPTH - 8)));
602 add2 = _mm_set1_epi32(1 << (log2Wd - 1));
603 c0 = _mm_set1_epi16(wlxFlag);
604 if (log2Wd >= 1)
605 for (y = 0; y < height; y++) {
606 for (x = 0; x < width; x += 16) {
607 x0 = _mm_load_si128((__m128i *) &src[x]);
608 x2 = _mm_load_si128((__m128i *) &src[x + 8]);
609 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
610 _mm_mulhi_epi16(x0, c0));
611 x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
612 _mm_mulhi_epi16(x2, c0));
613 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
614 _mm_mulhi_epi16(x0, c0));
615 x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
616 _mm_mulhi_epi16(x2, c0));
617 x0 = _mm_add_epi32(x0, add2);
618 x1 = _mm_add_epi32(x1, add2);
619 x2 = _mm_add_epi32(x2, add2);
620 x3 = _mm_add_epi32(x3, add2);
621 x0 = _mm_srai_epi32(x0, log2Wd);
622 x1 = _mm_srai_epi32(x1, log2Wd);
623 x2 = _mm_srai_epi32(x2, log2Wd);
624 x3 = _mm_srai_epi32(x3, log2Wd);
625 x0 = _mm_add_epi32(x0, add);
626 x1 = _mm_add_epi32(x1, add);
627 x2 = _mm_add_epi32(x2, add);
628 x3 = _mm_add_epi32(x3, add);
629 x0 = _mm_packus_epi32(x0, x1);
630 x2 = _mm_packus_epi32(x2, x3);
631 x0 = _mm_packus_epi16(x0, x2);
632
633 _mm_storeu_si128((__m128i *) (dst + x), x0);
634
635 }
636 dst += dststride;
637 src += srcstride;
638 }
639 else
640 for (y = 0; y < height; y++) {
641 for (x = 0; x < width; x += 16) {
642
643 x0 = _mm_load_si128((__m128i *) &src[x]);
644 x2 = _mm_load_si128((__m128i *) &src[x + 8]);
645 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
646 _mm_mulhi_epi16(x0, c0));
647 x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
648 _mm_mulhi_epi16(x2, c0));
649 x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
650 _mm_mulhi_epi16(x0, c0));
651 x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
652 _mm_mulhi_epi16(x2, c0));
653
654 x0 = _mm_add_epi32(x0, add2);
655 x1 = _mm_add_epi32(x1, add2);
656 x2 = _mm_add_epi32(x2, add2);
657 x3 = _mm_add_epi32(x3, add2);
658
659 x0 = _mm_packus_epi32(x0, x1);
660 x2 = _mm_packus_epi32(x2, x3);
661 x0 = _mm_packus_epi16(x0, x2);
662
663 _mm_storeu_si128((__m128i *) (dst + x), x0);
664
665 }
666 dst += dststride;
667 src += srcstride;
668 }
669 }
670 #endif
671
672 #if HAVE_SSE4_1
673 void ff_hevc_weighted_pred_avg_8_sse4(uint8_t denom, int16_t wl0Flag,
674 int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag, uint8_t *_dst,
675 ptrdiff_t _dststride, int16_t *src1, int16_t *src2, ptrdiff_t srcstride,
676 int width, int height) {
677 int shift, shift2;
678 int log2Wd;
679 int o0;
680 int o1;
681 int x, y;
682 uint8_t *dst = (uint8_t*) _dst;
683 ptrdiff_t dststride = _dststride / sizeof(uint8_t);
684 __m128i x0, x1, x2, x3, r0, r1, r2, r3, c0, c1, c2;
685 shift = 14 - BIT_DEPTH;
686 log2Wd = denom + shift;
687
688 o0 = (ol0Flag) * (1 << (BIT_DEPTH - 8));
689 o1 = (ol1Flag) * (1 << (BIT_DEPTH - 8));
690 shift2 = (log2Wd + 1);
691 c0 = _mm_set1_epi16(wl0Flag);
692 c1 = _mm_set1_epi16(wl1Flag);
693 c2 = _mm_set1_epi32((o0 + o1 + 1) << log2Wd);
694
695 if(!(width & 15)){
696 for (y = 0; y < height; y++) {
697 for (x = 0; x < width; x += 16) {
698 x0 = _mm_load_si128((__m128i *) &src1[x]);
699 x1 = _mm_load_si128((__m128i *) &src1[x + 8]);
700 x2 = _mm_load_si128((__m128i *) &src2[x]);
701 x3 = _mm_load_si128((__m128i *) &src2[x + 8]);
702
703 r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
704 _mm_mulhi_epi16(x0, c0));
705 r1 = _mm_unpacklo_epi16(_mm_mullo_epi16(x1, c0),
706 _mm_mulhi_epi16(x1, c0));
707 r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
708 _mm_mulhi_epi16(x2, c1));
709 r3 = _mm_unpacklo_epi16(_mm_mullo_epi16(x3, c1),
710 _mm_mulhi_epi16(x3, c1));
711 x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
712 _mm_mulhi_epi16(x0, c0));
713 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x1, c0),
714 _mm_mulhi_epi16(x1, c0));
715 x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
716 _mm_mulhi_epi16(x2, c1));
717 x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x3, c1),
718 _mm_mulhi_epi16(x3, c1));
719 r0 = _mm_add_epi32(r0, r2);
720 r1 = _mm_add_epi32(r1, r3);
721 r2 = _mm_add_epi32(x0, x2);
722 r3 = _mm_add_epi32(x1, x3);
723
724 r0 = _mm_add_epi32(r0, c2);
725 r1 = _mm_add_epi32(r1, c2);
726 r2 = _mm_add_epi32(r2, c2);
727 r3 = _mm_add_epi32(r3, c2);
728
729 r0 = _mm_srai_epi32(r0, shift2);
730 r1 = _mm_srai_epi32(r1, shift2);
731 r2 = _mm_srai_epi32(r2, shift2);
732 r3 = _mm_srai_epi32(r3, shift2);
733
734 r0 = _mm_packus_epi32(r0, r2);
735 r1 = _mm_packus_epi32(r1, r3);
736 r0 = _mm_packus_epi16(r0, r1);
737
738 _mm_storeu_si128((__m128i *) (dst + x), r0);
739
740 }
741 dst += dststride;
742 src1 += srcstride;
743 src2 += srcstride;
744 }
745 }else if(!(width & 7)){
746 for (y = 0; y < height; y++) {
747 for(x=0;x<width;x+=8){
748 x0 = _mm_load_si128((__m128i *) (src1+x));
749 x2 = _mm_load_si128((__m128i *) (src2+x));
750
751 r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
752 _mm_mulhi_epi16(x0, c0));
753
754 r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
755 _mm_mulhi_epi16(x2, c1));
756
757 x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
758 _mm_mulhi_epi16(x0, c0));
759
760 x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
761 _mm_mulhi_epi16(x2, c1));
762
763 r0 = _mm_add_epi32(r0, r2);
764 r2 = _mm_add_epi32(x0, x2);
765
766
767 r0 = _mm_add_epi32(r0, c2);
768 r2 = _mm_add_epi32(r2, c2);
769
770 r0 = _mm_srai_epi32(r0, shift2);
771 r2 = _mm_srai_epi32(r2, shift2);
772
773 r0 = _mm_packus_epi32(r0, r2);
774 r0 = _mm_packus_epi16(r0, r0);
775
776 _mm_storel_epi64((__m128i *) (dst+x), r0);
777 }
778
779 dst += dststride;
780 src1 += srcstride;
781 src2 += srcstride;
782 }
783 }else if(!(width & 3)){
784 for (y = 0; y < height; y++) {
785 for(x=0;x<width;x+=4){
786 x0 = _mm_loadl_epi64((__m128i *) (src1+x));
787 x2 = _mm_loadl_epi64((__m128i *) (src2+x));
788
789 r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
790 _mm_mulhi_epi16(x0, c0));
791
792 r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
793 _mm_mulhi_epi16(x2, c1));
794
795 x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
796 _mm_mulhi_epi16(x0, c0));
797
798 x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
799 _mm_mulhi_epi16(x2, c1));
800
801 r0 = _mm_add_epi32(r0, r2);
802 r2 = _mm_add_epi32(x0, x2);
803
804 r0 = _mm_add_epi32(r0, c2);
805 r2 = _mm_add_epi32(r2, c2);
806
807 r0 = _mm_srai_epi32(r0, shift2);
808 r2 = _mm_srai_epi32(r2, shift2);
809
810 r0 = _mm_packus_epi32(r0, r2);
811 r0 = _mm_packus_epi16(r0, r0);
812
813 #if MASKMOVE
814 _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
815 #else
816 *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
817 #endif
818 }
819 dst += dststride;
820 src1 += srcstride;
821 src2 += srcstride;
822 }
823 }else{
824 for (y = 0; y < height; y++) {
825 for(x=0;x<width;x+=2){
826 x0 = _mm_loadl_epi64((__m128i *) (src1+x));
827 x2 = _mm_loadl_epi64((__m128i *) (src2+x));
828
829 r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
830 _mm_mulhi_epi16(x0, c0));
831
832 r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
833 _mm_mulhi_epi16(x2, c1));
834
835 x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
836 _mm_mulhi_epi16(x0, c0));
837
838 x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
839 _mm_mulhi_epi16(x2, c1));
840
841 r0 = _mm_add_epi32(r0, r2);
842 r2 = _mm_add_epi32(x0, x2);
843
844 r0 = _mm_add_epi32(r0, c2);
845 r2 = _mm_add_epi32(r2, c2);
846
847 r0 = _mm_srai_epi32(r0, shift2);
848 r2 = _mm_srai_epi32(r2, shift2);
849
850 r0 = _mm_packus_epi32(r0, r2);
851 r0 = _mm_packus_epi16(r0, r0);
852
853 #if MASKMOVE
854 _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
855 #else
856 *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
857 #endif
858 }
859 dst += dststride;
860 src1 += srcstride;
861 src2 += srcstride;
862 }
863 }
864 }
865 #endif
866
867
868 #if 0
869 void ff_hevc_weighted_pred_avg_sse(uint8_t denom, int16_t wl0Flag,
870 int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag, uint8_t *_dst,
871 ptrdiff_t _dststride, int16_t *src1, int16_t *src2, ptrdiff_t srcstride,
872 int width, int height) {
873 int shift, shift2;
874 int log2Wd;
875 int o0;
876 int o1;
877 int x, y;
878 uint8_t *dst = (uint8_t*) _dst;
879 ptrdiff_t dststride = _dststride / sizeof(uint8_t);
880 __m128i x0, x1, x2, x3, r0, r1, r2, r3, c0, c1, c2;
881 shift = 14 - BIT_DEPTH;
882 log2Wd = denom + shift;
883
884 o0 = (ol0Flag) * (1 << (BIT_DEPTH - 8));
885 o1 = (ol1Flag) * (1 << (BIT_DEPTH - 8));
886 shift2 = (log2Wd + 1);
887 c0 = _mm_set1_epi16(wl0Flag);
888 c1 = _mm_set1_epi16(wl1Flag);
889 c2 = _mm_set1_epi32((o0 + o1 + 1) << log2Wd);
890
891 for (y = 0; y < height; y++) {
892 for (x = 0; x < width; x += 16) {
893 x0 = _mm_load_si128((__m128i *) &src1[x]);
894 x1 = _mm_load_si128((__m128i *) &src1[x + 8]);
895 x2 = _mm_load_si128((__m128i *) &src2[x]);
896 x3 = _mm_load_si128((__m128i *) &src2[x + 8]);
897
898 r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
899 _mm_mulhi_epi16(x0, c0));
900 r1 = _mm_unpacklo_epi16(_mm_mullo_epi16(x1, c0),
901 _mm_mulhi_epi16(x1, c0));
902 r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
903 _mm_mulhi_epi16(x2, c1));
904 r3 = _mm_unpacklo_epi16(_mm_mullo_epi16(x3, c1),
905 _mm_mulhi_epi16(x3, c1));
906 x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
907 _mm_mulhi_epi16(x0, c0));
908 x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x1, c0),
909 _mm_mulhi_epi16(x1, c0));
910 x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
911 _mm_mulhi_epi16(x2, c1));
912 x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x3, c1),
913 _mm_mulhi_epi16(x3, c1));
914 r0 = _mm_add_epi32(r0, r2);
915 r1 = _mm_add_epi32(r1, r3);
916 r2 = _mm_add_epi32(x0, x2);
917 r3 = _mm_add_epi32(x1, x3);
918
919 r0 = _mm_add_epi32(r0, c2);
920 r1 = _mm_add_epi32(r1, c2);
921 r2 = _mm_add_epi32(r2, c2);
922 r3 = _mm_add_epi32(r3, c2);
923
924 r0 = _mm_srai_epi32(r0, shift2);
925 r1 = _mm_srai_epi32(r1, shift2);
926 r2 = _mm_srai_epi32(r2, shift2);
927 r3 = _mm_srai_epi32(r3, shift2);
928
929 r0 = _mm_packus_epi32(r0, r2);
930 r1 = _mm_packus_epi32(r1, r3);
931 r0 = _mm_packus_epi16(r0, r1);
932
933 _mm_storeu_si128((__m128i *) (dst + x), r0);
934
935 }
936 dst += dststride;
937 src1 += srcstride;
938 src2 += srcstride;
939 }
940 }
941 #endif
942
943
944 void ff_hevc_put_hevc_epel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride,
945 uint8_t *_src, ptrdiff_t srcstride, int width, int height, int mx,
946 int my, int16_t* mcbuffer) {
947 int x, y;
948 __m128i x1, x2,x3;
949 uint8_t *src = (uint8_t*) _src;
950 if(!(width & 15)){
951 x3= _mm_setzero_si128();
952 for (y = 0; y < height; y++) {
953 for (x = 0; x < width; x += 16) {
954
955 x1 = _mm_loadu_si128((__m128i *) &src[x]);
956 x2 = _mm_unpacklo_epi8(x1, x3);
957
958 x1 = _mm_unpackhi_epi8(x1, x3);
959
960 x2 = _mm_slli_epi16(x2, 6);
961 x1 = _mm_slli_epi16(x1, 6);
962 _mm_store_si128((__m128i *) &dst[x], x2);
963 _mm_store_si128((__m128i *) &dst[x + 8], x1);
964
965 }
966 src += srcstride;
967 dst += dststride;
968 }
969 }else if(!(width & 7)){
970 x1= _mm_setzero_si128();
971 for (y = 0; y < height; y++) {
972 for (x = 0; x < width; x += 8) {
973
974 x2 = _mm_loadl_epi64((__m128i *) &src[x]);
975 x2 = _mm_unpacklo_epi8(x2, x1);
976 x2 = _mm_slli_epi16(x2, 6);
977 _mm_store_si128((__m128i *) &dst[x], x2);
978
979 }
980 src += srcstride;
981 dst += dststride;
982 }
983 }else if(!(width & 3)){
984 x1= _mm_setzero_si128();
985 for (y = 0; y < height; y++) {
986 for (x = 0; x < width; x += 4) {
987
988 x2 = _mm_loadl_epi64((__m128i *) &src[x]);
989 x2 = _mm_unpacklo_epi8(x2,x1);
990
991 x2 = _mm_slli_epi16(x2, 6);
992
993 _mm_storel_epi64((__m128i *) &dst[x], x2);
994
995 }
996 src += srcstride;
997 dst += dststride;
998 }
999 }else{
1000 x1= _mm_setzero_si128();
1001 for (y = 0; y < height; y++) {
1002 for (x = 0; x < width; x += 2) {
1003
1004 x2 = _mm_loadl_epi64((__m128i *) &src[x]);
1005 x2 = _mm_unpacklo_epi8(x2, x1);
1006 x2 = _mm_slli_epi16(x2, 6);
1007 #if MASKMOVE
1008 _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1009 #else
1010 *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
1011 #endif
1012 }
1013 src += srcstride;
1014 dst += dststride;
1015 }
1016 }
1017
1018 }
1019
1020 #ifndef __native_client__
1021 void ff_hevc_put_hevc_epel_pixels_10_sse(int16_t *dst, ptrdiff_t dststride,
1022 uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1023 int my, int16_t* mcbuffer) {
1024 int x, y;
1025 __m128i x2;
1026 uint16_t *src = (uint16_t*) _src;
1027 ptrdiff_t srcstride = _srcstride>>1;
1028 if(!(width & 7)){
1029 //x1= _mm_setzero_si128();
1030 for (y = 0; y < height; y++) {
1031 for (x = 0; x < width; x += 8) {
1032
1033 x2 = _mm_loadu_si128((__m128i *) &src[x]);
1034 x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH
1035 _mm_store_si128((__m128i *) &dst[x], x2);
1036
1037 }
1038 src += srcstride;
1039 dst += dststride;
1040 }
1041 }else if(!(width & 3)){
1042 //x1= _mm_setzero_si128();
1043 for (y = 0; y < height; y++) {
1044 for (x = 0; x < width; x += 4) {
1045
1046 x2 = _mm_loadl_epi64((__m128i *) &src[x]);
1047 x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH
1048
1049 _mm_storel_epi64((__m128i *) &dst[x], x2);
1050
1051 }
1052 src += srcstride;
1053 dst += dststride;
1054 }
1055 }else{
1056 //x1= _mm_setzero_si128();
1057 for (y = 0; y < height; y++) {
1058 for (x = 0; x < width; x += 2) {
1059
1060 x2 = _mm_loadl_epi64((__m128i *) &src[x]);
1061 x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH
1062 _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1063 }
1064 src += srcstride;
1065 dst += dststride;
1066 }
1067 }
1068
1069 }
1070 #endif
1071
1072 void ff_hevc_put_hevc_epel_h_8_sse(int16_t *dst, ptrdiff_t dststride,
1073 uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1074 int my, int16_t* mcbuffer) {
1075 int x, y;
1076 uint8_t *src = (uint8_t*) _src;
1077 ptrdiff_t srcstride = _srcstride;
1078 const int8_t *filter = epel_filters[mx - 1];
1079 __m128i r0, bshuffle1, bshuffle2, x1, x2, x3;
1080 int8_t filter_0 = filter[0];
1081 int8_t filter_1 = filter[1];
1082 int8_t filter_2 = filter[2];
1083 int8_t filter_3 = filter[3];
1084 r0 = _mm_set_epi8(filter_3, filter_2, filter_1, filter_0, filter_3,
1085 filter_2, filter_1, filter_0, filter_3, filter_2, filter_1,
1086 filter_0, filter_3, filter_2, filter_1, filter_0);
1087 bshuffle1 = _mm_set_epi8(6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0);
1088
1089
1090 /*
1091 printf("---IN---SSE\n");
1092
1093 int extra_top = 1;
1094 int extra_left = 1;
1095 int extra_right = 2;
1096 int extra_bottom = 2;
1097
1098 for (int y=-extra_top;y<height+extra_bottom;y++) {
1099 uint8_t* p = &_src[y*_srcstride -extra_left];
1100
1101 for (int x=-extra_left;x<width+extra_right;x++) {
1102 printf("%05d ",*p << 6);
1103 p++;
1104 }
1105 printf("\n");
1106 }
1107 */
1108
1109 if(!(width & 7)){
1110 bshuffle2 = _mm_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5,
1111 4);
1112 for (y = 0; y < height; y++) {
1113 for (x = 0; x < width; x += 8) {
1114
1115 x1 = _mm_loadu_si128((__m128i *) &src[x - 1]);
1116 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1117 x3 = _mm_shuffle_epi8(x1, bshuffle2);
1118
1119 /* PMADDUBSW then PMADDW */
1120 x2 = _mm_maddubs_epi16(x2, r0);
1121 x3 = _mm_maddubs_epi16(x3, r0);
1122 x2 = _mm_hadd_epi16(x2, x3);
1123 _mm_store_si128((__m128i *) &dst[x], x2);
1124 }
1125 src += srcstride;
1126 dst += dststride;
1127 }
1128 }else if(!(width & 3)){
1129
1130 for (y = 0; y < height; y++) {
1131 for (x = 0; x < width; x += 4) {
1132 /* load data in register */
1133 x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1134 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1135
1136 /* PMADDUBSW then PMADDW */
1137 x2 = _mm_maddubs_epi16(x2, r0);
1138 x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
1139 /* give results back */
1140 _mm_storel_epi64((__m128i *) &dst[x], x2);
1141 }
1142 src += srcstride;
1143 dst += dststride;
1144 }
1145 }else{
1146 for (y = 0; y < height; y++) {
1147 for (x = 0; x < width; x += 2) {
1148 /* load data in register */
1149 x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1150 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1151
1152 /* PMADDUBSW then PMADDW */
1153 x2 = _mm_maddubs_epi16(x2, r0);
1154 x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
1155 /* give results back */
1156 #if MASKMOVE
1157 _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1158 #else
1159 *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
1160 #endif
1161 }
1162 src += srcstride;
1163 dst += dststride;
1164 }
1165 }
1166 }
1167
1168 #ifndef __native_client__
1169 void ff_hevc_put_hevc_epel_h_10_sse(int16_t *dst, ptrdiff_t dststride,
1170 uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1171 int my, int16_t* mcbuffer) {
1172 int x, y;
1173 uint16_t *src = (uint16_t*) _src;
1174 ptrdiff_t srcstride = _srcstride>>1;
1175 const int8_t *filter = epel_filters[mx - 1];
1176 __m128i r0, bshuffle1, bshuffle2, x1, x2, x3, r1;
1177 int8_t filter_0 = filter[0];
1178 int8_t filter_1 = filter[1];
1179 int8_t filter_2 = filter[2];
1180 int8_t filter_3 = filter[3];
1181 r0 = _mm_set_epi16(filter_3, filter_2, filter_1,
1182 filter_0, filter_3, filter_2, filter_1, filter_0);
1183 bshuffle1 = _mm_set_epi8(9,8,7,6,5,4, 3, 2,7,6,5,4, 3, 2, 1, 0);
1184
1185 if(!(width & 3)){
1186 bshuffle2 = _mm_set_epi8(13,12,11,10,9,8,7,6,11,10, 9,8,7,6,5, 4);
1187 for (y = 0; y < height; y++) {
1188 for (x = 0; x < width; x += 4) {
1189
1190 x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1191 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1192 x3 = _mm_shuffle_epi8(x1, bshuffle2);
1193
1194
1195 x2 = _mm_madd_epi16(x2, r0);
1196 x3 = _mm_madd_epi16(x3, r0);
1197 x2 = _mm_hadd_epi32(x2, x3);
1198 x2= _mm_srai_epi32(x2,2); //>> (BIT_DEPTH - 8)
1199
1200 x2 = _mm_packs_epi32(x2,r0);
1201 //give results back
1202 _mm_storel_epi64((__m128i *) &dst[x], x2);
1203 }
1204 src += srcstride;
1205 dst += dststride;
1206 }
1207 }else{
1208 r1= _mm_setzero_si128();
1209 for (y = 0; y < height; y++) {
1210 for (x = 0; x < width; x += 2) {
1211 /* load data in register */
1212 x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1213 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1214
1215 /* PMADDUBSW then PMADDW */
1216 x2 = _mm_madd_epi16(x2, r0);
1217 x2 = _mm_hadd_epi32(x2, r1);
1218 x2= _mm_srai_epi32(x2,2); //>> (BIT_DEPTH - 8)
1219 x2 = _mm_packs_epi32(x2, r1);
1220 /* give results back */
1221 _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1222 }
1223 src += srcstride;
1224 dst += dststride;
1225 }
1226 }
1227 }
1228 #endif
1229
1230
1231 void ff_hevc_put_hevc_epel_v_8_sse(int16_t *dst, ptrdiff_t dststride,
1232 uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1233 int my, int16_t* mcbuffer) {
1234 int x, y;
1235 __m128i x0, x1, x2, x3, t0, t1, t2, t3, r0, f0, f1, f2, f3, r1;
1236 uint8_t *src = (uint8_t*) _src;
1237 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
1238 const int8_t *filter = epel_filters[my - 1];
1239 int8_t filter_0 = filter[0];
1240 int8_t filter_1 = filter[1];
1241 int8_t filter_2 = filter[2];
1242 int8_t filter_3 = filter[3];
1243 f0 = _mm_set1_epi16(filter_0);
1244 f1 = _mm_set1_epi16(filter_1);
1245 f2 = _mm_set1_epi16(filter_2);
1246 f3 = _mm_set1_epi16(filter_3);
1247
1248 if(!(width & 15)){
1249 for (y = 0; y < height; y++) {
1250 for (x = 0; x < width; x += 16) {
1251 /* check if memory needs to be reloaded */
1252
1253 x0 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
1254 x1 = _mm_loadu_si128((__m128i *) &src[x]);
1255 x2 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
1256 x3 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
1257
1258 t0 = _mm_unpacklo_epi8(x0, _mm_setzero_si128());
1259 t1 = _mm_unpacklo_epi8(x1, _mm_setzero_si128());
1260 t2 = _mm_unpacklo_epi8(x2, _mm_setzero_si128());
1261 t3 = _mm_unpacklo_epi8(x3, _mm_setzero_si128());
1262
1263 x0 = _mm_unpackhi_epi8(x0, _mm_setzero_si128());
1264 x1 = _mm_unpackhi_epi8(x1, _mm_setzero_si128());
1265 x2 = _mm_unpackhi_epi8(x2, _mm_setzero_si128());
1266 x3 = _mm_unpackhi_epi8(x3, _mm_setzero_si128());
1267
1268 /* multiply by correct value : */
1269 r0 = _mm_mullo_epi16(t0, f0);
1270 r1 = _mm_mullo_epi16(x0, f0);
1271 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1272 r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x1, f1));
1273 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1274 r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x2, f2));
1275 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1276 r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x3, f3));
1277 /* give results back */
1278 _mm_store_si128((__m128i *) &dst[x], r0);
1279 _mm_storeu_si128((__m128i *) &dst[x + 8], r1);
1280 }
1281 src += srcstride;
1282 dst += dststride;
1283 }
1284 }else if(!(width & 7)){
1285 r1= _mm_setzero_si128();
1286 for (y = 0; y < height; y++) {
1287 for(x=0;x<width;x+=8){
1288 x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1289 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1290 x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1291 x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1292
1293 t0 = _mm_unpacklo_epi8(x0, r1);
1294 t1 = _mm_unpacklo_epi8(x1, r1);
1295 t2 = _mm_unpacklo_epi8(x2, r1);
1296 t3 = _mm_unpacklo_epi8(x3, r1);
1297
1298
1299 /* multiply by correct value : */
1300 r0 = _mm_mullo_epi16(t0, f0);
1301 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1302 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1303 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1304 /* give results back */
1305 _mm_storeu_si128((__m128i *) &dst[x], r0);
1306 }
1307 src += srcstride;
1308 dst += dststride;
1309 }
1310 }else if(!(width & 3)){
1311 r1= _mm_setzero_si128();
1312 for (y = 0; y < height; y++) {
1313 for(x=0;x<width;x+=4){
1314 x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1315 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1316 x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1317 x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1318
1319 t0 = _mm_unpacklo_epi8(x0, r1);
1320 t1 = _mm_unpacklo_epi8(x1, r1);
1321 t2 = _mm_unpacklo_epi8(x2, r1);
1322 t3 = _mm_unpacklo_epi8(x3, r1);
1323
1324
1325 /* multiply by correct value : */
1326 r0 = _mm_mullo_epi16(t0, f0);
1327 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1328 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1329 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1330 /* give results back */
1331 _mm_storel_epi64((__m128i *) &dst[x], r0);
1332 }
1333 src += srcstride;
1334 dst += dststride;
1335 }
1336 }else{
1337 r1= _mm_setzero_si128();
1338 for (y = 0; y < height; y++) {
1339 for(x=0;x<width;x+=2){
1340 x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1341 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1342 x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1343 x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1344
1345 t0 = _mm_unpacklo_epi8(x0, r1);
1346 t1 = _mm_unpacklo_epi8(x1, r1);
1347 t2 = _mm_unpacklo_epi8(x2, r1);
1348 t3 = _mm_unpacklo_epi8(x3, r1);
1349
1350
1351 /* multiply by correct value : */
1352 r0 = _mm_mullo_epi16(t0, f0);
1353 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1354 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1355 r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1356 /* give results back */
1357 #if MASKMOVE
1358 _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1359 #else
1360 *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
1361 #endif
1362 }
1363 src += srcstride;
1364 dst += dststride;
1365 }
1366 }
1367 }
1368
1369 #ifndef __native_client__
1370 void ff_hevc_put_hevc_epel_v_10_sse(int16_t *dst, ptrdiff_t dststride,
1371 uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1372 int my, int16_t* mcbuffer) {
1373 int x, y;
1374 __m128i x0, x1, x2, x3, t0, t1, t2, t3, r0, f0, f1, f2, f3, r1, r2, r3;
1375 uint16_t *src = (uint16_t*) _src;
1376 ptrdiff_t srcstride = _srcstride >>1;
1377 const int8_t *filter = epel_filters[my - 1];
1378 int8_t filter_0 = filter[0];
1379 int8_t filter_1 = filter[1];
1380 int8_t filter_2 = filter[2];
1381 int8_t filter_3 = filter[3];
1382 f0 = _mm_set1_epi16(filter_0);
1383 f1 = _mm_set1_epi16(filter_1);
1384 f2 = _mm_set1_epi16(filter_2);
1385 f3 = _mm_set1_epi16(filter_3);
1386
1387 if(!(width & 7)){
1388 r1= _mm_setzero_si128();
1389 for (y = 0; y < height; y++) {
1390 for(x=0;x<width;x+=8){
1391 x0 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
1392 x1 = _mm_loadu_si128((__m128i *) &src[x]);
1393 x2 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
1394 x3 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
1395
1396 // multiply by correct value :
1397 r0 = _mm_mullo_epi16(x0, f0);
1398 t0 = _mm_mulhi_epi16(x0, f0);
1399
1400 x0= _mm_unpacklo_epi16(r0,t0);
1401 t0= _mm_unpackhi_epi16(r0,t0);
1402
1403 r1 = _mm_mullo_epi16(x1, f1);
1404 t1 = _mm_mulhi_epi16(x1, f1);
1405
1406 x1= _mm_unpacklo_epi16(r1,t1);
1407 t1= _mm_unpackhi_epi16(r1,t1);
1408
1409
1410 r2 = _mm_mullo_epi16(x2, f2);
1411 t2 = _mm_mulhi_epi16(x2, f2);
1412
1413 x2= _mm_unpacklo_epi16(r2,t2);
1414 t2= _mm_unpackhi_epi16(r2,t2);
1415
1416
1417 r3 = _mm_mullo_epi16(x3, f3);
1418 t3 = _mm_mulhi_epi16(x3, f3);
1419
1420 x3= _mm_unpacklo_epi16(r3,t3);
1421 t3= _mm_unpackhi_epi16(r3,t3);
1422
1423
1424 r0= _mm_add_epi32(x0,x1);
1425 r1= _mm_add_epi32(x2,x3);
1426
1427 t0= _mm_add_epi32(t0,t1);
1428 t1= _mm_add_epi32(t2,t3);
1429
1430 r0= _mm_add_epi32(r0,r1);
1431 t0= _mm_add_epi32(t0,t1);
1432
1433 r0= _mm_srai_epi32(r0,2);//>> (BIT_DEPTH - 8)
1434 t0= _mm_srai_epi32(t0,2);//>> (BIT_DEPTH - 8)
1435
1436 r0= _mm_packs_epi32(r0, t0);
1437 // give results back
1438 _mm_storeu_si128((__m128i *) &dst[x], r0);
1439 }
1440 src += srcstride;
1441 dst += dststride;
1442 }
1443 }else if(!(width & 3)){
1444 r1= _mm_setzero_si128();
1445 for (y = 0; y < height; y++) {
1446 for(x=0;x<width;x+=4){
1447 x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1448 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1449 x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1450 x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1451
1452 /* multiply by correct value : */
1453 r0 = _mm_mullo_epi16(x0, f0);
1454 t0 = _mm_mulhi_epi16(x0, f0);
1455
1456 x0= _mm_unpacklo_epi16(r0,t0);
1457
1458 r1 = _mm_mullo_epi16(x1, f1);
1459 t1 = _mm_mulhi_epi16(x1, f1);
1460
1461 x1= _mm_unpacklo_epi16(r1,t1);
1462
1463
1464 r2 = _mm_mullo_epi16(x2, f2);
1465 t2 = _mm_mulhi_epi16(x2, f2);
1466
1467 x2= _mm_unpacklo_epi16(r2,t2);
1468
1469
1470 r3 = _mm_mullo_epi16(x3, f3);
1471 t3 = _mm_mulhi_epi16(x3, f3);
1472
1473 x3= _mm_unpacklo_epi16(r3,t3);
1474
1475
1476 r0= _mm_add_epi32(x0,x1);
1477 r1= _mm_add_epi32(x2,x3);
1478 r0= _mm_add_epi32(r0,r1);
1479 r0= _mm_srai_epi32(r0,2);//>> (BIT_DEPTH - 8)
1480
1481 r0= _mm_packs_epi32(r0, r0);
1482
1483 // give results back
1484 _mm_storel_epi64((__m128i *) &dst[x], r0);
1485 }
1486 src += srcstride;
1487 dst += dststride;
1488 }
1489 }else{
1490 r1= _mm_setzero_si128();
1491 for (y = 0; y < height; y++) {
1492 for(x=0;x<width;x+=2){
1493 x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1494 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1495 x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1496 x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1497
1498 /* multiply by correct value : */
1499 r0 = _mm_mullo_epi16(x0, f0);
1500 t0 = _mm_mulhi_epi16(x0, f0);
1501
1502 x0= _mm_unpacklo_epi16(r0,t0);
1503
1504 r1 = _mm_mullo_epi16(x1, f1);
1505 t1 = _mm_mulhi_epi16(x1, f1);
1506
1507 x1= _mm_unpacklo_epi16(r1,t1);
1508
1509 r2 = _mm_mullo_epi16(x2, f2);
1510 t2 = _mm_mulhi_epi16(x2, f2);
1511
1512 x2= _mm_unpacklo_epi16(r2,t2);
1513
1514 r3 = _mm_mullo_epi16(x3, f3);
1515 t3 = _mm_mulhi_epi16(x3, f3);
1516
1517 x3= _mm_unpacklo_epi16(r3,t3);
1518
1519 r0= _mm_add_epi32(x0,x1);
1520 r1= _mm_add_epi32(x2,x3);
1521 r0= _mm_add_epi32(r0,r1);
1522 r0= _mm_srai_epi32(r0,2);//>> (BIT_DEPTH - 8)
1523
1524 r0= _mm_packs_epi32(r0, r0);
1525
1526 /* give results back */
1527 _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1528
1529 }
1530 src += srcstride;
1531 dst += dststride;
1532 }
1533 }
1534 }
1535 #endif
1536
1537 void ff_hevc_put_hevc_epel_hv_8_sse(int16_t *dst, ptrdiff_t dststride,
1538 uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1539 int my, int16_t* mcbuffer) {
1540 int x, y;
1541 uint8_t *src = (uint8_t*) _src;
1542 ptrdiff_t srcstride = _srcstride;
1543 const int8_t *filter_h = epel_filters[mx - 1];
1544 const int8_t *filter_v = epel_filters[my - 1];
1545 __m128i r0, bshuffle1, bshuffle2, x0, x1, x2, x3, t0, t1, t2, t3, f0, f1,
1546 f2, f3, r1, r2;
1547 int8_t filter_0 = filter_h[0];
1548 int8_t filter_1 = filter_h[1];
1549 int8_t filter_2 = filter_h[2];
1550 int8_t filter_3 = filter_h[3];
1551 int16_t *tmp = mcbuffer;
1552 r0 = _mm_set_epi8(filter_3, filter_2, filter_1, filter_0, filter_3,
1553 filter_2, filter_1, filter_0, filter_3, filter_2, filter_1,
1554 filter_0, filter_3, filter_2, filter_1, filter_0);
1555 bshuffle1 = _mm_set_epi8(6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0);
1556
1557 src -= epel_extra_before * srcstride;
1558
1559 f3 = _mm_set1_epi16(filter_v[3]);
1560 f1 = _mm_set1_epi16(filter_v[1]);
1561 f2 = _mm_set1_epi16(filter_v[2]);
1562 f0 = _mm_set1_epi16(filter_v[0]);
1563
1564 /* horizontal treatment */
1565 if(!(width & 7)){
1566 bshuffle2 = _mm_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5,
1567 4);
1568 for (y = 0; y < height + epel_extra; y++) {
1569 for (x = 0; x < width; x += 8) {
1570
1571 x1 = _mm_loadu_si128((__m128i *) &src[x - 1]);
1572 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1573 x3 = _mm_shuffle_epi8(x1, bshuffle2);
1574
1575 /* PMADDUBSW then PMADDW */
1576 x2 = _mm_maddubs_epi16(x2, r0);
1577 x3 = _mm_maddubs_epi16(x3, r0);
1578 x2 = _mm_hadd_epi16(x2, x3);
1579 _mm_store_si128((__m128i *) &tmp[x], x2);
1580 }
1581 src += srcstride;
1582 tmp += MAX_PB_SIZE;
1583 }
1584 tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1585
1586 /* vertical treatment */
1587
1588 for (y = 0; y < height; y++) {
1589 for (x = 0; x < width; x += 8) {
1590 /* check if memory needs to be reloaded */
1591 x0 = _mm_load_si128((__m128i *) &tmp[x - MAX_PB_SIZE]);
1592 x1 = _mm_load_si128((__m128i *) &tmp[x]);
1593 x2 = _mm_load_si128((__m128i *) &tmp[x + MAX_PB_SIZE]);
1594 x3 = _mm_load_si128((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1595
1596 r0 = _mm_mullo_epi16(x0, f0);
1597 r1 = _mm_mulhi_epi16(x0, f0);
1598 r2 = _mm_mullo_epi16(x1, f1);
1599 t0 = _mm_unpacklo_epi16(r0, r1);
1600 x0 = _mm_unpackhi_epi16(r0, r1);
1601 r0 = _mm_mulhi_epi16(x1, f1);
1602 r1 = _mm_mullo_epi16(x2, f2);
1603 t1 = _mm_unpacklo_epi16(r2, r0);
1604 x1 = _mm_unpackhi_epi16(r2, r0);
1605 r2 = _mm_mulhi_epi16(x2, f2);
1606 r0 = _mm_mullo_epi16(x3, f3);
1607 t2 = _mm_unpacklo_epi16(r1, r2);
1608 x2 = _mm_unpackhi_epi16(r1, r2);
1609 r1 = _mm_mulhi_epi16(x3, f3);
1610 t3 = _mm_unpacklo_epi16(r0, r1);
1611 x3 = _mm_unpackhi_epi16(r0, r1);
1612
1613 /* multiply by correct value : */
1614 r0 = _mm_add_epi32(t0, t1);
1615 r1 = _mm_add_epi32(x0, x1);
1616 r0 = _mm_add_epi32(r0, t2);
1617 r1 = _mm_add_epi32(r1, x2);
1618 r0 = _mm_add_epi32(r0, t3);
1619 r1 = _mm_add_epi32(r1, x3);
1620 r0 = _mm_srai_epi32(r0, 6);
1621 r1 = _mm_srai_epi32(r1, 6);
1622
1623 /* give results back */
1624 r0 = _mm_packs_epi32(r0, r1);
1625 _mm_store_si128((__m128i *) &dst[x], r0);
1626 }
1627 tmp += MAX_PB_SIZE;
1628 dst += dststride;
1629 }
1630 }else if(!(width & 3)){
1631 for (y = 0; y < height + epel_extra; y ++) {
1632 for(x=0;x<width;x+=4){
1633 /* load data in register */
1634 x1 = _mm_loadl_epi64((__m128i *) &src[x-1]);
1635
1636 x1 = _mm_shuffle_epi8(x1, bshuffle1);
1637
1638 /* PMADDUBSW then PMADDW */
1639 x1 = _mm_maddubs_epi16(x1, r0);
1640 x1 = _mm_hadd_epi16(x1, _mm_setzero_si128());
1641
1642 /* give results back */
1643 _mm_storel_epi64((__m128i *) &tmp[x], x1);
1644
1645 }
1646 src += srcstride;
1647 tmp += MAX_PB_SIZE;
1648 }
1649 tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1650
1651 /* vertical treatment */
1652
1653
1654 for (y = 0; y < height; y++) {
1655 for (x = 0; x < width; x += 4) {
1656 /* check if memory needs to be reloaded */
1657 x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1658 x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1659 x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1660 x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1661
1662 r0 = _mm_mullo_epi16(x0, f0);
1663 r1 = _mm_mulhi_epi16(x0, f0);
1664 r2 = _mm_mullo_epi16(x1, f1);
1665 t0 = _mm_unpacklo_epi16(r0, r1);
1666
1667 r0 = _mm_mulhi_epi16(x1, f1);
1668 r1 = _mm_mullo_epi16(x2, f2);
1669 t1 = _mm_unpacklo_epi16(r2, r0);
1670
1671 r2 = _mm_mulhi_epi16(x2, f2);
1672 r0 = _mm_mullo_epi16(x3, f3);
1673 t2 = _mm_unpacklo_epi16(r1, r2);
1674
1675 r1 = _mm_mulhi_epi16(x3, f3);
1676 t3 = _mm_unpacklo_epi16(r0, r1);
1677
1678
1679 /* multiply by correct value : */
1680 r0 = _mm_add_epi32(t0, t1);
1681 r0 = _mm_add_epi32(r0, t2);
1682 r0 = _mm_add_epi32(r0, t3);
1683 r0 = _mm_srai_epi32(r0, 6);
1684
1685 /* give results back */
1686 r0 = _mm_packs_epi32(r0, r0);
1687 _mm_storel_epi64((__m128i *) &dst[x], r0);
1688 }
1689 tmp += MAX_PB_SIZE;
1690 dst += dststride;
1691 }
1692 }else{
1693 #if MASKMOVE
1694 bshuffle2=_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1);
1695 #endif
1696 for (y = 0; y < height + epel_extra; y ++) {
1697 for(x=0;x<width;x+=2){
1698 /* load data in register */
1699 x1 = _mm_loadl_epi64((__m128i *) &src[x-1]);
1700 x1 = _mm_shuffle_epi8(x1, bshuffle1);
1701
1702 /* PMADDUBSW then PMADDW */
1703 x1 = _mm_maddubs_epi16(x1, r0);
1704 x1 = _mm_hadd_epi16(x1, _mm_setzero_si128());
1705
1706 /* give results back */
1707 #if MASKMOVE
1708 _mm_maskmoveu_si128(x1,bshuffle2,(char *) (tmp+x));
1709 #else
1710 *((uint32_t*)(tmp+x)) = _mm_cvtsi128_si32(x1);
1711 #endif
1712 }
1713 src += srcstride;
1714 tmp += MAX_PB_SIZE;
1715 }
1716
1717 tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1718
1719 /* vertical treatment */
1720
1721 for (y = 0; y < height; y++) {
1722 for (x = 0; x < width; x += 2) {
1723 /* check if memory needs to be reloaded */
1724 x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1725 x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1726 x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1727 x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1728
1729 r0 = _mm_mullo_epi16(x0, f0);
1730 r1 = _mm_mulhi_epi16(x0, f0);
1731 r2 = _mm_mullo_epi16(x1, f1);
1732 t0 = _mm_unpacklo_epi16(r0, r1);
1733 r0 = _mm_mulhi_epi16(x1, f1);
1734 r1 = _mm_mullo_epi16(x2, f2);
1735 t1 = _mm_unpacklo_epi16(r2, r0);
1736 r2 = _mm_mulhi_epi16(x2, f2);
1737 r0 = _mm_mullo_epi16(x3, f3);
1738 t2 = _mm_unpacklo_epi16(r1, r2);
1739 r1 = _mm_mulhi_epi16(x3, f3);
1740 t3 = _mm_unpacklo_epi16(r0, r1);
1741
1742 /* multiply by correct value : */
1743 r0 = _mm_add_epi32(t0, t1);
1744 r0 = _mm_add_epi32(r0, t2);
1745 r0 = _mm_add_epi32(r0, t3);
1746 r0 = _mm_srai_epi32(r0, 6);
1747 /* give results back */
1748 r0 = _mm_packs_epi32(r0, r0);
1749 #if MASKMOVE
1750 _mm_maskmoveu_si128(r0,bshuffle2,(char *) (dst+x));
1751 #else
1752 *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
1753 #endif
1754 }
1755 tmp += MAX_PB_SIZE;
1756 dst += dststride;
1757 }
1758 }
1759
1760 }
1761
1762
1763 #ifndef __native_client__
1764 void ff_hevc_put_hevc_epel_hv_10_sse(int16_t *dst, ptrdiff_t dststride,
1765 uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1766 int my, int16_t* mcbuffer) {
1767 int x, y;
1768 uint16_t *src = (uint16_t*) _src;
1769 ptrdiff_t srcstride = _srcstride>>1;
1770 const int8_t *filter_h = epel_filters[mx - 1];
1771 const int8_t *filter_v = epel_filters[my - 1];
1772 __m128i r0, bshuffle1, bshuffle2, x0, x1, x2, x3, t0, t1, t2, t3, f0, f1,
1773 f2, f3, r1, r2, r3;
1774 int8_t filter_0 = filter_h[0];
1775 int8_t filter_1 = filter_h[1];
1776 int8_t filter_2 = filter_h[2];
1777 int8_t filter_3 = filter_h[3];
1778 int16_t *tmp = mcbuffer;
1779
1780 r0 = _mm_set_epi16(filter_3, filter_2, filter_1,
1781 filter_0, filter_3, filter_2, filter_1, filter_0);
1782 bshuffle1 = _mm_set_epi8(9,8,7,6,5,4, 3, 2,7,6,5,4, 3, 2, 1, 0);
1783
1784 src -= epel_extra_before * srcstride;
1785
1786 f0 = _mm_set1_epi16(filter_v[0]);
1787 f1 = _mm_set1_epi16(filter_v[1]);
1788 f2 = _mm_set1_epi16(filter_v[2]);
1789 f3 = _mm_set1_epi16(filter_v[3]);
1790
1791
1792 /* horizontal treatment */
1793 if(!(width & 3)){
1794 bshuffle2 = _mm_set_epi8(13,12,11,10,9,8,7,6,11,10, 9,8,7,6,5, 4);
1795 for (y = 0; y < height + epel_extra; y ++) {
1796 for(x=0;x<width;x+=4){
1797
1798 x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1799 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1800 x3 = _mm_shuffle_epi8(x1, bshuffle2);
1801
1802
1803 x2 = _mm_madd_epi16(x2, r0);
1804 x3 = _mm_madd_epi16(x3, r0);
1805 x2 = _mm_hadd_epi32(x2, x3);
1806 x2= _mm_srai_epi32(x2,2); //>> (BIT_DEPTH - 8)
1807
1808 x2 = _mm_packs_epi32(x2,r0);
1809 //give results back
1810 _mm_storel_epi64((__m128i *) &tmp[x], x2);
1811
1812 }
1813 src += srcstride;
1814 tmp += MAX_PB_SIZE;
1815 }
1816 tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1817
1818 // vertical treatment
1819
1820
1821 for (y = 0; y < height; y++) {
1822 for (x = 0; x < width; x += 4) {
1823 x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1824 x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1825 x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1826 x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1827
1828 r0 = _mm_mullo_epi16(x0, f0);
1829 r1 = _mm_mulhi_epi16(x0, f0);
1830 r2 = _mm_mullo_epi16(x1, f1);
1831 t0 = _mm_unpacklo_epi16(r0, r1);
1832
1833 r0 = _mm_mulhi_epi16(x1, f1);
1834 r1 = _mm_mullo_epi16(x2, f2);
1835 t1 = _mm_unpacklo_epi16(r2, r0);
1836
1837 r2 = _mm_mulhi_epi16(x2, f2);
1838 r0 = _mm_mullo_epi16(x3, f3);
1839 t2 = _mm_unpacklo_epi16(r1, r2);
1840
1841 r1 = _mm_mulhi_epi16(x3, f3);
1842 t3 = _mm_unpacklo_epi16(r0, r1);
1843
1844
1845
1846 r0 = _mm_add_epi32(t0, t1);
1847 r0 = _mm_add_epi32(r0, t2);
1848 r0 = _mm_add_epi32(r0, t3);
1849 r0 = _mm_srai_epi32(r0, 6);
1850
1851 // give results back
1852 r0 = _mm_packs_epi32(r0, r0);
1853 _mm_storel_epi64((__m128i *) &dst[x], r0);
1854 }
1855 tmp += MAX_PB_SIZE;
1856 dst += dststride;
1857 }
1858 }else{
1859 bshuffle2=_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1);
1860 r1= _mm_setzero_si128();
1861 for (y = 0; y < height + epel_extra; y ++) {
1862 for(x=0;x<width;x+=2){
1863 /* load data in register */
1864 x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1865 x2 = _mm_shuffle_epi8(x1, bshuffle1);
1866
1867 /* PMADDUBSW then PMADDW */
1868 x2 = _mm_madd_epi16(x2, r0);
1869 x2 = _mm_hadd_epi32(x2, r1);
1870 x2= _mm_srai_epi32(x2,2); //>> (BIT_DEPTH - 8)
1871 x2 = _mm_packs_epi32(x2, r1);
1872 /* give results back */
1873 _mm_maskmoveu_si128(x2,bshuffle2,(char *) (tmp+x));
1874 }
1875 src += srcstride;
1876 tmp += MAX_PB_SIZE;
1877 }
1878
1879 tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1880
1881 /* vertical treatment */
1882
1883 for (y = 0; y < height; y++) {
1884 for (x = 0; x < width; x += 2) {
1885 /* check if memory needs to be reloaded */
1886 x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1887 x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1888 x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1889 x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1890
1891 r0 = _mm_mullo_epi16(x0, f0);
1892 t0 = _mm_mulhi_epi16(x0, f0);
1893
1894 x0= _mm_unpacklo_epi16(r0,t0);
1895
1896 r1 = _mm_mullo_epi16(x1, f1);
1897 t1 = _mm_mulhi_epi16(x1, f1);
1898
1899 x1= _mm_unpacklo_epi16(r1,t1);
1900
1901 r2 = _mm_mullo_epi16(x2, f2);
1902 t2 = _mm_mulhi_epi16(x2, f2);
1903
1904 x2= _mm_unpacklo_epi16(r2,t2);
1905
1906 r3 = _mm_mullo_epi16(x3, f3);
1907 t3 = _mm_mulhi_epi16(x3, f3);
1908
1909 x3= _mm_unpacklo_epi16(r3,t3);
1910
1911 r0= _mm_add_epi32(x0,x1);
1912 r1= _mm_add_epi32(x2,x3);
1913 r0= _mm_add_epi32(r0,r1);
1914 r0 = _mm_srai_epi32(r0, 6);
1915 /* give results back */
1916 r0 = _mm_packs_epi32(r0, r0);
1917 _mm_maskmoveu_si128(r0,bshuffle2,(char *) (dst+x));
1918 }
1919 tmp += MAX_PB_SIZE;
1920 dst += dststride;
1921 }
1922 }
1923 }
1924 #endif
1925
1926 void ff_hevc_put_hevc_qpel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride,
1927 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
1928 int16_t* mcbuffer) {
1929 int x, y;
1930 __m128i x1, x2, x3, x0;
1931 uint8_t *src = (uint8_t*) _src;
1932 ptrdiff_t srcstride = _srcstride;
1933 x0= _mm_setzero_si128();
1934 if(!(width & 15)){
1935 for (y = 0; y < height; y++) {
1936 for (x = 0; x < width; x += 16) {
1937
1938 x1 = _mm_loadu_si128((__m128i *) &src[x]);
1939 x2 = _mm_unpacklo_epi8(x1, x0);
1940
1941 x3 = _mm_unpackhi_epi8(x1, x0);
1942
1943 x2 = _mm_slli_epi16(x2, 6);
1944 x3 = _mm_slli_epi16(x3, 6);
1945 _mm_storeu_si128((__m128i *) &dst[x], x2);
1946 _mm_storeu_si128((__m128i *) &dst[x + 8], x3);
1947
1948 }
1949 src += srcstride;
1950 dst += dststride;
1951 }
1952 }else if(!(width & 7)){
1953 for (y = 0; y < height; y++) {
1954 for (x = 0; x < width; x += 8) {
1955
1956 x1 = _mm_loadu_si128((__m128i *) &src[x]);
1957 x2 = _mm_unpacklo_epi8(x1, x0);
1958 x2 = _mm_slli_epi16(x2, 6);
1959 _mm_storeu_si128((__m128i *) &dst[x], x2);
1960
1961 }
1962 src += srcstride;
1963 dst += dststride;
1964 }
1965 }else if(!(width & 3)){
1966 for (y = 0; y < height; y++) {
1967 for(x=0;x<width;x+=4){
1968 x1 = _mm_loadu_si128((__m128i *) &src[x]);
1969 x2 = _mm_unpacklo_epi8(x1, x0);
1970 x2 = _mm_slli_epi16(x2, 6);
1971 _mm_storel_epi64((__m128i *) &dst[x], x2);
1972 }
1973 src += srcstride;
1974 dst += dststride;
1975 }
1976 }else{
1977 #if MASKMOVE
1978 x4= _mm_set_epi32(0,0,0,-1); //mask to store
1979 #endif
1980 for (y = 0; y < height; y++) {
1981 for(x=0;x<width;x+=2){
1982 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1983 x2 = _mm_unpacklo_epi8(x1, x0);
1984 x2 = _mm_slli_epi16(x2, 6);
1985 #if MASKMOVE
1986 _mm_maskmoveu_si128(x2,x4,(char *) (dst+x));
1987 #else
1988 *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
1989 #endif
1990 }
1991 src += srcstride;
1992 dst += dststride;
1993 }
1994 }
1995
1996
1997 }
1998
1999 #ifndef __native_client__
2000 void ff_hevc_put_hevc_qpel_pixels_10_sse(int16_t *dst, ptrdiff_t dststride,
2001 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2002 int16_t* mcbuffer) {
2003 int x, y;
2004 __m128i x1, x2, x4;
2005 uint16_t *src = (uint16_t*) _src;
2006 ptrdiff_t srcstride = _srcstride>>1;
2007 if(!(width & 7)){
2008 for (y = 0; y < height; y++) {
2009 for (x = 0; x < width; x += 8) {
2010
2011 x1 = _mm_loadu_si128((__m128i *) &src[x]);
2012 x2 = _mm_slli_epi16(x1, 4); //14-BIT DEPTH
2013 _mm_storeu_si128((__m128i *) &dst[x], x2);
2014
2015 }
2016 src += srcstride;
2017 dst += dststride;
2018 }
2019 }else if(!(width & 3)){
2020 for (y = 0; y < height; y++) {
2021 for(x=0;x<width;x+=4){
2022 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
2023 x2 = _mm_slli_epi16(x1, 4);//14-BIT DEPTH
2024 _mm_storel_epi64((__m128i *) &dst[x], x2);
2025 }
2026 src += srcstride;
2027 dst += dststride;
2028 }
2029 }else{
2030 x4= _mm_set_epi32(0,0,0,-1); //mask to store
2031 for (y = 0; y < height; y++) {
2032 for(x=0;x<width;x+=2){
2033 x1 = _mm_loadl_epi64((__m128i *) &src[x]);
2034 x2 = _mm_slli_epi16(x1, 4);//14-BIT DEPTH
2035 _mm_maskmoveu_si128(x2,x4,(char *) (dst+x));
2036 }
2037 src += srcstride;
2038 dst += dststride;
2039 }
2040 }
2041
2042
2043 }
2044 #endif
2045
2046
2047 void ff_hevc_put_hevc_qpel_h_1_8_sse(int16_t *dst, ptrdiff_t dststride,
2048 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2049 int16_t* mcbuffer) {
2050 int x, y;
2051 uint8_t *src = _src;
2052 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2053 __m128i x1, r0, x2, x3, x4, x5;
2054
2055 r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
2056 -1);
2057
2058 if(!(width & 7)){
2059 for (y = 0; y < height; y++) {
2060 for (x = 0; x < width; x += 8) {
2061 /* load data in register */
2062 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
2063 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2064 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2065 _mm_srli_si128(x1, 3));
2066 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2067 _mm_srli_si128(x1, 5));
2068 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2069 _mm_srli_si128(x1, 7));
2070
2071 /* PMADDUBSW then PMADDW */
2072 x2 = _mm_maddubs_epi16(x2, r0);
2073 x3 = _mm_maddubs_epi16(x3, r0);
2074 x4 = _mm_maddubs_epi16(x4, r0);
2075 x5 = _mm_maddubs_epi16(x5, r0);
2076 x2 = _mm_hadd_epi16(x2, x3);
2077 x4 = _mm_hadd_epi16(x4, x5);
2078 x2 = _mm_hadd_epi16(x2, x4);
2079 /* give results back */
2080 _mm_store_si128((__m128i *) &dst[x],x2);
2081
2082 }
2083 src += srcstride;
2084 dst += dststride;
2085 }
2086 }else if(!(width &3)){
2087
2088 for (y = 0; y < height; y ++) {
2089 for(x=0;x<width;x+=4){
2090 /* load data in register */
2091 x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2092 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2093 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2094 _mm_srli_si128(x1, 3));
2095
2096 /* PMADDUBSW then PMADDW */
2097 x2 = _mm_maddubs_epi16(x2, r0);
2098 x3 = _mm_maddubs_epi16(x3, r0);
2099 x2 = _mm_hadd_epi16(x2, x3);
2100 x2 = _mm_hadd_epi16(x2, x2);
2101
2102 /* give results back */
2103 _mm_storel_epi64((__m128i *) &dst[x], x2);
2104 }
2105
2106 src += srcstride;
2107 dst += dststride;
2108 }
2109 }else{
2110 x5= _mm_setzero_si128();
2111 #if MASKMOVE
2112 x3= _mm_set_epi32(0,0,0,-1);
2113 #endif
2114 for (y = 0; y < height; y ++) {
2115 for(x=0;x<width;x+=4){
2116 /* load data in register */
2117 x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2118 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2119
2120
2121
2122 /* PMADDUBSW then PMADDW */
2123 x2 = _mm_maddubs_epi16(x2, r0);
2124 x2 = _mm_hadd_epi16(x2,x5 );
2125 x2 = _mm_hadd_epi16(x2,x5 );
2126
2127 /* give results back */
2128 //_mm_storel_epi64((__m128i *) &dst[x], x2);
2129 #if MASKMOVE
2130 _mm_maskmoveu_si128(x2,x3,(char *) (dst+x));
2131 #else
2132 *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
2133 #endif
2134 }
2135
2136 src += srcstride;
2137 dst += dststride;
2138 }
2139 }
2140
2141 }
2142 #ifndef __native_client__
2143 /*
2144 * @TODO : Valgrind to see if it's useful to use SSE or wait for AVX2 implementation
2145 */
2146 void ff_hevc_put_hevc_qpel_h_1_10_sse(int16_t *dst, ptrdiff_t dststride,
2147 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2148 int16_t* mcbuffer) {
2149 int x, y;
2150 uint16_t *src = (uint16_t*)_src;
2151 ptrdiff_t srcstride = _srcstride>>1;
2152 __m128i x0, x1, x2, x3, r0;
2153
2154 r0 = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
2155 x0= _mm_setzero_si128();
2156 x3= _mm_set_epi32(0,0,0,-1);
2157 for (y = 0; y < height; y ++) {
2158 for(x=0;x<width;x+=2){
2159 x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2160 x2 = _mm_srli_si128(x1,2); //last 16bit not used so 1 load can be used for 2 dst
2161
2162 x1 = _mm_madd_epi16(x1,r0);
2163 x2 = _mm_madd_epi16(x2,r0);
2164
2165 x1 = _mm_hadd_epi32(x1,x2);
2166 x1 = _mm_hadd_epi32(x1,x0);
2167 x1= _mm_srai_epi32(x1,2); //>>BIT_DEPTH-8
2168 x1= _mm_packs_epi32(x1,x0);
2169 // dst[x]= _mm_extract_epi16(x1,0);
2170 _mm_maskmoveu_si128(x1,x3,(char *) (dst+x));
2171 }
2172 src += srcstride;
2173 dst += dststride;
2174 }
2175
2176 }
2177 #endif
2178
2179
2180 void ff_hevc_put_hevc_qpel_h_2_8_sse(int16_t *dst, ptrdiff_t dststride,
2181 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2182 int16_t* mcbuffer) {
2183 int x, y;
2184 uint8_t *src = _src;
2185 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2186 __m128i x1, r0, x2, x3, x4, x5;
2187
2188 r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
2189 4, -1);
2190
2191 /* LOAD src from memory to registers to limit memory bandwidth */
2192 if(!(width - 15)){
2193 for (y = 0; y < height; y++) {
2194 for (x = 0; x < width; x += 8) {
2195 /* load data in register */
2196 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
2197 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2198 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2199 _mm_srli_si128(x1, 3));
2200 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2201 _mm_srli_si128(x1, 5));
2202 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2203 _mm_srli_si128(x1, 7));
2204
2205 /* PMADDUBSW then PMADDW */
2206 x2 = _mm_maddubs_epi16(x2, r0);
2207 x3 = _mm_maddubs_epi16(x3, r0);
2208 x4 = _mm_maddubs_epi16(x4, r0);
2209 x5 = _mm_maddubs_epi16(x5, r0);
2210 x2 = _mm_hadd_epi16(x2, x3);
2211 x4 = _mm_hadd_epi16(x4, x5);
2212 x2 = _mm_hadd_epi16(x2, x4);
2213 /* give results back */
2214 _mm_store_si128((__m128i *) &dst[x],x2);
2215 }
2216 src += srcstride;
2217 dst += dststride;
2218 }
2219
2220 }else{
2221
2222 for (y = 0; y < height; y ++) {
2223 for(x=0;x<width;x+=4){
2224 /* load data in register */
2225 x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2226
2227 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2228 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2229 _mm_srli_si128(x1, 3));
2230
2231
2232 /* PMADDUBSW then PMADDW */
2233 x2 = _mm_maddubs_epi16(x2, r0);
2234 x3 = _mm_maddubs_epi16(x3, r0);
2235 x2 = _mm_hadd_epi16(x2, x3);
2236 x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2237
2238 /* give results back */
2239 _mm_storel_epi64((__m128i *) &dst[x], x2);
2240
2241 }
2242 src += srcstride;
2243 dst += dststride;
2244 }
2245 }
2246
2247 }
2248
2249 #if 0
2250 static void ff_hevc_put_hevc_qpel_h_2_sse(int16_t *dst, ptrdiff_t dststride,
2251 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2252 int16_t* mcbuffer) {
2253 int x, y;
2254 uint8_t *src = _src;
2255 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2256 __m128i x1, r0, x2, x3, x4, x5;
2257
2258 r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
2259 4, -1);
2260
2261 /* LOAD src from memory to registers to limit memory bandwidth */
2262 if(!(width & 7)){
2263 for (y = 0; y < height; y++) {
2264 for (x = 0; x < width; x += 8) {
2265 /* load data in register */
2266 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
2267 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2268 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2269 _mm_srli_si128(x1, 3));
2270 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2271 _mm_srli_si128(x1, 5));
2272 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2273 _mm_srli_si128(x1, 7));
2274
2275 /* PMADDUBSW then PMADDW */
2276 x2 = _mm_maddubs_epi16(x2, r0);
2277 x3 = _mm_maddubs_epi16(x3, r0);
2278 x4 = _mm_maddubs_epi16(x4, r0);
2279 x5 = _mm_maddubs_epi16(x5, r0);
2280 x2 = _mm_hadd_epi16(x2, x3);
2281 x4 = _mm_hadd_epi16(x4, x5);
2282 x2 = _mm_hadd_epi16(x2, x4);
2283 /* give results back */
2284 _mm_store_si128((__m128i *) &dst[x],x2);
2285 }
2286 src += srcstride;
2287 dst += dststride;
2288 }
2289
2290 }else{
2291
2292 for (y = 0; y < height; y ++) {
2293 for(x=0;x<width;x+=4){
2294 /* load data in register */
2295 x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2296
2297 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2298 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2299 _mm_srli_si128(x1, 3));
2300
2301
2302 /* PMADDUBSW then PMADDW */
2303 x2 = _mm_maddubs_epi16(x2, r0);
2304 x3 = _mm_maddubs_epi16(x3, r0);
2305 x2 = _mm_hadd_epi16(x2, x3);
2306 x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2307
2308 /* give results back */
2309 _mm_storel_epi64((__m128i *) &dst[x], x2);
2310
2311 }
2312 src += srcstride;
2313 dst += dststride;
2314 }
2315 }
2316
2317 }
2318 static void ff_hevc_put_hevc_qpel_h_3_sse(int16_t *dst, ptrdiff_t dststride,
2319 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2320 int16_t* mcbuffer) {
2321 int x, y;
2322 uint8_t *src = _src;
2323 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2324 __m128i x1, r0, x2, x3, x4, x5;
2325
2326 r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
2327 0);
2328
2329 if(!(width & 7)){
2330 for (y = 0; y < height; y++) {
2331 for (x = 0; x < width; x += 8) {
2332 /* load data in register */
2333 x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
2334 x1 = _mm_slli_si128(x1, 1);
2335 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2336 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2337 _mm_srli_si128(x1, 3));
2338 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2339 _mm_srli_si128(x1, 5));
2340 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2341 _mm_srli_si128(x1, 7));
2342
2343 /* PMADDUBSW then PMADDW */
2344 x2 = _mm_maddubs_epi16(x2, r0);
2345 x3 = _mm_maddubs_epi16(x3, r0);
2346 x4 = _mm_maddubs_epi16(x4, r0);
2347 x5 = _mm_maddubs_epi16(x5, r0);
2348 x2 = _mm_hadd_epi16(x2, x3);
2349 x4 = _mm_hadd_epi16(x4, x5);
2350 x2 = _mm_hadd_epi16(x2, x4);
2351 /* give results back */
2352 _mm_store_si128((__m128i *) &dst[x],
2353 _mm_srli_si128(x2, BIT_DEPTH - 8));
2354 }
2355 src += srcstride;
2356 dst += dststride;
2357 }
2358 }else{
2359 for (y = 0; y < height; y ++) {
2360 for(x=0;x<width;x+=4){
2361 /* load data in register */
2362 x1 = _mm_loadu_si128((__m128i *) &src[x-2]);
2363 x1 = _mm_slli_si128(x1, 1);
2364 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2365 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2366 _mm_srli_si128(x1, 3));
2367
2368 /* PMADDUBSW then PMADDW */
2369 x2 = _mm_maddubs_epi16(x2, r0);
2370 x3 = _mm_maddubs_epi16(x3, r0);
2371 x2 = _mm_hadd_epi16(x2, x3);
2372 x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2373 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
2374 /* give results back */
2375 _mm_storel_epi64((__m128i *) &dst[x], x2);
2376
2377 }
2378 src += srcstride;
2379 dst += dststride;
2380 }
2381 }
2382 }
2383 #endif
2384
2385 void ff_hevc_put_hevc_qpel_h_3_8_sse(int16_t *dst, ptrdiff_t dststride,
2386 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2387 int16_t* mcbuffer) {
2388 int x, y;
2389 uint8_t *src = _src;
2390 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2391 __m128i x1, r0, x2, x3, x4, x5;
2392
2393 r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
2394 0);
2395
2396 if(!(width & 7)){
2397 for (y = 0; y < height; y++) {
2398 for (x = 0; x < width; x += 8) {
2399 /* load data in register */
2400 x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
2401 x1 = _mm_slli_si128(x1, 1);
2402 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2403 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2404 _mm_srli_si128(x1, 3));
2405 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2406 _mm_srli_si128(x1, 5));
2407 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2408 _mm_srli_si128(x1, 7));
2409
2410 /* PMADDUBSW then PMADDW */
2411 x2 = _mm_maddubs_epi16(x2, r0);
2412 x3 = _mm_maddubs_epi16(x3, r0);
2413 x4 = _mm_maddubs_epi16(x4, r0);
2414 x5 = _mm_maddubs_epi16(x5, r0);
2415 x2 = _mm_hadd_epi16(x2, x3);
2416 x4 = _mm_hadd_epi16(x4, x5);
2417 x2 = _mm_hadd_epi16(x2, x4);
2418 /* give results back */
2419 _mm_store_si128((__m128i *) &dst[x],x2);
2420 }
2421 src += srcstride;
2422 dst += dststride;
2423 }
2424 }else{
2425 for (y = 0; y < height; y ++) {
2426 for(x=0;x<width;x+=4){
2427 /* load data in register */
2428 x1 = _mm_loadu_si128((__m128i *) &src[x-2]);
2429 x1 = _mm_slli_si128(x1, 1);
2430 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2431 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2432 _mm_srli_si128(x1, 3));
2433
2434 /* PMADDUBSW then PMADDW */
2435 x2 = _mm_maddubs_epi16(x2, r0);
2436 x3 = _mm_maddubs_epi16(x3, r0);
2437 x2 = _mm_hadd_epi16(x2, x3);
2438 x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2439 /* give results back */
2440 _mm_storel_epi64((__m128i *) &dst[x], x2);
2441
2442 }
2443 src += srcstride;
2444 dst += dststride;
2445 }
2446 }
2447 }
2448 /**
2449 for column MC treatment, we will calculate 8 pixels at the same time by multiplying the values
2450 of each row.
2451
2452 */
2453 void ff_hevc_put_hevc_qpel_v_1_8_sse(int16_t *dst, ptrdiff_t dststride,
2454 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2455 int16_t* mcbuffer) {
2456 int x, y;
2457 uint8_t *src = (uint8_t*) _src;
2458 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2459 __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2460 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2461 r1 = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
2462
2463 if(!(width & 15)){
2464 x8 = _mm_setzero_si128();
2465 for (y = 0; y < height; y++) {
2466 for (x = 0; x < width; x += 16) {
2467 /* check if memory needs to be reloaded */
2468 x1 = _mm_loadu_si128((__m128i *) &src[x - 3 * srcstride]);
2469 x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
2470 x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
2471 x4 = _mm_loadu_si128((__m128i *) &src[x]);
2472 x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
2473 x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
2474 x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
2475
2476 t1 = _mm_unpacklo_epi8(x1,x8);
2477 t2 = _mm_unpacklo_epi8(x2, x8);
2478 t3 = _mm_unpacklo_epi8(x3, x8);
2479 t4 = _mm_unpacklo_epi8(x4, x8);
2480 t5 = _mm_unpacklo_epi8(x5, x8);
2481 t6 = _mm_unpacklo_epi8(x6, x8);
2482 t7 = _mm_unpacklo_epi8(x7, x8);
2483
2484 x1 = _mm_unpackhi_epi8(x1,x8);
2485 x2 = _mm_unpackhi_epi8(x2, x8);
2486 x3 = _mm_unpackhi_epi8(x3, x8);
2487 x4 = _mm_unpackhi_epi8(x4, x8);
2488 x5 = _mm_unpackhi_epi8(x5, x8);
2489 x6 = _mm_unpackhi_epi8(x6, x8);
2490 x7 = _mm_unpackhi_epi8(x7, x8);
2491
2492 /* multiply by correct value : */
2493 r0 = _mm_mullo_epi16(t1,
2494 _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2495 r2 = _mm_mullo_epi16(x1,
2496 _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2497 r0 = _mm_adds_epi16(r0,
2498 _mm_mullo_epi16(t2,
2499 _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2500 r2 = _mm_adds_epi16(r2,
2501 _mm_mullo_epi16(x2,
2502 _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2503 r0 = _mm_adds_epi16(r0,
2504 _mm_mullo_epi16(t3,
2505 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2506 r2 = _mm_adds_epi16(r2,
2507 _mm_mullo_epi16(x3,
2508 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2509
2510 r0 = _mm_adds_epi16(r0,
2511 _mm_mullo_epi16(t4,
2512 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2513 r2 = _mm_adds_epi16(r2,
2514 _mm_mullo_epi16(x4,
2515 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2516
2517 r0 = _mm_adds_epi16(r0,
2518 _mm_mullo_epi16(t5,
2519 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2520 r2 = _mm_adds_epi16(r2,
2521 _mm_mullo_epi16(x5,
2522 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2523
2524 r0 = _mm_adds_epi16(r0,
2525 _mm_mullo_epi16(t6,
2526 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2527 r2 = _mm_adds_epi16(r2,
2528 _mm_mullo_epi16(x6,
2529 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2530
2531 r0 = _mm_adds_epi16(r0,
2532 _mm_mullo_epi16(t7,
2533 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2534 r2 = _mm_adds_epi16(r2,
2535 _mm_mullo_epi16(x7,
2536 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2537
2538
2539 /* give results back */
2540 _mm_store_si128((__m128i *) &dst[x],r0);
2541 _mm_store_si128((__m128i *) &dst[x + 8],r2);
2542 }
2543 src += srcstride;
2544 dst += dststride;
2545 }
2546
2547 }else{
2548 x = 0;
2549 x8 = _mm_setzero_si128();
2550 t8 = _mm_setzero_si128();
2551 for (y = 0; y < height; y ++) {
2552 for(x=0;x<width;x+=4){
2553 /* load data in register */
2554 x1 = _mm_loadl_epi64((__m128i *) &src[x-(3 * srcstride)]);
2555 x2 = _mm_loadl_epi64((__m128i *) &src[x-(2 * srcstride)]);
2556 x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2557 x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2558 x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2559 x6 = _mm_loadl_epi64((__m128i *) &src[x+(2 * srcstride)]);
2560 x7 = _mm_loadl_epi64((__m128i *) &src[x+(3 * srcstride)]);
2561
2562
2563
2564 x1 = _mm_unpacklo_epi8(x1, t8);
2565 x2 = _mm_unpacklo_epi8(x2, t8);
2566 x3 = _mm_unpacklo_epi8(x3, t8);
2567 x4 = _mm_unpacklo_epi8(x4, t8);
2568 x5 = _mm_unpacklo_epi8(x5, t8);
2569 x6 = _mm_unpacklo_epi8(x6, t8);
2570 x7 = _mm_unpacklo_epi8(x7, t8);
2571
2572
2573 r0 = _mm_mullo_epi16(x1, _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2574
2575 r0 = _mm_adds_epi16(r0,
2576 _mm_mullo_epi16(x2,
2577 _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2578
2579
2580 r0 = _mm_adds_epi16(r0,
2581 _mm_mullo_epi16(x3,
2582 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2583
2584 r0 = _mm_adds_epi16(r0,
2585 _mm_mullo_epi16(x4,
2586 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2587
2588 r0 = _mm_adds_epi16(r0,
2589 _mm_mullo_epi16(x5,
2590 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2591
2592
2593 r0 = _mm_adds_epi16(r0,
2594 _mm_mullo_epi16(x6,
2595 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2596
2597
2598 r0 = _mm_adds_epi16(r0,
2599 _mm_mullo_epi16(x7,
2600 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2601
2602 /* give results back */
2603 _mm_storel_epi64((__m128i *) &dst[x], r0);
2604 }
2605 src += srcstride;
2606 dst += dststride;
2607 }
2608 }
2609 }
2610
2611 #if 0
2612 void ff_hevc_put_hevc_qpel_v_1_10_sse4(int16_t *dst, ptrdiff_t dststride,
2613 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2614 int16_t* mcbuffer) {
2615 int x, y;
2616 uint16_t *src = (uint16_t*) _src;
2617 ptrdiff_t srcstride = _srcstride >> 1;
2618 __m128i x1, x2, x3, x4, x5, x6, x7, r1;
2619 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2620
2621 t7= _mm_set1_epi32(1);
2622 t6= _mm_set1_epi32(-5);
2623 t5= _mm_set1_epi32(17);
2624 t4= _mm_set1_epi32(58);
2625 t3= _mm_set1_epi32(-10);
2626 t2= _mm_set1_epi32(4);
2627 t1= _mm_set1_epi32(-1);
2628 t8= _mm_setzero_si128();
2629
2630 for (y = 0; y < height; y ++) {
2631 for(x=0;x<width;x+=4){
2632 /* load data in register */
2633 x1 = _mm_loadl_epi64((__m128i *) &src[x-(3 * srcstride)]);
2634 x2 = _mm_loadl_epi64((__m128i *) &src[x-(2 * srcstride)]);
2635 x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2636 x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2637 x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2638 x6 = _mm_loadl_epi64((__m128i *) &src[x+(2 * srcstride)]);
2639 x7 = _mm_loadl_epi64((__m128i *) &src[x+(3 * srcstride)]);
2640
2641
2642 x1 = _mm_unpacklo_epi16(x1, t8);
2643 x2 = _mm_unpacklo_epi16(x2, t8);
2644 x3 = _mm_unpacklo_epi16(x3, t8);
2645 x4 = _mm_unpacklo_epi16(x4, t8);
2646 x5 = _mm_unpacklo_epi16(x5, t8);
2647 x6 = _mm_unpacklo_epi16(x6, t8);
2648 x7 = _mm_unpacklo_epi16(x7, t8);
2649
2650
2651 r1 = _mm_mullo_epi32(x1,t1);
2652
2653 r1 = _mm_add_epi32(r1,
2654 _mm_mullo_epi32(x2,t2));
2655
2656
2657 r1 = _mm_add_epi32(r1,
2658 _mm_mullo_epi32(x3,t3));
2659
2660 r1 = _mm_add_epi32(r1,
2661 _mm_mullo_epi32(x4,t4));
2662
2663 r1 = _mm_add_epi32(r1,
2664 _mm_mullo_epi32(x5,t5));
2665
2666
2667 r1 = _mm_add_epi32(r1,
2668 _mm_mullo_epi32(x6,t6));
2669
2670
2671 r1 = _mm_add_epi32(r1, _mm_mullo_epi32(x7,t7));
2672 r1 = _mm_srai_epi32(r1,2); //bit depth - 8
2673
2674
2675 r1 = _mm_packs_epi32(r1,t8);
2676
2677 // give results back
2678 _mm_storel_epi64((__m128i *) (dst + x), r1);
2679 }
2680 src += srcstride;
2681 dst += dststride;
2682 }
2683
2684 }
2685 #endif
2686
2687
2688
2689 void ff_hevc_put_hevc_qpel_v_2_8_sse(int16_t *dst, ptrdiff_t dststride,
2690 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2691 int16_t* mcbuffer) {
2692 int x, y;
2693 uint8_t *src = (uint8_t*) _src;
2694 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2695 __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2696 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2697 r1 = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
2698
2699 if(!(width & 15)){
2700 for (y = 0; y < height; y++) {
2701 for (x = 0; x < width; x += 16) {
2702 r0 = _mm_setzero_si128();
2703 /* check if memory needs to be reloaded */
2704 x1 = _mm_loadu_si128((__m128i *) &src[x - 3 * srcstride]);
2705 x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
2706 x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
2707 x4 = _mm_loadu_si128((__m128i *) &src[x]);
2708 x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
2709 x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
2710 x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
2711 x8 = _mm_loadu_si128((__m128i *) &src[x + 4 * srcstride]);
2712
2713 t1 = _mm_unpacklo_epi8(x1, r0);
2714 t2 = _mm_unpacklo_epi8(x2, r0);
2715 t3 = _mm_unpacklo_epi8(x3, r0);
2716 t4 = _mm_unpacklo_epi8(x4, r0);
2717 t5 = _mm_unpacklo_epi8(x5, r0);
2718 t6 = _mm_unpacklo_epi8(x6, r0);
2719 t7 = _mm_unpacklo_epi8(x7, r0);
2720 t8 = _mm_unpacklo_epi8(x8, r0);
2721
2722 x1 = _mm_unpackhi_epi8(x1, r0);
2723 x2 = _mm_unpackhi_epi8(x2, r0);
2724 x3 = _mm_unpackhi_epi8(x3, r0);
2725 x4 = _mm_unpackhi_epi8(x4, r0);
2726 x5 = _mm_unpackhi_epi8(x5, r0);
2727 x6 = _mm_unpackhi_epi8(x6, r0);
2728 x7 = _mm_unpackhi_epi8(x7, r0);
2729 x8 = _mm_unpackhi_epi8(x8, r0);
2730
2731 /* multiply by correct value : */
2732 r0 = _mm_mullo_epi16(t1,
2733 _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2734 r2 = _mm_mullo_epi16(x1,
2735 _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2736 r0 = _mm_adds_epi16(r0,
2737 _mm_mullo_epi16(t2,
2738 _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2739 r2 = _mm_adds_epi16(r2,
2740 _mm_mullo_epi16(x2,
2741 _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2742 r0 = _mm_adds_epi16(r0,
2743 _mm_mullo_epi16(t3,
2744 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2745 r2 = _mm_adds_epi16(r2,
2746 _mm_mullo_epi16(x3,
2747 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2748
2749 r0 = _mm_adds_epi16(r0,
2750 _mm_mullo_epi16(t4,
2751 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2752 r2 = _mm_adds_epi16(r2,
2753 _mm_mullo_epi16(x4,
2754 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2755
2756 r0 = _mm_adds_epi16(r0,
2757 _mm_mullo_epi16(t5,
2758 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2759 r2 = _mm_adds_epi16(r2,
2760 _mm_mullo_epi16(x5,
2761 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2762
2763 r0 = _mm_adds_epi16(r0,
2764 _mm_mullo_epi16(t6,
2765 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2766 r2 = _mm_adds_epi16(r2,
2767 _mm_mullo_epi16(x6,
2768 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2769
2770 r0 = _mm_adds_epi16(r0,
2771 _mm_mullo_epi16(t7,
2772 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2773 r2 = _mm_adds_epi16(r2,
2774 _mm_mullo_epi16(x7,
2775 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2776
2777 r0 = _mm_adds_epi16(r0,
2778 _mm_mullo_epi16(t8,
2779 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
2780 r2 = _mm_adds_epi16(r2,
2781 _mm_mullo_epi16(x8,
2782 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
2783
2784 /* give results back */
2785 _mm_store_si128((__m128i *) &dst[x],r0);
2786 _mm_store_si128((__m128i *) &dst[x + 8],r2);
2787 }
2788 src += srcstride;
2789 dst += dststride;
2790 }
2791 }else{
2792 x = 0;
2793 for (y = 0; y < height; y ++) {
2794 for(x=0;x<width;x+=4){
2795 r0 = _mm_setzero_si128();
2796 /* load data in register */
2797 x1 = _mm_loadl_epi64((__m128i *) &src[x - 3 * srcstride]);
2798 x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
2799 x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2800 x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2801 x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2802 x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
2803 x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
2804 x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
2805
2806 x1 = _mm_unpacklo_epi8(x1,r0);
2807 x2 = _mm_unpacklo_epi8(x2, r0);
2808 x3 = _mm_unpacklo_epi8(x3, r0);
2809 x4 = _mm_unpacklo_epi8(x4, r0);
2810 x5 = _mm_unpacklo_epi8(x5, r0);
2811 x6 = _mm_unpacklo_epi8(x6, r0);
2812 x7 = _mm_unpacklo_epi8(x7, r0);
2813 x8 = _mm_unpacklo_epi8(x8, r0);
2814
2815
2816 r0 = _mm_mullo_epi16(x1, _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2817
2818 r0 = _mm_adds_epi16(r0,
2819 _mm_mullo_epi16(x2,
2820 _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2821
2822
2823 r0 = _mm_adds_epi16(r0,
2824 _mm_mullo_epi16(x3,
2825 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2826
2827
2828 r0 = _mm_adds_epi16(r0,
2829 _mm_mullo_epi16(x4,
2830 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2831
2832
2833 r0 = _mm_adds_epi16(r0,
2834 _mm_mullo_epi16(x5,
2835 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2836
2837
2838 r0 = _mm_adds_epi16(r0,
2839 _mm_mullo_epi16(x6,
2840 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2841
2842
2843 r0 = _mm_adds_epi16(r0,
2844 _mm_mullo_epi16(x7,
2845 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2846
2847
2848 r0 = _mm_adds_epi16(r0,
2849 _mm_mullo_epi16(x8,
2850 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
2851
2852
2853 /* give results back */
2854 _mm_storel_epi64((__m128i *) &dst[x], r0);
2855
2856 }
2857 src += srcstride;
2858 dst += dststride;
2859 }
2860 }
2861 }
2862
2863 #if 0
2864 void ff_hevc_put_hevc_qpel_v_2_10_sse(int16_t *dst, ptrdiff_t dststride,
2865 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2866 int16_t* mcbuffer) {
2867 int x, y;
2868 uint16_t *src = (uint16_t*) _src;
2869 ptrdiff_t srcstride = _srcstride >> 1;
2870 __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2871 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2872 r1 = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
2873
2874 t1= _mm_set1_epi32(-1);
2875 t2= _mm_set1_epi32(4);
2876 t3= _mm_set1_epi32(-11);
2877 t4= _mm_set1_epi32(40);
2878 t5= _mm_set1_epi32(40);
2879 t6= _mm_set1_epi32(-11);
2880 t7= _mm_set1_epi32(4);
2881 t8= _mm_set1_epi32(-1);
2882
2883 {
2884 x = 0;
2885 r0 = _mm_setzero_si128();
2886 for (y = 0; y < height; y ++) {
2887 for(x=0;x<width;x+=4){
2888
2889 /* load data in register */
2890 x1 = _mm_loadl_epi64((__m128i *) &src[x - 3 * srcstride]);
2891 x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
2892 x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2893 x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2894 x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2895 x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
2896 x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
2897 x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
2898
2899 x1 = _mm_unpacklo_epi16(x1, r0);
2900 x2 = _mm_unpacklo_epi16(x2, r0);
2901 x3 = _mm_unpacklo_epi16(x3, r0);
2902 x4 = _mm_unpacklo_epi16(x4, r0);
2903 x5 = _mm_unpacklo_epi16(x5, r0);
2904 x6 = _mm_unpacklo_epi16(x6, r0);
2905 x7 = _mm_unpacklo_epi16(x7, r0);
2906 x8 = _mm_unpacklo_epi16(x8, r0);
2907
2908
2909 r1 = _mm_mullo_epi32(x1, t1);
2910
2911 r1 = _mm_add_epi32(r1,
2912 _mm_mullo_epi32(x2,t2));
2913
2914
2915 r1 = _mm_add_epi32(r1,
2916 _mm_mullo_epi32(x3,t3));
2917
2918
2919 r1 = _mm_add_epi32(r1,
2920 _mm_mullo_epi32(x4,t4));
2921
2922
2923 r1 = _mm_add_epi32(r1,
2924 _mm_mullo_epi32(x5,t5));
2925
2926
2927 r1 = _mm_add_epi32(r1,
2928 _mm_mullo_epi32(x6,t6));
2929
2930
2931 r1 = _mm_add_epi32(r1,
2932 _mm_mullo_epi32(x7,t7));
2933
2934
2935 r1 = _mm_add_epi32(r1,
2936 _mm_mullo_epi32(x8,t8));
2937
2938
2939 r1= _mm_srai_epi32(r1,2); //bit depth - 8
2940
2941 r1= _mm_packs_epi32(r1,t8);
2942
2943 /* give results back */
2944 _mm_storel_epi64((__m128i *) (dst+x), r1);
2945
2946 }
2947 src += srcstride;
2948 dst += dststride;
2949 }
2950 }
2951 }
2952 #endif
2953
2954 #if 0
2955 static void ff_hevc_put_hevc_qpel_v_3_sse(int16_t *dst, ptrdiff_t dststride,
2956 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2957 int16_t* mcbuffer) {
2958 int x, y;
2959 uint8_t *src = (uint8_t*) _src;
2960 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2961 __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2962 __m128i t2, t3, t4, t5, t6, t7, t8;
2963 r1 = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
2964
2965 if(!(width & 15)){
2966 for (y = 0; y < height; y++) {
2967 for (x = 0; x < width; x += 16) {
2968 /* check if memory needs to be reloaded */
2969 x1 = _mm_setzero_si128();
2970 x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
2971 x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
2972 x4 = _mm_loadu_si128((__m128i *) &src[x]);
2973 x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
2974 x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
2975 x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
2976 x8 = _mm_loadu_si128((__m128i *) &src[x + 4 * srcstride]);
2977
2978 t2 = _mm_unpacklo_epi8(x2, x1);
2979 t3 = _mm_unpacklo_epi8(x3, x1);
2980 t4 = _mm_unpacklo_epi8(x4, x1);
2981 t5 = _mm_unpacklo_epi8(x5, x1);
2982 t6 = _mm_unpacklo_epi8(x6, x1);
2983 t7 = _mm_unpacklo_epi8(x7, x1);
2984 t8 = _mm_unpacklo_epi8(x8, x1);
2985
2986 x2 = _mm_unpackhi_epi8(x2, x1);
2987 x3 = _mm_unpackhi_epi8(x3, x1);
2988 x4 = _mm_unpackhi_epi8(x4, x1);
2989 x5 = _mm_unpackhi_epi8(x5, x1);
2990 x6 = _mm_unpackhi_epi8(x6, x1);
2991 x7 = _mm_unpackhi_epi8(x7, x1);
2992 x8 = _mm_unpackhi_epi8(x8, x1);
2993
2994 /* multiply by correct value : */
2995 r0 = _mm_mullo_epi16(t2,
2996 _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
2997 r2 = _mm_mullo_epi16(x2,
2998 _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
2999
3000 r0 = _mm_adds_epi16(r0,
3001 _mm_mullo_epi16(t3,
3002 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3003 r2 = _mm_adds_epi16(r2,
3004 _mm_mullo_epi16(x3,
3005 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3006
3007 r0 = _mm_adds_epi16(r0,
3008 _mm_mullo_epi16(t4,
3009 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3010 r2 = _mm_adds_epi16(r2,
3011 _mm_mullo_epi16(x4,
3012 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3013
3014 r0 = _mm_adds_epi16(r0,
3015 _mm_mullo_epi16(t5,
3016 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3017 r2 = _mm_adds_epi16(r2,
3018 _mm_mullo_epi16(x5,
3019 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3020
3021 r0 = _mm_adds_epi16(r0,
3022 _mm_mullo_epi16(t6,
3023 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3024 r2 = _mm_adds_epi16(r2,
3025 _mm_mullo_epi16(x6,
3026 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3027
3028 r0 = _mm_adds_epi16(r0,
3029 _mm_mullo_epi16(t7,
3030 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3031 r2 = _mm_adds_epi16(r2,
3032 _mm_mullo_epi16(x7,
3033 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3034
3035 r0 = _mm_adds_epi16(r0,
3036 _mm_mullo_epi16(t8,
3037 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3038 r2 = _mm_adds_epi16(r2,
3039 _mm_mullo_epi16(x8,
3040 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3041
3042 /* give results back */
3043 _mm_store_si128((__m128i *) &dst[x],
3044 _mm_srli_epi16(r0, BIT_DEPTH - 8));
3045 _mm_store_si128((__m128i *) &dst[x + 8],
3046 _mm_srli_epi16(r2, BIT_DEPTH - 8));
3047 }
3048 src += srcstride;
3049 dst += dststride;
3050 }
3051 }else{
3052 x = 0;
3053 for (y = 0; y < height; y ++) {
3054 for(x=0;x<width;x+=4){
3055 r0 = _mm_set1_epi16(0);
3056 /* load data in register */
3057 //x1 = _mm_setzero_si128();
3058 x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
3059 x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
3060 x4 = _mm_loadl_epi64((__m128i *) &src[x]);
3061 x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
3062 x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
3063 x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
3064 x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
3065
3066 x1 = _mm_unpacklo_epi8(x1,r0);
3067 x2 = _mm_unpacklo_epi8(x2, r0);
3068 x3 = _mm_unpacklo_epi8(x3, r0);
3069 x4 = _mm_unpacklo_epi8(x4, r0);
3070 x5 = _mm_unpacklo_epi8(x5, r0);
3071 x6 = _mm_unpacklo_epi8(x6, r0);
3072 x7 = _mm_unpacklo_epi8(x7, r0);
3073 x8 = _mm_unpacklo_epi8(x8, r0);
3074
3075
3076 r0 = _mm_mullo_epi16(x2, _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3077
3078
3079 r0 = _mm_adds_epi16(r0,
3080 _mm_mullo_epi16(x3,
3081 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3082
3083
3084 r0 = _mm_adds_epi16(r0,
3085 _mm_mullo_epi16(x4,
3086 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3087
3088
3089 r0 = _mm_adds_epi16(r0,
3090 _mm_mullo_epi16(x5,
3091 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3092
3093
3094 r0 = _mm_adds_epi16(r0,
3095 _mm_mullo_epi16(x6,
3096 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3097
3098
3099 r0 = _mm_adds_epi16(r0,
3100 _mm_mullo_epi16(x7,
3101 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3102
3103
3104 r0 = _mm_adds_epi16(r0,
3105 _mm_mullo_epi16(x8,
3106 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3107
3108
3109 r0 = _mm_srli_epi16(r0, BIT_DEPTH - 8);
3110 /* give results back */
3111 _mm_storel_epi64((__m128i *) &dst[x], r0);
3112
3113 }
3114 src += srcstride;
3115 dst += dststride;
3116 }
3117 }
3118
3119 }
3120 #endif
3121
3122 void ff_hevc_put_hevc_qpel_v_3_8_sse(int16_t *dst, ptrdiff_t dststride,
3123 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3124 int16_t* mcbuffer) {
3125 int x, y;
3126 uint8_t *src = (uint8_t*) _src;
3127 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3128 __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
3129 __m128i t2, t3, t4, t5, t6, t7, t8;
3130 r1 = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
3131
3132 if(!(width & 15)){
3133 for (y = 0; y < height; y++) {
3134 for (x = 0; x < width; x += 16) {
3135 /* check if memory needs to be reloaded */
3136 x1 = _mm_setzero_si128();
3137 x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
3138 x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
3139 x4 = _mm_loadu_si128((__m128i *) &src[x]);
3140 x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
3141 x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
3142 x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
3143 x8 = _mm_loadu_si128((__m128i *) &src[x + 4 * srcstride]);
3144
3145 t2 = _mm_unpacklo_epi8(x2, x1);
3146 t3 = _mm_unpacklo_epi8(x3, x1);
3147 t4 = _mm_unpacklo_epi8(x4, x1);
3148 t5 = _mm_unpacklo_epi8(x5, x1);
3149 t6 = _mm_unpacklo_epi8(x6, x1);
3150 t7 = _mm_unpacklo_epi8(x7, x1);
3151 t8 = _mm_unpacklo_epi8(x8, x1);
3152
3153 x2 = _mm_unpackhi_epi8(x2, x1);
3154 x3 = _mm_unpackhi_epi8(x3, x1);
3155 x4 = _mm_unpackhi_epi8(x4, x1);
3156 x5 = _mm_unpackhi_epi8(x5, x1);
3157 x6 = _mm_unpackhi_epi8(x6, x1);
3158 x7 = _mm_unpackhi_epi8(x7, x1);
3159 x8 = _mm_unpackhi_epi8(x8, x1);
3160
3161 /* multiply by correct value : */
3162 r0 = _mm_mullo_epi16(t2,
3163 _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3164 r2 = _mm_mullo_epi16(x2,
3165 _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3166
3167 r0 = _mm_adds_epi16(r0,
3168 _mm_mullo_epi16(t3,
3169 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3170 r2 = _mm_adds_epi16(r2,
3171 _mm_mullo_epi16(x3,
3172 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3173
3174 r0 = _mm_adds_epi16(r0,
3175 _mm_mullo_epi16(t4,
3176 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3177 r2 = _mm_adds_epi16(r2,
3178 _mm_mullo_epi16(x4,
3179 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3180
3181 r0 = _mm_adds_epi16(r0,
3182 _mm_mullo_epi16(t5,
3183 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3184 r2 = _mm_adds_epi16(r2,
3185 _mm_mullo_epi16(x5,
3186 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3187
3188 r0 = _mm_adds_epi16(r0,
3189 _mm_mullo_epi16(t6,
3190 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3191 r2 = _mm_adds_epi16(r2,
3192 _mm_mullo_epi16(x6,
3193 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3194
3195 r0 = _mm_adds_epi16(r0,
3196 _mm_mullo_epi16(t7,
3197 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3198 r2 = _mm_adds_epi16(r2,
3199 _mm_mullo_epi16(x7,
3200 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3201
3202 r0 = _mm_adds_epi16(r0,
3203 _mm_mullo_epi16(t8,
3204 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3205 r2 = _mm_adds_epi16(r2,
3206 _mm_mullo_epi16(x8,
3207 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3208
3209 /* give results back */
3210 _mm_store_si128((__m128i *) &dst[x],r0);
3211 _mm_store_si128((__m128i *) &dst[x + 8],r2);
3212 }
3213 src += srcstride;
3214 dst += dststride;
3215 }
3216 }else{
3217 x = 0;
3218 for (y = 0; y < height; y ++) {
3219 for(x=0;x<width;x+=4){
3220 r0 = _mm_set1_epi16(0);
3221 /* load data in register */
3222 x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
3223 x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
3224 x4 = _mm_loadl_epi64((__m128i *) &src[x]);
3225 x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
3226 x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
3227 x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
3228 x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
3229
3230 x2 = _mm_unpacklo_epi8(x2, r0);
3231 x3 = _mm_unpacklo_epi8(x3, r0);
3232 x4 = _mm_unpacklo_epi8(x4, r0);
3233 x5 = _mm_unpacklo_epi8(x5, r0);
3234 x6 = _mm_unpacklo_epi8(x6, r0);
3235 x7 = _mm_unpacklo_epi8(x7, r0);
3236 x8 = _mm_unpacklo_epi8(x8, r0);
3237
3238 r0 = _mm_mullo_epi16(x2, _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3239
3240 r0 = _mm_adds_epi16(r0,
3241 _mm_mullo_epi16(x3,
3242 _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3243
3244 r0 = _mm_adds_epi16(r0,
3245 _mm_mullo_epi16(x4,
3246 _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3247
3248 r0 = _mm_adds_epi16(r0,
3249 _mm_mullo_epi16(x5,
3250 _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3251
3252 r0 = _mm_adds_epi16(r0,
3253 _mm_mullo_epi16(x6,
3254 _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3255
3256 r0 = _mm_adds_epi16(r0,
3257 _mm_mullo_epi16(x7,
3258 _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3259
3260 r0 = _mm_adds_epi16(r0,
3261 _mm_mullo_epi16(x8,
3262 _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3263
3264 /* give results back */
3265 _mm_storel_epi64((__m128i *) &dst[x], r0);
3266
3267 }
3268 src += srcstride;
3269 dst += dststride;
3270 }
3271 }
3272
3273 }
3274
3275
3276 #if 0
3277 void ff_hevc_put_hevc_qpel_v_3_10_sse(int16_t *dst, ptrdiff_t dststride,
3278 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3279 int16_t* mcbuffer) {
3280 int x, y;
3281 uint16_t *src = (uint16_t*) _src;
3282 ptrdiff_t srcstride = _srcstride >> 1;
3283 __m128i x1, x2, x3, x4, x5, x6, x7, r0;
3284 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3285
3286 t7 = _mm_set1_epi32(-1);
3287 t6 = _mm_set1_epi32(4);
3288 t5 = _mm_set1_epi32(-10);
3289 t4 = _mm_set1_epi32(58);
3290 t3 = _mm_set1_epi32(17);
3291 t2 = _mm_set1_epi32(-5);
3292 t1 = _mm_set1_epi32(1);
3293 t8= _mm_setzero_si128();
3294 {
3295
3296 for (y = 0; y < height; y ++) {
3297 for(x=0;x<width;x+=4){
3298 /* load data in register */
3299 x1 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
3300 x2 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
3301 x3 = _mm_loadl_epi64((__m128i *) &src[x]);
3302 x4 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
3303 x5 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
3304 x6 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
3305 x7 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
3306
3307 x1 = _mm_unpacklo_epi16(x1, t8);
3308 x2 = _mm_unpacklo_epi16(x2, t8);
3309 x3 = _mm_unpacklo_epi16(x3, t8);
3310 x4 = _mm_unpacklo_epi16(x4, t8);
3311 x5 = _mm_unpacklo_epi16(x5, t8);
3312 x6 = _mm_unpacklo_epi16(x6, t8);
3313 x7 = _mm_unpacklo_epi16(x7, t8);
3314
3315 r0 = _mm_mullo_epi32(x1, t1);
3316
3317 r0 = _mm_add_epi32(r0,
3318 _mm_mullo_epi32(x2,t2));
3319
3320 r0 = _mm_add_epi32(r0,
3321 _mm_mullo_epi32(x3,t3));
3322
3323 r0 = _mm_add_epi32(r0,
3324 _mm_mullo_epi32(x4,t4));
3325
3326 r0 = _mm_add_epi32(r0,
3327 _mm_mullo_epi32(x5,t5));
3328
3329 r0 = _mm_add_epi32(r0,
3330 _mm_mullo_epi32(x6,t6));
3331
3332 r0 = _mm_add_epi32(r0,
3333 _mm_mullo_epi32(x7,t7));
3334
3335 r0= _mm_srai_epi32(r0,2);
3336
3337 r0= _mm_packs_epi32(r0,t8);
3338
3339 /* give results back */
3340 _mm_storel_epi64((__m128i *) &dst[x], r0);
3341
3342 }
3343 src += srcstride;
3344 dst += dststride;
3345 }
3346 }
3347
3348 }
3349 #endif
3350
3351
3352
3353 void ff_hevc_put_hevc_qpel_h_1_v_1_sse(int16_t *dst, ptrdiff_t dststride,
3354 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3355 int16_t* mcbuffer) {
3356 int x, y;
3357 uint8_t* src = (uint8_t*) _src;
3358 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3359 int16_t *tmp = mcbuffer;
3360 __m128i x1, x2, x3, x4, x5, x6, x7, rBuffer, rTemp, r0, r1;
3361 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3362
3363 src -= qpel_extra_before[1] * srcstride;
3364 r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
3365 -1);
3366
3367 /* LOAD src from memory to registers to limit memory bandwidth */
3368 if (width == 4) {
3369
3370 for (y = 0; y < height + qpel_extra[1]; y += 2) {
3371 /* load data in register */
3372 x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3373 src += srcstride;
3374 t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3375 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3376 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3377 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3378 _mm_srli_si128(x1, 3));
3379 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3380 _mm_srli_si128(t1, 3));
3381
3382 /* PMADDUBSW then PMADDW */
3383 x2 = _mm_maddubs_epi16(x2, r0);
3384 t2 = _mm_maddubs_epi16(t2, r0);
3385 x3 = _mm_maddubs_epi16(x3, r0);
3386 t3 = _mm_maddubs_epi16(t3, r0);
3387 x2 = _mm_hadd_epi16(x2, x3);
3388 t2 = _mm_hadd_epi16(t2, t3);
3389 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3390 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3391 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3392 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3393 /* give results back */
3394 _mm_storel_epi64((__m128i *) &tmp[0], x2);
3395
3396 tmp += MAX_PB_SIZE;
3397 _mm_storel_epi64((__m128i *) &tmp[0], t2);
3398
3399 src += srcstride;
3400 tmp += MAX_PB_SIZE;
3401 }
3402 } else
3403 for (y = 0; y < height + qpel_extra[1]; y++) {
3404 for (x = 0; x < width; x += 8) {
3405 /* load data in register */
3406 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3407 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3408 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3409 _mm_srli_si128(x1, 3));
3410 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3411 _mm_srli_si128(x1, 5));
3412 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3413 _mm_srli_si128(x1, 7));
3414
3415 /* PMADDUBSW then PMADDW */
3416 x2 = _mm_maddubs_epi16(x2, r0);
3417 x3 = _mm_maddubs_epi16(x3, r0);
3418 x4 = _mm_maddubs_epi16(x4, r0);
3419 x5 = _mm_maddubs_epi16(x5, r0);
3420 x2 = _mm_hadd_epi16(x2, x3);
3421 x4 = _mm_hadd_epi16(x4, x5);
3422 x2 = _mm_hadd_epi16(x2, x4);
3423 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3424
3425 /* give results back */
3426 _mm_store_si128((__m128i *) &tmp[x], x2);
3427
3428 }
3429 src += srcstride;
3430 tmp += MAX_PB_SIZE;
3431 }
3432
3433 tmp = mcbuffer + qpel_extra_before[1] * MAX_PB_SIZE;
3434 srcstride = MAX_PB_SIZE;
3435
3436 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
3437 for register calculations */
3438 rTemp = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
3439 for (y = 0; y < height; y++) {
3440 for (x = 0; x < width; x += 8) {
3441
3442 x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
3443 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3444 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3445 x4 = _mm_load_si128((__m128i *) &tmp[x]);
3446 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3447 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3448 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3449
3450 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
3451 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
3452 t8 = _mm_mullo_epi16(x1, r0);
3453 rBuffer = _mm_mulhi_epi16(x1, r0);
3454 t7 = _mm_mullo_epi16(x2, r1);
3455 t1 = _mm_unpacklo_epi16(t8, rBuffer);
3456 x1 = _mm_unpackhi_epi16(t8, rBuffer);
3457
3458 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
3459 rBuffer = _mm_mulhi_epi16(x2, r1);
3460 t8 = _mm_mullo_epi16(x3, r0);
3461 t2 = _mm_unpacklo_epi16(t7, rBuffer);
3462 x2 = _mm_unpackhi_epi16(t7, rBuffer);
3463
3464 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
3465 rBuffer = _mm_mulhi_epi16(x3, r0);
3466 t7 = _mm_mullo_epi16(x4, r1);
3467 t3 = _mm_unpacklo_epi16(t8, rBuffer);
3468 x3 = _mm_unpackhi_epi16(t8, rBuffer);
3469
3470 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
3471 rBuffer = _mm_mulhi_epi16(x4, r1);
3472 t8 = _mm_mullo_epi16(x5, r0);
3473 t4 = _mm_unpacklo_epi16(t7, rBuffer);
3474 x4 = _mm_unpackhi_epi16(t7, rBuffer);
3475
3476 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
3477 rBuffer = _mm_mulhi_epi16(x5, r0);
3478 t7 = _mm_mullo_epi16(x6, r1);
3479 t5 = _mm_unpacklo_epi16(t8, rBuffer);
3480 x5 = _mm_unpackhi_epi16(t8, rBuffer);
3481
3482 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
3483 rBuffer = _mm_mulhi_epi16(x6, r1);
3484 t8 = _mm_mullo_epi16(x7, r0);
3485 t6 = _mm_unpacklo_epi16(t7, rBuffer);
3486 x6 = _mm_unpackhi_epi16(t7, rBuffer);
3487
3488 rBuffer = _mm_mulhi_epi16(x7, r0);
3489 t7 = _mm_unpacklo_epi16(t8, rBuffer);
3490 x7 = _mm_unpackhi_epi16(t8, rBuffer);
3491
3492
3493
3494 /* add calculus by correct value : */
3495
3496 r1 = _mm_add_epi32(x1, x2);
3497 x3 = _mm_add_epi32(x3, x4);
3498 x5 = _mm_add_epi32(x5, x6);
3499 r1 = _mm_add_epi32(r1, x3);
3500
3501 r1 = _mm_add_epi32(r1, x5);
3502
3503 r0 = _mm_add_epi32(t1, t2);
3504 t3 = _mm_add_epi32(t3, t4);
3505 t5 = _mm_add_epi32(t5, t6);
3506 r0 = _mm_add_epi32(r0, t3);
3507 r0 = _mm_add_epi32(r0, t5);
3508 r1 = _mm_add_epi32(r1, x7);
3509 r0 = _mm_add_epi32(r0, t7);
3510 r1 = _mm_srli_epi32(r1, 6);
3511 r0 = _mm_srli_epi32(r0, 6);
3512
3513 r1 = _mm_and_si128(r1,
3514 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3515 r0 = _mm_and_si128(r0,
3516 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3517 r0 = _mm_hadd_epi16(r0, r1);
3518 _mm_store_si128((__m128i *) &dst[x], r0);
3519
3520 }
3521 tmp += MAX_PB_SIZE;
3522 dst += dststride;
3523 }
3524 }
3525 void ff_hevc_put_hevc_qpel_h_1_v_2_sse(int16_t *dst, ptrdiff_t dststride,
3526 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3527 int16_t* mcbuffer) {
3528 int x, y;
3529 uint8_t *src = (uint8_t*) _src;
3530 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3531 int16_t *tmp = mcbuffer;
3532 __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
3533 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3534
3535 src -= qpel_extra_before[2] * srcstride;
3536 r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
3537 -1);
3538
3539 /* LOAD src from memory to registers to limit memory bandwidth */
3540 if (width == 4) {
3541
3542 for (y = 0; y < height + qpel_extra[2]; y += 2) {
3543 /* load data in register */
3544 x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3545 src += srcstride;
3546 t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3547 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3548 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3549 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3550 _mm_srli_si128(x1, 3));
3551 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3552 _mm_srli_si128(t1, 3));
3553
3554 /* PMADDUBSW then PMADDW */
3555 x2 = _mm_maddubs_epi16(x2, r0);
3556 t2 = _mm_maddubs_epi16(t2, r0);
3557 x3 = _mm_maddubs_epi16(x3, r0);
3558 t3 = _mm_maddubs_epi16(t3, r0);
3559 x2 = _mm_hadd_epi16(x2, x3);
3560 t2 = _mm_hadd_epi16(t2, t3);
3561 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3562 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3563 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3564 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3565 /* give results back */
3566 _mm_storel_epi64((__m128i *) &tmp[0], x2);
3567
3568 tmp += MAX_PB_SIZE;
3569 _mm_storel_epi64((__m128i *) &tmp[0], t2);
3570
3571 src += srcstride;
3572 tmp += MAX_PB_SIZE;
3573 }
3574 } else
3575 for (y = 0; y < height + qpel_extra[2]; y++) {
3576 for (x = 0; x < width; x += 8) {
3577 /* load data in register */
3578 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3579 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3580 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3581 _mm_srli_si128(x1, 3));
3582 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3583 _mm_srli_si128(x1, 5));
3584 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3585 _mm_srli_si128(x1, 7));
3586
3587 /* PMADDUBSW then PMADDW */
3588 x2 = _mm_maddubs_epi16(x2, r0);
3589 x3 = _mm_maddubs_epi16(x3, r0);
3590 x4 = _mm_maddubs_epi16(x4, r0);
3591 x5 = _mm_maddubs_epi16(x5, r0);
3592 x2 = _mm_hadd_epi16(x2, x3);
3593 x4 = _mm_hadd_epi16(x4, x5);
3594 x2 = _mm_hadd_epi16(x2, x4);
3595 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3596
3597 /* give results back */
3598 _mm_store_si128((__m128i *) &tmp[x], x2);
3599
3600 }
3601 src += srcstride;
3602 tmp += MAX_PB_SIZE;
3603 }
3604
3605 tmp = mcbuffer + qpel_extra_before[2] * MAX_PB_SIZE;
3606 srcstride = MAX_PB_SIZE;
3607
3608 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
3609 for register calculations */
3610 rTemp = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
3611 for (y = 0; y < height; y++) {
3612 for (x = 0; x < width; x += 8) {
3613
3614 x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
3615 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3616 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3617 x4 = _mm_load_si128((__m128i *) &tmp[x]);
3618 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3619 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3620 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3621 x8 = _mm_loadu_si128((__m128i *) &tmp[x + 4 * srcstride]);
3622
3623 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
3624 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
3625 t8 = _mm_mullo_epi16(x1, r0);
3626 rBuffer = _mm_mulhi_epi16(x1, r0);
3627 t7 = _mm_mullo_epi16(x2, r1);
3628 t1 = _mm_unpacklo_epi16(t8, rBuffer);
3629 x1 = _mm_unpackhi_epi16(t8, rBuffer);
3630
3631 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
3632 rBuffer = _mm_mulhi_epi16(x2, r1);
3633 t8 = _mm_mullo_epi16(x3, r0);
3634 t2 = _mm_unpacklo_epi16(t7, rBuffer);
3635 x2 = _mm_unpackhi_epi16(t7, rBuffer);
3636
3637 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
3638 rBuffer = _mm_mulhi_epi16(x3, r0);
3639 t7 = _mm_mullo_epi16(x4, r1);
3640 t3 = _mm_unpacklo_epi16(t8, rBuffer);
3641 x3 = _mm_unpackhi_epi16(t8, rBuffer);
3642
3643 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
3644 rBuffer = _mm_mulhi_epi16(x4, r1);
3645 t8 = _mm_mullo_epi16(x5, r0);
3646 t4 = _mm_unpacklo_epi16(t7, rBuffer);
3647 x4 = _mm_unpackhi_epi16(t7, rBuffer);
3648
3649 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
3650 rBuffer = _mm_mulhi_epi16(x5, r0);
3651 t7 = _mm_mullo_epi16(x6, r1);
3652 t5 = _mm_unpacklo_epi16(t8, rBuffer);
3653 x5 = _mm_unpackhi_epi16(t8, rBuffer);
3654
3655 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
3656 rBuffer = _mm_mulhi_epi16(x6, r1);
3657 t8 = _mm_mullo_epi16(x7, r0);
3658 t6 = _mm_unpacklo_epi16(t7, rBuffer);
3659 x6 = _mm_unpackhi_epi16(t7, rBuffer);
3660
3661 rBuffer = _mm_mulhi_epi16(x7, r0);
3662 t7 = _mm_unpacklo_epi16(t8, rBuffer);
3663 x7 = _mm_unpackhi_epi16(t8, rBuffer);
3664
3665 t8 = _mm_unpacklo_epi16(
3666 _mm_mullo_epi16(x8,
3667 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3668 _mm_mulhi_epi16(x8,
3669 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3670 x8 = _mm_unpackhi_epi16(
3671 _mm_mullo_epi16(x8,
3672 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3673 _mm_mulhi_epi16(x8,
3674 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3675
3676 /* add calculus by correct value : */
3677
3678 r1 = _mm_add_epi32(x1, x2);
3679 x3 = _mm_add_epi32(x3, x4);
3680 x5 = _mm_add_epi32(x5, x6);
3681 r1 = _mm_add_epi32(r1, x3);
3682 x7 = _mm_add_epi32(x7, x8);
3683 r1 = _mm_add_epi32(r1, x5);
3684
3685 r0 = _mm_add_epi32(t1, t2);
3686 t3 = _mm_add_epi32(t3, t4);
3687 t5 = _mm_add_epi32(t5, t6);
3688 r0 = _mm_add_epi32(r0, t3);
3689 t7 = _mm_add_epi32(t7, t8);
3690 r0 = _mm_add_epi32(r0, t5);
3691 r1 = _mm_add_epi32(r1, x7);
3692 r0 = _mm_add_epi32(r0, t7);
3693 r1 = _mm_srli_epi32(r1, 6);
3694 r0 = _mm_srli_epi32(r0, 6);
3695
3696 r1 = _mm_and_si128(r1,
3697 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3698 r0 = _mm_and_si128(r0,
3699 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3700 r0 = _mm_hadd_epi16(r0, r1);
3701 _mm_store_si128((__m128i *) &dst[x], r0);
3702
3703 }
3704 tmp += MAX_PB_SIZE;
3705 dst += dststride;
3706 }
3707 }
3708 void ff_hevc_put_hevc_qpel_h_1_v_3_sse(int16_t *dst, ptrdiff_t dststride,
3709 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3710 int16_t* mcbuffer) {
3711 int x, y;
3712 uint8_t *src = (uint8_t*) _src;
3713 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3714 int16_t *tmp = mcbuffer;
3715 __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
3716 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3717
3718 src -= qpel_extra_before[3] * srcstride;
3719 r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
3720 -1);
3721
3722 /* LOAD src from memory to registers to limit memory bandwidth */
3723 if (width == 4) {
3724
3725 for (y = 0; y < height + qpel_extra[3]; y += 2) {
3726 /* load data in register */
3727 x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3728 src += srcstride;
3729 t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3730 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3731 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3732 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3733 _mm_srli_si128(x1, 3));
3734 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3735 _mm_srli_si128(t1, 3));
3736
3737 /* PMADDUBSW then PMADDW */
3738 x2 = _mm_maddubs_epi16(x2, r0);
3739 t2 = _mm_maddubs_epi16(t2, r0);
3740 x3 = _mm_maddubs_epi16(x3, r0);
3741 t3 = _mm_maddubs_epi16(t3, r0);
3742 x2 = _mm_hadd_epi16(x2, x3);
3743 t2 = _mm_hadd_epi16(t2, t3);
3744 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3745 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3746 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3747 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3748 /* give results back */
3749 _mm_storel_epi64((__m128i *) &tmp[0], x2);
3750
3751 tmp += MAX_PB_SIZE;
3752 _mm_storel_epi64((__m128i *) &tmp[0], t2);
3753
3754 src += srcstride;
3755 tmp += MAX_PB_SIZE;
3756 }
3757 } else
3758 for (y = 0; y < height + qpel_extra[3]; y++) {
3759 for (x = 0; x < width; x += 8) {
3760 /* load data in register */
3761 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3762 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3763 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3764 _mm_srli_si128(x1, 3));
3765 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3766 _mm_srli_si128(x1, 5));
3767 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3768 _mm_srli_si128(x1, 7));
3769
3770 /* PMADDUBSW then PMADDW */
3771 x2 = _mm_maddubs_epi16(x2, r0);
3772 x3 = _mm_maddubs_epi16(x3, r0);
3773 x4 = _mm_maddubs_epi16(x4, r0);
3774 x5 = _mm_maddubs_epi16(x5, r0);
3775 x2 = _mm_hadd_epi16(x2, x3);
3776 x4 = _mm_hadd_epi16(x4, x5);
3777 x2 = _mm_hadd_epi16(x2, x4);
3778 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3779
3780 /* give results back */
3781 _mm_store_si128((__m128i *) &tmp[x], x2);
3782
3783 }
3784 src += srcstride;
3785 tmp += MAX_PB_SIZE;
3786 }
3787
3788 tmp = mcbuffer + qpel_extra_before[3] * MAX_PB_SIZE;
3789 srcstride = MAX_PB_SIZE;
3790
3791 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
3792 for register calculations */
3793 rTemp = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
3794 for (y = 0; y < height; y++) {
3795 for (x = 0; x < width; x += 8) {
3796
3797 x1 = _mm_setzero_si128();
3798 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3799 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3800 x4 = _mm_load_si128((__m128i *) &tmp[x]);
3801 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3802 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3803 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3804 x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
3805
3806
3807 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
3808
3809 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
3810 t7 = _mm_mullo_epi16(x2, r1);
3811 rBuffer = _mm_mulhi_epi16(x2, r1);
3812 t8 = _mm_mullo_epi16(x3, r0);
3813 t2 = _mm_unpacklo_epi16(t7, rBuffer);
3814 x2 = _mm_unpackhi_epi16(t7, rBuffer);
3815
3816 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
3817 rBuffer = _mm_mulhi_epi16(x3, r0);
3818 t7 = _mm_mullo_epi16(x4, r1);
3819 t3 = _mm_unpacklo_epi16(t8, rBuffer);
3820 x3 = _mm_unpackhi_epi16(t8, rBuffer);
3821
3822 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
3823 rBuffer = _mm_mulhi_epi16(x4, r1);
3824 t8 = _mm_mullo_epi16(x5, r0);
3825 t4 = _mm_unpacklo_epi16(t7, rBuffer);
3826 x4 = _mm_unpackhi_epi16(t7, rBuffer);
3827
3828 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
3829 rBuffer = _mm_mulhi_epi16(x5, r0);
3830 t7 = _mm_mullo_epi16(x6, r1);
3831 t5 = _mm_unpacklo_epi16(t8, rBuffer);
3832 x5 = _mm_unpackhi_epi16(t8, rBuffer);
3833
3834 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
3835 rBuffer = _mm_mulhi_epi16(x6, r1);
3836 t8 = _mm_mullo_epi16(x7, r0);
3837 t6 = _mm_unpacklo_epi16(t7, rBuffer);
3838 x6 = _mm_unpackhi_epi16(t7, rBuffer);
3839
3840 rBuffer = _mm_mulhi_epi16(x7, r0);
3841 t7 = _mm_unpacklo_epi16(t8, rBuffer);
3842 x7 = _mm_unpackhi_epi16(t8, rBuffer);
3843
3844 t8 = _mm_unpacklo_epi16(
3845 _mm_mullo_epi16(x8,
3846 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3847 _mm_mulhi_epi16(x8,
3848 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3849 x8 = _mm_unpackhi_epi16(
3850 _mm_mullo_epi16(x8,
3851 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3852 _mm_mulhi_epi16(x8,
3853 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3854
3855 /* add calculus by correct value : */
3856
3857 x3 = _mm_add_epi32(x3, x4);
3858 x5 = _mm_add_epi32(x5, x6);
3859 r1 = _mm_add_epi32(x2, x3);
3860 x7 = _mm_add_epi32(x7, x8);
3861 r1 = _mm_add_epi32(r1, x5);
3862
3863 t3 = _mm_add_epi32(t3, t4);
3864 t5 = _mm_add_epi32(t5, t6);
3865 r0 = _mm_add_epi32(t2, t3);
3866 t7 = _mm_add_epi32(t7, t8);
3867 r0 = _mm_add_epi32(r0, t5);
3868 r1 = _mm_add_epi32(r1, x7);
3869 r0 = _mm_add_epi32(r0, t7);
3870 r1 = _mm_srli_epi32(r1, 6);
3871 r0 = _mm_srli_epi32(r0, 6);
3872
3873 r1 = _mm_and_si128(r1,
3874 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3875 r0 = _mm_and_si128(r0,
3876 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3877 r0 = _mm_hadd_epi16(r0, r1);
3878 _mm_store_si128((__m128i *) &dst[x], r0);
3879
3880 }
3881 tmp += MAX_PB_SIZE;
3882 dst += dststride;
3883 }
3884 }
3885 void ff_hevc_put_hevc_qpel_h_2_v_1_sse(int16_t *dst, ptrdiff_t dststride,
3886 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3887 int16_t* mcbuffer) {
3888 int x, y;
3889 uint8_t *src = (uint8_t*) _src;
3890 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3891 int16_t *tmp = mcbuffer;
3892 __m128i x1, x2, x3, x4, x5, x6, x7, rBuffer, rTemp, r0, r1;
3893 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3894
3895 src -= qpel_extra_before[1] * srcstride;
3896 r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
3897 4, -1);
3898
3899 /* LOAD src from memory to registers to limit memory bandwidth */
3900 if (width == 4) {
3901
3902 for (y = 0; y < height + qpel_extra[1]; y += 2) {
3903 /* load data in register */
3904 x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3905 src += srcstride;
3906 t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3907 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3908 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3909 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3910 _mm_srli_si128(x1, 3));
3911 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3912 _mm_srli_si128(t1, 3));
3913
3914 /* PMADDUBSW then PMADDW */
3915 x2 = _mm_maddubs_epi16(x2, r0);
3916 t2 = _mm_maddubs_epi16(t2, r0);
3917 x3 = _mm_maddubs_epi16(x3, r0);
3918 t3 = _mm_maddubs_epi16(t3, r0);
3919 x2 = _mm_hadd_epi16(x2, x3);
3920 t2 = _mm_hadd_epi16(t2, t3);
3921 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3922 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3923 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3924 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3925 /* give results back */
3926 _mm_storel_epi64((__m128i *) &tmp[0], x2);
3927
3928 tmp += MAX_PB_SIZE;
3929 _mm_storel_epi64((__m128i *) &tmp[0], t2);
3930
3931 src += srcstride;
3932 tmp += MAX_PB_SIZE;
3933 }
3934 } else
3935 for (y = 0; y < height + qpel_extra[1]; y++) {
3936 for (x = 0; x < width; x += 8) {
3937 /* load data in register */
3938 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3939 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3940 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3941 _mm_srli_si128(x1, 3));
3942 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3943 _mm_srli_si128(x1, 5));
3944 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3945 _mm_srli_si128(x1, 7));
3946
3947 /* PMADDUBSW then PMADDW */
3948 x2 = _mm_maddubs_epi16(x2, r0);
3949 x3 = _mm_maddubs_epi16(x3, r0);
3950 x4 = _mm_maddubs_epi16(x4, r0);
3951 x5 = _mm_maddubs_epi16(x5, r0);
3952 x2 = _mm_hadd_epi16(x2, x3);
3953 x4 = _mm_hadd_epi16(x4, x5);
3954 x2 = _mm_hadd_epi16(x2, x4);
3955 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3956
3957 /* give results back */
3958 _mm_store_si128((__m128i *) &tmp[x], x2);
3959
3960 }
3961 src += srcstride;
3962 tmp += MAX_PB_SIZE;
3963 }
3964
3965 tmp = mcbuffer + qpel_extra_before[1] * MAX_PB_SIZE;
3966 srcstride = MAX_PB_SIZE;
3967
3968 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
3969 for register calculations */
3970 rTemp = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
3971 for (y = 0; y < height; y++) {
3972 for (x = 0; x < width; x += 8) {
3973
3974 x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
3975 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3976 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3977 x4 = _mm_load_si128((__m128i *) &tmp[x]);
3978 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3979 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3980 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3981
3982 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
3983 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
3984 t8 = _mm_mullo_epi16(x1, r0);
3985 rBuffer = _mm_mulhi_epi16(x1, r0);
3986 t7 = _mm_mullo_epi16(x2, r1);
3987 t1 = _mm_unpacklo_epi16(t8, rBuffer);
3988 x1 = _mm_unpackhi_epi16(t8, rBuffer);
3989
3990 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
3991 rBuffer = _mm_mulhi_epi16(x2, r1);
3992 t8 = _mm_mullo_epi16(x3, r0);
3993 t2 = _mm_unpacklo_epi16(t7, rBuffer);
3994 x2 = _mm_unpackhi_epi16(t7, rBuffer);
3995
3996 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
3997 rBuffer = _mm_mulhi_epi16(x3, r0);
3998 t7 = _mm_mullo_epi16(x4, r1);
3999 t3 = _mm_unpacklo_epi16(t8, rBuffer);
4000 x3 = _mm_unpackhi_epi16(t8, rBuffer);
4001
4002 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4003 rBuffer = _mm_mulhi_epi16(x4, r1);
4004 t8 = _mm_mullo_epi16(x5, r0);
4005 t4 = _mm_unpacklo_epi16(t7, rBuffer);
4006 x4 = _mm_unpackhi_epi16(t7, rBuffer);
4007
4008 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4009 rBuffer = _mm_mulhi_epi16(x5, r0);
4010 t7 = _mm_mullo_epi16(x6, r1);
4011 t5 = _mm_unpacklo_epi16(t8, rBuffer);
4012 x5 = _mm_unpackhi_epi16(t8, rBuffer);
4013
4014 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4015 rBuffer = _mm_mulhi_epi16(x6, r1);
4016 t8 = _mm_mullo_epi16(x7, r0);
4017 t6 = _mm_unpacklo_epi16(t7, rBuffer);
4018 x6 = _mm_unpackhi_epi16(t7, rBuffer);
4019
4020 rBuffer = _mm_mulhi_epi16(x7, r0);
4021 t7 = _mm_unpacklo_epi16(t8, rBuffer);
4022 x7 = _mm_unpackhi_epi16(t8, rBuffer);
4023
4024
4025
4026 /* add calculus by correct value : */
4027
4028 r1 = _mm_add_epi32(x1, x2);
4029 x3 = _mm_add_epi32(x3, x4);
4030 x5 = _mm_add_epi32(x5, x6);
4031 r1 = _mm_add_epi32(r1, x3);
4032 r1 = _mm_add_epi32(r1, x5);
4033
4034 r0 = _mm_add_epi32(t1, t2);
4035 t3 = _mm_add_epi32(t3, t4);
4036 t5 = _mm_add_epi32(t5, t6);
4037 r0 = _mm_add_epi32(r0, t3);
4038 r0 = _mm_add_epi32(r0, t5);
4039 r1 = _mm_add_epi32(r1, x7);
4040 r0 = _mm_add_epi32(r0, t7);
4041 r1 = _mm_srli_epi32(r1, 6);
4042 r0 = _mm_srli_epi32(r0, 6);
4043
4044 r1 = _mm_and_si128(r1,
4045 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4046 r0 = _mm_and_si128(r0,
4047 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4048 r0 = _mm_hadd_epi16(r0, r1);
4049 _mm_store_si128((__m128i *) &dst[x], r0);
4050
4051 }
4052 tmp += MAX_PB_SIZE;
4053 dst += dststride;
4054 }
4055 }
4056 void ff_hevc_put_hevc_qpel_h_2_v_2_sse(int16_t *dst, ptrdiff_t dststride,
4057 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4058 int16_t* mcbuffer) {
4059 int x, y;
4060 uint8_t *src = (uint8_t*) _src;
4061 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4062 int16_t *tmp = mcbuffer;
4063 __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4064 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4065
4066 src -= qpel_extra_before[2] * srcstride;
4067 r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
4068 4, -1);
4069
4070 /* LOAD src from memory to registers to limit memory bandwidth */
4071 if (width == 4) {
4072
4073 for (y = 0; y < height + qpel_extra[2]; y += 2) {
4074 /* load data in register */
4075 x1 = _mm_loadu_si128((__m128i *) &src[-3]);
4076 src += srcstride;
4077 t1 = _mm_loadu_si128((__m128i *) &src[-3]);
4078 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4079 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4080 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4081 _mm_srli_si128(x1, 3));
4082 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4083 _mm_srli_si128(t1, 3));
4084
4085 /* PMADDUBSW then PMADDW */
4086 x2 = _mm_maddubs_epi16(x2, r0);
4087 t2 = _mm_maddubs_epi16(t2, r0);
4088 x3 = _mm_maddubs_epi16(x3, r0);
4089 t3 = _mm_maddubs_epi16(t3, r0);
4090 x2 = _mm_hadd_epi16(x2, x3);
4091 t2 = _mm_hadd_epi16(t2, t3);
4092 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4093 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4094 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4095 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4096 /* give results back */
4097 _mm_storel_epi64((__m128i *) &tmp[0], x2);
4098
4099 tmp += MAX_PB_SIZE;
4100 _mm_storel_epi64((__m128i *) &tmp[0], t2);
4101
4102 src += srcstride;
4103 tmp += MAX_PB_SIZE;
4104 }
4105 } else
4106 for (y = 0; y < height + qpel_extra[2]; y++) {
4107 for (x = 0; x < width; x += 8) {
4108 /* load data in register */
4109 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
4110 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4111 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4112 _mm_srli_si128(x1, 3));
4113 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4114 _mm_srli_si128(x1, 5));
4115 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4116 _mm_srli_si128(x1, 7));
4117
4118 /* PMADDUBSW then PMADDW */
4119 x2 = _mm_maddubs_epi16(x2, r0);
4120 x3 = _mm_maddubs_epi16(x3, r0);
4121 x4 = _mm_maddubs_epi16(x4, r0);
4122 x5 = _mm_maddubs_epi16(x5, r0);
4123 x2 = _mm_hadd_epi16(x2, x3);
4124 x4 = _mm_hadd_epi16(x4, x5);
4125 x2 = _mm_hadd_epi16(x2, x4);
4126 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4127
4128 /* give results back */
4129 _mm_store_si128((__m128i *) &tmp[x], x2);
4130
4131 }
4132 src += srcstride;
4133 tmp += MAX_PB_SIZE;
4134 }
4135
4136 tmp = mcbuffer + qpel_extra_before[2] * MAX_PB_SIZE;
4137 srcstride = MAX_PB_SIZE;
4138
4139 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
4140 for register calculations */
4141 rTemp = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
4142 for (y = 0; y < height; y++) {
4143 for (x = 0; x < width; x += 8) {
4144
4145 x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
4146 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4147 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4148 x4 = _mm_load_si128((__m128i *) &tmp[x]);
4149 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4150 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4151 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4152 x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4153
4154 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
4155 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4156 t8 = _mm_mullo_epi16(x1, r0);
4157 rBuffer = _mm_mulhi_epi16(x1, r0);
4158 t7 = _mm_mullo_epi16(x2, r1);
4159 t1 = _mm_unpacklo_epi16(t8, rBuffer);
4160 x1 = _mm_unpackhi_epi16(t8, rBuffer);
4161
4162 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4163 rBuffer = _mm_mulhi_epi16(x2, r1);
4164 t8 = _mm_mullo_epi16(x3, r0);
4165 t2 = _mm_unpacklo_epi16(t7, rBuffer);
4166 x2 = _mm_unpackhi_epi16(t7, rBuffer);
4167
4168 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4169 rBuffer = _mm_mulhi_epi16(x3, r0);
4170 t7 = _mm_mullo_epi16(x4, r1);
4171 t3 = _mm_unpacklo_epi16(t8, rBuffer);
4172 x3 = _mm_unpackhi_epi16(t8, rBuffer);
4173
4174 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4175 rBuffer = _mm_mulhi_epi16(x4, r1);
4176 t8 = _mm_mullo_epi16(x5, r0);
4177 t4 = _mm_unpacklo_epi16(t7, rBuffer);
4178 x4 = _mm_unpackhi_epi16(t7, rBuffer);
4179
4180 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4181 rBuffer = _mm_mulhi_epi16(x5, r0);
4182 t7 = _mm_mullo_epi16(x6, r1);
4183 t5 = _mm_unpacklo_epi16(t8, rBuffer);
4184 x5 = _mm_unpackhi_epi16(t8, rBuffer);
4185
4186 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4187 rBuffer = _mm_mulhi_epi16(x6, r1);
4188 t8 = _mm_mullo_epi16(x7, r0);
4189 t6 = _mm_unpacklo_epi16(t7, rBuffer);
4190 x6 = _mm_unpackhi_epi16(t7, rBuffer);
4191
4192 rBuffer = _mm_mulhi_epi16(x7, r0);
4193 t7 = _mm_unpacklo_epi16(t8, rBuffer);
4194 x7 = _mm_unpackhi_epi16(t8, rBuffer);
4195
4196 t8 = _mm_unpacklo_epi16(
4197 _mm_mullo_epi16(x8,
4198 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4199 _mm_mulhi_epi16(x8,
4200 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4201 x8 = _mm_unpackhi_epi16(
4202 _mm_mullo_epi16(x8,
4203 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4204 _mm_mulhi_epi16(x8,
4205 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4206
4207 /* add calculus by correct value : */
4208
4209 r1 = _mm_add_epi32(x1, x2);
4210 x3 = _mm_add_epi32(x3, x4);
4211 x5 = _mm_add_epi32(x5, x6);
4212 r1 = _mm_add_epi32(r1, x3);
4213 x7 = _mm_add_epi32(x7, x8);
4214 r1 = _mm_add_epi32(r1, x5);
4215
4216 r0 = _mm_add_epi32(t1, t2);
4217 t3 = _mm_add_epi32(t3, t4);
4218 t5 = _mm_add_epi32(t5, t6);
4219 r0 = _mm_add_epi32(r0, t3);
4220 t7 = _mm_add_epi32(t7, t8);
4221 r0 = _mm_add_epi32(r0, t5);
4222 r1 = _mm_add_epi32(r1, x7);
4223 r0 = _mm_add_epi32(r0, t7);
4224 r1 = _mm_srli_epi32(r1, 6);
4225 r0 = _mm_srli_epi32(r0, 6);
4226
4227 r1 = _mm_and_si128(r1,
4228 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4229 r0 = _mm_and_si128(r0,
4230 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4231 r0 = _mm_hadd_epi16(r0, r1);
4232 _mm_store_si128((__m128i *) &dst[x], r0);
4233
4234 }
4235 tmp += MAX_PB_SIZE;
4236 dst += dststride;
4237 }
4238 }
4239 void ff_hevc_put_hevc_qpel_h_2_v_3_sse(int16_t *dst, ptrdiff_t dststride,
4240 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4241 int16_t* mcbuffer) {
4242 int x, y;
4243 uint8_t *src = (uint8_t*) _src;
4244 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4245 int16_t *tmp = mcbuffer;
4246 __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4247 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4248
4249 src -= qpel_extra_before[3] * srcstride;
4250 r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
4251 4, -1);
4252
4253 /* LOAD src from memory to registers to limit memory bandwidth */
4254 if (width == 4) {
4255
4256 for (y = 0; y < height + qpel_extra[3]; y += 2) {
4257 /* load data in register */
4258 x1 = _mm_loadu_si128((__m128i *) &src[-3]);
4259 src += srcstride;
4260 t1 = _mm_loadu_si128((__m128i *) &src[-3]);
4261 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4262 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4263 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4264 _mm_srli_si128(x1, 3));
4265 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4266 _mm_srli_si128(t1, 3));
4267
4268 /* PMADDUBSW then PMADDW */
4269 x2 = _mm_maddubs_epi16(x2, r0);
4270 t2 = _mm_maddubs_epi16(t2, r0);
4271 x3 = _mm_maddubs_epi16(x3, r0);
4272 t3 = _mm_maddubs_epi16(t3, r0);
4273 x2 = _mm_hadd_epi16(x2, x3);
4274 t2 = _mm_hadd_epi16(t2, t3);
4275 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4276 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4277 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4278 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4279 /* give results back */
4280 _mm_storel_epi64((__m128i *) &tmp[0], x2);
4281
4282 tmp += MAX_PB_SIZE;
4283 _mm_storel_epi64((__m128i *) &tmp[0], t2);
4284
4285 src += srcstride;
4286 tmp += MAX_PB_SIZE;
4287 }
4288 } else
4289 for (y = 0; y < height + qpel_extra[3]; y++) {
4290 for (x = 0; x < width; x += 8) {
4291 /* load data in register */
4292 x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
4293 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4294 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4295 _mm_srli_si128(x1, 3));
4296 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4297 _mm_srli_si128(x1, 5));
4298 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4299 _mm_srli_si128(x1, 7));
4300
4301 /* PMADDUBSW then PMADDW */
4302 x2 = _mm_maddubs_epi16(x2, r0);
4303 x3 = _mm_maddubs_epi16(x3, r0);
4304 x4 = _mm_maddubs_epi16(x4, r0);
4305 x5 = _mm_maddubs_epi16(x5, r0);
4306 x2 = _mm_hadd_epi16(x2, x3);
4307 x4 = _mm_hadd_epi16(x4, x5);
4308 x2 = _mm_hadd_epi16(x2, x4);
4309 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4310
4311 /* give results back */
4312 _mm_store_si128((__m128i *) &tmp[x], x2);
4313
4314 }
4315 src += srcstride;
4316 tmp += MAX_PB_SIZE;
4317 }
4318
4319 tmp = mcbuffer + qpel_extra_before[3] * MAX_PB_SIZE;
4320 srcstride = MAX_PB_SIZE;
4321
4322 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
4323 for register calculations */
4324 rTemp = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
4325 for (y = 0; y < height; y++) {
4326 for (x = 0; x < width; x += 8) {
4327
4328 x1 = _mm_setzero_si128();
4329 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4330 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4331 x4 = _mm_load_si128((__m128i *) &tmp[x]);
4332 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4333 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4334 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4335 x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4336
4337 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4338
4339 t7 = _mm_mullo_epi16(x2, r1);
4340
4341
4342 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4343 rBuffer = _mm_mulhi_epi16(x2, r1);
4344 t8 = _mm_mullo_epi16(x3, r0);
4345 t2 = _mm_unpacklo_epi16(t7, rBuffer);
4346 x2 = _mm_unpackhi_epi16(t7, rBuffer);
4347
4348 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4349 rBuffer = _mm_mulhi_epi16(x3, r0);
4350 t7 = _mm_mullo_epi16(x4, r1);
4351 t3 = _mm_unpacklo_epi16(t8, rBuffer);
4352 x3 = _mm_unpackhi_epi16(t8, rBuffer);
4353
4354 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4355 rBuffer = _mm_mulhi_epi16(x4, r1);
4356 t8 = _mm_mullo_epi16(x5, r0);
4357 t4 = _mm_unpacklo_epi16(t7, rBuffer);
4358 x4 = _mm_unpackhi_epi16(t7, rBuffer);
4359
4360 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4361 rBuffer = _mm_mulhi_epi16(x5, r0);
4362 t7 = _mm_mullo_epi16(x6, r1);
4363 t5 = _mm_unpacklo_epi16(t8, rBuffer);
4364 x5 = _mm_unpackhi_epi16(t8, rBuffer);
4365
4366 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4367 rBuffer = _mm_mulhi_epi16(x6, r1);
4368 t8 = _mm_mullo_epi16(x7, r0);
4369 t6 = _mm_unpacklo_epi16(t7, rBuffer);
4370 x6 = _mm_unpackhi_epi16(t7, rBuffer);
4371
4372 rBuffer = _mm_mulhi_epi16(x7, r0);
4373 t7 = _mm_unpacklo_epi16(t8, rBuffer);
4374 x7 = _mm_unpackhi_epi16(t8, rBuffer);
4375
4376 t8 = _mm_unpacklo_epi16(
4377 _mm_mullo_epi16(x8,
4378 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4379 _mm_mulhi_epi16(x8,
4380 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4381 x8 = _mm_unpackhi_epi16(
4382 _mm_mullo_epi16(x8,
4383 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4384 _mm_mulhi_epi16(x8,
4385 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4386
4387 /* add calculus by correct value : */
4388
4389 x3 = _mm_add_epi32(x3, x4);
4390 x5 = _mm_add_epi32(x5, x6);
4391 r1 = _mm_add_epi32(x2, x3);
4392 x7 = _mm_add_epi32(x7, x8);
4393 r1 = _mm_add_epi32(r1, x5);
4394
4395 t3 = _mm_add_epi32(t3, t4);
4396 t5 = _mm_add_epi32(t5, t6);
4397 r0 = _mm_add_epi32(t2, t3);
4398 t7 = _mm_add_epi32(t7, t8);
4399 r0 = _mm_add_epi32(r0, t5);
4400 r1 = _mm_add_epi32(r1, x7);
4401 r0 = _mm_add_epi32(r0, t7);
4402 r1 = _mm_srli_epi32(r1, 6);
4403 r0 = _mm_srli_epi32(r0, 6);
4404
4405 r1 = _mm_and_si128(r1,
4406 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4407 r0 = _mm_and_si128(r0,
4408 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4409 r0 = _mm_hadd_epi16(r0, r1);
4410 _mm_store_si128((__m128i *) &dst[x], r0);
4411
4412 }
4413 tmp += MAX_PB_SIZE;
4414 dst += dststride;
4415 }
4416 }
4417 void ff_hevc_put_hevc_qpel_h_3_v_1_sse(int16_t *dst, ptrdiff_t dststride,
4418 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4419 int16_t* mcbuffer) {
4420 int x, y;
4421 uint8_t *src = (uint8_t*) _src;
4422 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4423 int16_t *tmp = mcbuffer;
4424 __m128i x1, x2, x3, x4, x5, x6, x7, rBuffer, rTemp, r0, r1;
4425 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4426
4427 src -= qpel_extra_before[1] * srcstride;
4428 r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
4429 0);
4430
4431 /* LOAD src from memory to registers to limit memory bandwidth */
4432 if (width == 4) {
4433
4434 for (y = 0; y < height + qpel_extra[1]; y += 2) {
4435 /* load data in register */
4436 x1 = _mm_loadu_si128((__m128i *) &src[-2]);
4437 x1 = _mm_slli_si128(x1, 1);
4438 src += srcstride;
4439 t1 = _mm_loadu_si128((__m128i *) &src[-2]);
4440 t1 = _mm_slli_si128(t1, 1);
4441 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4442 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4443 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4444 _mm_srli_si128(x1, 3));
4445 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4446 _mm_srli_si128(t1, 3));
4447
4448 /* PMADDUBSW then PMADDW */
4449 x2 = _mm_maddubs_epi16(x2, r0);
4450 t2 = _mm_maddubs_epi16(t2, r0);
4451 x3 = _mm_maddubs_epi16(x3, r0);
4452 t3 = _mm_maddubs_epi16(t3, r0);
4453 x2 = _mm_hadd_epi16(x2, x3);
4454 t2 = _mm_hadd_epi16(t2, t3);
4455 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4456 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4457 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4458 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4459 /* give results back */
4460 _mm_storel_epi64((__m128i *) &tmp[0], x2);
4461
4462 tmp += MAX_PB_SIZE;
4463 _mm_storel_epi64((__m128i *) &tmp[0], t2);
4464
4465 src += srcstride;
4466 tmp += MAX_PB_SIZE;
4467 }
4468 } else
4469 for (y = 0; y < height + qpel_extra[1]; y++) {
4470 for (x = 0; x < width; x += 8) {
4471 /* load data in register */
4472 x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
4473 x1 = _mm_slli_si128(x1, 1);
4474 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4475 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4476 _mm_srli_si128(x1, 3));
4477 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4478 _mm_srli_si128(x1, 5));
4479 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4480 _mm_srli_si128(x1, 7));
4481
4482 /* PMADDUBSW then PMADDW */
4483 x2 = _mm_maddubs_epi16(x2, r0);
4484 x3 = _mm_maddubs_epi16(x3, r0);
4485 x4 = _mm_maddubs_epi16(x4, r0);
4486 x5 = _mm_maddubs_epi16(x5, r0);
4487 x2 = _mm_hadd_epi16(x2, x3);
4488 x4 = _mm_hadd_epi16(x4, x5);
4489 x2 = _mm_hadd_epi16(x2, x4);
4490 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4491
4492 /* give results back */
4493 _mm_store_si128((__m128i *) &tmp[x], x2);
4494
4495 }
4496 src += srcstride;
4497 tmp += MAX_PB_SIZE;
4498 }
4499
4500 tmp = mcbuffer + qpel_extra_before[1] * MAX_PB_SIZE;
4501 srcstride = MAX_PB_SIZE;
4502
4503 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
4504 for register calculations */
4505 rTemp = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
4506 for (y = 0; y < height; y++) {
4507 for (x = 0; x < width; x += 8) {
4508
4509 x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
4510 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4511 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4512 x4 = _mm_load_si128((__m128i *) &tmp[x]);
4513 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4514 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4515 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4516
4517 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
4518 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4519 t8 = _mm_mullo_epi16(x1, r0);
4520 rBuffer = _mm_mulhi_epi16(x1, r0);
4521 t7 = _mm_mullo_epi16(x2, r1);
4522 t1 = _mm_unpacklo_epi16(t8, rBuffer);
4523 x1 = _mm_unpackhi_epi16(t8, rBuffer);
4524
4525 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4526 rBuffer = _mm_mulhi_epi16(x2, r1);
4527 t8 = _mm_mullo_epi16(x3, r0);
4528 t2 = _mm_unpacklo_epi16(t7, rBuffer);
4529 x2 = _mm_unpackhi_epi16(t7, rBuffer);
4530
4531 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4532 rBuffer = _mm_mulhi_epi16(x3, r0);
4533 t7 = _mm_mullo_epi16(x4, r1);
4534 t3 = _mm_unpacklo_epi16(t8, rBuffer);
4535 x3 = _mm_unpackhi_epi16(t8, rBuffer);
4536
4537 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4538 rBuffer = _mm_mulhi_epi16(x4, r1);
4539 t8 = _mm_mullo_epi16(x5, r0);
4540 t4 = _mm_unpacklo_epi16(t7, rBuffer);
4541 x4 = _mm_unpackhi_epi16(t7, rBuffer);
4542
4543 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4544 rBuffer = _mm_mulhi_epi16(x5, r0);
4545 t7 = _mm_mullo_epi16(x6, r1);
4546 t5 = _mm_unpacklo_epi16(t8, rBuffer);
4547 x5 = _mm_unpackhi_epi16(t8, rBuffer);
4548
4549 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4550 rBuffer = _mm_mulhi_epi16(x6, r1);
4551 t8 = _mm_mullo_epi16(x7, r0);
4552 t6 = _mm_unpacklo_epi16(t7, rBuffer);
4553 x6 = _mm_unpackhi_epi16(t7, rBuffer);
4554
4555 rBuffer = _mm_mulhi_epi16(x7, r0);
4556 t7 = _mm_unpacklo_epi16(t8, rBuffer);
4557 x7 = _mm_unpackhi_epi16(t8, rBuffer);
4558
4559
4560 /* add calculus by correct value : */
4561
4562 r1 = _mm_add_epi32(x1, x2);
4563 x3 = _mm_add_epi32(x3, x4);
4564 x5 = _mm_add_epi32(x5, x6);
4565 r1 = _mm_add_epi32(r1, x3);
4566 r1 = _mm_add_epi32(r1, x5);
4567
4568 r0 = _mm_add_epi32(t1, t2);
4569 t3 = _mm_add_epi32(t3, t4);
4570 t5 = _mm_add_epi32(t5, t6);
4571 r0 = _mm_add_epi32(r0, t3);
4572 r0 = _mm_add_epi32(r0, t5);
4573 r1 = _mm_add_epi32(r1, x7);
4574 r0 = _mm_add_epi32(r0, t7);
4575 r1 = _mm_srli_epi32(r1, 6);
4576 r0 = _mm_srli_epi32(r0, 6);
4577
4578 r1 = _mm_and_si128(r1,
4579 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4580 r0 = _mm_and_si128(r0,
4581 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4582 r0 = _mm_hadd_epi16(r0, r1);
4583 _mm_store_si128((__m128i *) &dst[x], r0);
4584
4585 }
4586 tmp += MAX_PB_SIZE;
4587 dst += dststride;
4588 }
4589 }
4590 void ff_hevc_put_hevc_qpel_h_3_v_2_sse(int16_t *dst, ptrdiff_t dststride,
4591 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4592 int16_t* mcbuffer) {
4593 int x, y;
4594 uint8_t *src = (uint8_t*) _src;
4595 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4596 int16_t *tmp = mcbuffer;
4597 __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4598 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4599
4600 src -= qpel_extra_before[2] * srcstride;
4601 r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
4602 0);
4603
4604 /* LOAD src from memory to registers to limit memory bandwidth */
4605 if (width == 4) {
4606
4607 for (y = 0; y < height + qpel_extra[2]; y += 2) {
4608 /* load data in register */
4609 x1 = _mm_loadu_si128((__m128i *) &src[-2]);
4610 x1 = _mm_slli_si128(x1, 1);
4611 src += srcstride;
4612 t1 = _mm_loadu_si128((__m128i *) &src[-2]);
4613 t1 = _mm_slli_si128(t1, 1);
4614 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4615 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4616 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4617 _mm_srli_si128(x1, 3));
4618 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4619 _mm_srli_si128(t1, 3));
4620
4621 /* PMADDUBSW then PMADDW */
4622 x2 = _mm_maddubs_epi16(x2, r0);
4623 t2 = _mm_maddubs_epi16(t2, r0);
4624 x3 = _mm_maddubs_epi16(x3, r0);
4625 t3 = _mm_maddubs_epi16(t3, r0);
4626 x2 = _mm_hadd_epi16(x2, x3);
4627 t2 = _mm_hadd_epi16(t2, t3);
4628 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4629 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4630 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4631 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4632 /* give results back */
4633 _mm_storel_epi64((__m128i *) &tmp[0], x2);
4634
4635 tmp += MAX_PB_SIZE;
4636 _mm_storel_epi64((__m128i *) &tmp[0], t2);
4637
4638 src += srcstride;
4639 tmp += MAX_PB_SIZE;
4640 }
4641 } else
4642 for (y = 0; y < height + qpel_extra[2]; y++) {
4643 for (x = 0; x < width; x += 8) {
4644 /* load data in register */
4645 x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
4646 x1 = _mm_slli_si128(x1, 1);
4647 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4648 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4649 _mm_srli_si128(x1, 3));
4650 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4651 _mm_srli_si128(x1, 5));
4652 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4653 _mm_srli_si128(x1, 7));
4654
4655 /* PMADDUBSW then PMADDW */
4656 x2 = _mm_maddubs_epi16(x2, r0);
4657 x3 = _mm_maddubs_epi16(x3, r0);
4658 x4 = _mm_maddubs_epi16(x4, r0);
4659 x5 = _mm_maddubs_epi16(x5, r0);
4660 x2 = _mm_hadd_epi16(x2, x3);
4661 x4 = _mm_hadd_epi16(x4, x5);
4662 x2 = _mm_hadd_epi16(x2, x4);
4663 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4664
4665 /* give results back */
4666 _mm_store_si128((__m128i *) &tmp[x], x2);
4667
4668 }
4669 src += srcstride;
4670 tmp += MAX_PB_SIZE;
4671 }
4672
4673 tmp = mcbuffer + qpel_extra_before[2] * MAX_PB_SIZE;
4674 srcstride = MAX_PB_SIZE;
4675
4676 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
4677 for register calculations */
4678 rTemp = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
4679 for (y = 0; y < height; y++) {
4680 for (x = 0; x < width; x += 8) {
4681
4682 x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
4683 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4684 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4685 x4 = _mm_load_si128((__m128i *) &tmp[x]);
4686 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4687 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4688 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4689 x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4690
4691 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
4692 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4693 t8 = _mm_mullo_epi16(x1, r0);
4694 rBuffer = _mm_mulhi_epi16(x1, r0);
4695 t7 = _mm_mullo_epi16(x2, r1);
4696 t1 = _mm_unpacklo_epi16(t8, rBuffer);
4697 x1 = _mm_unpackhi_epi16(t8, rBuffer);
4698
4699 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4700 rBuffer = _mm_mulhi_epi16(x2, r1);
4701 t8 = _mm_mullo_epi16(x3, r0);
4702 t2 = _mm_unpacklo_epi16(t7, rBuffer);
4703 x2 = _mm_unpackhi_epi16(t7, rBuffer);
4704
4705 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4706 rBuffer = _mm_mulhi_epi16(x3, r0);
4707 t7 = _mm_mullo_epi16(x4, r1);
4708 t3 = _mm_unpacklo_epi16(t8, rBuffer);
4709 x3 = _mm_unpackhi_epi16(t8, rBuffer);
4710
4711 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4712 rBuffer = _mm_mulhi_epi16(x4, r1);
4713 t8 = _mm_mullo_epi16(x5, r0);
4714 t4 = _mm_unpacklo_epi16(t7, rBuffer);
4715 x4 = _mm_unpackhi_epi16(t7, rBuffer);
4716
4717 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4718 rBuffer = _mm_mulhi_epi16(x5, r0);
4719 t7 = _mm_mullo_epi16(x6, r1);
4720 t5 = _mm_unpacklo_epi16(t8, rBuffer);
4721 x5 = _mm_unpackhi_epi16(t8, rBuffer);
4722
4723 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4724 rBuffer = _mm_mulhi_epi16(x6, r1);
4725 t8 = _mm_mullo_epi16(x7, r0);
4726 t6 = _mm_unpacklo_epi16(t7, rBuffer);
4727 x6 = _mm_unpackhi_epi16(t7, rBuffer);
4728
4729 rBuffer = _mm_mulhi_epi16(x7, r0);
4730 t7 = _mm_unpacklo_epi16(t8, rBuffer);
4731 x7 = _mm_unpackhi_epi16(t8, rBuffer);
4732
4733 t8 = _mm_unpacklo_epi16(
4734 _mm_mullo_epi16(x8,
4735 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4736 _mm_mulhi_epi16(x8,
4737 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4738 x8 = _mm_unpackhi_epi16(
4739 _mm_mullo_epi16(x8,
4740 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4741 _mm_mulhi_epi16(x8,
4742 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4743
4744 /* add calculus by correct value : */
4745
4746 r1 = _mm_add_epi32(x1, x2);
4747 x3 = _mm_add_epi32(x3, x4);
4748 x5 = _mm_add_epi32(x5, x6);
4749 r1 = _mm_add_epi32(r1, x3);
4750 x7 = _mm_add_epi32(x7, x8);
4751 r1 = _mm_add_epi32(r1, x5);
4752
4753 r0 = _mm_add_epi32(t1, t2);
4754 t3 = _mm_add_epi32(t3, t4);
4755 t5 = _mm_add_epi32(t5, t6);
4756 r0 = _mm_add_epi32(r0, t3);
4757 t7 = _mm_add_epi32(t7, t8);
4758 r0 = _mm_add_epi32(r0, t5);
4759 r1 = _mm_add_epi32(r1, x7);
4760 r0 = _mm_add_epi32(r0, t7);
4761 r1 = _mm_srli_epi32(r1, 6);
4762 r0 = _mm_srli_epi32(r0, 6);
4763
4764 r1 = _mm_and_si128(r1,
4765 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4766 r0 = _mm_and_si128(r0,
4767 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4768 r0 = _mm_hadd_epi16(r0, r1);
4769 _mm_store_si128((__m128i *) &dst[x], r0);
4770
4771 }
4772 tmp += MAX_PB_SIZE;
4773 dst += dststride;
4774 }
4775 }
4776 void ff_hevc_put_hevc_qpel_h_3_v_3_sse(int16_t *dst, ptrdiff_t dststride,
4777 uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4778 int16_t* mcbuffer) {
4779 int x, y;
4780 uint8_t *src = (uint8_t*) _src;
4781 ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4782 int16_t *tmp = mcbuffer;
4783 __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4784 __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4785
4786 src -= qpel_extra_before[3] * srcstride;
4787 r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
4788 0);
4789
4790 /* LOAD src from memory to registers to limit memory bandwidth */
4791 if (width == 4) {
4792
4793 for (y = 0; y < height + qpel_extra[3]; y += 2) {
4794 /* load data in register */
4795 x1 = _mm_loadu_si128((__m128i *) &src[-2]);
4796 x1 = _mm_slli_si128(x1, 1);
4797 src += srcstride;
4798 t1 = _mm_loadu_si128((__m128i *) &src[-2]);
4799 t1 = _mm_slli_si128(t1, 1);
4800 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4801 t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4802 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4803 _mm_srli_si128(x1, 3));
4804 t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4805 _mm_srli_si128(t1, 3));
4806
4807 /* PMADDUBSW then PMADDW */
4808 x2 = _mm_maddubs_epi16(x2, r0);
4809 t2 = _mm_maddubs_epi16(t2, r0);
4810 x3 = _mm_maddubs_epi16(x3, r0);
4811 t3 = _mm_maddubs_epi16(t3, r0);
4812 x2 = _mm_hadd_epi16(x2, x3);
4813 t2 = _mm_hadd_epi16(t2, t3);
4814 x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4815 t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4816 x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4817 t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4818 /* give results back */
4819 _mm_storel_epi64((__m128i *) &tmp[0], x2);
4820
4821 tmp += MAX_PB_SIZE;
4822 _mm_storel_epi64((__m128i *) &tmp[0], t2);
4823
4824 src += srcstride;
4825 tmp += MAX_PB_SIZE;
4826 }
4827 } else
4828 for (y = 0; y < height + qpel_extra[3]; y++) {
4829 for (x = 0; x < width; x += 8) {
4830 /* load data in register */
4831 x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
4832 x1 = _mm_slli_si128(x1, 1);
4833 x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4834 x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4835 _mm_srli_si128(x1, 3));
4836 x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4837 _mm_srli_si128(x1, 5));
4838 x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4839 _mm_srli_si128(x1, 7));
4840
4841 /* PMADDUBSW then PMADDW */
4842 x2 = _mm_maddubs_epi16(x2, r0);
4843 x3 = _mm_maddubs_epi16(x3, r0);
4844 x4 = _mm_maddubs_epi16(x4, r0);
4845 x5 = _mm_maddubs_epi16(x5, r0);
4846 x2 = _mm_hadd_epi16(x2, x3);
4847 x4 = _mm_hadd_epi16(x4, x5);
4848 x2 = _mm_hadd_epi16(x2, x4);
4849 x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4850
4851 /* give results back */
4852 _mm_store_si128((__m128i *) &tmp[x], x2);
4853
4854 }
4855 src += srcstride;
4856 tmp += MAX_PB_SIZE;
4857 }
4858
4859 tmp = mcbuffer + qpel_extra_before[3] * MAX_PB_SIZE;
4860 srcstride = MAX_PB_SIZE;
4861
4862 /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit integers
4863 for register calculations */
4864 rTemp = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
4865 for (y = 0; y < height; y++) {
4866 for (x = 0; x < width; x += 8) {
4867
4868 x1 = _mm_setzero_si128();
4869 x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4870 x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4871 x4 = _mm_load_si128((__m128i *) &tmp[x]);
4872 x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4873 x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4874 x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4875 x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4876
4877 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4878 t7 = _mm_mullo_epi16(x2, r1);
4879
4880
4881 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4882 rBuffer = _mm_mulhi_epi16(x2, r1);
4883 t8 = _mm_mullo_epi16(x3, r0);
4884 t2 = _mm_unpacklo_epi16(t7, rBuffer);
4885 x2 = _mm_unpackhi_epi16(t7, rBuffer);
4886
4887 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4888 rBuffer = _mm_mulhi_epi16(x3, r0);
4889 t7 = _mm_mullo_epi16(x4, r1);
4890 t3 = _mm_unpacklo_epi16(t8, rBuffer);
4891 x3 = _mm_unpackhi_epi16(t8, rBuffer);
4892
4893 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4894 rBuffer = _mm_mulhi_epi16(x4, r1);
4895 t8 = _mm_mullo_epi16(x5, r0);
4896 t4 = _mm_unpacklo_epi16(t7, rBuffer);
4897 x4 = _mm_unpackhi_epi16(t7, rBuffer);
4898
4899 r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4900 rBuffer = _mm_mulhi_epi16(x5, r0);
4901 t7 = _mm_mullo_epi16(x6, r1);
4902 t5 = _mm_unpacklo_epi16(t8, rBuffer);
4903 x5 = _mm_unpackhi_epi16(t8, rBuffer);
4904
4905 r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4906 rBuffer = _mm_mulhi_epi16(x6, r1);
4907 t8 = _mm_mullo_epi16(x7, r0);
4908 t6 = _mm_unpacklo_epi16(t7, rBuffer);
4909 x6 = _mm_unpackhi_epi16(t7, rBuffer);
4910
4911 rBuffer = _mm_mulhi_epi16(x7, r0);
4912 t7 = _mm_unpacklo_epi16(t8, rBuffer);
4913 x7 = _mm_unpackhi_epi16(t8, rBuffer);
4914
4915 t8 = _mm_unpacklo_epi16(
4916 _mm_mullo_epi16(x8,
4917 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4918 _mm_mulhi_epi16(x8,
4919 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4920 x8 = _mm_unpackhi_epi16(
4921 _mm_mullo_epi16(x8,
4922 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4923 _mm_mulhi_epi16(x8,
4924 _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4925
4926 /* add calculus by correct value : */
4927
4928 x3 = _mm_add_epi32(x3, x4);
4929 x5 = _mm_add_epi32(x5, x6);
4930 r1 = _mm_add_epi32(x2, x3);
4931 x7 = _mm_add_epi32(x7, x8);
4932 r1 = _mm_add_epi32(r1, x5);
4933
4934 t3 = _mm_add_epi32(t3, t4);
4935 t5 = _mm_add_epi32(t5, t6);
4936 r0 = _mm_add_epi32(t2, t3);
4937 t7 = _mm_add_epi32(t7, t8);
4938 r0 = _mm_add_epi32(r0, t5);
4939 r1 = _mm_add_epi32(r1, x7);
4940 r0 = _mm_add_epi32(r0, t7);
4941 r1 = _mm_srli_epi32(r1, 6);
4942 r0 = _mm_srli_epi32(r0, 6);
4943
4944 r1 = _mm_and_si128(r1,
4945 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4946 r0 = _mm_and_si128(r0,
4947 _mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4948 r0 = _mm_hadd_epi16(r0, r1);
4949 _mm_store_si128((__m128i *) &dst[x], r0);
4950
4951 }
4952 tmp += MAX_PB_SIZE;
4953 dst += dststride;
4954 }
4955 }
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 openHEVC contributors
3 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4 *
5 * This file is part of libde265.
6 *
7 * libde265 is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation, either version 3 of
10 * the License, or (at your option) any later version.
11 *
12 * libde265 is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
19 */
020
121 #ifndef SSE_MOTION_H
222 #define SSE_MOTION_H
+0
-84
libde265/x86/sse.c less more
0 #ifdef _MSC_VER
1 #include <intrin.h>
2 #endif
3
4 #include "x86/sse.h"
5 #include "x86/sse-motion.h"
6 #include "x86/sse-dct.h"
7
8 #ifdef HAVE_CONFIG_H
9 #include "config.h"
10 #endif
11
12 #ifdef __GNUC__
13 #include <cpuid.h>
14 #endif
15
16 void init_acceleration_functions_sse(struct acceleration_functions* accel)
17 {
18 uint32_t ecx=0,edx=0;
19
20 #ifdef _MSC_VER
21 uint32_t regs[4];
22 int a = 1;
23
24 __cpuid((int *)regs, (int)a);
25
26 ecx = regs[2];
27 edx = regs[3];
28 #else
29 uint32_t eax,ebx;
30 __get_cpuid(1, &eax,&ebx,&ecx,&edx);
31 #endif
32
33 // printf("CPUID EAX=1 -> ECX=%x EDX=%x\n", regs[2], regs[3]);
34
35 //int have_MMX = !!(edx & (1<<23));
36 int have_SSE = !!(edx & (1<<25));
37 int have_SSE4_1 = !!(ecx & (1<<19));
38
39 // printf("MMX:%d SSE:%d SSE4_1:%d\n",have_MMX,have_SSE,have_SSE4_1);
40
41 if (have_SSE) {
42 }
43
44 #if HAVE_SSE4_1
45 if (have_SSE4_1) {
46 accel->put_unweighted_pred_8 = ff_hevc_put_unweighted_pred_8_sse;
47 accel->put_weighted_pred_avg_8 = ff_hevc_put_weighted_pred_avg_8_sse;
48
49 accel->put_hevc_epel_8 = ff_hevc_put_hevc_epel_pixels_8_sse;
50 accel->put_hevc_epel_h_8 = ff_hevc_put_hevc_epel_h_8_sse;
51 accel->put_hevc_epel_v_8 = ff_hevc_put_hevc_epel_v_8_sse;
52 accel->put_hevc_epel_hv_8 = ff_hevc_put_hevc_epel_hv_8_sse;
53
54 accel->put_hevc_qpel_8[0][0] = ff_hevc_put_hevc_qpel_pixels_8_sse;
55 accel->put_hevc_qpel_8[0][1] = ff_hevc_put_hevc_qpel_v_1_8_sse;
56 accel->put_hevc_qpel_8[0][2] = ff_hevc_put_hevc_qpel_v_2_8_sse;
57 accel->put_hevc_qpel_8[0][3] = ff_hevc_put_hevc_qpel_v_3_8_sse;
58 accel->put_hevc_qpel_8[1][0] = ff_hevc_put_hevc_qpel_h_1_8_sse;
59 accel->put_hevc_qpel_8[1][1] = ff_hevc_put_hevc_qpel_h_1_v_1_sse;
60 accel->put_hevc_qpel_8[1][2] = ff_hevc_put_hevc_qpel_h_1_v_2_sse;
61 accel->put_hevc_qpel_8[1][3] = ff_hevc_put_hevc_qpel_h_1_v_3_sse;
62 accel->put_hevc_qpel_8[2][0] = ff_hevc_put_hevc_qpel_h_2_8_sse;
63 accel->put_hevc_qpel_8[2][1] = ff_hevc_put_hevc_qpel_h_2_v_1_sse;
64 accel->put_hevc_qpel_8[2][2] = ff_hevc_put_hevc_qpel_h_2_v_2_sse;
65 accel->put_hevc_qpel_8[2][3] = ff_hevc_put_hevc_qpel_h_2_v_3_sse;
66 accel->put_hevc_qpel_8[3][0] = ff_hevc_put_hevc_qpel_h_3_8_sse;
67 accel->put_hevc_qpel_8[3][1] = ff_hevc_put_hevc_qpel_h_3_v_1_sse;
68 accel->put_hevc_qpel_8[3][2] = ff_hevc_put_hevc_qpel_h_3_v_2_sse;
69 accel->put_hevc_qpel_8[3][3] = ff_hevc_put_hevc_qpel_h_3_v_3_sse;
70
71 accel->transform_skip_8 = ff_hevc_transform_skip_8_sse;
72
73 // actually, for these two functions, the scalar fallback seems to be faster than the SSE code
74 //accel->transform_4x4_luma_add_8 = ff_hevc_transform_4x4_luma_add_8_sse4; // SSE-4 only TODO
75 //accel->transform_4x4_add_8 = ff_hevc_transform_4x4_add_8_sse4;
76
77 accel->transform_8x8_add_8 = ff_hevc_transform_8x8_add_8_sse4;
78 accel->transform_16x16_add_8 = ff_hevc_transform_16x16_add_8_sse4;
79 accel->transform_32x32_add_8 = ff_hevc_transform_32x32_add_8_sse4;
80 }
81 #endif
82 }
83
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifdef _MSC_VER
21 #include <intrin.h>
22 #endif
23
24 #include "x86/sse.h"
25 #include "x86/sse-motion.h"
26 #include "x86/sse-dct.h"
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #ifdef __GNUC__
33 #include <cpuid.h>
34 #endif
35
36 void init_acceleration_functions_sse(struct acceleration_functions* accel)
37 {
38 uint32_t ecx=0,edx=0;
39
40 #ifdef _MSC_VER
41 uint32_t regs[4];
42 int a = 1;
43
44 __cpuid((int *)regs, (int)a);
45
46 ecx = regs[2];
47 edx = regs[3];
48 #else
49 uint32_t eax,ebx;
50 __get_cpuid(1, &eax,&ebx,&ecx,&edx);
51 #endif
52
53 // printf("CPUID EAX=1 -> ECX=%x EDX=%x\n", regs[2], regs[3]);
54
55 //int have_MMX = !!(edx & (1<<23));
56 int have_SSE = !!(edx & (1<<25));
57 int have_SSE4_1 = !!(ecx & (1<<19));
58
59 // printf("MMX:%d SSE:%d SSE4_1:%d\n",have_MMX,have_SSE,have_SSE4_1);
60
61 if (have_SSE) {
62 }
63
64 #if HAVE_SSE4_1
65 if (have_SSE4_1) {
66 accel->put_unweighted_pred_8 = ff_hevc_put_unweighted_pred_8_sse;
67 accel->put_weighted_pred_avg_8 = ff_hevc_put_weighted_pred_avg_8_sse;
68
69 accel->put_hevc_epel_8 = ff_hevc_put_hevc_epel_pixels_8_sse;
70 accel->put_hevc_epel_h_8 = ff_hevc_put_hevc_epel_h_8_sse;
71 accel->put_hevc_epel_v_8 = ff_hevc_put_hevc_epel_v_8_sse;
72 accel->put_hevc_epel_hv_8 = ff_hevc_put_hevc_epel_hv_8_sse;
73
74 accel->put_hevc_qpel_8[0][0] = ff_hevc_put_hevc_qpel_pixels_8_sse;
75 accel->put_hevc_qpel_8[0][1] = ff_hevc_put_hevc_qpel_v_1_8_sse;
76 accel->put_hevc_qpel_8[0][2] = ff_hevc_put_hevc_qpel_v_2_8_sse;
77 accel->put_hevc_qpel_8[0][3] = ff_hevc_put_hevc_qpel_v_3_8_sse;
78 accel->put_hevc_qpel_8[1][0] = ff_hevc_put_hevc_qpel_h_1_8_sse;
79 accel->put_hevc_qpel_8[1][1] = ff_hevc_put_hevc_qpel_h_1_v_1_sse;
80 accel->put_hevc_qpel_8[1][2] = ff_hevc_put_hevc_qpel_h_1_v_2_sse;
81 accel->put_hevc_qpel_8[1][3] = ff_hevc_put_hevc_qpel_h_1_v_3_sse;
82 accel->put_hevc_qpel_8[2][0] = ff_hevc_put_hevc_qpel_h_2_8_sse;
83 accel->put_hevc_qpel_8[2][1] = ff_hevc_put_hevc_qpel_h_2_v_1_sse;
84 accel->put_hevc_qpel_8[2][2] = ff_hevc_put_hevc_qpel_h_2_v_2_sse;
85 accel->put_hevc_qpel_8[2][3] = ff_hevc_put_hevc_qpel_h_2_v_3_sse;
86 accel->put_hevc_qpel_8[3][0] = ff_hevc_put_hevc_qpel_h_3_8_sse;
87 accel->put_hevc_qpel_8[3][1] = ff_hevc_put_hevc_qpel_h_3_v_1_sse;
88 accel->put_hevc_qpel_8[3][2] = ff_hevc_put_hevc_qpel_h_3_v_2_sse;
89 accel->put_hevc_qpel_8[3][3] = ff_hevc_put_hevc_qpel_h_3_v_3_sse;
90
91 accel->transform_skip_8 = ff_hevc_transform_skip_8_sse;
92
93 // actually, for these two functions, the scalar fallback seems to be faster than the SSE code
94 //accel->transform_4x4_luma_add_8 = ff_hevc_transform_4x4_luma_add_8_sse4; // SSE-4 only TODO
95 //accel->transform_4x4_add_8 = ff_hevc_transform_4x4_add_8_sse4;
96
97 accel->transform_8x8_add_8 = ff_hevc_transform_8x8_add_8_sse4;
98 accel->transform_16x16_add_8 = ff_hevc_transform_16x16_add_8_sse4;
99 accel->transform_32x32_add_8 = ff_hevc_transform_32x32_add_8_sse4;
100 }
101 #endif
102 }
103
0 /*
1 * H.265 video codec.
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
3 *
4 * This file is part of libde265.
5 *
6 * libde265 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * libde265 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
18 */
019
120 #ifndef DE265_SSE_H
221 #define DE265_SSE_H
88 Version: @VERSION@
99 Requires:
1010 Libs: -lde265 -L@libdir@
11 Libs.private: @LIBS@ -lstdc++
1112 Cflags: -I@includedir@
6969 # compiler: $LTCC
7070 # compiler flags: $LTCFLAGS
7171 # linker: $LD (gnu? $with_gnu_ld)
72 # $progname: (GNU libtool) 2.4.2 Debian-2.4.2-1.3ubuntu1
72 # $progname: (GNU libtool) 2.4.2 Debian-2.4.2-1.7ubuntu1
7373 # automake: $automake_version
7474 # autoconf: $autoconf_version
7575 #
7979
8080 PROGRAM=libtool
8181 PACKAGE=libtool
82 VERSION="2.4.2 Debian-2.4.2-1.3ubuntu1"
82 VERSION="2.4.2 Debian-2.4.2-1.7ubuntu1"
8383 TIMESTAMP=""
8484 package_revision=1.3337
8585
13111311 rm -rf conftest*
13121312 ;;
13131313
1314 x86_64-*kfreebsd*-gnu|x86_64-*linux*|ppc*-*linux*|powerpc*-*linux*| \
1314 x86_64-*kfreebsd*-gnu|x86_64-*linux*|powerpc*-*linux*| \
13151315 s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
13161316 # Find out which ABI we are using.
13171317 echo 'int i;' > conftest.$ac_ext
13321332 ;;
13331333 esac
13341334 ;;
1335 ppc64-*linux*|powerpc64-*linux*)
1335 powerpc64le-*)
1336 LD="${LD-ld} -m elf32lppclinux"
1337 ;;
1338 powerpc64-*)
13361339 LD="${LD-ld} -m elf32ppclinux"
13371340 ;;
13381341 s390x-*linux*)
13511354 x86_64-*linux*)
13521355 LD="${LD-ld} -m elf_x86_64"
13531356 ;;
1354 ppc*-*linux*|powerpc*-*linux*)
1357 powerpcle-*)
1358 LD="${LD-ld} -m elf64lppc"
1359 ;;
1360 powerpc-*)
13551361 LD="${LD-ld} -m elf64ppc"
13561362 ;;
13571363 s390*-*linux*|s390*-*tpf*)
00 #! /bin/sh
11 # Common wrapper for a few potentially missing GNU programs.
22
3 scriptversion=2012-06-26.16; # UTC
3 scriptversion=2013-10-28.13; # UTC
44
55 # Copyright (C) 1996-2013 Free Software Foundation, Inc.
66 # Originally written by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
159159 ;;
160160 autom4te*)
161161 echo "You might have modified some maintainer files that require"
162 echo "the 'automa4te' program to be rebuilt."
162 echo "the 'autom4te' program to be rebuilt."
163163 program_details 'autom4te'
164164 ;;
165165 bison*|yacc*)
0 GNU GENERAL PUBLIC LICENSE
1 Version 3, 29 June 2007
2
3 Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
4 Everyone is permitted to copy and distribute verbatim copies
5 of this license document, but changing it is not allowed.
6
7 Preamble
8
9 The GNU General Public License is a free, copyleft license for
10 software and other kinds of works.
11
12 The licenses for most software and other practical works are designed
13 to take away your freedom to share and change the works. By contrast,
14 the GNU General Public License is intended to guarantee your freedom to
15 share and change all versions of a program--to make sure it remains free
16 software for all its users. We, the Free Software Foundation, use the
17 GNU General Public License for most of our software; it applies also to
18 any other work released this way by its authors. You can apply it to
19 your programs, too.
20
21 When we speak of free software, we are referring to freedom, not
22 price. Our General Public Licenses are designed to make sure that you
23 have the freedom to distribute copies of free software (and charge for
24 them if you wish), that you receive source code or can get it if you
25 want it, that you can change the software or use pieces of it in new
26 free programs, and that you know you can do these things.
27
28 To protect your rights, we need to prevent others from denying you
29 these rights or asking you to surrender the rights. Therefore, you have
30 certain responsibilities if you distribute copies of the software, or if
31 you modify it: responsibilities to respect the freedom of others.
32
33 For example, if you distribute copies of such a program, whether
34 gratis or for a fee, you must pass on to the recipients the same
35 freedoms that you received. You must make sure that they, too, receive
36 or can get the source code. And you must show them these terms so they
37 know their rights.
38
39 Developers that use the GNU GPL protect your rights with two steps:
40 (1) assert copyright on the software, and (2) offer you this License
41 giving you legal permission to copy, distribute and/or modify it.
42
43 For the developers' and authors' protection, the GPL clearly explains
44 that there is no warranty for this free software. For both users' and
45 authors' sake, the GPL requires that modified versions be marked as
46 changed, so that their problems will not be attributed erroneously to
47 authors of previous versions.
48
49 Some devices are designed to deny users access to install or run
50 modified versions of the software inside them, although the manufacturer
51 can do so. This is fundamentally incompatible with the aim of
52 protecting users' freedom to change the software. The systematic
53 pattern of such abuse occurs in the area of products for individuals to
54 use, which is precisely where it is most unacceptable. Therefore, we
55 have designed this version of the GPL to prohibit the practice for those
56 products. If such problems arise substantially in other domains, we
57 stand ready to extend this provision to those domains in future versions
58 of the GPL, as needed to protect the freedom of users.
59
60 Finally, every program is threatened constantly by software patents.
61 States should not allow patents to restrict development and use of
62 software on general-purpose computers, but in those that do, we wish to
63 avoid the special danger that patents applied to a free program could
64 make it effectively proprietary. To prevent this, the GPL assures that
65 patents cannot be used to render the program non-free.
66
67 The precise terms and conditions for copying, distribution and
68 modification follow.
69
70 TERMS AND CONDITIONS
71
72 0. Definitions.
73
74 "This License" refers to version 3 of the GNU General Public License.
75
76 "Copyright" also means copyright-like laws that apply to other kinds of
77 works, such as semiconductor masks.
78
79 "The Program" refers to any copyrightable work licensed under this
80 License. Each licensee is addressed as "you". "Licensees" and
81 "recipients" may be individuals or organizations.
82
83 To "modify" a work means to copy from or adapt all or part of the work
84 in a fashion requiring copyright permission, other than the making of an
85 exact copy. The resulting work is called a "modified version" of the
86 earlier work or a work "based on" the earlier work.
87
88 A "covered work" means either the unmodified Program or a work based
89 on the Program.
90
91 To "propagate" a work means to do anything with it that, without
92 permission, would make you directly or secondarily liable for
93 infringement under applicable copyright law, except executing it on a
94 computer or modifying a private copy. Propagation includes copying,
95 distribution (with or without modification), making available to the
96 public, and in some countries other activities as well.
97
98 To "convey" a work means any kind of propagation that enables other
99 parties to make or receive copies. Mere interaction with a user through
100 a computer network, with no transfer of a copy, is not conveying.
101
102 An interactive user interface displays "Appropriate Legal Notices"
103 to the extent that it includes a convenient and prominently visible
104 feature that (1) displays an appropriate copyright notice, and (2)
105 tells the user that there is no warranty for the work (except to the
106 extent that warranties are provided), that licensees may convey the
107 work under this License, and how to view a copy of this License. If
108 the interface presents a list of user commands or options, such as a
109 menu, a prominent item in the list meets this criterion.
110
111 1. Source Code.
112
113 The "source code" for a work means the preferred form of the work
114 for making modifications to it. "Object code" means any non-source
115 form of a work.
116
117 A "Standard Interface" means an interface that either is an official
118 standard defined by a recognized standards body, or, in the case of
119 interfaces specified for a particular programming language, one that
120 is widely used among developers working in that language.
121
122 The "System Libraries" of an executable work include anything, other
123 than the work as a whole, that (a) is included in the normal form of
124 packaging a Major Component, but which is not part of that Major
125 Component, and (b) serves only to enable use of the work with that
126 Major Component, or to implement a Standard Interface for which an
127 implementation is available to the public in source code form. A
128 "Major Component", in this context, means a major essential component
129 (kernel, window system, and so on) of the specific operating system
130 (if any) on which the executable work runs, or a compiler used to
131 produce the work, or an object code interpreter used to run it.
132
133 The "Corresponding Source" for a work in object code form means all
134 the source code needed to generate, install, and (for an executable
135 work) run the object code and to modify the work, including scripts to
136 control those activities. However, it does not include the work's
137 System Libraries, or general-purpose tools or generally available free
138 programs which are used unmodified in performing those activities but
139 which are not part of the work. For example, Corresponding Source
140 includes interface definition files associated with source files for
141 the work, and the source code for shared libraries and dynamically
142 linked subprograms that the work is specifically designed to require,
143 such as by intimate data communication or control flow between those
144 subprograms and other parts of the work.
145
146 The Corresponding Source need not include anything that users
147 can regenerate automatically from other parts of the Corresponding
148 Source.
149
150 The Corresponding Source for a work in source code form is that
151 same work.
152
153 2. Basic Permissions.
154
155 All rights granted under this License are granted for the term of
156 copyright on the Program, and are irrevocable provided the stated
157 conditions are met. This License explicitly affirms your unlimited
158 permission to run the unmodified Program. The output from running a
159 covered work is covered by this License only if the output, given its
160 content, constitutes a covered work. This License acknowledges your
161 rights of fair use or other equivalent, as provided by copyright law.
162
163 You may make, run and propagate covered works that you do not
164 convey, without conditions so long as your license otherwise remains
165 in force. You may convey covered works to others for the sole purpose
166 of having them make modifications exclusively for you, or provide you
167 with facilities for running those works, provided that you comply with
168 the terms of this License in conveying all material for which you do
169 not control copyright. Those thus making or running the covered works
170 for you must do so exclusively on your behalf, under your direction
171 and control, on terms that prohibit them from making any copies of
172 your copyrighted material outside their relationship with you.
173
174 Conveying under any other circumstances is permitted solely under
175 the conditions stated below. Sublicensing is not allowed; section 10
176 makes it unnecessary.
177
178 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
179
180 No covered work shall be deemed part of an effective technological
181 measure under any applicable law fulfilling obligations under article
182 11 of the WIPO copyright treaty adopted on 20 December 1996, or
183 similar laws prohibiting or restricting circumvention of such
184 measures.
185
186 When you convey a covered work, you waive any legal power to forbid
187 circumvention of technological measures to the extent such circumvention
188 is effected by exercising rights under this License with respect to
189 the covered work, and you disclaim any intention to limit operation or
190 modification of the work as a means of enforcing, against the work's
191 users, your or third parties' legal rights to forbid circumvention of
192 technological measures.
193
194 4. Conveying Verbatim Copies.
195
196 You may convey verbatim copies of the Program's source code as you
197 receive it, in any medium, provided that you conspicuously and
198 appropriately publish on each copy an appropriate copyright notice;
199 keep intact all notices stating that this License and any
200 non-permissive terms added in accord with section 7 apply to the code;
201 keep intact all notices of the absence of any warranty; and give all
202 recipients a copy of this License along with the Program.
203
204 You may charge any price or no price for each copy that you convey,
205 and you may offer support or warranty protection for a fee.
206
207 5. Conveying Modified Source Versions.
208
209 You may convey a work based on the Program, or the modifications to
210 produce it from the Program, in the form of source code under the
211 terms of section 4, provided that you also meet all of these conditions:
212
213 a) The work must carry prominent notices stating that you modified
214 it, and giving a relevant date.
215
216 b) The work must carry prominent notices stating that it is
217 released under this License and any conditions added under section
218 7. This requirement modifies the requirement in section 4 to
219 "keep intact all notices".
220
221 c) You must license the entire work, as a whole, under this
222 License to anyone who comes into possession of a copy. This
223 License will therefore apply, along with any applicable section 7
224 additional terms, to the whole of the work, and all its parts,
225 regardless of how they are packaged. This License gives no
226 permission to license the work in any other way, but it does not
227 invalidate such permission if you have separately received it.
228
229 d) If the work has interactive user interfaces, each must display
230 Appropriate Legal Notices; however, if the Program has interactive
231 interfaces that do not display Appropriate Legal Notices, your
232 work need not make them do so.
233
234 A compilation of a covered work with other separate and independent
235 works, which are not by their nature extensions of the covered work,
236 and which are not combined with it such as to form a larger program,
237 in or on a volume of a storage or distribution medium, is called an
238 "aggregate" if the compilation and its resulting copyright are not
239 used to limit the access or legal rights of the compilation's users
240 beyond what the individual works permit. Inclusion of a covered work
241 in an aggregate does not cause this License to apply to the other
242 parts of the aggregate.
243
244 6. Conveying Non-Source Forms.
245
246 You may convey a covered work in object code form under the terms
247 of sections 4 and 5, provided that you also convey the
248 machine-readable Corresponding Source under the terms of this License,
249 in one of these ways:
250
251 a) Convey the object code in, or embodied in, a physical product
252 (including a physical distribution medium), accompanied by the
253 Corresponding Source fixed on a durable physical medium
254 customarily used for software interchange.
255
256 b) Convey the object code in, or embodied in, a physical product
257 (including a physical distribution medium), accompanied by a
258 written offer, valid for at least three years and valid for as
259 long as you offer spare parts or customer support for that product
260 model, to give anyone who possesses the object code either (1) a
261 copy of the Corresponding Source for all the software in the
262 product that is covered by this License, on a durable physical
263 medium customarily used for software interchange, for a price no
264 more than your reasonable cost of physically performing this
265 conveying of source, or (2) access to copy the
266 Corresponding Source from a network server at no charge.
267
268 c) Convey individual copies of the object code with a copy of the
269 written offer to provide the Corresponding Source. This
270 alternative is allowed only occasionally and noncommercially, and
271 only if you received the object code with such an offer, in accord
272 with subsection 6b.
273
274 d) Convey the object code by offering access from a designated
275 place (gratis or for a charge), and offer equivalent access to the
276 Corresponding Source in the same way through the same place at no
277 further charge. You need not require recipients to copy the
278 Corresponding Source along with the object code. If the place to
279 copy the object code is a network server, the Corresponding Source
280 may be on a different server (operated by you or a third party)
281 that supports equivalent copying facilities, provided you maintain
282 clear directions next to the object code saying where to find the
283 Corresponding Source. Regardless of what server hosts the
284 Corresponding Source, you remain obligated to ensure that it is
285 available for as long as needed to satisfy these requirements.
286
287 e) Convey the object code using peer-to-peer transmission, provided
288 you inform other peers where the object code and Corresponding
289 Source of the work are being offered to the general public at no
290 charge under subsection 6d.
291
292 A separable portion of the object code, whose source code is excluded
293 from the Corresponding Source as a System Library, need not be
294 included in conveying the object code work.
295
296 A "User Product" is either (1) a "consumer product", which means any
297 tangible personal property which is normally used for personal, family,
298 or household purposes, or (2) anything designed or sold for incorporation
299 into a dwelling. In determining whether a product is a consumer product,
300 doubtful cases shall be resolved in favor of coverage. For a particular
301 product received by a particular user, "normally used" refers to a
302 typical or common use of that class of product, regardless of the status
303 of the particular user or of the way in which the particular user
304 actually uses, or expects or is expected to use, the product. A product
305 is a consumer product regardless of whether the product has substantial
306 commercial, industrial or non-consumer uses, unless such uses represent
307 the only significant mode of use of the product.
308
309 "Installation Information" for a User Product means any methods,
310 procedures, authorization keys, or other information required to install
311 and execute modified versions of a covered work in that User Product from
312 a modified version of its Corresponding Source. The information must
313 suffice to ensure that the continued functioning of the modified object
314 code is in no case prevented or interfered with solely because
315 modification has been made.
316
317 If you convey an object code work under this section in, or with, or
318 specifically for use in, a User Product, and the conveying occurs as
319 part of a transaction in which the right of possession and use of the
320 User Product is transferred to the recipient in perpetuity or for a
321 fixed term (regardless of how the transaction is characterized), the
322 Corresponding Source conveyed under this section must be accompanied
323 by the Installation Information. But this requirement does not apply
324 if neither you nor any third party retains the ability to install
325 modified object code on the User Product (for example, the work has
326 been installed in ROM).
327
328 The requirement to provide Installation Information does not include a
329 requirement to continue to provide support service, warranty, or updates
330 for a work that has been modified or installed by the recipient, or for
331 the User Product in which it has been modified or installed. Access to a
332 network may be denied when the modification itself materially and
333 adversely affects the operation of the network or violates the rules and
334 protocols for communication across the network.
335
336 Corresponding Source conveyed, and Installation Information provided,
337 in accord with this section must be in a format that is publicly
338 documented (and with an implementation available to the public in
339 source code form), and must require no special password or key for
340 unpacking, reading or copying.
341
342 7. Additional Terms.
343
344 "Additional permissions" are terms that supplement the terms of this
345 License by making exceptions from one or more of its conditions.
346 Additional permissions that are applicable to the entire Program shall
347 be treated as though they were included in this License, to the extent
348 that they are valid under applicable law. If additional permissions
349 apply only to part of the Program, that part may be used separately
350 under those permissions, but the entire Program remains governed by
351 this License without regard to the additional permissions.
352
353 When you convey a copy of a covered work, you may at your option
354 remove any additional permissions from that copy, or from any part of
355 it. (Additional permissions may be written to require their own
356 removal in certain cases when you modify the work.) You may place
357 additional permissions on material, added by you to a covered work,
358 for which you have or can give appropriate copyright permission.
359
360 Notwithstanding any other provision of this License, for material you
361 add to a covered work, you may (if authorized by the copyright holders of
362 that material) supplement the terms of this License with terms:
363
364 a) Disclaiming warranty or limiting liability differently from the
365 terms of sections 15 and 16 of this License; or
366
367 b) Requiring preservation of specified reasonable legal notices or
368 author attributions in that material or in the Appropriate Legal
369 Notices displayed by works containing it; or
370
371 c) Prohibiting misrepresentation of the origin of that material, or
372 requiring that modified versions of such material be marked in
373 reasonable ways as different from the original version; or
374
375 d) Limiting the use for publicity purposes of names of licensors or
376 authors of the material; or
377
378 e) Declining to grant rights under trademark law for use of some
379 trade names, trademarks, or service marks; or
380
381 f) Requiring indemnification of licensors and authors of that
382 material by anyone who conveys the material (or modified versions of
383 it) with contractual assumptions of liability to the recipient, for
384 any liability that these contractual assumptions directly impose on
385 those licensors and authors.
386
387 All other non-permissive additional terms are considered "further
388 restrictions" within the meaning of section 10. If the Program as you
389 received it, or any part of it, contains a notice stating that it is
390 governed by this License along with a term that is a further
391 restriction, you may remove that term. If a license document contains
392 a further restriction but permits relicensing or conveying under this
393 License, you may add to a covered work material governed by the terms
394 of that license document, provided that the further restriction does
395 not survive such relicensing or conveying.
396
397 If you add terms to a covered work in accord with this section, you
398 must place, in the relevant source files, a statement of the
399 additional terms that apply to those files, or a notice indicating
400 where to find the applicable terms.
401
402 Additional terms, permissive or non-permissive, may be stated in the
403 form of a separately written license, or stated as exceptions;
404 the above requirements apply either way.
405
406 8. Termination.
407
408 You may not propagate or modify a covered work except as expressly
409 provided under this License. Any attempt otherwise to propagate or
410 modify it is void, and will automatically terminate your rights under
411 this License (including any patent licenses granted under the third
412 paragraph of section 11).
413
414 However, if you cease all violation of this License, then your
415 license from a particular copyright holder is reinstated (a)
416 provisionally, unless and until the copyright holder explicitly and
417 finally terminates your license, and (b) permanently, if the copyright
418 holder fails to notify you of the violation by some reasonable means
419 prior to 60 days after the cessation.
420
421 Moreover, your license from a particular copyright holder is
422 reinstated permanently if the copyright holder notifies you of the
423 violation by some reasonable means, this is the first time you have
424 received notice of violation of this License (for any work) from that
425 copyright holder, and you cure the violation prior to 30 days after
426 your receipt of the notice.
427
428 Termination of your rights under this section does not terminate the
429 licenses of parties who have received copies or rights from you under
430 this License. If your rights have been terminated and not permanently
431 reinstated, you do not qualify to receive new licenses for the same
432 material under section 10.
433
434 9. Acceptance Not Required for Having Copies.
435
436 You are not required to accept this License in order to receive or
437 run a copy of the Program. Ancillary propagation of a covered work
438 occurring solely as a consequence of using peer-to-peer transmission
439 to receive a copy likewise does not require acceptance. However,
440 nothing other than this License grants you permission to propagate or
441 modify any covered work. These actions infringe copyright if you do
442 not accept this License. Therefore, by modifying or propagating a
443 covered work, you indicate your acceptance of this License to do so.
444
445 10. Automatic Licensing of Downstream Recipients.
446
447 Each time you convey a covered work, the recipient automatically
448 receives a license from the original licensors, to run, modify and
449 propagate that work, subject to this License. You are not responsible
450 for enforcing compliance by third parties with this License.
451
452 An "entity transaction" is a transaction transferring control of an
453 organization, or substantially all assets of one, or subdividing an
454 organization, or merging organizations. If propagation of a covered
455 work results from an entity transaction, each party to that
456 transaction who receives a copy of the work also receives whatever
457 licenses to the work the party's predecessor in interest had or could
458 give under the previous paragraph, plus a right to possession of the
459 Corresponding Source of the work from the predecessor in interest, if
460 the predecessor has it or can get it with reasonable efforts.
461
462 You may not impose any further restrictions on the exercise of the
463 rights granted or affirmed under this License. For example, you may
464 not impose a license fee, royalty, or other charge for exercise of
465 rights granted under this License, and you may not initiate litigation
466 (including a cross-claim or counterclaim in a lawsuit) alleging that
467 any patent claim is infringed by making, using, selling, offering for
468 sale, or importing the Program or any portion of it.
469
470 11. Patents.
471
472 A "contributor" is a copyright holder who authorizes use under this
473 License of the Program or a work on which the Program is based. The
474 work thus licensed is called the contributor's "contributor version".
475
476 A contributor's "essential patent claims" are all patent claims
477 owned or controlled by the contributor, whether already acquired or
478 hereafter acquired, that would be infringed by some manner, permitted
479 by this License, of making, using, or selling its contributor version,
480 but do not include claims that would be infringed only as a
481 consequence of further modification of the contributor version. For
482 purposes of this definition, "control" includes the right to grant
483 patent sublicenses in a manner consistent with the requirements of
484 this License.
485
486 Each contributor grants you a non-exclusive, worldwide, royalty-free
487 patent license under the contributor's essential patent claims, to
488 make, use, sell, offer for sale, import and otherwise run, modify and
489 propagate the contents of its contributor version.
490
491 In the following three paragraphs, a "patent license" is any express
492 agreement or commitment, however denominated, not to enforce a patent
493 (such as an express permission to practice a patent or covenant not to
494 sue for patent infringement). To "grant" such a patent license to a
495 party means to make such an agreement or commitment not to enforce a
496 patent against the party.
497
498 If you convey a covered work, knowingly relying on a patent license,
499 and the Corresponding Source of the work is not available for anyone
500 to copy, free of charge and under the terms of this License, through a
501 publicly available network server or other readily accessible means,
502 then you must either (1) cause the Corresponding Source to be so
503 available, or (2) arrange to deprive yourself of the benefit of the
504 patent license for this particular work, or (3) arrange, in a manner
505 consistent with the requirements of this License, to extend the patent
506 license to downstream recipients. "Knowingly relying" means you have
507 actual knowledge that, but for the patent license, your conveying the
508 covered work in a country, or your recipient's use of the covered work
509 in a country, would infringe one or more identifiable patents in that
510 country that you have reason to believe are valid.
511
512 If, pursuant to or in connection with a single transaction or
513 arrangement, you convey, or propagate by procuring conveyance of, a
514 covered work, and grant a patent license to some of the parties
515 receiving the covered work authorizing them to use, propagate, modify
516 or convey a specific copy of the covered work, then the patent license
517 you grant is automatically extended to all recipients of the covered
518 work and works based on it.
519
520 A patent license is "discriminatory" if it does not include within
521 the scope of its coverage, prohibits the exercise of, or is
522 conditioned on the non-exercise of one or more of the rights that are
523 specifically granted under this License. You may not convey a covered
524 work if you are a party to an arrangement with a third party that is
525 in the business of distributing software, under which you make payment
526 to the third party based on the extent of your activity of conveying
527 the work, and under which the third party grants, to any of the
528 parties who would receive the covered work from you, a discriminatory
529 patent license (a) in connection with copies of the covered work
530 conveyed by you (or copies made from those copies), or (b) primarily
531 for and in connection with specific products or compilations that
532 contain the covered work, unless you entered into that arrangement,
533 or that patent license was granted, prior to 28 March 2007.
534
535 Nothing in this License shall be construed as excluding or limiting
536 any implied license or other defenses to infringement that may
537 otherwise be available to you under applicable patent law.
538
539 12. No Surrender of Others' Freedom.
540
541 If conditions are imposed on you (whether by court order, agreement or
542 otherwise) that contradict the conditions of this License, they do not
543 excuse you from the conditions of this License. If you cannot convey a
544 covered work so as to satisfy simultaneously your obligations under this
545 License and any other pertinent obligations, then as a consequence you may
546 not convey it at all. For example, if you agree to terms that obligate you
547 to collect a royalty for further conveying from those to whom you convey
548 the Program, the only way you could satisfy both those terms and this
549 License would be to refrain entirely from conveying the Program.
550
551 13. Use with the GNU Affero General Public License.
552
553 Notwithstanding any other provision of this License, you have
554 permission to link or combine any covered work with a work licensed
555 under version 3 of the GNU Affero General Public License into a single
556 combined work, and to convey the resulting work. The terms of this
557 License will continue to apply to the part which is the covered work,
558 but the special requirements of the GNU Affero General Public License,
559 section 13, concerning interaction through a network will apply to the
560 combination as such.
561
562 14. Revised Versions of this License.
563
564 The Free Software Foundation may publish revised and/or new versions of
565 the GNU General Public License from time to time. Such new versions will
566 be similar in spirit to the present version, but may differ in detail to
567 address new problems or concerns.
568
569 Each version is given a distinguishing version number. If the
570 Program specifies that a certain numbered version of the GNU General
571 Public License "or any later version" applies to it, you have the
572 option of following the terms and conditions either of that numbered
573 version or of any later version published by the Free Software
574 Foundation. If the Program does not specify a version number of the
575 GNU General Public License, you may choose any version ever published
576 by the Free Software Foundation.
577
578 If the Program specifies that a proxy can decide which future
579 versions of the GNU General Public License can be used, that proxy's
580 public statement of acceptance of a version permanently authorizes you
581 to choose that version for the Program.
582
583 Later license versions may give you additional or different
584 permissions. However, no additional obligations are imposed on any
585 author or copyright holder as a result of your choosing to follow a
586 later version.
587
588 15. Disclaimer of Warranty.
589
590 THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
591 APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
592 HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
593 OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
594 THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
595 PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
596 IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
597 ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
598
599 16. Limitation of Liability.
600
601 IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
602 WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
603 THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
604 GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
605 USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
606 DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
607 PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
608 EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
609 SUCH DAMAGES.
610
611 17. Interpretation of Sections 15 and 16.
612
613 If the disclaimer of warranty and limitation of liability provided
614 above cannot be given local legal effect according to their terms,
615 reviewing courts shall apply local law that most closely approximates
616 an absolute waiver of all civil liability in connection with the
617 Program, unless a warranty or assumption of liability accompanies a
618 copy of the Program in return for a fee.
619
620 END OF TERMS AND CONDITIONS
621
622 How to Apply These Terms to Your New Programs
623
624 If you develop a new program, and you want it to be of the greatest
625 possible use to the public, the best way to achieve this is to make it
626 free software which everyone can redistribute and change under these terms.
627
628 To do so, attach the following notices to the program. It is safest
629 to attach them to the start of each source file to most effectively
630 state the exclusion of warranty; and each file should have at least
631 the "copyright" line and a pointer to where the full notice is found.
632
633 <one line to give the program's name and a brief idea of what it does.>
634 Copyright (C) <year> <name of author>
635
636 This program is free software: you can redistribute it and/or modify
637 it under the terms of the GNU General Public License as published by
638 the Free Software Foundation, either version 3 of the License, or
639 (at your option) any later version.
640
641 This program is distributed in the hope that it will be useful,
642 but WITHOUT ANY WARRANTY; without even the implied warranty of
643 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
644 GNU General Public License for more details.
645
646 You should have received a copy of the GNU General Public License
647 along with this program. If not, see <http://www.gnu.org/licenses/>.
648
649 Also add information on how to contact you by electronic and paper mail.
650
651 If the program does terminal interaction, make it output a short
652 notice like this when it starts in an interactive mode:
653
654 <program> Copyright (C) <year> <name of author>
655 This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
656 This is free software, and you are welcome to redistribute it
657 under certain conditions; type `show c' for details.
658
659 The hypothetical commands `show w' and `show c' should show the appropriate
660 parts of the General Public License. Of course, your program's commands
661 might be different; for a GUI interface, you would use an "about box".
662
663 You should also get your employer (if you work as a programmer) or school,
664 if any, to sign a "copyright disclaimer" for the program, if necessary.
665 For more information on this, and how to apply and follow the GNU GPL, see
666 <http://www.gnu.org/licenses/>.
667
668 The GNU General Public License does not permit incorporating your program
669 into proprietary programs. If your program is a subroutine library, you
670 may consider it more useful to permit linking proprietary applications with
671 the library. If this is what you want to do, use the GNU Lesser General
672 Public License instead of this License. But first, please read
673 <http://www.gnu.org/philosophy/why-not-lgpl.html>.
33 AM_CPPFLAGS = -I../libde265
44
55 sherlock265_DEPENDENCIES = ../libde265/libde265.la
6 sherlock265_CXXFLAGS = $(QT_CFLAGS) $(VIDEOGFX_CFLAGS)
7 sherlock265_LDFLAGS = $(QT_LIBS) $(VIDEOGFX_LIBS)
6 sherlock265_CXXFLAGS = $(QT_CFLAGS) -std=c++0x
7 sherlock265_LDFLAGS = $(QT_LIBS)
88 sherlock265_LDADD = ../libde265/libde265.la -lstdc++ -lpthread
99 sherlock265_SOURCES = \
1010 sherlock265.cc \
1818 VideoDecoder.hh \
1919 VideoWidget.hh
2020
21 if HAVE_VIDEOGFX
22 sherlock265_CXXFLAGS += $(VIDEOGFX_CFLAGS)
23 sherlock265_LDFLAGS += $(VIDEOGFX_LIBS)
24 endif
25
26 if HAVE_SWSCALE
27 sherlock265_CXXFLAGS += $(SWSCALE_CFLAGS)
28 sherlock265_LDFLAGS += $(SWSCALE_LIBS)
29 endif
30
2131 moc_VideoWidget.cpp: VideoWidget.hh
2232 /usr/bin/moc-qt4 $(DEFINES) $(INCPATH) VideoWidget.hh -o moc_VideoWidget.cpp
2333
2636
2737 moc_VideoDecoder.cpp: VideoDecoder.hh
2838 /usr/bin/moc-qt4 $(DEFINES) $(INCPATH) VideoDecoder.hh -o moc_VideoDecoder.cpp
39
40 EXTRA_DIST = \
41 README
0 # Makefile.in generated by automake 1.13.3 from Makefile.am.
0 # Makefile.in generated by automake 1.14.1 from Makefile.am.
11 # @configure_input@
22
33 # Copyright (C) 1994-2013 Free Software Foundation, Inc.
7979 host_triplet = @host@
8080 target_triplet = @target@
8181 bin_PROGRAMS = sherlock265$(EXEEXT)
82 @HAVE_VIDEOGFX_TRUE@am__append_1 = $(VIDEOGFX_CFLAGS)
83 @HAVE_VIDEOGFX_TRUE@am__append_2 = $(VIDEOGFX_LIBS)
84 @HAVE_SWSCALE_TRUE@am__append_3 = $(SWSCALE_CFLAGS)
85 @HAVE_SWSCALE_TRUE@am__append_4 = $(SWSCALE_LIBS)
8286 subdir = sherlock265
8387 DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
84 $(top_srcdir)/depcomp
88 $(top_srcdir)/depcomp COPYING README
8589 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
8690 am__aclocal_m4_deps = $(top_srcdir)/m4/libtool.m4 \
8791 $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
266270 SET_MAKE = @SET_MAKE@
267271 SHELL = @SHELL@
268272 STRIP = @STRIP@
273 SWSCALE_CFLAGS = @SWSCALE_CFLAGS@
274 SWSCALE_LIBS = @SWSCALE_LIBS@
269275 VERSION = @VERSION@
270276 VIDEOGFX_CFLAGS = @VIDEOGFX_CFLAGS@
271277 VIDEOGFX_LIBS = @VIDEOGFX_LIBS@
328334 top_srcdir = @top_srcdir@
329335 AM_CPPFLAGS = -I../libde265
330336 sherlock265_DEPENDENCIES = ../libde265/libde265.la
331 sherlock265_CXXFLAGS = $(QT_CFLAGS) $(VIDEOGFX_CFLAGS)
332 sherlock265_LDFLAGS = $(QT_LIBS) $(VIDEOGFX_LIBS)
337 sherlock265_CXXFLAGS = $(QT_CFLAGS) -std=c++0x $(am__append_1) \
338 $(am__append_3)
339 sherlock265_LDFLAGS = $(QT_LIBS) $(am__append_2) $(am__append_4)
333340 sherlock265_LDADD = ../libde265/libde265.la -lstdc++ -lpthread
334341 sherlock265_SOURCES = \
335342 sherlock265.cc \
342349 VideoPlayer.hh \
343350 VideoDecoder.hh \
344351 VideoWidget.hh
352
353 EXTRA_DIST = \
354 README
345355
346356 all: all-am
347357
0
1 description of graphical overlays
2 ---------------------------------
3
4 CB - Show Coding Block quadtree structure. Prediction modes are
5 signalled at this level. CBs can be further subdivided into
6 PBs for prediction and TBs for residual transforms.
7
8 PB - Show Prediction Block structure. CB blocks may be further
9 subdivided, possibly using asymetric partitionings. This is
10 the level on which motion-compensation and intra-prediction is
11 performed.
12
13 TB - Show Transformation Block structure. DCT/DSTs are carried out
14 on this level.
15
16 QP - Quantization Parameter shown as greyscale value.
17 Brighter blocks for larger QP values (lower quality).
18
19 IntraPred - Show intra prediction mode.
20 * Directional prediction is depicted with a line in the prediction direction
21 (out of 32 possible directions)
22 * Planar prediction is depicted by a square.
23 * DC prediction is depicted by a circle.
24
25 PredMode - Show prediction mode.
26 * red: intra
27 * blue: inter
28 * green: skip = inter mode with no PB subdivision and candidate from merge list
29
30 MV - Show motion vectors. Vectors from list L0 are drawn in red,
31 motion vectors from L1 are green. Vectors are magnified by a factor of 4.
00 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
1 * libde265 example application "sherlock265".
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
4 * This file is part of libde265.
4 * This file is part of sherlock265, an example application using libde265.
55 *
6 * libde265 is free software: you can redistribute it and/or modify
6 * sherlock265 is free software: you can redistribute it and/or modify
77 * it under the terms of the GNU General Public License as published by
88 * the Free Software Foundation, either version 3 of the License, or
99 * (at your option) any later version.
1010 *
11 * libde265 is distributed in the hope that it will be useful,
11 * sherlock265 is distributed in the hope that it will be useful,
1212 * but WITHOUT ANY WARRANTY; without even the implied warranty of
1313 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1414 * GNU General Public License for more details.
1515 *
1616 * You should have received a copy of the GNU General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
17 * along with sherlock265. If not, see <http://www.gnu.org/licenses/>.
1818 */
1919
2020 #include "VideoDecoder.hh"
21 #ifdef HAVE_VIDEOGFX
2122 #include <libvideogfx.hh>
22
23
23 #endif
24
25
26 #ifdef HAVE_VIDEOGFX
2427 using namespace videogfx;
25
26 extern "C" {
27 #include "decctx.h"
28 }
28 #endif
29
30 //#include "decctx.h"
31 #include "visualize.h"
2932
3033
3134 VideoDecoder::VideoDecoder()
32 : mNextBuffer(0),
35 : ctx(NULL),
36 img(NULL),
37 mNextBuffer(0),
3338 mFrameCount(0),
3439 mPlayingVideo(false),
3540 mVideoEnded(false),
4045 mPBShowPartitioning(false),
4146 mShowPBPredMode(false),
4247 mShowIntraPredMode(false),
48 mShowQuantPY(false),
49 mShowSlices(false),
50 mShowTiles(false),
4351 mFH(NULL)
52 #ifdef HAVE_SWSCALE
53 , sws(NULL)
54 , width(0)
55 , height(0)
56 #endif
4457 {
4558 }
4659
4861 VideoDecoder::~VideoDecoder()
4962 {
5063 free_decoder();
64 #ifdef HAVE_SWSCALE
65 if (sws != NULL) {
66 sws_freeContext(sws);
67 }
68 #endif
5169 }
5270
5371 void VideoDecoder::run()
91109 for (;;)
92110 {
93111 if (mPlayingVideo) {
94 de265_release_next_picture(ctx);
95
96 const de265_image* img = de265_peek_next_picture(ctx);
112 mutex.lock();
113
114 if (img) {
115 img = NULL;
116 de265_release_next_picture(ctx);
117 }
118
119 img = de265_peek_next_picture(ctx);
97120 while (img==NULL)
98121 {
99 /*
100 int err;
101 err = read_nal_unit(&inputctx.ctx, &buf);
102 if (err!=DE265_OK)
122 mutex.unlock();
123 int more=1;
124 de265_error err = de265_decode(ctx, &more);
125 mutex.lock();
126
127 if (more && err == DE265_OK) {
128 // try again to get picture
129
130 img = de265_peek_next_picture(ctx);
131 }
132 else if (more && err == DE265_ERROR_WAITING_FOR_INPUT_DATA) {
133 uint8_t buf[4096];
134 int buf_size = fread(buf,1,sizeof(buf),mFH);
135 int err = de265_push_data(ctx,buf,buf_size ,0,0);
136 }
137 else if (!more)
103138 {
104139 mVideoEnded=true;
105140 mPlayingVideo=false; // TODO: send signal back
106141 break;
107142 }
108
109 err = de265_decode_NAL(&ctx, &buf);
110 if (err!=DE265_OK)
111 {
112 mVideoEnded=true;
113 mPlayingVideo=false; // TODO: send signal back
114 break;
115 }
116 */
117
118 uint8_t buf[4096];
119 int buf_size = fread(buf,1,sizeof(buf),mFH);
120 int err = de265_decode_data(ctx,buf,buf_size);
121 if (err!=DE265_OK)
122 {
123 mVideoEnded=true;
124 mPlayingVideo=false; // TODO: send signal back
125 break;
126 }
127
128 // try again to get picture
129
130 img = de265_peek_next_picture(ctx);
131143 }
132144
133145
142154 }
143155 }
144156
157 mutex.unlock();
145158
146159 // process events
147160
153166 }
154167 }
155168
156
157 void VideoDecoder::show_frame(const de265_image* img)
158 {
169 #ifdef HAVE_VIDEOGFX
170 void VideoDecoder::convert_frame_libvideogfx(const de265_image* img, QImage & qimg)
171 {
172 // --- convert to RGB ---
173
174 Image<Pixel> visu;
175 visu.Create(img->get_width(), img->get_height(), Colorspace_YUV, Chroma_420);
176
177 for (int y=0;y<img->get_height(0);y++) {
178 memcpy(visu.AskFrameY()[y], img->get_image_plane_at_pos(0, 0,y), img->get_width(0));
179 }
180
181 for (int y=0;y<img->get_height(1);y++) {
182 memcpy(visu.AskFrameU()[y], img->get_image_plane_at_pos(1, 0,y), img->get_width(1));
183 }
184
185 for (int y=0;y<img->get_height(2);y++) {
186 memcpy(visu.AskFrameV()[y], img->get_image_plane_at_pos(2, 0,y), img->get_width(2));
187 }
188
159189 Image<Pixel> debugvisu;
160
161 if (mShowDecodedImage) {
162 Image<Pixel> visu;
163 visu.Create(img->width, img->height, Colorspace_YUV, Chroma_420);
164
165 for (int y=0;y<img->height;y++) {
166 memcpy(visu.AskFrameY()[y], img->y + y*img->stride, img->width);
167 }
168
169 for (int y=0;y<img->chroma_height;y++) {
170 memcpy(visu.AskFrameU()[y], img->cb + y*img->chroma_stride, img->chroma_width);
171 }
172
173 for (int y=0;y<img->chroma_height;y++) {
174 memcpy(visu.AskFrameV()[y], img->cr + y*img->chroma_stride, img->chroma_width);
175 }
176
177
178 ChangeColorspace(debugvisu, visu, Colorspace_RGB);
179 }
180 else {
181 debugvisu.Create(img->width,img->height, Colorspace_RGB);
182 Clear(debugvisu, Color<Pixel>(0,0,0));
183 }
184
185
186 const decoder_context* cx = (const decoder_context*)ctx;
187
188 if (1) {
189 if (mShowPBPredMode)
190 {
191 draw_PB_pred_modes(cx,
192 debugvisu.AskFrameR()[0],
193 debugvisu.AskFrameG()[0],
194 debugvisu.AskFrameB()[0],
195 debugvisu.AskBitmapB().AskStride());
196 }
197
198 if (mShowIntraPredMode)
199 {
200 draw_intra_pred_modes(cx,debugvisu.AskFrameR()[0],debugvisu.AskBitmapR().AskStride(),140);
201 draw_intra_pred_modes(cx,debugvisu.AskFrameG()[0],debugvisu.AskBitmapG().AskStride(),140);
202 draw_intra_pred_modes(cx,debugvisu.AskFrameB()[0],debugvisu.AskBitmapB().AskStride(),255);
203 }
204
205 if (mTBShowPartitioning)
206 {
207 draw_TB_grid(cx, debugvisu.AskFrameR()[0], debugvisu.AskBitmapR().AskStride(),255);
208 draw_TB_grid(cx, debugvisu.AskFrameG()[0], debugvisu.AskBitmapG().AskStride(), 80);
209 draw_TB_grid(cx, debugvisu.AskFrameB()[0], debugvisu.AskBitmapB().AskStride(), 0);
210 }
211
212 if (mPBShowPartitioning)
213 {
214 draw_PB_grid(cx, debugvisu.AskFrameR()[0], debugvisu.AskBitmapR().AskStride(), 0);
215 draw_PB_grid(cx, debugvisu.AskFrameG()[0], debugvisu.AskBitmapG().AskStride(),200);
216 draw_PB_grid(cx, debugvisu.AskFrameB()[0], debugvisu.AskBitmapB().AskStride(), 0);
217 }
218
219 if (mCBShowPartitioning)
220 {
221 draw_CB_grid(cx, debugvisu.AskFrameR()[0], debugvisu.AskBitmapR().AskStride(),255);
222 draw_CB_grid(cx, debugvisu.AskFrameG()[0], debugvisu.AskBitmapG().AskStride(),255);
223 draw_CB_grid(cx, debugvisu.AskFrameB()[0], debugvisu.AskBitmapB().AskStride(),255);
224 }
225 }
226
227
228 // --- convert to QImage and show ---
229
230 if (mFrameCount==0) {
231 mImgBuffers[0] = QImage(QSize(img->width,img->height), QImage::Format_RGB32);
232 mImgBuffers[1] = QImage(QSize(img->width,img->height), QImage::Format_RGB32);
233 }
234
235 QImage* qimg = &mImgBuffers[mNextBuffer];
236 uchar* ptr = qimg->bits();
237 int bpl = qimg->bytesPerLine();
238
239 for (int y=0;y<img->height;y++)
240 {
241 for (int x=0;x<img->width;x++)
190 ChangeColorspace(debugvisu, visu, Colorspace_RGB);
191
192 // --- convert to QImage ---
193
194 uchar* ptr = qimg.bits();
195 int bpl = qimg.bytesPerLine();
196
197 for (int y=0;y<img->get_height();y++)
198 {
199 for (int x=0;x<img->get_width();x++)
242200 {
243201 *(uint32_t*)(ptr+x*4) = ((debugvisu.AskFrameR()[y][x] << 16) |
244202 (debugvisu.AskFrameG()[y][x] << 8) |
247205
248206 ptr += bpl;
249207 }
250
251
252 if (0) {
253 if (mTBShowPartitioning)
254 {
255 draw_TB_grid(cx, qimg->bits(), bpl, 0xff9000);
256 }
257
258 if (mCBShowPartitioning)
259 {
260 draw_CB_grid(cx, qimg->bits(), bpl, 0xffffff);
261 }
262 }
263
208 }
209 #endif
210
211 #ifdef HAVE_SWSCALE
212 void VideoDecoder::convert_frame_swscale(const de265_image* img, QImage & qimg)
213 {
214 if (sws == NULL || img->get_width() != width || img->get_height() != height) {
215 if (sws != NULL) {
216 sws_freeContext(sws);
217 }
218 width = img->get_width();
219 height = img->get_height();
220 sws = sws_getContext(width, height, PIX_FMT_YUV420P, width, height, PIX_FMT_BGRA, SWS_FAST_BILINEAR, NULL, NULL, NULL);
221 }
222
223 int stride[3];
224 const uint8_t *data[3];
225 for (int c=0;c<3;c++) {
226 data[c] = img->get_image_plane(c);
227 stride[c] = img->get_image_stride(c);
228 }
229
230 uint8_t *qdata[1] = { (uint8_t *) qimg.bits() };
231 int qstride[1] = { qimg.bytesPerLine() };
232 sws_scale(sws, data, stride, 0, img->get_height(), qdata, qstride);
233 }
234 #endif
235
236 void VideoDecoder::show_frame(const de265_image* img)
237 {
238 if (mFrameCount==0) {
239 mImgBuffers[0] = QImage(QSize(img->get_width(),img->get_height()), QImage::Format_RGB32);
240 mImgBuffers[1] = QImage(QSize(img->get_width(),img->get_height()), QImage::Format_RGB32);
241 }
242
243 // --- convert to RGB (or generate a black image if video image is disabled) ---
244
245 QImage* qimg = &mImgBuffers[mNextBuffer];
246 uchar* ptr = qimg->bits();
247 int bpl = qimg->bytesPerLine();
248
249 if (mShowDecodedImage) {
250 #ifdef HAVE_VIDEOGFX
251 convert_frame_libvideogfx(img, *qimg);
252 #elif HAVE_SWSCALE
253 convert_frame_swscale(img, *qimg);
254 #else
255 qimg->fill(QColor(0, 0, 0));
256 #endif
257 } else {
258 qimg->fill(QColor(0, 0, 0));
259 }
260
261 // --- overlay coding-mode visualization ---
262
263 if (mShowQuantPY)
264 {
265 draw_QuantPY(img, ptr, bpl, 4);
266 }
267
268 if (mShowPBPredMode)
269 {
270 draw_PB_pred_modes(img, ptr, bpl, 4);
271 }
272
273 if (mShowIntraPredMode)
274 {
275 draw_intra_pred_modes(img, ptr, bpl, 0x009090ff, 4);
276 }
277
278 if (mTBShowPartitioning)
279 {
280 draw_TB_grid(img, ptr, bpl, 0x00ff6000, 4);
281 }
282
283 if (mPBShowPartitioning)
284 {
285 draw_PB_grid(img, ptr, bpl, 0x00e000, 4);
286 }
287
288 if (mCBShowPartitioning)
289 {
290 draw_CB_grid(img, ptr, bpl, 0x00FFFFFF, 4);
291 }
292
293 if (mShowMotionVec)
294 {
295 draw_Motion(img, ptr, bpl, 4);
296 }
297
298 if (mShowSlices)
299 {
300 draw_Slices(img, ptr, bpl, 4);
301 }
302
303 if (mShowTiles)
304 {
305 draw_Tiles(img, ptr, bpl, 4);
306 }
264307
265308 emit displayImage(qimg);
266309 mNextBuffer = 1-mNextBuffer;
272315 {
273316 mCBShowPartitioning=flag;
274317
275 const de265_image* img = de265_peek_next_picture(ctx);
276 if (img != NULL) { show_frame(img); }
318 mutex.lock();
319 if (img != NULL) { show_frame(img); }
320 mutex.unlock();
277321 }
278322
279323
281325 {
282326 mTBShowPartitioning=flag;
283327
284 const de265_image* img = de265_peek_next_picture(ctx);
285 if (img != NULL) { show_frame(img); }
328 mutex.lock();
329 if (img != NULL) { show_frame(img); }
330 mutex.unlock();
286331 }
287332
288333 void VideoDecoder::showPBPartitioning(bool flag)
289334 {
290335 mPBShowPartitioning=flag;
291336
292 const de265_image* img = de265_peek_next_picture(ctx);
293 if (img != NULL) { show_frame(img); }
337 mutex.lock();
338 if (img != NULL) { show_frame(img); }
339 mutex.unlock();
294340 }
295341
296342 void VideoDecoder::showIntraPredMode(bool flag)
297343 {
298344 mShowIntraPredMode=flag;
299345
300 const de265_image* img = de265_peek_next_picture(ctx);
301 if (img != NULL) { show_frame(img); }
346 mutex.lock();
347 if (img != NULL) { show_frame(img); }
348 mutex.unlock();
302349 }
303350
304351 void VideoDecoder::showPBPredMode(bool flag)
305352 {
306353 mShowPBPredMode=flag;
307354
308 const de265_image* img = de265_peek_next_picture(ctx);
309 if (img != NULL) { show_frame(img); }
355 mutex.lock();
356 if (img != NULL) { show_frame(img); }
357 mutex.unlock();
358 }
359
360 void VideoDecoder::showQuantPY(bool flag)
361 {
362 mShowQuantPY=flag;
363
364 mutex.lock();
365 if (img != NULL) { show_frame(img); }
366 mutex.unlock();
367 }
368
369 void VideoDecoder::showMotionVec(bool flag)
370 {
371 mShowMotionVec=flag;
372
373 mutex.lock();
374 if (img != NULL) { show_frame(img); }
375 mutex.unlock();
310376 }
311377
312378 void VideoDecoder::showDecodedImage(bool flag)
313379 {
314380 mShowDecodedImage=flag;
315381
316 const de265_image* img = de265_peek_next_picture(ctx);
317 if (img != NULL) { show_frame(img); }
382 mutex.lock();
383 if (img != NULL) { show_frame(img); }
384 mutex.unlock();
385 }
386
387 void VideoDecoder::showTiles(bool flag)
388 {
389 mShowTiles=flag;
390
391 mutex.lock();
392 if (img != NULL) { show_frame(img); }
393 mutex.unlock();
394 }
395
396 void VideoDecoder::showSlices(bool flag)
397 {
398 mShowSlices=flag;
399
400 mutex.lock();
401 if (img != NULL) { show_frame(img); }
402 mutex.unlock();
318403 }
319404
320405
327412 //rbsp_buffer_init(&buf);
328413
329414 ctx = de265_new_decoder();
415 de265_start_worker_threads(ctx, 4); // start 4 background threads
330416 }
331417
332418 void VideoDecoder::free_decoder()
333419 {
334420 if (mFH) { fclose(mFH); }
335421
336 de265_free_decoder(ctx);
337 }
422 if (ctx) { de265_free_decoder(ctx); }
423 }
00 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
1 * libde265 example application "sherlock265".
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
4 * This file is part of libde265.
4 * This file is part of sherlock265, an example application using libde265.
55 *
6 * libde265 is free software: you can redistribute it and/or modify
6 * sherlock265 is free software: you can redistribute it and/or modify
77 * it under the terms of the GNU General Public License as published by
88 * the Free Software Foundation, either version 3 of the License, or
99 * (at your option) any later version.
1010 *
11 * libde265 is distributed in the hope that it will be useful,
11 * sherlock265 is distributed in the hope that it will be useful,
1212 * but WITHOUT ANY WARRANTY; without even the implied warranty of
1313 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1414 * GNU General Public License for more details.
1515 *
1616 * You should have received a copy of the GNU General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
17 * along with sherlock265. If not, see <http://www.gnu.org/licenses/>.
1818 */
1919
2020 #ifndef VIDEODECODER_HH
2121 #define VIDEODECODER_HH
2222
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26
2327 #include <QtGui>
28 #ifdef HAVE_SWSCALE
29 #ifdef __cplusplus
30 extern "C" {
31 #endif
32 #include <libswscale/swscale.h>
33 #ifdef __cplusplus
34 }
35 #endif
36 #endif
2437
2538 #include "VideoWidget.hh"
2639 #include "de265.h"
4962 void showPBPartitioning(bool flag);
5063 void showIntraPredMode(bool flag);
5164 void showPBPredMode(bool flag);
65 void showQuantPY(bool flag);
66 void showMotionVec(bool flag);
67 void showTiles(bool flag);
68 void showSlices(bool flag);
5269 void showDecodedImage(bool flag);
5370
5471 signals:
6178 //input_context_FILE inputctx;
6279 //rbsp_buffer buf;
6380 de265_decoder_context* ctx;
81 const de265_image* img;
82
83 QMutex mutex;
6484
6585 QImage mImgBuffers[2];
6686 int mNextBuffer;
7292
7393
7494 bool mShowDecodedImage;
95 bool mShowQuantPY;
7596 bool mCBShowPartitioning;
7697 bool mTBShowPartitioning;
7798 bool mPBShowPartitioning;
7899 bool mShowIntraPredMode;
79100 bool mShowPBPredMode;
101 bool mShowMotionVec;
102 bool mShowTiles;
103 bool mShowSlices;
80104
81105 void decoder_loop();
82106
83107 void init_decoder(const char* filename);
84 void free_decoder();
108 void free_decoder();
85109
86110 void show_frame(const de265_image* img);
111 #ifdef HAVE_VIDEOGFX
112 void convert_frame_libvideogfx(const de265_image* img, QImage & qimg);
113 #endif
114 #ifdef HAVE_SWSCALE
115 SwsContext* sws;
116 int width;
117 int height;
118 void convert_frame_swscale(const de265_image* img, QImage & qimg);
119 #endif
87120 };
88121
89122 #endif
00 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
1 * libde265 example application "sherlock265".
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
4 * This file is part of libde265.
4 * This file is part of sherlock265, an example application using libde265.
55 *
6 * libde265 is free software: you can redistribute it and/or modify
6 * sherlock265 is free software: you can redistribute it and/or modify
77 * it under the terms of the GNU General Public License as published by
88 * the Free Software Foundation, either version 3 of the License, or
99 * (at your option) any later version.
1010 *
11 * libde265 is distributed in the hope that it will be useful,
11 * sherlock265 is distributed in the hope that it will be useful,
1212 * but WITHOUT ANY WARRANTY; without even the implied warranty of
1313 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1414 * GNU General Public License for more details.
1515 *
1616 * You should have received a copy of the GNU General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
17 * along with sherlock265. If not, see <http://www.gnu.org/licenses/>.
1818 */
1919
2020 #include "VideoPlayer.hh"
6868 QObject::connect(showPBPredModeButton, SIGNAL(toggled(bool)),
6969 mDecoder, SLOT(showPBPredMode(bool)));
7070
71 QPushButton* showQuantPYButton = new QPushButton("Quant");
72 showQuantPYButton->setCheckable(true);
73 QObject::connect(showQuantPYButton, SIGNAL(toggled(bool)),
74 mDecoder, SLOT(showQuantPY(bool)));
75
76 QPushButton* showMotionVecButton = new QPushButton("MotionVec");
77 showMotionVecButton->setCheckable(true);
78 QObject::connect(showMotionVecButton, SIGNAL(toggled(bool)),
79 mDecoder, SLOT(showMotionVec(bool)));
80
81 QPushButton* showTilesButton = new QPushButton("Tiles");
82 showTilesButton->setCheckable(true);
83 QObject::connect(showTilesButton, SIGNAL(toggled(bool)),
84 mDecoder, SLOT(showTiles(bool)));
85
86 QPushButton* showSlicesButton = new QPushButton("Slices");
87 showSlicesButton->setCheckable(true);
88 QObject::connect(showSlicesButton, SIGNAL(toggled(bool)),
89 mDecoder, SLOT(showSlices(bool)));
90
7191 QPushButton* showDecodedImageButton = new QPushButton("image");
7292 showDecodedImageButton->setCheckable(true);
7393 showDecodedImageButton->setChecked(true);
7595 mDecoder, SLOT(showDecodedImage(bool)));
7696
7797 QGridLayout *layout = new QGridLayout;
78 layout->addWidget(videoWidget, 0,0,1,6);
98 layout->addWidget(videoWidget, 0,0,1,7);
7999 layout->addWidget(startButton, 1,0,1,1);
80100 layout->addWidget(stopButton, 1,1,1,1);
81101 layout->addWidget(stepButton, 1,2,1,1);
102 layout->addWidget(showDecodedImageButton, 1,6,1,1);
103 layout->addWidget(showTilesButton, 1,5,1,1);
104 layout->addWidget(showSlicesButton, 1,4,1,1);
82105 layout->addWidget(showCBPartitioningButton,2,0,1,1);
83106 layout->addWidget(showTBPartitioningButton,2,1,1,1);
84107 layout->addWidget(showPBPartitioningButton,2,2,1,1);
85108 layout->addWidget(showIntraPredModeButton, 2,3,1,1);
86109 layout->addWidget(showPBPredModeButton, 2,4,1,1);
87 layout->addWidget(showDecodedImageButton, 2,5,1,1);
110 layout->addWidget(showQuantPYButton, 2,5,1,1);
111 layout->addWidget(showMotionVecButton, 2,6,1,1);
88112 setLayout(layout);
89113
90114
00 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
1 * libde265 example application "sherlock265".
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
4 * This file is part of libde265.
4 * This file is part of sherlock265, an example application using libde265.
55 *
6 * libde265 is free software: you can redistribute it and/or modify
6 * sherlock265 is free software: you can redistribute it and/or modify
77 * it under the terms of the GNU General Public License as published by
88 * the Free Software Foundation, either version 3 of the License, or
99 * (at your option) any later version.
1010 *
11 * libde265 is distributed in the hope that it will be useful,
11 * sherlock265 is distributed in the hope that it will be useful,
1212 * but WITHOUT ANY WARRANTY; without even the implied warranty of
1313 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1414 * GNU General Public License for more details.
1515 *
1616 * You should have received a copy of the GNU General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
17 * along with sherlock265. If not, see <http://www.gnu.org/licenses/>.
1818 */
1919
2020 #ifndef VIDEOPLAYER_HH
00 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
1 * libde265 example application "sherlock265".
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
4 * This file is part of libde265.
4 * This file is part of sherlock265, an example application using libde265.
55 *
6 * libde265 is free software: you can redistribute it and/or modify
6 * sherlock265 is free software: you can redistribute it and/or modify
77 * it under the terms of the GNU General Public License as published by
88 * the Free Software Foundation, either version 3 of the License, or
99 * (at your option) any later version.
1010 *
11 * libde265 is distributed in the hope that it will be useful,
11 * sherlock265 is distributed in the hope that it will be useful,
1212 * but WITHOUT ANY WARRANTY; without even the implied warranty of
1313 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1414 * GNU General Public License for more details.
1515 *
1616 * You should have received a copy of the GNU General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
17 * along with sherlock265. If not, see <http://www.gnu.org/licenses/>.
1818 */
1919
2020 #include "VideoWidget.hh"
00 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
1 * libde265 example application "sherlock265".
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
4 * This file is part of libde265.
4 * This file is part of sherlock265, an example application using libde265.
55 *
6 * libde265 is free software: you can redistribute it and/or modify
6 * sherlock265 is free software: you can redistribute it and/or modify
77 * it under the terms of the GNU General Public License as published by
88 * the Free Software Foundation, either version 3 of the License, or
99 * (at your option) any later version.
1010 *
11 * libde265 is distributed in the hope that it will be useful,
11 * sherlock265 is distributed in the hope that it will be useful,
1212 * but WITHOUT ANY WARRANTY; without even the implied warranty of
1313 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1414 * GNU General Public License for more details.
1515 *
1616 * You should have received a copy of the GNU General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
17 * along with sherlock265. If not, see <http://www.gnu.org/licenses/>.
1818 */
1919
2020 #ifndef VIDEOWIDGET_HH
00 /****************************************************************************
11 ** Meta object code from reading C++ file 'VideoDecoder.hh'
22 **
3 ** Created: Tue Oct 22 15:51:14 2013
4 ** by: The Qt Meta Object Compiler version 63 (Qt 4.8.1)
3 ** Created by: The Qt Meta Object Compiler version 63 (Qt 4.8.6)
54 **
65 ** WARNING! All changes made in this file will be lost!
76 *****************************************************************************/
109 #if !defined(Q_MOC_OUTPUT_REVISION)
1110 #error "The header file 'VideoDecoder.hh' doesn't include <QObject>."
1211 #elif Q_MOC_OUTPUT_REVISION != 63
13 #error "This file was generated using the moc from 4.8.1. It"
12 #error "This file was generated using the moc from 4.8.6. It"
1413 #error "cannot be used with the include files from this version of Qt."
1514 #error "(The moc has changed too much.)"
1615 #endif
2221 6, // revision
2322 0, // classname
2423 0, 0, // classinfo
25 10, 14, // methods
24 14, 14, // methods
2625 0, 0, // properties
2726 0, 0, // enums/sets
2827 0, 0, // constructors
4241 165, 85, 13, 13, 0x0a,
4342 189, 85, 13, 13, 0x0a,
4443 210, 85, 13, 13, 0x0a,
44 228, 85, 13, 13, 0x0a,
45 248, 85, 13, 13, 0x0a,
46 264, 85, 13, 13, 0x0a,
47 281, 85, 13, 13, 0x0a,
4548
4649 0 // eod
4750 };
5457 "showTBPartitioning(bool)\0"
5558 "showPBPartitioning(bool)\0"
5659 "showIntraPredMode(bool)\0showPBPredMode(bool)\0"
60 "showQuantPY(bool)\0showMotionVec(bool)\0"
61 "showTiles(bool)\0showSlices(bool)\0"
5762 "showDecodedImage(bool)\0"
5863 };
5964
7277 case 6: _t->showPBPartitioning((*reinterpret_cast< bool(*)>(_a[1]))); break;
7378 case 7: _t->showIntraPredMode((*reinterpret_cast< bool(*)>(_a[1]))); break;
7479 case 8: _t->showPBPredMode((*reinterpret_cast< bool(*)>(_a[1]))); break;
75 case 9: _t->showDecodedImage((*reinterpret_cast< bool(*)>(_a[1]))); break;
80 case 9: _t->showQuantPY((*reinterpret_cast< bool(*)>(_a[1]))); break;
81 case 10: _t->showMotionVec((*reinterpret_cast< bool(*)>(_a[1]))); break;
82 case 11: _t->showTiles((*reinterpret_cast< bool(*)>(_a[1]))); break;
83 case 12: _t->showSlices((*reinterpret_cast< bool(*)>(_a[1]))); break;
84 case 13: _t->showDecodedImage((*reinterpret_cast< bool(*)>(_a[1]))); break;
7685 default: ;
7786 }
7887 }
110119 if (_id < 0)
111120 return _id;
112121 if (_c == QMetaObject::InvokeMetaMethod) {
113 if (_id < 10)
122 if (_id < 14)
114123 qt_static_metacall(this, _c, _id, _a);
115 _id -= 10;
124 _id -= 14;
116125 }
117126 return _id;
118127 }
00 /****************************************************************************
11 ** Meta object code from reading C++ file 'VideoPlayer.hh'
22 **
3 ** Created: Tue Oct 22 15:51:14 2013
4 ** by: The Qt Meta Object Compiler version 63 (Qt 4.8.1)
3 ** Created by: The Qt Meta Object Compiler version 63 (Qt 4.8.6)
54 **
65 ** WARNING! All changes made in this file will be lost!
76 *****************************************************************************/
109 #if !defined(Q_MOC_OUTPUT_REVISION)
1110 #error "The header file 'VideoPlayer.hh' doesn't include <QObject>."
1211 #elif Q_MOC_OUTPUT_REVISION != 63
13 #error "This file was generated using the moc from 4.8.1. It"
12 #error "This file was generated using the moc from 4.8.6. It"
1413 #error "cannot be used with the include files from this version of Qt."
1514 #error "(The moc has changed too much.)"
1615 #endif
00 /****************************************************************************
11 ** Meta object code from reading C++ file 'VideoWidget.hh'
22 **
3 ** Created: Tue Oct 22 15:51:14 2013
4 ** by: The Qt Meta Object Compiler version 63 (Qt 4.8.1)
3 ** Created by: The Qt Meta Object Compiler version 63 (Qt 4.8.6)
54 **
65 ** WARNING! All changes made in this file will be lost!
76 *****************************************************************************/
109 #if !defined(Q_MOC_OUTPUT_REVISION)
1110 #error "The header file 'VideoWidget.hh' doesn't include <QObject>."
1211 #elif Q_MOC_OUTPUT_REVISION != 63
13 #error "This file was generated using the moc from 4.8.1. It"
12 #error "This file was generated using the moc from 4.8.6. It"
1413 #error "cannot be used with the include files from this version of Qt."
1514 #error "(The moc has changed too much.)"
1615 #endif
00 /*
1 * H.265 video codec.
2 * Copyright (c) 2013 StrukturAG, Dirk Farin, <farin@struktur.de>
1 * libde265 example application "sherlock265".
2 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
33 *
4 * This file is part of libde265.
4 * This file is part of sherlock265, an example application using libde265.
55 *
6 * libde265 is free software: you can redistribute it and/or modify
6 * sherlock265 is free software: you can redistribute it and/or modify
77 * it under the terms of the GNU General Public License as published by
88 * the Free Software Foundation, either version 3 of the License, or
99 * (at your option) any later version.
1010 *
11 * libde265 is distributed in the hope that it will be useful,
11 * sherlock265 is distributed in the hope that it will be useful,
1212 * but WITHOUT ANY WARRANTY; without even the implied warranty of
1313 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1414 * GNU General Public License for more details.
1515 *
1616 * You should have received a copy of the GNU General Public License
17 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
17 * along with sherlock265. If not, see <http://www.gnu.org/licenses/>.
1818 */
1919
2020 #include "VideoPlayer.hh"
3333
3434 VideoPlayer videoPlayer(argv[1]);
3535 videoPlayer.show();
36
36
3737 return app.exec();
3838 }
3939