unit MRVJpegDec;

(*

 RVMedia uses this unit under Mozilla Public License (MPL).
 The Fast Jpeg Decoder can be used in Delphi VCL projects for the 32-bit
 Windows platform only. It is not compatible with C++Builder, FireMonkey,
 Lazarus, or VCL for 64-bit Windows projects.

 To activate it, open Source\MRV_Defs.inc and remove the dot from the line
 {.$DEFINE RVMUSEFASTJPEGDECODER}

 Changes made by TRichView for RVMedia:
 - the unit is renamed from jpegdec to MRVJpegDec
 - {$IFDEF RVMUSEFASTJPEGDECODER} is added
 - Synopse's include file and Delphi versions $defines are replaced by ours.

*)



/// JPEG picture fast decoder, using SSE/SSE2
// - released under MPL/GPL/LGPL tri-license; version 1.18

(*
    This file is a JPEG picture fast decoder for Delphi

    Copyright (C) 2004 Dr. Manhattan
     Initial SSE and SSE2 assembly code - http://sourceforge.net/projects/jpegdec
    Copyright (C) 2010 Arnaud Bouchez
     Synopse Informatique - http://synopse.info
    Copyright (C) 2017 Mark Griffiths
      http://www.marktg.com/jpegdec

  *** BEGIN LICENSE BLOCK *****
  Version: MPL 1.1/GPL 2.0/LGPL 2.1

  The contents of this file are subject to the Mozilla Public License Version
  1.1 (the "License"); you may not use this file except in compliance with
  the License. You may obtain a copy of the License at
  http://www.mozilla.org/MPL

  Software distributed under the License is distributed on an "AS IS" basis,
  WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  for the specific language governing rights and limitations under the License.

  The Original Code is jpegdec.

  The Initial Developer of the Original Code is Dr. Manhattan.

  Portions created by the Initial Developer are Copyright (C) 2004
    the Initial Developer. All Rights Reserved.

  Contributor(s):
    March 2010: Arnaud Bouchez for Delphi integration
      http://synopse.info
    2017: Mark Griffiths made the code fully thread safe.
      http://www.marktg.com/jpegdec

  Alternatively, the contents of this file may be used under the terms of
  either the GNU General Public License Version 2 or later (the "GPL"), or
  the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  in which case the provisions of the GPL or the LGPL are applicable instead
  of those above. If you wish to allow use of your version of this file only
  under the terms of either the GPL or the LGPL, and not to allow others to
  use your version of this file under the terms of the MPL, indicate your
  decision by deleting the provisions above and replace them with the notice
  and other provisions required by the GPL or the LGPL. If you do not delete
  the provisions above, a recipient may use your version of this file under
  the terms of any one of the MPL, the GPL or the LGPL.

  ***** END LICENSE BLOCK *****


   Arnaud Bouchez notes:
   - Most of the hack was to include the SSE/SSE2 assembly code into a true
   Delphi unit. Conversion was made difficult because the Delphi compiler
   doesn't allow to align code or data at 16 bytes boundaries, which is
   required by the SSE/SSE2 operations. This is a well known limitation
   of the Delphi compiler. See @http://qc.embarcadero.com/wc/qcmain.aspx?d=1116
   A solution was found by copying the whole used tables into memory-allocated
   buffer, and by creating the TBL_64 table by code.
   - Since we use Win32 VirtualAlloc API for memory allocation (which is always
   16 bytes aligned and set to zero, as expected by the code), the TJpegDecode
   object instance has a not-common creator, as the JpegDecode() function:
   don't try allocate any TJpegDecode object on the stack or via Delphi heap
   - there is no TPicture descendent implementation yet, since it should be
   more usefull to use a resulting TBitmap in your code
   - direct access to the picture bitmap without creating any TBitmap resource
   is allowed via the TJpegDecode.DrawTo() methods: so you can use very big
   pictures, without any resource limitations (under Win 2K or XP, allocating
   big TBitmap instances raises errors)
   - decoding is not thread safe by now; if you really need it, please ask
   - tested under Delphi 7 and 2009
   - initial code by Dr Manhattan (a french guy also?) was GREAT! Thanks! :)

  Version 1.0
   - initial release for Delphi, under LGPL license

  Version 1.1
   - licensed under a MPL/GPL/LGPL tri-license
   - memory leak resolution after decoding error
   - avoid GPF issues in TJpegDecode method call
   - it was identified that this decoder is not able to decode some kind of
     jpeg files; comes from original jpegdec code, not our Delphi conversion;
     in such cases, the original libjpeg library must be used instead

  Version 1.2
   - resource leak in TJpegDecode.ToBitmap fixed (thanks Esmond for the report)
   - potential GPF issue fixed in TJpegDecode.Free

  Version 1.18
   - Removed use of a global variable to allow full multithreaded use of
     this library - thanks Mark Griffiths for the patch!

*)

interface

{.$I MRV_Defs.inc}

{$DEFINE RVMUSEFASTJPEGDECODER}

{$IFDEF RVMUSEFASTJPEGDECODER}

uses
  Windows,
  SysUtils,
  {$IFDEF MRVDELPHIXE2}
  VCL.Graphics;
  {$ELSE}
  Graphics;
  {$ENDIF}

type
  /// error codes returned by JpegDecode()
  TJpegDecodeError =
    (JPEG_EOF, JPEG_OUTOFMEM, JPEG_SUCCESS, JPEG_CPUNOTSUPPORTED,
     JPEG_BADFILE, JPEG_FORMATNOTSUPPORTED);

{$A-}
  /// the memory header of a decoded image, as returned by JpegDecode()
  // - must be allocated by JpegDecode() function, not on stack nor heap
  // - code example to draw a picture on a canvas (taken from JpegDraw procedure):
  // !var Img: PJpegDecode;
  // !begin
  // !  if JpegDecode(Buffer,BufferLen,Img)=JPEG_SUCCESS then
  // !  try
  // !    Img.DrawTo(Canvas,X,Y);
  // !  finally
  // !    Img.Free;
  // !  end;
  // !end;
  TJpegDecode = object
    /// picture width
    width: integer;
    /// picture height
    height: integer;
    /// picture scan length
    scanlength: cardinal;
    /// pointer to RGB data
    pRGB: PAnsiChar;
    /// picture  bits per pixel
    bitsPixel: cardinal;
    /// picture number of components
    ComponentsCount: cardinal;
    /// initialize the associated Win32 Bitmap header
    procedure ToBMI(var BMI: TBitmapInfo);
    /// convert the resulting image to a Win32 bitmap context
    // - return nil on error
    // - the PJpegDecode can be freed by JPEG_Free()
    function ToBitmap: TBitmap;
    /// direct draw of the picture to a canvas
    // - use Win32 fast API for fast drawing, without any TBitmap conversion
    // and/or resource allocation (usefull for very big pictures)
    procedure DrawTo(Canvas: TCanvas; X,Y: integer); overload;
    /// direct stretch draw of the picture to a canvas
    // - use Win32 fast API for fast drawing, without any TBitmap conversion
    // and/or resource allocation (usefull for very big pictures)
    procedure DrawTo(Canvas: TCanvas; const Dest: TRect); overload;
    /// direct stretch draw of a source rectangle of the picture to a canvas
    // - use Win32 fast API for fast drawing, without any TBitmap conversion
    // and/or resource allocation (usefull for very big pictures)
    procedure DrawTo(Canvas: TCanvas; const Source, Dest: TRect); overload;
    /// release used bitmap memory and the corresponding PJpegDecode instance
    // - use this method, and not any freemem() nor stack-allocated instance
    function Free: boolean;
  end;
{$A+}
  /// reference pointer to a jpeg picture content, as returned by JpegDecode()
  PJpegDecode = ^TJpegDecode;

/// decode a .JPEG buffer into a bitmap array
// - written entirely in assembly with SSE and SSE2 optimizations. Uses floating
// point internally for maximum precision and image quality. Assembly code
// integrated from @http://sourceforge.net/projects/jpegdec into Delphi
// - very fast, but need a SSE or SSE2 compatible CPU
// - this function will allocate the bitmap image into pImg parameter
// - return an error code, or JPEG_SUCCESS on sucess
// - all memory must be freed by a pImg^.Free call (in a try...finally block)
function JpegDecode(Buffer: pointer; BufferLen: cardinal;
  var pImg: PJpegDecode): TJpegDecodeError; stdcall; overload;

/// helper function which creates a TBitmap from raw .JPEG memory buffer
function JpegDecode(Buffer: pointer; BufferLen: integer): TBitmap; overload;

/// helper function which draw raw .JPEG memory buffer into a Canvas
// - perform the JPEG decompression at every call: if you want to draw a JPEG
// in a WM_PAINT event (i.e. Delphi OnPaint event), don't use this procedure
// but perform the JPEG decompression once, save the PJpegDecode instance,
// and draw the picture at request by using the fast TJpegDecode.DrawTo() methods;
// another possibility (more VCLish) is to use a temporary TBitmap for the drawing
procedure JpegDraw(Buffer: pointer; BufferLen: integer; Canvas: TCanvas; X,Y: integer);

{$ENDIF}


implementation

{$IFDEF RVMUSEFASTJPEGDECODER}


function AllocMem(Size: integer): pointer;
asm // use Windows heap for memory allocation (16 bytes align + set to zero)
        push        PAGE_READWRITE
        push        MEM_COMMIT
        push        eax
        push        0
        call        VirtualAlloc
end;

function ReleaseMem(buf: pointer): integer;
asm
        test        eax, eax
        jz          @Done
        push        MEM_RELEASE
        push        0
        push        eax
        call        VirtualFree
        sub         eax, eax
@Done:
end;

const
  /// size of data to be copied from TBL into 16 bytes aligned memory
  // (otherwize SSE2 will fail)
  TBLSize = $B00;
  /// normal PJpegDecode needed data size -> TBL is copied after this
  TBLOffset = $1B10;
  // TBL_64 offset in PJpegDecode
  TBL64Offset = TBLOffset+TBLSize;
  // TBL_64 size
  TBL64Size = 65*8;

procedure TBL;
asm
// _TBL_MultCR;
        db 00H, 00H, 00H, 00H, 0E1H, 0D1H, 36H, 0BFH
        db 0BCH, 74H, 0B3H, 3FH, 00H, 00H, 00H, 00H
// _TBL_MultCB: TBL+010H
        db 0E5H, 0D0H, 0E2H, 3FH, 1EH, 33H, 0B0H, 0BEH
        db 00H, 00H, 00H, 00H, 00H, 00H, 00H, 00H
// _TBL_tg1_16; TBL+020H
        db 0AFH, 0AFH, 4BH, 3EH, 0AFH, 0AFH, 4BH, 3EH
        db 0AFH, 0AFH, 4BH, 3EH, 0AFH, 0AFH, 4BH, 3EH
// _TBL_tg2_16; TBL+030H
        db 0CDH, 13H, 0D4H, 3EH, 0CDH, 13H, 0D4H, 3EH
        db 0CDH, 13H, 0D4H, 3EH, 0CDH, 13H, 0D4H, 3EH
// _TBL_tg3_16; TBL+040H
        db 0C1H, 0DH, 2BH, 3FH, 0C1H, 0DH, 2BH, 3FH
        db 0C1H, 0DH, 2BH, 3FH, 0C1H, 0DH, 2BH, 3FH
// _TBL_tg1_32; TBL+050H
        db 0DCH, 0B5H, 0C9H, 3DH, 0DCH, 0B5H, 0C9H, 3DH
        db 0DCH, 0B5H, 0C9H, 3DH, 0DCH, 0B5H, 0C9H, 3DH
// _TBL_tg3_32; TBL+060H
        db 42H, 50H, 9BH, 3EH, 42H, 50H, 9BH, 3EH
        db 42H, 50H, 9BH, 3EH, 42H, 50H, 9BH, 3EH
// _TBL_tg5_32; TBL+070H
        db 0B9H, 0D5H, 08H, 3FH, 0B9H, 0D5H, 08H, 3FH
        db 0B9H, 0D5H, 08H, 3FH, 0B9H, 0D5H, 08H, 3FH
// _TBL_tg7_32; TBL+080H
        db 01H, 18H, 52H, 3FH, 01H, 18H, 52H, 3FH
        db 01H, 18H, 52H, 3FH, 01H, 18H, 52H, 3FH
// _TBL_cos2_16; TBL+090H
        db 5EH, 83H, 6CH, 3FH, 5EH, 83H, 6CH, 3FH
        db 5EH, 83H, 6CH, 3FH, 5EH, 83H, 6CH, 3FH
// _TBL_cos4_16; TBL+0A0H
        db 0F3H, 04H, 35H, 3FH, 0F3H, 04H, 35H, 3FH
        db 0F3H, 04H, 35H, 3FH, 0F3H, 04H, 35H, 3FH
// _TBL_MultRow8x8; TBL+0B0H
        dd offset @TBL_8x8_04 // offsets will be recalculated in JpegDecode()
        dd offset @TBL_8x8_17
        dd offset @TBL_8x8_26
        dd offset @TBL_8x8_35
        dd offset @TBL_8x8_04
        dd offset @TBL_8x8_35
        dd offset @TBL_8x8_26
        dd offset @TBL_8x8_17
//_TBL_MultRow16x16: _TBL_MultRow8x8+020H = TBL+0D0H
        dd offset @TBL_16x16_08 // offsets will be recalculated in JpegDecode()
        dd offset @TBL_16x16_1F
        dd offset @TBL_16x16_2E
        dd offset @TBL_16x16_3D
        dd offset @TBL_16x16_4C
        dd offset @TBL_16x16_5B
        dd offset @TBL_16x16_6A
        dd offset @TBL_16x16_79
@TBL_8x8_04: 
        db 00H, 00H, 00H, 3EH, 00H, 00H, 00H, 3EH
        db 00H, 00H, 00H, 3EH, 00H, 00H, 00H, 3EH
        db 75H, 3DH, 27H, 3EH, 0D4H, 8BH, 8AH, 3DH
        db 0D4H, 8BH, 8AH, 0BDH, 75H, 3DH, 27H, 0BEH
        db 00H, 00H, 00H, 3EH, 00H, 00H, 00H, 0BEH
        db 00H, 00H, 00H, 0BEH, 00H, 00H, 00H, 3EH
        db 0D4H, 8BH, 8AH, 3DH, 75H, 3DH, 27H, 0BEH
        db 75H, 3DH, 27H, 3EH, 0D4H, 8BH, 8AH, 0BDH
        db 86H, 8AH, 31H, 3EH, 17H, 83H, 16H, 3EH
        db 4EH, 23H, 0C9H, 3DH, 0AFH, 42H, 0DH, 3DH
        db 17H, 83H, 16H, 3EH, 0AFH, 42H, 0DH, 0BDH
        db 86H, 8AH, 31H, 0BEH, 4EH, 23H, 0C9H, 0BDH
        db 4EH, 23H, 0C9H, 3DH, 86H, 8AH, 31H, 0BEH
        db 0AFH, 42H, 0DH, 3DH, 17H, 83H, 16H, 3EH
        db 0AFH, 42H, 0DH, 3DH, 4EH, 23H, 0C9H, 0BDH
        db 17H, 83H, 16H, 3EH, 86H, 8AH, 31H, 0BEH
@TBL_8x8_17:
        db 86H, 8AH, 31H, 3EH, 86H, 8AH, 31H, 3EH
        db 86H, 8AH, 31H, 3EH, 86H, 8AH, 31H, 3EH
        db 0F8H, 0F7H, 67H, 3EH, 4AH, 2BH, 0C0H, 3DH
        db 4AH, 2BH, 0C0H, 0BDH, 0F8H, 0F7H, 67H, 0BEH
        db 86H, 8AH, 31H, 3EH, 86H, 8AH, 31H, 0BEH
        db 86H, 8AH, 31H, 0BEH, 86H, 8AH, 31H, 3EH
        db 4AH, 2BH, 0C0H, 3DH, 0F8H, 0F7H, 67H, 0BEH
        db 0F8H, 0F7H, 67H, 3EH, 4AH, 2BH, 0C0H, 0BDH
        db 0AFH, 41H, 76H, 3EH, 29H, 0C4H, 50H, 3EH
        db 3FH, 7EH, 0BH, 3EH, 15H, 0EFH, 43H, 3DH
        db 29H, 0C4H, 50H, 3EH, 15H, 0EFH, 43H, 0BDH
        db 0AFH, 41H, 76H, 0BEH, 3FH, 7EH, 0BH, 0BEH
        db 3FH, 7EH, 0BH, 3EH, 0AFH, 41H, 76H, 0BEH
        db 15H, 0EFH, 43H, 3DH, 29H, 0C4H, 50H, 3EH
        db 15H, 0EFH, 43H, 3DH, 3FH, 7EH, 0BH, 0BEH
        db 29H, 0C4H, 50H, 3EH, 0AFH, 41H, 76H, 0BEH
@TBL_8x8_26:
        db 75H, 3DH, 27H, 3EH, 75H, 3DH, 27H, 3EH
        db 75H, 3DH, 27H, 3EH, 75H, 3DH, 27H, 3EH
        db 7AH, 82H, 5AH, 3EH, 0F3H, 04H, 0B5H, 3DH
        db 0F3H, 04H, 0B5H, 0BDH, 7AH, 82H, 5AH, 0BEH
        db 75H, 3DH, 27H, 3EH, 75H, 3DH, 27H, 0BEH
        db 75H, 3DH, 27H, 0BEH, 75H, 3DH, 27H, 3EH
        db 0F3H, 04H, 0B5H, 3DH, 7AH, 82H, 5AH, 0BEH
        db 7AH, 82H, 5AH, 3EH, 0F3H, 04H, 0B5H, 0BDH
        db 0F8H, 0F7H, 67H, 3EH, 4CH, 0A7H, 44H, 3EH
        db 51H, 66H, 03H, 3EH, 0D3H, 90H, 38H, 3DH
        db 4CH, 0A7H, 44H, 3EH, 0D3H, 90H, 38H, 0BDH
        db 0F8H, 0F7H, 67H, 0BEH, 51H, 66H, 03H, 0BEH
        db 51H, 66H, 03H, 3EH, 0F8H, 0F7H, 67H, 0BEH
        db 0D3H, 90H, 38H, 3DH, 4CH, 0A7H, 44H, 3EH
        db 0D3H, 90H, 38H, 3DH, 51H, 66H, 03H, 0BEH
        db 4CH, 0A7H, 44H, 3EH, 0F8H, 0F7H, 67H, 0BEH
@TBL_8x8_35:
        db 17H, 83H, 16H, 3EH, 17H, 83H, 16H, 3EH
        db 17H, 83H, 16H, 3EH, 17H, 83H, 16H, 3EH
        db 4CH, 0A7H, 44H, 3EH, 0C1H, 0E9H, 0A2H, 3DH
        db 0C1H, 0E9H, 0A2H, 0BDH, 4CH, 0A7H, 44H, 0BEH
        db 17H, 83H, 16H, 3EH, 17H, 83H, 16H, 0BEH
        db 17H, 83H, 16H, 0BEH, 17H, 83H, 16H, 3EH
        db 0C1H, 0E9H, 0A2H, 3DH, 4CH, 0A7H, 44H, 0BEH
        db 4CH, 0A7H, 44H, 3EH, 0C1H, 0E9H, 0A2H, 0BDH
        db 29H, 0C4H, 50H, 3EH, 0C5H, 0FBH, 30H, 3EH
        db 5EH, 83H, 0ECH, 3DH, 0D1H, 1AH, 26H, 3DH
        db 0C5H, 0FBH, 30H, 3EH, 0D1H, 1AH, 26H, 0BDH
        db 29H, 0C4H, 50H, 0BEH, 5EH, 83H, 0ECH, 0BDH
        db 5EH, 83H, 0ECH, 3DH, 29H, 0C4H, 50H, 0BEH
        db 0D1H, 1AH, 26H, 3DH, 0C5H, 0FBH, 30H, 3EH
        db 0D1H, 1AH, 26H, 3DH, 5EH, 83H, 0ECH, 0BDH
        db 0C5H, 0FBH, 30H, 3EH, 29H, 0C4H, 50H, 0BEH
@TBL_16x16_08:
        db 00H, 00H, 00H, 3EH, 00H, 00H, 00H, 3EH
        db 00H, 00H, 00H, 3EH, 00H, 00H, 00H, 3EH
        db 86H, 8AH, 31H, 3EH, 17H, 83H, 16H, 3EH
        db 4EH, 23H, 0C9H, 3DH, 0AFH, 42H, 0DH, 3DH
        db 75H, 3DH, 27H, 3EH, 0D4H, 8BH, 8AH, 3DH
        db 0D4H, 8BH, 8AH, 0BDH, 75H, 3DH, 27H, 0BEH
        db 17H, 83H, 16H, 3EH, 0AFH, 42H, 0DH, 0BDH
        db 86H, 8AH, 31H, 0BEH, 4EH, 23H, 0C9H, 0BDH
        db 0CEH, 25H, 34H, 3EH, 86H, 39H, 2DH, 3EH
        db 12H, 0A5H, 1FH, 3EH, 0AH, 0EEH, 0BH, 3EH
        db 86H, 39H, 2DH, 3EH, 0C6H, 0ACH, 0E5H, 3DH
        db 0A9H, 0F1H, 8DH, 3CH, 0F2H, 0A9H, 0AAH, 0BDH
        db 12H, 0A5H, 1FH, 3EH, 0A9H, 0F1H, 8DH, 3CH
        db 0AH, 0EEH, 0BH, 0BEH, 86H, 39H, 2DH, 0BEH
        db 0AH, 0EEH, 0BH, 3EH, 0F2H, 0A9H, 0AAH, 0BDH
        db 86H, 39H, 2DH, 0BEH, 0A9H, 0F1H, 8DH, 3CH
        db 00H, 00H, 00H, 3EH, 00H, 00H, 00H, 3EH
        db 00H, 00H, 00H, 3EH, 00H, 00H, 00H, 3EH
        db 0AFH, 42H, 0DH, 0BDH, 4EH, 23H, 0C9H, 0BDH
        db 17H, 83H, 16H, 0BEH, 86H, 8AH, 31H, 0BEH
        db 75H, 3DH, 27H, 0BEH, 0D4H, 8BH, 8AH, 0BDH
        db 0D4H, 8BH, 8AH, 3DH, 75H, 3DH, 27H, 3EH
        db 4EH, 23H, 0C9H, 3DH, 86H, 8AH, 31H, 3EH
        db 0AFH, 42H, 0DH, 3DH, 17H, 83H, 16H, 0BEH
        db 0C6H, 0ACH, 0E5H, 3DH, 0F2H, 0A9H, 0AAH, 3DH
        db 45H, 30H, 52H, 3DH, 0A9H, 0F1H, 8DH, 3CH
        db 12H, 0A5H, 1FH, 0BEH, 0CEH, 25H, 34H, 0BEH
        db 0AH, 0EEH, 0BH, 0BEH, 45H, 30H, 52H, 0BDH
        db 45H, 30H, 52H, 0BDH, 0C6H, 0ACH, 0E5H, 3DH
        db 0CEH, 25H, 34H, 3EH, 0F2H, 0A9H, 0AAH, 3DH
        db 0CEH, 25H, 34H, 3EH, 45H, 30H, 52H, 3DH
        db 12H, 0A5H, 1FH, 0BEH, 0C6H, 0ACH, 0E5H, 0BDH
@TBL_16x16_1F:
        db 0CEH, 25H, 34H, 3EH, 0CEH, 25H, 34H, 3EH
        db 0CEH, 25H, 34H, 3EH, 0CEH, 25H, 34H, 3EH
        db 3CH, 0DFH, 79H, 3EH, 0CDH, 0D4H, 53H, 3EH
        db 87H, 8AH, 0DH, 3EH, 7FH, 0CFH, 46H, 3DH
        db 0D1H, 5FH, 6BH, 3EH, 8EH, 0FDH, 0C2H, 3DH
        db 8EH, 0FDH, 0C2H, 0BDH, 0D1H, 5FH, 6BH, 0BEH
        db 0CDH, 0D4H, 53H, 3EH, 7FH, 0CFH, 46H, 0BDH
        db 3CH, 0DFH, 79H, 0BEH, 87H, 8AH, 0DH, 0BEH
        db 5FH, 8AH, 7DH, 3EH, 0EH, 0CCH, 73H, 3EH
        db 48H, 0AFH, 60H, 3EH, 12H, 0F0H, 44H, 3EH
        db 0EH, 0CCH, 73H, 3EH, 67H, 9FH, 21H, 3EH
        db 0C2H, 0C5H, 0C7H, 3CH, 64H, 31H, 0F0H, 0BDH
        db 48H, 0AFH, 60H, 3EH, 0C2H, 0C5H, 0C7H, 3CH
        db 12H, 0F0H, 44H, 0BEH, 0EH, 0CCH, 73H, 0BEH
        db 12H, 0F0H, 44H, 3EH, 64H, 31H, 0F0H, 0BDH
        db 0EH, 0CCH, 73H, 0BEH, 0C2H, 0C5H, 0C7H, 3CH
        db 0CEH, 25H, 34H, 3EH, 0CEH, 25H, 34H, 3EH
        db 0CEH, 25H, 34H, 3EH, 0CEH, 25H, 34H, 3EH
        db 7FH, 0CFH, 46H, 0BDH, 87H, 8AH, 0DH, 0BEH
        db 0CDH, 0D4H, 53H, 0BEH, 3CH, 0DFH, 79H, 0BEH
        db 0D1H, 5FH, 6BH, 0BEH, 8EH, 0FDH, 0C2H, 0BDH
        db 8EH, 0FDH, 0C2H, 3DH, 0D1H, 5FH, 6BH, 3EH
        db 87H, 8AH, 0DH, 3EH, 3CH, 0DFH, 79H, 3EH
        db 7FH, 0CFH, 46H, 3DH, 0CDH, 0D4H, 53H, 0BEH
        db 67H, 9FH, 21H, 3EH, 64H, 31H, 0F0H, 3DH
        db 0FBH, 0E8H, 93H, 3DH, 0C2H, 0C5H, 0C7H, 3CH
        db 48H, 0AFH, 60H, 0BEH, 5FH, 8AH, 7DH, 0BEH
        db 12H, 0F0H, 44H, 0BEH, 0FBH, 0E8H, 93H, 0BDH
        db 0FBH, 0E8H, 93H, 0BDH, 67H, 9FH, 21H, 3EH
        db 5FH, 8AH, 7DH, 3EH, 64H, 31H, 0F0H, 3DH
        db 5FH, 8AH, 7DH, 3EH, 0FBH, 0E8H, 93H, 3DH
        db 48H, 0AFH, 60H, 0BEH, 67H, 9FH, 21H, 0BEH
@TBL_16x16_2E:
        db 86H, 8AH, 31H, 3EH, 86H, 8AH, 31H, 3EH
        db 86H, 8AH, 31H, 3EH, 86H, 8AH, 31H, 3EH
        db 0AFH, 41H, 76H, 3EH, 29H, 0C4H, 50H, 3EH
        db 3FH, 7EH, 0BH, 3EH, 15H, 0EFH, 43H, 3DH
        db 0F8H, 0F7H, 67H, 3EH, 4AH, 2BH, 0C0H, 3DH
        db 4AH, 2BH, 0C0H, 0BDH, 0F8H, 0F7H, 67H, 0BEH
        db 29H, 0C4H, 50H, 3EH, 15H, 0EFH, 43H, 0BDH
        db 0AFH, 41H, 76H, 0BEH, 3FH, 7EH, 0BH, 0BEH
        db 3CH, 0DFH, 79H, 3EH, 02H, 45H, 70H, 3EH
        db 07H, 6FH, 5DH, 3EH, 98H, 16H, 42H, 3EH
        db 02H, 45H, 70H, 3EH, 0BCH, 48H, 1FH, 3EH
        db 0C8H, 0E1H, 0C4H, 3CH, 0B2H, 0B7H, 0ECH, 0BDH
        db 07H, 6FH, 5DH, 3EH, 0C8H, 0E1H, 0C4H, 3CH
        db 98H, 16H, 42H, 0BEH, 02H, 45H, 70H, 0BEH
        db 98H, 16H, 42H, 3EH, 0B2H, 0B7H, 0ECH, 0BDH
        db 02H, 45H, 70H, 0BEH, 0C8H, 0E1H, 0C4H, 3CH
        db 86H, 8AH, 31H, 3EH, 86H, 8AH, 31H, 3EH
        db 86H, 8AH, 31H, 3EH, 86H, 8AH, 31H, 3EH
        db 15H, 0EFH, 43H, 0BDH, 3FH, 7EH, 0BH, 0BEH
        db 29H, 0C4H, 50H, 0BEH, 0AFH, 41H, 76H, 0BEH
        db 0F8H, 0F7H, 67H, 0BEH, 4AH, 2BH, 0C0H, 0BDH
        db 4AH, 2BH, 0C0H, 3DH, 0F8H, 0F7H, 67H, 3EH
        db 3FH, 7EH, 0BH, 3EH, 0AFH, 41H, 76H, 3EH
        db 15H, 0EFH, 43H, 3DH, 29H, 0C4H, 50H, 0BEH
        db 0BCH, 48H, 1FH, 3EH, 0B2H, 0B7H, 0ECH, 3DH
        db 1CH, 0C5H, 91H, 3DH, 0C8H, 0E1H, 0C4H, 3CH
        db 07H, 6FH, 5DH, 0BEH, 3CH, 0DFH, 79H, 0BEH
        db 98H, 16H, 42H, 0BEH, 1CH, 0C5H, 91H, 0BDH
        db 1CH, 0C5H, 91H, 0BDH, 0BCH, 48H, 1FH, 3EH
        db 3CH, 0DFH, 79H, 3EH, 0B2H, 0B7H, 0ECH, 3DH
        db 3CH, 0DFH, 79H, 3EH, 1CH, 0C5H, 91H, 3DH
        db 07H, 6FH, 5DH, 0BEH, 0BCH, 48H, 1FH, 0BEH
@TBL_16x16_3D:
        db 86H, 39H, 2DH, 3EH, 86H, 39H, 2DH, 3EH
        db 86H, 39H, 2DH, 3EH, 86H, 39H, 2DH, 3EH
        db 02H, 45H, 70H, 3EH, 0D2H, 0B0H, 4BH, 3EH     
        db 0EH, 1AH, 08H, 3EH, 9CH, 2BH, 3FH, 3DH       
        db 38H, 54H, 62H, 3EH, 40H, 7FH, 0BBH, 3DH      
        db 40H, 7FH, 0BBH, 0BDH, 38H, 54H, 62H, 0BEH    
        db 0D2H, 0B0H, 4BH, 3EH, 9CH, 2BH, 3FH, 0BDH    
        db 02H, 45H, 70H, 0BEH, 0EH, 1AH, 08H, 0BEH     
        db 0EH, 0CCH, 73H, 3EH, 99H, 6DH, 6AH, 3EH      
        db 0D9H, 0CH, 58H, 3EH, 9CH, 5EH, 3DH, 3EH      
        db 99H, 6DH, 6AH, 3EH, 5EH, 69H, 1BH, 3EH       
        db 68H, 18H, 0C0H, 3CH, 64H, 0F6H, 0E6H, 0BDH
        db 0D9H, 0CH, 58H, 3EH, 68H, 18H, 0C0H, 3CH     
        db 9CH, 5EH, 3DH, 0BEH, 99H, 6DH, 6AH, 0BEH     
        db 9CH, 5EH, 3DH, 3EH, 64H, 0F6H, 0E6H, 0BDH    
        db 99H, 6DH, 6AH, 0BEH, 68H, 18H, 0C0H, 3CH     
        db 86H, 39H, 2DH, 3EH, 86H, 39H, 2DH, 3EH       
        db 86H, 39H, 2DH, 3EH, 86H, 39H, 2DH, 3EH       
        db 9CH, 2BH, 3FH, 0BDH, 0EH, 1AH, 08H, 0BEH     
        db 0D2H, 0B0H, 4BH, 0BEH, 02H, 45H, 70H, 0BEH
        db 38H, 54H, 62H, 0BEH, 40H, 7FH, 0BBH, 0BDH    
        db 40H, 7FH, 0BBH, 3DH, 38H, 54H, 62H, 3EH
        db 0EH, 1AH, 08H, 3EH, 02H, 45H, 70H, 3EH       
        db 9CH, 2BH, 3FH, 3DH, 0D2H, 0B0H, 4BH, 0BEH
        db 5EH, 69H, 1BH, 3EH, 64H, 0F6H, 0E6H, 3DH     
        db 0DAH, 39H, 8EH, 3DH, 68H, 18H, 0C0H, 3CH     
        db 0D9H, 0CH, 58H, 0BEH, 0EH, 0CCH, 73H, 0BEH   
        db 9CH, 5EH, 3DH, 0BEH, 0DAH, 39H, 8EH, 0BDH    
        db 0DAH, 39H, 8EH, 0BDH, 5EH, 69H, 1BH, 3EH     
        db 0EH, 0CCH, 73H, 3EH, 64H, 0F6H, 0E6H, 3DH    
        db 0EH, 0CCH, 73H, 3EH, 0DAH, 39H, 8EH, 3DH     
        db 0D9H, 0CH, 58H, 0BEH, 5EH, 69H, 1BH, 0BEH    
@TBL_16x16_4C:
        db 75H, 3DH, 27H, 3EH, 75H, 3DH, 27H, 3EH       
        db 75H, 3DH, 27H, 3EH, 75H, 3DH, 27H, 3EH       
        db 0F8H, 0F7H, 67H, 3EH, 4CH, 0A7H, 44H, 3EH    
        db 51H, 66H, 03H, 3EH, 0D3H, 90H, 38H, 3DH      
        db 7AH, 82H, 5AH, 3EH, 0F3H, 04H, 0B5H, 3DH
        db 0F3H, 04H, 0B5H, 0BDH, 7AH, 82H, 5AH, 0BEH   
        db 4CH, 0A7H, 44H, 3EH, 0D3H, 90H, 38H, 0BDH    
        db 0F8H, 0F7H, 67H, 0BEH, 51H, 66H, 03H, 0BEH   
        db 0D1H, 5FH, 6BH, 3EH, 38H, 54H, 62H, 3EH
        db 03H, 96H, 50H, 3EH, 0C0H, 0D3H, 36H, 3EH
        db 38H, 54H, 62H, 3EH, 0D8H, 0AH, 16H, 3EH      
        db 70H, 75H, 0B9H, 3CH, 0AAH, 0FBH, 0DEH, 0BDH
        db 03H, 96H, 50H, 3EH, 70H, 75H, 0B9H, 3CH      
        db 0C0H, 0D3H, 36H, 0BEH, 38H, 54H, 62H, 0BEH   
        db 0C0H, 0D3H, 36H, 3EH, 0AAH, 0FBH, 0DEH, 0BDH 
        db 38H, 54H, 62H, 0BEH, 70H, 75H, 0B9H, 3CH     
        db 75H, 3DH, 27H, 3EH, 75H, 3DH, 27H, 3EH
        db 75H, 3DH, 27H, 3EH, 75H, 3DH, 27H, 3EH       
        db 0D3H, 90H, 38H, 0BDH, 51H, 66H, 03H, 0BEH    
        db 4CH, 0A7H, 44H, 0BEH, 0F8H, 0F7H, 67H, 0BEH
        db 7AH, 82H, 5AH, 0BEH, 0F3H, 04H, 0B5H, 0BDH   
        db 0F3H, 04H, 0B5H, 3DH, 7AH, 82H, 5AH, 3EH     
        db 51H, 66H, 03H, 3EH, 0F8H, 0F7H, 67H, 3EH
        db 0D3H, 90H, 38H, 3DH, 4CH, 0A7H, 44H, 0BEH
        db 0D8H, 0AH, 16H, 3EH, 0AAH, 0FBH, 0DEH, 3DH   
        db 0F2H, 4FH, 89H, 3DH, 70H, 75H, 0B9H, 3CH     
        db 03H, 96H, 50H, 0BEH, 0D1H, 5FH, 6BH, 0BEH    
        db 0C0H, 0D3H, 36H, 0BEH, 0F2H, 4FH, 89H, 0BDH
        db 0F2H, 4FH, 89H, 0BDH, 0D8H, 0AH, 16H, 3EH    
        db 0D1H, 5FH, 6BH, 3EH, 0AAH, 0FBH, 0DEH, 3DH   
        db 0D1H, 5FH, 6BH, 3EH, 0F2H, 4FH, 89H, 3DH     
        db 03H, 96H, 50H, 0BEH, 0D8H, 0AH, 16H, 0BEH    
@TBL_16x16_5B:
        db 12H, 0A5H, 1FH, 3EH, 12H, 0A5H, 1FH, 3EH     
        db 12H, 0A5H, 1FH, 3EH, 12H, 0A5H, 1FH, 3EH     
        db 07H, 6FH, 5DH, 3EH, 0F1H, 0B8H, 3BH, 3EH     
        db 3EH, 0DDH, 0FAH, 3DH, 01H, 2FH, 30H, 3DH     
        db 03H, 96H, 50H, 3EH, 5DH, 0CCH, 0ACH, 3DH     
        db 5DH, 0CCH, 0ACH, 0BDH, 03H, 96H, 50H, 0BEH   
        db 0F1H, 0B8H, 3BH, 3EH, 01H, 2FH, 30H, 0BDH    
        db 07H, 6FH, 5DH, 0BEH, 3EH, 0DDH, 0FAH, 0BDH   
        db 48H, 0AFH, 60H, 3EH, 0D9H, 0CH, 58H, 3EH     
        db 0EDH, 1CH, 47H, 3EH, 25H, 86H, 2EH, 3EH      
        db 0D9H, 0CH, 58H, 3EH, 67H, 3AH, 0FH, 3EH      
        db 3DH, 09H, 0B1H, 3CH, 31H, 0DBH, 0D4H, 0BDH   
        db 0EDH, 1CH, 47H, 3EH, 3DH, 09H, 0B1H, 3CH     
        db 25H, 86H, 2EH, 0BEH, 0D9H, 0CH, 58H, 0BEH    
        db 25H, 86H, 2EH, 3EH, 31H, 0DBH, 0D4H, 0BDH
        db 0D9H, 0CH, 58H, 0BEH, 3DH, 09H, 0B1H, 3CH    
        db 12H, 0A5H, 1FH, 3EH, 12H, 0A5H, 1FH, 3EH
        db 12H, 0A5H, 1FH, 3EH, 12H, 0A5H, 1FH, 3EH     
        db 01H, 2FH, 30H, 0BDH, 3EH, 0DDH, 0FAH, 0BDH
        db 0F1H, 0B8H, 3BH, 0BEH, 07H, 6FH, 5DH, 0BEH   
        db 03H, 96H, 50H, 0BEH, 5DH, 0CCH, 0ACH, 0BDH   
        db 5DH, 0CCH, 0ACH, 3DH, 03H, 96H, 50H, 3EH
        db 3EH, 0DDH, 0FAH, 3DH, 07H, 6FH, 5DH, 3EH     
        db 01H, 2FH, 30H, 3DH, 0F1H, 0B8H, 3BH, 0BEH    
        db 67H, 3AH, 0FH, 3EH, 31H, 0DBH, 0D4H, 3DH     
        db 83H, 13H, 83H, 3DH, 3DH, 09H, 0B1H, 3CH      
        db 0EDH, 1CH, 47H, 0BEH, 48H, 0AFH, 60H, 0BEH   
        db 25H, 86H, 2EH, 0BEH, 83H, 13H, 83H, 0BDH     
        db 83H, 13H, 83H, 0BDH, 67H, 3AH, 0FH, 3EH      
        db 48H, 0AFH, 60H, 3EH, 31H, 0DBH, 0D4H, 3DH    
        db 48H, 0AFH, 60H, 3EH, 83H, 13H, 83H, 3DH      
        db 0EDH, 1CH, 47H, 0BEH, 67H, 3AH, 0FH, 0BEH    
@TBL_16x16_6A:
        db 17H, 83H, 16H, 3EH, 17H, 83H, 16H, 3EH
        db 17H, 83H, 16H, 3EH, 17H, 83H, 16H, 3EH       
        db 29H, 0C4H, 50H, 3EH, 0C5H, 0FBH, 30H, 3EH    
        db 5EH, 83H, 0ECH, 3DH, 0D1H, 1AH, 26H, 3DH
        db 4CH, 0A7H, 44H, 3EH, 0C1H, 0E9H, 0A2H, 3DH
        db 0C1H, 0E9H, 0A2H, 0BDH, 4CH, 0A7H, 44H, 0BEH 
        db 0C5H, 0FBH, 30H, 3EH, 0D1H, 1AH, 26H, 0BDH
        db 29H, 0C4H, 50H, 0BEH, 5EH, 83H, 0ECH, 0BDH   
        db 0CDH, 0D4H, 53H, 3EH, 0D2H, 0B0H, 4BH, 3EH   
        db 0F1H, 0B8H, 3BH, 3EH, 43H, 8AH, 24H, 3EH     
        db 0D2H, 0B0H, 4BH, 3EH, 0D9H, 08H, 07H, 3EH    
        db 91H, 0E8H, 0A6H, 3CH, 0F1H, 0ADH, 0C8H, 0BDH
        db 0F1H, 0B8H, 3BH, 3EH, 91H, 0E8H, 0A6H, 3CH
        db 43H, 8AH, 24H, 0BEH, 0D2H, 0B0H, 4BH, 0BEH   
        db 43H, 8AH, 24H, 3EH, 0F1H, 0ADH, 0C8H, 0BDH   
        db 0D2H, 0B0H, 4BH, 0BEH, 91H, 0E8H, 0A6H, 3CH  
        db 17H, 83H, 16H, 3EH, 17H, 83H, 16H, 3EH       
        db 17H, 83H, 16H, 3EH, 17H, 83H, 16H, 3EH       
        db 0D1H, 1AH, 26H, 0BDH, 5EH, 83H, 0ECH, 0BDH   
        db 0C5H, 0FBH, 30H, 0BEH, 29H, 0C4H, 50H, 0BEH  
        db 4CH, 0A7H, 44H, 0BEH, 0C1H, 0E9H, 0A2H, 0BDH 
        db 0C1H, 0E9H, 0A2H, 3DH, 4CH, 0A7H, 44H, 3EH   
        db 5EH, 83H, 0ECH, 3DH, 29H, 0C4H, 50H, 3EH
        db 0D1H, 1AH, 26H, 3DH, 0C5H, 0FBH, 30H, 0BEH   
        db 0D9H, 08H, 07H, 3EH, 0F1H, 0ADH, 0C8H, 3DH   
        db 0D5H, 27H, 77H, 3DH, 91H, 0E8H, 0A6H, 3CH    
        db 0F1H, 0B8H, 3BH, 0BEH, 0CDH, 0D4H, 53H, 0BEH 
        db 43H, 8AH, 24H, 0BEH, 0D5H, 27H, 77H, 0BDH    
        db 0D5H, 27H, 77H, 0BDH, 0D9H, 08H, 07H, 3EH    
        db 0CDH, 0D4H, 53H, 3EH, 0F1H, 0ADH, 0C8H, 3DH  
        db 0CDH, 0D4H, 53H, 3EH, 0D5H, 27H, 77H, 3DH    
        db 0F1H, 0B8H, 3BH, 0BEH, 0D9H, 08H, 07H, 0BEH  
@TBL_16x16_79:
        db 0AH, 0EEH, 0BH, 3EH, 0AH, 0EEH, 0BH, 3EH     
        db 0AH, 0EEH, 0BH, 3EH, 0AH, 0EEH, 0BH, 3EH     
        db 98H, 16H, 42H, 3EH, 43H, 8AH, 24H, 3EH       
        db 64H, 0E2H, 0DBH, 3DH, 1DH, 6DH, 1AH, 3DH     
        db 0C0H, 0D3H, 36H, 3EH, 7FH, 75H, 97H, 3DH     
        db 7FH, 75H, 97H, 0BDH, 0C0H, 0D3H, 36H, 0BEH
        db 43H, 8AH, 24H, 3EH, 1DH, 6DH, 1AH, 0BDH      
        db 98H, 16H, 42H, 0BEH, 64H, 0E2H, 0DBH, 0BDH   
        db 12H, 0F0H, 44H, 3EH, 9CH, 5EH, 3DH, 3EH      
        db 25H, 86H, 2EH, 3EH, 0B8H, 0F8H, 18H, 3EH
        db 9CH, 5EH, 3DH, 3EH, 0BEH, 14H, 0FBH, 3DH
        db 66H, 2CH, 9BH, 3CH, 0EEH, 91H, 0BAH, 0BDH
        db 25H, 86H, 2EH, 3EH, 66H, 2CH, 9BH, 3CH       
        db 0B8H, 0F8H, 18H, 0BEH, 9CH, 5EH, 3DH, 0BEH
        db 0B8H, 0F8H, 18H, 3EH, 0EEH, 91H, 0BAH, 0BDH
        db 9CH, 5EH, 3DH, 0BEH, 66H, 2CH, 9BH, 3CH
        db 0AH, 0EEH, 0BH, 3EH, 0AH, 0EEH, 0BH, 3EH
        db 0AH, 0EEH, 0BH, 3EH, 0AH, 0EEH, 0BH, 3EH
        db 1DH, 6DH, 1AH, 0BDH, 64H, 0E2H, 0DBH, 0BDH
        db 43H, 8AH, 24H, 0BEH, 98H, 16H, 42H, 0BEH
        db 0C0H, 0D3H, 36H, 0BEH, 7FH, 75H, 97H, 0BDH
        db 7FH, 75H, 97H, 3DH, 0C0H, 0D3H, 36H, 3EH
        db 64H, 0E2H, 0DBH, 3DH, 98H, 16H, 42H, 3EH
        db 1DH, 6DH, 1AH, 3DH, 43H, 8AH, 24H, 0BEH
        db 0BEH, 14H, 0FBH, 3DH, 0EEH, 91H, 0BAH, 3DH
        db 4DH, 0C7H, 65H, 3DH, 66H, 2CH, 9BH, 3CH
        db 25H, 86H, 2EH, 0BEH, 12H, 0F0H, 44H, 0BEH
        db 0B8H, 0F8H, 18H, 0BEH, 4DH, 0C7H, 65H, 0BDH
        db 4DH, 0C7H, 65H, 0BDH, 0BEH, 14H, 0FBH, 3DH
        db 12H, 0F0H, 44H, 3EH, 0EEH, 91H, 0BAH, 3DH
        db 12H, 0F0H, 44H, 3EH, 4DH, 0C7H, 65H, 3DH
        db 25H, 86H, 2EH, 0BEH, 0BEH, 14H, 0FBH, 0BDH
end;

procedure RGB_GrayConv_SSE2;
asm
        push    esi
        mov     ecx, dword ptr [ebp + 000000C8H]
        mov     edx, dword ptr [ebp + 000000B8H]
        mov     esi, dword ptr [ebp + 000000D4H]
        shr     edx, 2
        movaps  xmm7, dqword ptr [ebp + 000000E0H]
        neg     edx
@@016:  mov     eax, dword ptr [edx + esi]
        movaps  xmm0, dqword ptr [eax]                 
        movaps  xmm1, dqword ptr [eax + 10H]           
        addps   xmm0, xmm7                              
        addps   xmm1, xmm7                              
        cvtps2dq xmm0, xmm0
        cvtps2dq xmm1, xmm1                             
        pshufd  xmm2, xmm0, -1                          
        pshufd  xmm3, xmm0, -86                         
        pshufd  xmm4, xmm0, 85                          
        pshufd  xmm0, xmm0, 0                           
        packssdw xmm3, xmm2                             
        packssdw xmm0, xmm4
        pshufd  xmm2, xmm1, -1                          
        packuswb xmm0, xmm3                             
        pshufd  xmm3, xmm1, -86                         
        pshufd  xmm4, xmm1, 85                          
        pshufd  xmm1, xmm1, 0
        packssdw xmm3, xmm2
        packssdw xmm1, xmm4                             
        packuswb xmm1, xmm3                             
        movdqa  dqword ptr [ecx], xmm0                 
        movdqa  dqword ptr [ecx + 10H], xmm1           
        add     ecx, dword ptr [edx + esi + 04H]        
        add     edx, 8                                  
        jnz     @@016                                   
        dec     dword ptr [ebp + 000000D0H]             
        jz      @@018
@@017:  mov     dword ptr [ebp + 000000C8H], ecx        
        pop     esi
        ret                                             

@@018:  add     ecx, dword ptr [ebp + 000000D8H]        
        mov     eax, dword ptr [ebp + 000000CCH]        
        mov     dword ptr [ebp + 000000D0H], eax        
        jmp     @@017
end;

procedure RGB_GrayConv_SSE;
asm
        push    esi                                     
        mov     ecx, dword ptr [ebp + 000000C8H]        
        mov     edx, dword ptr [ebp + 000000B8H]        
        mov     esi, dword ptr [ebp + 000000D4H]        
        shr     edx, 2
        movaps  xmm7, dqword ptr [ebp + 000000E0H]     
        neg     edx                                     
@@019:  mov     eax, dword ptr [edx + esi]              
        movaps  xmm0, dqword ptr [eax]                 
        movaps  xmm1, dqword ptr [eax + 10H]           
        addps   xmm0, xmm7                              
        addps   xmm1, xmm7
        cvtps2pi mm1, xmm0                              
        cvtps2pi mm2, xmm1                              
        movhlps xmm0, xmm0                              
        movhlps xmm1, xmm1                              
        cvtps2pi mm3, xmm0
        cvtps2pi mm4, xmm1
        packssdw mm1, mm3                               
        packssdw mm2, mm4                               
        pshufw  mm5, mm1, -1                            
        pshufw  mm6, mm1, -86                           
        pshufw  mm7, mm1, 85                            
        pshufw  mm1, mm1, 0                             
        packuswb mm6, mm5
        packuswb mm1, mm7
        pshufw  mm3, mm2, -1                            
        pshufw  mm4, mm2, -86
        pshufw  mm5, mm2, 85
        pshufw  mm2, mm2, 0
        packuswb mm4, mm3
        packuswb mm2, mm5
        movq    qword ptr [ecx], mm1
        movq    qword ptr [ecx + 08H], mm6
        movq    qword ptr [ecx + 10H], mm2
        movq    qword ptr [ecx + 18H], mm4
        add     ecx, dword ptr [edx + esi + 04H]        
        add     edx, 8
        jnz     @@019                                   
        dec     dword ptr [ebp + 000000D0H]
        jz      @@021                                   
@@020:  mov     dword ptr [ebp + 000000C8H], ecx
        pop     esi                                     
        ret

@@021:  add     ecx, dword ptr [ebp + 000000D8H]
        mov     eax, dword ptr [ebp + 000000CCH]
        mov     dword ptr [ebp + 000000D0H], eax
        jmp     @@020
end;

procedure RGB_YCbCrConv_SSE2;
asm
        push    esi
        push    edi
        push    ebx
        mov     edx, dword ptr [ebp + 000000C8H]
        mov     eax, dword ptr [ebp + 000000B8H]
        mov     esi, dword ptr [ebp + 000000D4H]
        shr     eax, 1
        neg     eax
        movaps  xmm7, dqword ptr [ebp+TBLOffset]
@@010:  mov     edi, dword ptr [eax + esi]
        mov     ebx, dword ptr [eax + esi + 04H]        
        mov     ecx, dword ptr [eax + esi + 08H]        
        movaps  xmm6, dqword ptr [ebp+TBLOffset+16]
        movaps  xmm0, dqword ptr [edi]                 
        movaps  xmm1, dqword ptr [ebx]                 
        movaps  xmm2, dqword ptr [ecx]                 
        addps   xmm0, dqword ptr [ebp + 000000E0H]     
        pshufd  xmm3, xmm0, 0                           
        pshufd  xmm4, xmm1, 0                           
        pshufd  xmm5, xmm2, 0                           
        mulps   xmm4, xmm6                              
        mulps   xmm5, xmm7                              
        addps   xmm3, xmm4                              
        addps   xmm3, xmm5                              
        pshufd  xmm4, xmm0, 85                          
        pshufd  xmm5, xmm1, 85                          
        pshufd  xmm6, xmm2, 85                          
        mulps   xmm5, dqword ptr [ebp+TBLOffset+16]
        mulps   xmm6, xmm7                              
        addps   xmm4, xmm5                              
        addps   xmm4, xmm6                              
        cvtps2dq xmm3, xmm3                             
        cvtps2dq xmm4, xmm4                             
        packssdw xmm3, xmm4                             
        pshufd  xmm4, xmm0, -86                         
        pshufd  xmm5, xmm1, -86
        pshufd  xmm6, xmm2, -86                         
        mulps   xmm5, dqword ptr [ebp+TBLOffset+16]
        mulps   xmm6, xmm7                              
        addps   xmm4, xmm5                              
        addps   xmm4, xmm6                              
        pshufd  xmm0, xmm0, -1                          
        pshufd  xmm1, xmm1, -1                          
        pshufd  xmm2, xmm2, -1                          
        mulps   xmm1, dqword ptr [ebp+TBLOffset+16]
        mulps   xmm2, xmm7                              
        addps   xmm0, xmm1                              
        addps   xmm0, xmm2                              
        cvtps2dq xmm4, xmm4                             
        cvtps2dq xmm0, xmm0                             
        packssdw xmm4, xmm0                             
        packuswb xmm3, xmm4                             
        movdqa  dqword ptr [edx], xmm3                 
        movaps  xmm6, dqword ptr [ebp+TBLOffset+16]
        movaps  xmm0, dqword ptr [edi + 10H]           
        movaps  xmm1, dqword ptr [ebx + 10H]           
        movaps  xmm2, dqword ptr [ecx + 10H]           
        addps   xmm0, dqword ptr [ebp + 000000E0H]     
        pshufd  xmm3, xmm0, 0
        pshufd  xmm4, xmm1, 0                           
        pshufd  xmm5, xmm2, 0
        mulps   xmm4, xmm6                              
        mulps   xmm5, xmm7                              
        addps   xmm3, xmm4                              
        addps   xmm3, xmm5                              
        pshufd  xmm4, xmm0, 85                          
        pshufd  xmm5, xmm1, 85                          
        pshufd  xmm6, xmm2, 85
        mulps   xmm5, dqword ptr [ebp+TBLOffset+16]
        mulps   xmm6, xmm7                              
        addps   xmm4, xmm5                              
        addps   xmm4, xmm6                              
        cvtps2dq xmm3, xmm3                             
        cvtps2dq xmm4, xmm4                             
        packssdw xmm3, xmm4                             
        pshufd  xmm4, xmm0, -86                         
        pshufd  xmm5, xmm1, -86                         
        pshufd  xmm6, xmm2, -86                         
        mulps   xmm5, dqword ptr [ebp+TBLOffset+16]
        mulps   xmm6, xmm7
        addps   xmm4, xmm5
        addps   xmm4, xmm6
        shufps  xmm0, xmm0, -1
        shufps  xmm1, xmm1, -1
        shufps  xmm2, xmm2, -1
        mulps   xmm1, dqword ptr [ebp+TBLOffset+16]
        mulps   xmm2, xmm7                              
        addps   xmm0, xmm1                              
        addps   xmm0, xmm2                              
        cvtps2dq xmm4, xmm4                             
        cvtps2dq xmm0, xmm0
        packssdw xmm4, xmm0                             
        packuswb xmm3, xmm4                             
        movdqa  dqword ptr [edx + 10H], xmm3           
        add     edx, dword ptr [eax + esi + 0CH]        
        add     eax, 16                                 
        jne     @@010                                   
        dec     dword ptr [ebp + 000000D0H]             
        jz      @@012
@@011:  mov     dword ptr [ebp + 000000C8H], edx
        pop     ebx
        pop     edi
        pop     esi
        ret

@@012:  add     edx, dword ptr [ebp + 000000D8H]
        mov     eax, dword ptr [ebp + 000000CCH]
        mov     dword ptr [ebp + 000000D0H], eax
        jmp     @@011
end;

procedure RGB_YCbCrConv_SSE;
asm
        push    esi                                     
        push    edi                                     
        push    ebx                                     
        mov     edx, dword ptr [ebp + 000000C8H]        
        mov     eax, dword ptr [ebp + 000000B8H]        
        mov     esi, dword ptr [ebp + 000000D4H]        
        shr     eax, 1
        neg     eax                                     
        movaps  xmm7, dqword ptr [ebp+TBLOffset]
@@013:  mov     edi, dword ptr [eax + esi]              
        mov     ebx, dword ptr [eax + esi + 04H]        
        mov     ecx, dword ptr [eax + esi + 08H]        
        movaps  xmm6, dqword ptr [ebp+TBLOffset+16]
        movaps  xmm0, dqword ptr [edi]                 
        movaps  xmm1, dqword ptr [ebx]                 
        movaps  xmm2, dqword ptr [ecx]                 
        addps   xmm0, dqword ptr [ebp + 000000E0H]     
        movaps  xmm3, xmm0                              
        movaps  xmm4, xmm1                              
        movaps  xmm5, xmm2                              
        shufps  xmm3, xmm3, 0                           
        shufps  xmm4, xmm4, 0                           
        shufps  xmm5, xmm5, 0                           
        mulps   xmm4, xmm6                              
        mulps   xmm5, xmm7                              
        addps   xmm3, xmm4                              
        addps   xmm3, xmm5                              
        movaps  xmm4, xmm0                              
        movaps  xmm5, xmm1                              
        movaps  xmm6, xmm2                              
        shufps  xmm4, xmm4, 85                          
        shufps  xmm5, xmm5, 85                          
        shufps  xmm6, xmm6, 85
        mulps   xmm5, dqword ptr [ebp+TBLOffset+16]
        mulps   xmm6, xmm7                              
        addps   xmm4, xmm5                              
        addps   xmm4, xmm6                              
        cvtps2pi mm1, xmm3                              
        cvtps2pi mm2, xmm4                              
        movhlps xmm3, xmm3                              
        movhlps xmm4, xmm4                              
        cvtps2pi mm3, xmm3                              
        cvtps2pi mm4, xmm4                              
        packssdw mm1, mm3                               
        packssdw mm2, mm4
        packuswb mm1, mm2                               
        movq    qword ptr [edx], mm1                    
        movaps  xmm4, xmm0                              
        movaps  xmm5, xmm1                              
        movaps  xmm6, xmm2
        shufps  xmm4, xmm4, -86                         
        shufps  xmm5, xmm5, -86                         
        shufps  xmm6, xmm6, -86                         
        mulps   xmm5, dqword ptr [ebp+TBLOffset+16]         
        mulps   xmm6, xmm7                              
        addps   xmm4, xmm5                              
        addps   xmm4, xmm6                              
        shufps  xmm0, xmm0, -1                          
        shufps  xmm1, xmm1, -1                          
        shufps  xmm2, xmm2, -1                          
        mulps   xmm1, dqword ptr [ebp+TBLOffset+16]         
        mulps   xmm2, xmm7                              
        addps   xmm0, xmm1                              
        addps   xmm0, xmm2                              
        cvtps2pi mm1, xmm4                              
        cvtps2pi mm2, xmm0                              
        movhlps xmm4, xmm4                              
        movhlps xmm0, xmm0
        cvtps2pi mm3, xmm4                              
        cvtps2pi mm4, xmm0
        packssdw mm1, mm3                               
        packssdw mm2, mm4                               
        packuswb mm1, mm2                               
        movq    qword ptr [edx + 08H], mm1              
        movaps  xmm6, dqword ptr [ebp+TBLOffset+16]         
        movaps  xmm0, dqword ptr [edi + 10H]           
        movaps  xmm1, dqword ptr [ebx + 10H]           
        movaps  xmm2, dqword ptr [ecx + 10H]           
        addps   xmm0, dqword ptr [ebp + 000000E0H]     
        movaps  xmm3, xmm0                              
        movaps  xmm4, xmm1                              
        movaps  xmm5, xmm2                              
        shufps  xmm3, xmm3, 0                           
        shufps  xmm4, xmm4, 0                           
        shufps  xmm5, xmm5, 0                           
        mulps   xmm4, xmm6                              
        mulps   xmm5, xmm7                              
        addps   xmm3, xmm4
        addps   xmm3, xmm5                              
        movaps  xmm4, xmm0
        movaps  xmm5, xmm1                              
        movaps  xmm6, xmm2                              
        shufps  xmm4, xmm4, 85                          
        shufps  xmm5, xmm5, 85                          
        shufps  xmm6, xmm6, 85                          
        mulps   xmm5, dqword ptr [ebp+TBLOffset+16]         
        mulps   xmm6, xmm7                              
        addps   xmm4, xmm5                              
        addps   xmm4, xmm6                              
        cvtps2pi mm1, xmm3                              
        cvtps2pi mm2, xmm4
        movhlps xmm3, xmm3                              
        movhlps xmm4, xmm4                              
        cvtps2pi mm3, xmm3                              
        cvtps2pi mm4, xmm4                              
        packssdw mm1, mm3                               
        packssdw mm2, mm4                               
        packuswb mm1, mm2                               
        movq    qword ptr [edx + 10H], mm1              
        movaps  xmm4, xmm0                              
        movaps  xmm5, xmm1                              
        movaps  xmm6, xmm2                              
        shufps  xmm4, xmm4, -86                         
        shufps  xmm5, xmm5, -86                         
        shufps  xmm6, xmm6, -86                         
        mulps   xmm5, dqword ptr [ebp+TBLOffset+16]         
        mulps   xmm6, xmm7                              
        addps   xmm4, xmm5                              
        addps   xmm4, xmm6                              
        shufps  xmm0, xmm0, -1                          
        shufps  xmm1, xmm1, -1                          
        shufps  xmm2, xmm2, -1                          
        mulps   xmm1, dqword ptr [ebp+TBLOffset+16]
        mulps   xmm2, xmm7                              
        addps   xmm0, xmm1                              
        addps   xmm0, xmm2                              
        cvtps2pi mm1, xmm4                              
        cvtps2pi mm2, xmm0
        movhlps xmm4, xmm4
        movhlps xmm0, xmm0                              
        cvtps2pi mm3, xmm4
        cvtps2pi mm4, xmm0                              
        packssdw mm1, mm3                               
        packssdw mm2, mm4                               
        packuswb mm1, mm2
        movq    qword ptr [edx + 18H], mm1              
        add     edx, dword ptr [eax + esi + 0CH]        
        add     eax, 16                                 
        jne     @@013                                   
        dec     dword ptr [ebp + 000000D0H]             
        jz      @@015                                   
@@014:  mov     dword ptr [ebp + 000000C8H], edx        
        pop     ebx
        pop     edi                                     
        pop     esi                                     
        ret                                             

@@015:  add     edx, dword ptr [ebp + 000000D8H]        
        mov     eax, dword ptr [ebp + 000000CCH]        
        mov     dword ptr [ebp + 000000D0H], eax        
        jmp     @@014
end;

procedure _RGB_Init;
asm
        mov     ecx, dword ptr [@TBL_RGBConv - 00000008H + eax*4]
        mov     dword ptr [ebp + 00000100H], ecx
        ret
        nop; nop
@TBL_RGBConv:
        dd offset @TBL_RGBConv_SSE
        dd offset @TBL_RGBConv_SSE2
        dd offset @TBL_RGBConv_SSE2
@TBL_RGBConv_SSE2:
        dd offset RGB_GrayConv_SSE2
        dd 00000000H
        dd offset RGB_YCbCrConv_SSE2
@TBL_RGBConv_SSE:
        dd offset RGB_GrayConv_SSE
        dd 00000000H
        dd offset RGB_YCbCrConv_SSE
end;


procedure _RGB_GenPointer;
asm
        sub     esp, 56
        mov     dword ptr [esp + 24H], edi
        mov     dword ptr [esp + 28H], ebx
        shr     edx, 2
        movzx   eax, byte ptr [@TBL_C0 + edx]
        mov     dword ptr [esp], eax
        movzx   ecx, byte ptr [@TBL_C1 + edx]
        mov     dword ptr [esp + 04H], ecx
        imul    eax, ecx
        mov     dword ptr [esp + 08H], eax              
        mov     ecx, dword ptr [ebp + 08H]
        shl     ecx, 2
        mov     eax, dword ptr [ebp + 000000B0H]
        mov     dword ptr [esp + 2CH], eax
        dec     eax                                     
        shl     eax, 5
        neg     eax                                     
        add     eax, ecx                                
        mov     dword ptr [esp + 0CH], eax
        mov     eax, dword ptr [ebp + 000000B4H]
        mov     dword ptr [esp + 30H], eax
        mov     eax, 32
        sub     eax, dword ptr [ebp + 000000D8H]        
        mov     dword ptr [esp + 10H], eax              
        mov     eax, dword ptr [ebp + 14H]              
        lea     eax, [eax*4 + 00000004H]                
        mov     dword ptr [esp + 34H], eax              
        cmp     dword ptr [esp + 3CH], 1                
        jz      @@001                                   
        mov     eax, dword ptr [esi]
        mov     dword ptr [esp + 2CH], eax              
        mov     eax, dword ptr [esi + 04H]              
        mov     dword ptr [esp + 30H], eax              
@@001:  mov     eax, dword ptr [esp + 30H]              
        mov     dword ptr [esp + 14H], eax              
@@002:  mov     eax, dword ptr [esp]                    
        mov     dword ptr [esp + 18H], eax              
@@003:  mov     dword ptr [esp + 1CH], ebx              
        mov     edx, dword ptr [esp + 2CH]              
@@004:  mov     ecx, dword ptr [esp + 04H]              
        mov     eax, ebx                                
@@005:  mov     dword ptr [edi], 32                     
        cmp     dword ptr [esp + 3CH], 1                
        jz      @@006                                   
        mov     dword ptr [edi], eax                    
        add     eax, 32                                 
@@006:  add     edi, dword ptr [esp + 34H]              
        sub     ecx, 32
        jnz     @@005                                   
        add     ebx, dword ptr [esp + 08H]              
        dec     edx
        jnz     @@004                                   
        mov     eax, dword ptr [esp]                    
        cmp     dword ptr [esp + 18H], eax              
        jnz     @@007                                   
        mov     dword ptr [esp + 20H], ebx              
@@007:  mov     eax, dword ptr [esp + 04H]              
        add     eax, dword ptr [esp + 1CH]              
        mov     ebx, eax                                
        cmp     dword ptr [esp + 3CH], 1
        jnz     @@008
        mov     ecx, dword ptr [esp + 34H]
        neg     ecx
        mov     eax, dword ptr [esp + 0CH]
        mov     dword ptr [ecx + edi], eax
@@008:  dec     dword ptr [esp + 18H]
        jnz     @@003
        mov     ebx, dword ptr [esp + 20H]
        dec     dword ptr [esp + 14H]
        jnz     @@002
        cmp     dword ptr [esp + 3CH], 1
        jnz     @@009
        mov     ecx, dword ptr [esp + 34H]
        neg     ecx
        mov     eax, dword ptr [esp + 10H]
        mov     dword ptr [ecx + edi], eax
@@009:  mov     edi, dword ptr [esp + 24H]
        mov     ebx, dword ptr [esp + 28H]
        add     esp, 56
        ret     4
        nop; nop; nop // dword align
@TBL_C0:db 10H, 08H, 10H, 08H
@TBL_C1:db 20H, 40H, 40H, 20H, 00H, 00H, 00H, 00H
end;

procedure _IDCT_8x8_SSE2;
asm
        mov     ecx, -32
@@001:  mov     edx, dword ptr [ebp+TBLOffset+0B0H + 00000020H + ecx]
        cvtdq2ps xmm0, dqword ptr [esp + ecx*8 + 00000100H]
        cvtdq2ps xmm1, dqword ptr [esp + ecx*8 + 00000110H]
        mulps   xmm0, dqword ptr [eax + ecx*8 + 00000100H]
        mulps   xmm1, dqword ptr [eax + ecx*8 + 00000110H]
        pshufd  xmm2, xmm0, 0
        pshufd  xmm3, xmm0, -86
        pshufd  xmm4, xmm1, 0
        pshufd  xmm5, xmm1, -86
        mulps   xmm2, dqword ptr [edx]
        mulps   xmm3, dqword ptr [edx + 10H]
        mulps   xmm4, dqword ptr [edx + 20H]
        mulps   xmm5, dqword ptr [edx + 30H]
        addps   xmm2, xmm3
        addps   xmm4, xmm5
        pshufd  xmm3, xmm0, 85
        addps   xmm2, xmm4
        pshufd  xmm0, xmm0, -1
        pshufd  xmm4, xmm1, 85
        pshufd  xmm1, xmm1, -1
        mulps   xmm3, dqword ptr [edx + 40H]
        mulps   xmm0, dqword ptr [edx + 50H]
        mulps   xmm4, dqword ptr [edx + 60H]
        mulps   xmm1, dqword ptr [edx + 70H]
        addps   xmm3, xmm0
        addps   xmm4, xmm1
        addps   xmm3, xmm4
        movaps  xmm4, xmm2
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3                              
        shufps  xmm4, xmm4, 27
        movaps  dqword ptr [edi + ecx*8 + 00000100H], xmm2
        movaps  dqword ptr [edi + ecx*8 + 00000110H], xmm4
        add     ecx, 4
        jne     @@001                                   
        mov     ecx, -32                                
@@002:  movaps  xmm0, dqword ptr [ecx + edi + 40H]     
        movaps  xmm1, dqword ptr [ecx + edi + 00000100H]
        movaps  xmm2, xmm0
        mulps   xmm0, dqword ptr [ebp+TBLOffset+020H]
        movaps  xmm3, xmm1                              
        mulps   xmm1, dqword ptr [ebp+TBLOffset+020H]         
        subps   xmm0, xmm3                              
        addps   xmm2, xmm1                              
        movaps  xmm1, dqword ptr [ecx + edi + 00000080H]
        movaps  xmm3, dqword ptr [ecx + edi + 000000C0H]
        movaps  xmm4, xmm1                              
        mulps   xmm1, dqword ptr [ebp+TBLOffset+040H]         
        movaps  xmm5, xmm3
        mulps   xmm3, dqword ptr [ebp+TBLOffset+040H]         
        subps   xmm5, xmm1
        addps   xmm3, xmm4                              
        movaps  xmm1, xmm2                              
        movaps  xmm4, xmm0                              
        addps   xmm2, xmm3                              
        subps   xmm1, xmm3
        addps   xmm0, xmm5
        subps   xmm4, xmm5                              
        movaps  xmm3, xmm4                              
        addps   xmm4, xmm1                              
        subps   xmm1, xmm3                              
        mulps   xmm4, dqword ptr [ebp+TBLOffset+0A0H]        
        mulps   xmm1, dqword ptr [ebp+TBLOffset+0A0H]        
        movaps  dqword ptr [ecx + edi + 40H], xmm4     
        movaps  dqword ptr [ecx + edi + 00000080H], xmm0
        movaps  xmm5, dqword ptr [ecx + edi + 000000A0H]
        movaps  xmm3, dqword ptr [ecx + edi + 20H]     
        movaps  xmm6, xmm5
        addps   xmm5, xmm3                              
        subps   xmm3, xmm6                              
        movaps  xmm0, dqword ptr [ecx + edi + 60H]     
        movaps  xmm4, dqword ptr [ecx + edi + 000000E0H]
        movaps  xmm6, xmm0                              
        mulps   xmm0, dqword ptr [ebp+TBLOffset+030H]         
        movaps  xmm7, xmm4                              
        mulps   xmm4, dqword ptr [ebp+TBLOffset+030H]         
        subps   xmm0, xmm7                              
        addps   xmm4, xmm6                              
        movaps  xmm6, xmm0                              
        addps   xmm0, xmm3                              
        subps   xmm3, xmm6                              
        movaps  xmm6, xmm4                              
        addps   xmm4, xmm5                              
        subps   xmm5, xmm6
        movaps  xmm6, xmm2
        addps   xmm2, xmm4                              
        subps   xmm4, xmm6                              
        movaps  xmm7, xmm1                              
        addps   xmm1, xmm3                              
        subps   xmm3, xmm7                              
        movaps  xmm6, dqword ptr [ecx + edi + 00000080H]
        movaps  xmm7, xmm5                              
        subps   xmm5, xmm6                              
        addps   xmm7, xmm6                              
        movaps  dqword ptr [ecx + edi + 20H], xmm2     
        movaps  dqword ptr [ecx + edi + 00000100H], xmm4
        movaps  dqword ptr [ecx + edi + 60H], xmm1     
        movaps  dqword ptr [ecx + edi + 000000C0H], xmm3
        movaps  dqword ptr [ecx + edi + 000000A0H], xmm5
        movaps  dqword ptr [ecx + edi + 00000080H], xmm7
        movaps  xmm2, dqword ptr [ecx + edi + 40H]     
        movaps  xmm3, xmm0                              
        addps   xmm0, xmm2                              
        subps   xmm3, xmm2                              
        movaps  dqword ptr [ecx + edi + 40H], xmm0     
        movaps  dqword ptr [ecx + edi + 000000E0H], xmm3
        add     ecx, 16                                 
        jne     @@002                                   
        add     edi, 256                                
        jmp     dword ptr [ebp + 000000FCH]
end;

procedure _IDCT_16x16_SSE2;
asm
        movd    mm1, ebp
        mov     ecx, -32
        mov     ebp, -512
@@003:  //mov     edx, dword ptr [ebp+TBLOffset+0D0H + 00000020H + ecx]
        movd edx,mm1
        mov edx,[edx+TBLOffset+0D0H + 00000020H + ecx]
        cvtdq2ps xmm0, dqword ptr [esp + ecx*8 + 00000100H]
        cvtdq2ps xmm1, dqword ptr [esp + ecx*8 + 00000110H]
        mulps   xmm0, dqword ptr [eax + ecx*8 + 00000100H]
        mulps   xmm1, dqword ptr [eax + ecx*8 + 00000110H]
        pshufd  xmm2, xmm0, 0
        pshufd  xmm3, xmm0, -86
        pshufd  xmm4, xmm1, 0
        pshufd  xmm5, xmm1, -86
        movaps  xmm6, xmm0
        movaps  xmm7, xmm1
        mulps   xmm2, dqword ptr [edx]
        mulps   xmm3, dqword ptr [edx + 10H]
        mulps   xmm4, dqword ptr [edx + 20H]
        mulps   xmm5, dqword ptr [edx + 30H]
        addps   xmm2, xmm3
        addps   xmm4, xmm5
        pshufd  xmm3, xmm0, 85
        addps   xmm2, xmm4
        pshufd  xmm0, xmm0, -1
        pshufd  xmm4, xmm1, 85
        pshufd  xmm1, xmm1, -1
        mulps   xmm3, dqword ptr [edx + 40H]           
        mulps   xmm0, dqword ptr [edx + 50H]           
        mulps   xmm4, dqword ptr [edx + 60H]           
        mulps   xmm1, dqword ptr [edx + 70H]           
        addps   xmm3, xmm0                              
        addps   xmm4, xmm1                              
        addps   xmm3, xmm4                              
        movaps  xmm4, xmm2                              
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3                              
        shufps  xmm4, xmm4, 27
        movaps  dqword ptr [ebp + edi + 00000200H], xmm2
        movaps  dqword ptr [ebp + edi + 00000230H], xmm4
        pshufd  xmm2, xmm6, 0                           
        pshufd  xmm3, xmm6, -86                         
        pshufd  xmm4, xmm7, 0                           
        pshufd  xmm5, xmm7, -86                         
        mulps   xmm2, dqword ptr [edx + 00000080H]     
        mulps   xmm3, dqword ptr [edx + 00000090H]     
        mulps   xmm4, dqword ptr [edx + 000000A0H]
        mulps   xmm5, dqword ptr [edx + 000000B0H]     
        addps   xmm2, xmm3                              
        addps   xmm4, xmm5                              
        pshufd  xmm3, xmm6, 85                          
        addps   xmm2, xmm4                              
        pshufd  xmm6, xmm6, -1                          
        pshufd  xmm4, xmm7, 85                          
        pshufd  xmm7, xmm7, -1                          
        mulps   xmm3, dqword ptr [edx + 000000C0H]     
        mulps   xmm6, dqword ptr [edx + 000000D0H]
        mulps   xmm4, dqword ptr [edx + 000000E0H]     
        mulps   xmm7, dqword ptr [edx + 000000F0H]     
        addps   xmm3, xmm6                              
        addps   xmm4, xmm7
        addps   xmm3, xmm4                              
        movaps  xmm4, xmm2                              
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3
        shufps  xmm4, xmm4, 27
        movaps  dqword ptr [ebp + edi + 00000210H], xmm2
        movaps  dqword ptr [ebp + edi + 00000220H], xmm4
        add     ebp, 64
        add     ecx, 4
        jne     @@003
        mov     ecx, -64
        movd    ebp, mm1
@@004:  movaps  xmm0, dqword ptr [ecx + edi + 40H]
        movaps  xmm1, dqword ptr [ecx + edi + 00000140H]
        movaps  xmm2, xmm0
        movaps  xmm3, xmm1
        mulps   xmm1, dqword ptr [ebp+TBLOffset+030H]
        movaps  xmm4, xmm0                              
        movaps  xmm5, xmm0                              
        addps   xmm0, xmm3                              
        subps   xmm2, xmm3                              
        addps   xmm4, xmm1                              
        subps   xmm5, xmm1
        movaps  dqword ptr [ecx + edi + 40H], xmm0
        movaps  xmm0, dqword ptr [ecx + edi + 000000C0H]
        movaps  xmm1, dqword ptr [ecx + edi + 000001C0H]
        movaps  xmm3, xmm0                              
        movaps  xmm6, xmm1                              
        mulps   xmm0, dqword ptr [ebp+TBLOffset+020H]         
        mulps   xmm1, dqword ptr [ebp+TBLOffset+040H]         
        movaps  xmm7, xmm3                              
        addps   xmm3, xmm6
        subps   xmm7, xmm6                              
        movaps  xmm6, xmm0                              
        addps   xmm0, xmm1                              
        subps   xmm6, xmm1                              
        movaps  xmm1, xmm7                              
        addps   xmm7, xmm0                              
        subps   xmm1, xmm0                              
        movaps  xmm0, dqword ptr [ebp+TBLOffset+0A0H]        
        mulps   xmm7, xmm0                              
        mulps   xmm1, xmm0                              
        movaps  xmm0, xmm4                              
        addps   xmm4, xmm7                              
        subps   xmm0, xmm7                              
        movaps  xmm7, xmm5                              
        addps   xmm5, xmm1                              
        subps   xmm7, xmm1                              
        movaps  xmm1, xmm2                              
        addps   xmm2, xmm6                              
        subps   xmm1, xmm6
        movaps  dqword ptr [ecx + edi + 00000280H], xmm0
        movaps  dqword ptr [ecx + edi + 000002C0H], xmm7
        movaps  dqword ptr [ecx + edi + 00000300H], xmm1
        movaps  dqword ptr [ecx + edi + 00000340H], xmm2
        movaps  dqword ptr [ecx + edi + 00000380H], xmm5
        movaps  dqword ptr [ecx + edi + 000003C0H], xmm4
        movaps  xmm0, dqword ptr [ecx + edi + 40H]     
        movaps  xmm1, xmm0                              
        addps   xmm0, xmm3                              
        subps   xmm1, xmm3
        movaps  dqword ptr [ecx + edi + 00000400H], xmm0
        movaps  dqword ptr [ecx + edi + 00000240H], xmm1
        movaps  xmm0, dqword ptr [ecx + edi + 00000080H]
        movaps  xmm1, dqword ptr [ecx + edi + 00000200H]
        movaps  xmm2, xmm0
        movaps  xmm3, xmm1                              
        mulps   xmm0, dqword ptr [ebp+TBLOffset+050H]
        mulps   xmm1, dqword ptr [ebp+TBLOffset+080H]         
        movaps  xmm4, xmm2                              
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3
        movaps  xmm5, xmm0                              
        addps   xmm0, xmm1                              
        subps   xmm5, xmm1                              
        movaps  dqword ptr [ecx + edi + 40H], xmm2
        movaps  xmm1, dqword ptr [ecx + edi + 00000100H]
        movaps  xmm2, dqword ptr [ecx + edi + 00000180H]
        movaps  xmm3, xmm1
        movaps  xmm6, xmm2                              
        mulps   xmm1, dqword ptr [ebp+TBLOffset+060H]         
        mulps   xmm2, dqword ptr [ebp+TBLOffset+070H]         
        movaps  xmm7, xmm3                              
        addps   xmm3, xmm6                              
        subps   xmm7, xmm6                              
        movaps  xmm6, xmm1                              
        addps   xmm1, xmm2                              
        subps   xmm6, xmm2                              
        movaps  xmm2, xmm4                              
        addps   xmm4, xmm1                              
        subps   xmm2, xmm1                              
        movaps  xmm1, xmm7                              
        addps   xmm7, xmm0                              
        subps   xmm1, xmm0                              
        movaps  xmm0, xmm5                              
        addps   xmm5, xmm6                              
        subps   xmm0, xmm6                              
        movaps  xmm6, xmm3                              
        addps   xmm3, dqword ptr [ecx + edi + 40H]     
        subps   xmm6, dqword ptr [ecx + edi + 40H]     
        movaps  dqword ptr [ecx + edi + 40H], xmm3     
        movaps  xmm3, xmm5                              
        addps   xmm5, xmm6                              
        subps   xmm3, xmm6                              
        mulps   xmm5, dqword ptr [ebp+TBLOffset+0A0H]        
        mulps   xmm3, dqword ptr [ebp+TBLOffset+0A0H]
        movaps  dqword ptr [ecx + edi + 00000140H], xmm5
        movaps  dqword ptr [ecx + edi + 00000100H], xmm3
        movaps  xmm3, dqword ptr [ebp+TBLOffset+030H]         
        movaps  xmm5, xmm4                              
        mulps   xmm4, xmm3                              
        movaps  xmm6, xmm7                              
        mulps   xmm7, xmm3                              
        subps   xmm4, xmm6                              
        addps   xmm5, xmm7                              
        movaps  xmm6, xmm1                              
        mulps   xmm1, xmm3
        movaps  xmm7, xmm2                              
        mulps   xmm2, xmm3                              
        addps   xmm7, xmm1                              
        movaps  xmm3, dqword ptr [ebp+TBLOffset+090H]        
        subps   xmm2, xmm6
        mulps   xmm5, xmm3
        mulps   xmm7, xmm3                              
        mulps   xmm2, xmm3                              
        mulps   xmm4, xmm3                              
        movaps  xmm1, dqword ptr [ecx + edi + 40H]
        movaps  xmm3, dqword ptr [ecx + edi + 00000400H]
        movaps  xmm6, xmm3                              
        addps   xmm3, xmm1                              
        subps   xmm6, xmm1                              
        movaps  dqword ptr [ecx + edi + 40H], xmm3
        movaps  dqword ptr [ecx + edi + 00000400H], xmm6
        movaps  xmm1, dqword ptr [ecx + edi + 00000380H]
        movaps  xmm3, dqword ptr [ecx + edi + 000003C0H]
        movaps  xmm6, xmm1                              
        addps   xmm1, xmm7                              
        subps   xmm6, xmm7                              
        movaps  xmm7, xmm3                              
        addps   xmm3, xmm5                              
        subps   xmm7, xmm5
        movaps  dqword ptr [ecx + edi + 00000080H], xmm3
        movaps  dqword ptr [ecx + edi + 000000C0H], xmm1
        movaps  dqword ptr [ecx + edi + 00000380H], xmm6
        movaps  dqword ptr [ecx + edi + 000003C0H], xmm7
        movaps  xmm1, dqword ptr [ecx + edi + 00000100H]
        movaps  xmm3, dqword ptr [ecx + edi + 00000140H]
        movaps  xmm5, dqword ptr [ecx + edi + 00000300H]
        movaps  xmm6, dqword ptr [ecx + edi + 00000340H]
        movaps  xmm7, xmm5                              
        subps   xmm5, xmm3                              
        addps   xmm7, xmm3                              
        movaps  xmm3, xmm6                              
        addps   xmm6, xmm1                              
        subps   xmm3, xmm1
        movaps  dqword ptr [ecx + edi + 00000100H], xmm6
        movaps  dqword ptr [ecx + edi + 00000140H], xmm5
        movaps  dqword ptr [ecx + edi + 00000300H], xmm7
        movaps  dqword ptr [ecx + edi + 00000340H], xmm3
        movaps  xmm1, dqword ptr [ecx + edi + 00000240H]
        movaps  xmm3, dqword ptr [ecx + edi + 00000280H]
        movaps  xmm5, dqword ptr [ecx + edi + 000002C0H]
        movaps  xmm6, xmm3                              
        addps   xmm3, xmm4                              
        subps   xmm6, xmm4                              
        movaps  xmm4, xmm5                              
        addps   xmm5, xmm2
        subps   xmm4, xmm2
        movaps  xmm7, xmm1
        addps   xmm1, xmm0
        subps   xmm7, xmm0
        movaps  dqword ptr [ecx + edi + 00000180H], xmm5
        movaps  dqword ptr [ecx + edi + 000001C0H], xmm3
        movaps  dqword ptr [ecx + edi + 00000200H], xmm1
        movaps  dqword ptr [ecx + edi + 00000240H], xmm7
        movaps  dqword ptr [ecx + edi + 00000280H], xmm6
        movaps  dqword ptr [ecx + edi + 000002C0H], xmm4
        add     ecx, 16
        jne     @@004
        add     edi, 1024
        jmp     dword ptr [ebp + 000000FCH]
end;
                           
procedure _IDCT_8x16_SSE2;
asm
        mov     ecx, -32
@@005:  mov     edx, dword ptr [ebp+TBLOffset+0B0H + 00000020H + ecx]
        cvtdq2ps xmm0, dqword ptr [esp + ecx*8 + 00000100H]
        cvtdq2ps xmm1, dqword ptr [esp + ecx*8 + 00000110H]
        mulps   xmm0, dqword ptr [eax + ecx*8 + 00000100H]
        mulps   xmm1, dqword ptr [eax + ecx*8 + 00000110H]
        pshufd  xmm2, xmm0, 0                           
        pshufd  xmm3, xmm0, -86                         
        pshufd  xmm4, xmm1, 0                           
        pshufd  xmm5, xmm1, -86                         
        mulps   xmm2, dqword ptr [edx]                 
        mulps   xmm3, dqword ptr [edx + 10H]           
        mulps   xmm4, dqword ptr [edx + 20H]           
        mulps   xmm5, dqword ptr [edx + 30H]           
        addps   xmm2, xmm3                              
        addps   xmm4, xmm5                              
        pshufd  xmm3, xmm0, 85                          
        addps   xmm2, xmm4                              
        pshufd  xmm0, xmm0, -1                          
        pshufd  xmm4, xmm1, 85                          
        pshufd  xmm1, xmm1, -1                          
        mulps   xmm3, dqword ptr [edx + 40H]           
        mulps   xmm0, dqword ptr [edx + 50H]           
        mulps   xmm4, dqword ptr [edx + 60H]           
        mulps   xmm1, dqword ptr [edx + 70H]           
        addps   xmm3, xmm0                              
        addps   xmm4, xmm1                              
        addps   xmm3, xmm4                              
        movaps  xmm4, xmm2                              
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3                              
        shufps  xmm4, xmm4, 27
        movaps  dqword ptr [edi + ecx*8 + 00000100H], xmm2
        movaps  dqword ptr [edi + ecx*8 + 00000110H], xmm4
        add     ecx, 4                                  
        jne     @@005                                   
        mov     ecx, -32                                
@@006:  movaps  xmm0, dqword ptr [ecx + edi + 20H]
        movaps  xmm1, dqword ptr [ecx + edi + 000000A0H]
        movaps  xmm2, xmm0                              
        movaps  xmm3, xmm1                              
        mulps   xmm1, dqword ptr [ebp+TBLOffset+030H]         
        movaps  xmm4, xmm0                              
        movaps  xmm5, xmm0
        addps   xmm0, xmm3
        subps   xmm2, xmm3                              
        addps   xmm4, xmm1                              
        subps   xmm5, xmm1                              
        movaps  dqword ptr [ecx + edi + 20H], xmm0     
        movaps  xmm0, dqword ptr [ecx + edi + 60H]
        movaps  xmm1, dqword ptr [ecx + edi + 000000E0H]
        movaps  xmm3, xmm0                              
        movaps  xmm6, xmm1                              
        mulps   xmm0, dqword ptr [ebp+TBLOffset+020H]         
        mulps   xmm1, dqword ptr [ebp+TBLOffset+040H]         
        movaps  xmm7, xmm3                              
        addps   xmm3, xmm6                              
        subps   xmm7, xmm6                              
        movaps  xmm6, xmm0
        addps   xmm0, xmm1                              
        subps   xmm6, xmm1                              
        movaps  xmm1, xmm7                              
        addps   xmm7, xmm0                              
        subps   xmm1, xmm0                              
        movaps  xmm0, dqword ptr [ebp+TBLOffset+0A0H]        
        mulps   xmm7, xmm0                              
        mulps   xmm1, xmm0                              
        movaps  xmm0, xmm4                              
        addps   xmm4, xmm7                              
        subps   xmm0, xmm7                              
        movaps  xmm7, xmm5                              
        addps   xmm5, xmm1                              
        subps   xmm7, xmm1                              
        movaps  xmm1, xmm2                              
        addps   xmm2, xmm6                              
        subps   xmm1, xmm6
        movaps  dqword ptr [ecx + edi + 00000140H], xmm0
        movaps  dqword ptr [ecx + edi + 00000160H], xmm7
        movaps  dqword ptr [ecx + edi + 00000180H], xmm1
        movaps  dqword ptr [ecx + edi + 000001A0H], xmm2
        movaps  dqword ptr [ecx + edi + 000001C0H], xmm5
        movaps  dqword ptr [ecx + edi + 000001E0H], xmm4
        movaps  xmm0, dqword ptr [ecx + edi + 20H]
        movaps  xmm1, xmm0
        addps   xmm0, xmm3                              
        subps   xmm1, xmm3
        movaps  dqword ptr [ecx + edi + 00000200H], xmm0
        movaps  dqword ptr [ecx + edi + 00000120H], xmm1
        movaps  xmm0, dqword ptr [ecx + edi + 40H]
        movaps  xmm1, dqword ptr [ecx + edi + 00000100H]
        movaps  xmm2, xmm0                              
        movaps  xmm3, xmm1                              
        mulps   xmm0, dqword ptr [ebp+TBLOffset+050H]         
        mulps   xmm1, dqword ptr [ebp+TBLOffset+080H]         
        movaps  xmm4, xmm2                              
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3                              
        movaps  xmm5, xmm0                              
        addps   xmm0, xmm1                              
        subps   xmm5, xmm1                              
        movaps  dqword ptr [ecx + edi + 20H], xmm2
        movaps  xmm1, dqword ptr [ecx + edi + 00000080H]
        movaps  xmm2, dqword ptr [ecx + edi + 000000C0H]
        movaps  xmm3, xmm1                              
        movaps  xmm6, xmm2                              
        mulps   xmm1, dqword ptr [ebp+TBLOffset+060H]         
        mulps   xmm2, dqword ptr [ebp+TBLOffset+070H]         
        movaps  xmm7, xmm3                              
        addps   xmm3, xmm6                              
        subps   xmm7, xmm6                              
        movaps  xmm6, xmm1                              
        addps   xmm1, xmm2                              
        subps   xmm6, xmm2                              
        movaps  xmm2, xmm4                              
        addps   xmm4, xmm1                              
        subps   xmm2, xmm1                              
        movaps  xmm1, xmm7                              
        addps   xmm7, xmm0                              
        subps   xmm1, xmm0                              
        movaps  xmm0, xmm5
        addps   xmm5, xmm6                              
        subps   xmm0, xmm6                              
        movaps  xmm6, xmm3                              
        addps   xmm3, dqword ptr [ecx + edi + 20H]     
        subps   xmm6, dqword ptr [ecx + edi + 20H]     
        movaps  dqword ptr [ecx + edi + 20H], xmm3     
        movaps  xmm3, xmm5                              
        addps   xmm5, xmm6                              
        subps   xmm3, xmm6                              
        mulps   xmm5, dqword ptr [ebp+TBLOffset+0A0H]        
        mulps   xmm3, dqword ptr [ebp+TBLOffset+0A0H]
        movaps  dqword ptr [ecx + edi + 000000A0H], xmm5
        movaps  dqword ptr [ecx + edi + 00000080H], xmm3
        movaps  xmm3, dqword ptr [ebp+TBLOffset+030H]
        movaps  xmm5, xmm4
        mulps   xmm4, xmm3                              
        movaps  xmm6, xmm7                              
        mulps   xmm7, xmm3                              
        subps   xmm4, xmm6                              
        addps   xmm5, xmm7                              
        movaps  xmm6, xmm1                              
        mulps   xmm1, xmm3                              
        movaps  xmm7, xmm2                              
        mulps   xmm2, xmm3                              
        addps   xmm7, xmm1                              
        movaps  xmm3, dqword ptr [ebp+TBLOffset+090H]        
        subps   xmm2, xmm6                              
        mulps   xmm5, xmm3                              
        mulps   xmm7, xmm3                              
        mulps   xmm2, xmm3                              
        mulps   xmm4, xmm3                              
        movaps  xmm1, dqword ptr [ecx + edi + 20H]
        movaps  xmm3, dqword ptr [ecx + edi + 00000200H]
        movaps  xmm6, xmm3                              
        addps   xmm3, xmm1                              
        subps   xmm6, xmm1                              
        movaps  dqword ptr [ecx + edi + 20H], xmm3
        movaps  dqword ptr [ecx + edi + 00000200H], xmm6
        movaps  xmm1, dqword ptr [ecx + edi + 000001C0H]
        movaps  xmm3, dqword ptr [ecx + edi + 000001E0H]
        movaps  xmm6, xmm1                              
        addps   xmm1, xmm7                              
        subps   xmm6, xmm7                              
        movaps  xmm7, xmm3                              
        addps   xmm3, xmm5                              
        subps   xmm7, xmm5                              
        movaps  dqword ptr [ecx + edi + 40H], xmm3     
        movaps  dqword ptr [ecx + edi + 60H], xmm1
        movaps  dqword ptr [ecx + edi + 000001C0H], xmm6
        movaps  dqword ptr [ecx + edi + 000001E0H], xmm7
        movaps  xmm1, dqword ptr [ecx + edi + 00000080H]
        movaps  xmm3, dqword ptr [ecx + edi + 000000A0H]
        movaps  xmm5, dqword ptr [ecx + edi + 00000180H]
        movaps  xmm6, dqword ptr [ecx + edi + 000001A0H]
        movaps  xmm7, xmm5                              
        subps   xmm5, xmm3                              
        addps   xmm7, xmm3                              
        movaps  xmm3, xmm6                              
        addps   xmm6, xmm1                              
        subps   xmm3, xmm1
        movaps  dqword ptr [ecx + edi + 00000080H], xmm6
        movaps  dqword ptr [ecx + edi + 000000A0H], xmm5
        movaps  dqword ptr [ecx + edi + 00000180H], xmm7
        movaps  dqword ptr [ecx + edi + 000001A0H], xmm3
        movaps  xmm1, dqword ptr [ecx + edi + 00000120H]
        movaps  xmm3, dqword ptr [ecx + edi + 00000140H]
        movaps  xmm5, dqword ptr [ecx + edi + 00000160H]
        movaps  xmm6, xmm3                              
        addps   xmm3, xmm4                              
        subps   xmm6, xmm4                              
        movaps  xmm4, xmm5                              
        addps   xmm5, xmm2                              
        subps   xmm4, xmm2                              
        movaps  xmm7, xmm1                              
        addps   xmm1, xmm0                              
        subps   xmm7, xmm0
        movaps  dqword ptr [ecx + edi + 000000C0H], xmm5
        movaps  dqword ptr [ecx + edi + 000000E0H], xmm3
        movaps  dqword ptr [ecx + edi + 00000100H], xmm1
        movaps  dqword ptr [ecx + edi + 00000120H], xmm7
        movaps  dqword ptr [ecx + edi + 00000140H], xmm6
        movaps  dqword ptr [ecx + edi + 00000160H], xmm4
        add     ecx, 16                                 
        jne     @@006                                   
        add     edi, 512                                
        jmp     dword ptr [ebp + 000000FCH]
end;

procedure _IDCT_16x8_SSE2;
asm
        movd    mm1, ebp
        mov     ecx, -32
        mov     ebp, -512
@@007:  //mov     edx, dword ptr [ebp+TBLOffset+0D0H + 00000020H + ecx]
        movd edx,mm1
        mov edx,[edx+TBLOffset+0D0H + 00000020H + ecx]
        cvtdq2ps xmm0, dqword ptr [esp + ecx*8 + 00000100H]
        cvtdq2ps xmm1, dqword ptr [esp + ecx*8 + 00000110H]
        mulps   xmm0, dqword ptr [eax + ecx*8 + 00000100H]
        mulps   xmm1, dqword ptr [eax + ecx*8 + 00000110H]
        pshufd  xmm2, xmm0, 0                           
        pshufd  xmm3, xmm0, -86                         
        pshufd  xmm4, xmm1, 0                           
        pshufd  xmm5, xmm1, -86                         
        movaps  xmm6, xmm0                              
        movaps  xmm7, xmm1                              
        mulps   xmm2, dqword ptr [edx]                 
        mulps   xmm3, dqword ptr [edx + 10H]           
        mulps   xmm4, dqword ptr [edx + 20H]           
        mulps   xmm5, dqword ptr [edx + 30H]           
        addps   xmm2, xmm3                              
        addps   xmm4, xmm5                              
        pshufd  xmm3, xmm0, 85                          
        addps   xmm2, xmm4                              
        pshufd  xmm0, xmm0, -1                          
        pshufd  xmm4, xmm1, 85                          
        pshufd  xmm1, xmm1, -1                          
        mulps   xmm3, dqword ptr [edx + 40H]           
        mulps   xmm0, dqword ptr [edx + 50H]
        mulps   xmm4, dqword ptr [edx + 60H]           
        mulps   xmm1, dqword ptr [edx + 70H]           
        addps   xmm3, xmm0                              
        addps   xmm4, xmm1
        addps   xmm3, xmm4
        movaps  xmm4, xmm2                              
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3                              
        shufps  xmm4, xmm4, 27
        movaps  dqword ptr [ebp + edi + 00000200H], xmm2
        movaps  dqword ptr [ebp + edi + 00000230H], xmm4
        pshufd  xmm2, xmm6, 0                           
        pshufd  xmm3, xmm6, -86                         
        pshufd  xmm4, xmm7, 0                           
        pshufd  xmm5, xmm7, -86                         
        mulps   xmm2, dqword ptr [edx + 00000080H]     
        mulps   xmm3, dqword ptr [edx + 00000090H]     
        mulps   xmm4, dqword ptr [edx + 000000A0H]     
        mulps   xmm5, dqword ptr [edx + 000000B0H]     
        addps   xmm2, xmm3                              
        addps   xmm4, xmm5                              
        pshufd  xmm3, xmm6, 85                          
        addps   xmm2, xmm4                              
        pshufd  xmm6, xmm6, -1                          
        pshufd  xmm4, xmm7, 85                          
        pshufd  xmm7, xmm7, -1                          
        mulps   xmm3, dqword ptr [edx + 000000C0H]     
        mulps   xmm6, dqword ptr [edx + 000000D0H]     
        mulps   xmm4, dqword ptr [edx + 000000E0H]     
        mulps   xmm7, dqword ptr [edx + 000000F0H]     
        addps   xmm3, xmm6                              
        addps   xmm4, xmm7                              
        addps   xmm3, xmm4                              
        movaps  xmm4, xmm2                              
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3                              
        shufps  xmm4, xmm4, 27
        movaps  dqword ptr [ebp + edi + 00000210H], xmm2
        movaps  dqword ptr [ebp + edi + 00000220H], xmm4
        add     ebp, 64
        add     ecx, 4
        jne     @@007
        mov     ecx, -64
        movd ebp,mm1
@@008:  movaps  xmm0, dqword ptr [ecx + edi + 00000080H]
        movaps  xmm1, dqword ptr [ecx + edi + 00000200H]
        movaps  xmm2, xmm0
        mulps   xmm0, dqword ptr [ebp+TBLOffset+020H]
        movaps  xmm3, xmm1
        mulps   xmm1, dqword ptr [ebp+TBLOffset+020H]
        subps   xmm0, xmm3                              
        addps   xmm2, xmm1
        movaps  xmm1, dqword ptr [ecx + edi + 00000100H]
        movaps  xmm3, dqword ptr [ecx + edi + 00000180H]
        movaps  xmm4, xmm1                              
        mulps   xmm1, dqword ptr [ebp+TBLOffset+040H]         
        movaps  xmm5, xmm3                              
        mulps   xmm3, dqword ptr [ebp+TBLOffset+040H]         
        subps   xmm5, xmm1                              
        addps   xmm3, xmm4                              
        movaps  xmm1, xmm2                              
        movaps  xmm4, xmm0                              
        addps   xmm2, xmm3                              
        subps   xmm1, xmm3                              
        addps   xmm0, xmm5                              
        subps   xmm4, xmm5                              
        movaps  xmm3, xmm4
        addps   xmm4, xmm1                              
        subps   xmm1, xmm3                              
        mulps   xmm4, dqword ptr [ebp+TBLOffset+0A0H]        
        mulps   xmm1, dqword ptr [ebp+TBLOffset+0A0H]
        movaps  dqword ptr [ecx + edi + 00000080H], xmm4
        movaps  dqword ptr [ecx + edi + 00000100H], xmm0
        movaps  xmm5, dqword ptr [ecx + edi + 00000140H]
        movaps  xmm3, dqword ptr [ecx + edi + 40H]     
        movaps  xmm6, xmm5                              
        addps   xmm5, xmm3                              
        subps   xmm3, xmm6
        movaps  xmm0, dqword ptr [ecx + edi + 000000C0H]
        movaps  xmm4, dqword ptr [ecx + edi + 000001C0H]
        movaps  xmm6, xmm0                              
        mulps   xmm0, dqword ptr [ebp+TBLOffset+030H]
        movaps  xmm7, xmm4
        mulps   xmm4, dqword ptr [ebp+TBLOffset+030H]
        subps   xmm0, xmm7                              
        addps   xmm4, xmm6                              
        movaps  xmm6, xmm0                              
        addps   xmm0, xmm3                              
        subps   xmm3, xmm6                              
        movaps  xmm6, xmm4                              
        addps   xmm4, xmm5                              
        subps   xmm5, xmm6                              
        movaps  xmm6, xmm2                              
        addps   xmm2, xmm4                              
        subps   xmm4, xmm6                              
        movaps  xmm7, xmm1                              
        addps   xmm1, xmm3                              
        subps   xmm3, xmm7
        movaps  xmm6, dqword ptr [ecx + edi + 00000100H]
        movaps  xmm7, xmm5                              
        subps   xmm5, xmm6                              
        addps   xmm7, xmm6                              
        movaps  dqword ptr [ecx + edi + 40H], xmm2
        movaps  dqword ptr [ecx + edi + 00000200H], xmm4
        movaps  dqword ptr [ecx + edi + 000000C0H], xmm1
        movaps  dqword ptr [ecx + edi + 00000180H], xmm3
        movaps  dqword ptr [ecx + edi + 00000140H], xmm5
        movaps  dqword ptr [ecx + edi + 00000100H], xmm7
        movaps  xmm2, dqword ptr [ecx + edi + 00000080H]
        movaps  xmm3, xmm0                              
        addps   xmm0, xmm2                              
        subps   xmm3, xmm2
        movaps  dqword ptr [ecx + edi + 00000080H], xmm0
        movaps  dqword ptr [ecx + edi + 000001C0H], xmm3
        add     ecx, 16                                 
        jne     @@008                                   
        add     edi, 512
        jmp     dword ptr [ebp + 000000FCH]
end;

procedure _IDCT_8x8_SSE;
asm
        mov     ecx, -32
@@009:  mov     edx, dword ptr [ebp+TBLOffset+0B0H + 00000020H + ecx]
        cvtpi2ps xmm0, qword ptr [esp + ecx*8 + 00000100H]
        cvtpi2ps xmm2, qword ptr [esp + ecx*8 + 00000108H]
        cvtpi2ps xmm1, qword ptr [esp + ecx*8 + 00000110H]
        cvtpi2ps xmm3, qword ptr [esp + ecx*8 + 00000118H]
        shufps  xmm0, xmm2, 68                          
        shufps  xmm1, xmm3, 68
        mulps   xmm0, dqword ptr [eax + ecx*8 + 00000100H]
        mulps   xmm1, dqword ptr [eax + ecx*8 + 00000110H]
        movaps  xmm2, xmm0                             
        movaps  xmm3, xmm0                             
        movaps  xmm4, xmm1                             
        movaps  xmm5, xmm1                             
        shufps  xmm2, xmm2, 0                          
        shufps  xmm3, xmm3, -86                        
        shufps  xmm4, xmm4, 0                          
        shufps  xmm5, xmm5, -86                        
        mulps   xmm2, dqword ptr [edx]                 
        mulps   xmm3, dqword ptr [edx + 10H]           
        mulps   xmm4, dqword ptr [edx + 20H]           
        mulps   xmm5, dqword ptr [edx + 30H]           
        addps   xmm2, xmm3                             
        addps   xmm4, xmm5                             
        addps   xmm2, xmm4                             
        movaps  xmm3, xmm0                             
        movaps  xmm4, xmm1                             
        shufps  xmm3, xmm3, 85                         
        shufps  xmm0, xmm0, -1                         
        shufps  xmm4, xmm4, 85                         
        shufps  xmm1, xmm1, -1
        mulps   xmm3, dqword ptr [edx + 40H]           
        mulps   xmm0, dqword ptr [edx + 50H]           
        mulps   xmm4, dqword ptr [edx + 60H]           
        mulps   xmm1, dqword ptr [edx + 70H]           
        addps   xmm3, xmm0                             
        addps   xmm4, xmm1                             
        addps   xmm3, xmm4                             
        movaps  xmm4, xmm2                             
        addps   xmm2, xmm3                             
        subps   xmm4, xmm3                             
        shufps  xmm4, xmm4, 27
        movaps  dqword ptr [edi + ecx*8 + 00000100H], xmm2
        movaps  dqword ptr [edi + ecx*8 + 00000110H], xmm4
        add     ecx, 4                                  
        jne     @@009                                   
        mov     ecx, -32                                
@@010:  movaps  xmm0, dqword ptr [ecx + edi + 40H]
        movaps  xmm1, dqword ptr [ecx + edi + 00000100H]
        movaps  xmm2, xmm0
        mulps   xmm0, dqword ptr [ebp+TBLOffset+020H]
        movaps  xmm3, xmm1
        mulps   xmm1, dqword ptr [ebp+TBLOffset+020H]
        subps   xmm0, xmm3
        addps   xmm2, xmm1
        movaps  xmm1, dqword ptr [ecx + edi + 00000080H]
        movaps  xmm3, dqword ptr [ecx + edi + 000000C0H]
        movaps  xmm4, xmm1                             
        mulps   xmm1, dqword ptr [ebp+TBLOffset+040H]         
        movaps  xmm5, xmm3                             
        mulps   xmm3, dqword ptr [ebp+TBLOffset+040H]         
        subps   xmm5, xmm1                             
        addps   xmm3, xmm4                             
        movaps  xmm1, xmm2                             
        movaps  xmm4, xmm0                             
        addps   xmm2, xmm3
        subps   xmm1, xmm3                             
        addps   xmm0, xmm5                             
        subps   xmm4, xmm5                             
        movaps  xmm3, xmm4                             
        addps   xmm4, xmm1                             
        subps   xmm1, xmm3                             
        mulps   xmm4, dqword ptr [ebp+TBLOffset+0A0H]        
        mulps   xmm1, dqword ptr [ebp+TBLOffset+0A0H]        
        movaps  dqword ptr [ecx + edi + 40H], xmm4
        movaps  dqword ptr [ecx + edi + 00000080H], xmm0
        movaps  xmm5, dqword ptr [ecx + edi + 000000A0H]
        movaps  xmm3, dqword ptr [ecx + edi + 20H]     
        movaps  xmm6, xmm5                             
        addps   xmm5, xmm3                             
        subps   xmm3, xmm6                             
        movaps  xmm0, dqword ptr [ecx + edi + 60H]
        movaps  xmm4, dqword ptr [ecx + edi + 000000E0H]
        movaps  xmm6, xmm0                              
        mulps   xmm0, dqword ptr [ebp+TBLOffset+030H]
        movaps  xmm7, xmm4
        mulps   xmm4, dqword ptr [ebp+TBLOffset+030H]
        subps   xmm0, xmm7
        addps   xmm4, xmm6
        movaps  xmm6, xmm0
        addps   xmm0, xmm3
        subps   xmm3, xmm6
        movaps  xmm6, xmm4
        addps   xmm4, xmm5
        subps   xmm5, xmm6
        movaps  xmm6, xmm2
        addps   xmm2, xmm4
        subps   xmm4, xmm6
        movaps  xmm7, xmm1
        addps   xmm1, xmm3
        subps   xmm3, xmm7
        movaps  xmm6, dqword ptr [ecx + edi + 00000080H]
        movaps  xmm7, xmm5
        subps   xmm5, xmm6
        addps   xmm7, xmm6
        movaps  dqword ptr [ecx + edi + 20H], xmm2
        movaps  dqword ptr [ecx + edi + 00000100H], xmm4
        movaps  dqword ptr [ecx + edi + 60H], xmm1
        movaps  dqword ptr [ecx + edi + 000000C0H], xmm3
        movaps  dqword ptr [ecx + edi + 000000A0H], xmm5
        movaps  dqword ptr [ecx + edi + 00000080H], xmm7
        movaps  xmm2, dqword ptr [ecx + edi + 40H]
        movaps  xmm3, xmm0                              
        addps   xmm0, xmm2                              
        subps   xmm3, xmm2                              
        movaps  dqword ptr [ecx + edi + 40H], xmm0
        movaps  dqword ptr [ecx + edi + 000000E0H], xmm3
        add     ecx, 16
        jne     @@010
        add     edi, 256
        jmp     dword ptr [ebp + 000000FCH]
end;

procedure _IDCT_16x16_SSE;
asm
        movd    mm1, ebp
        mov     ecx, -32
        mov     ebp, -512
@@011:  //mov     edx, dword ptr [ebp+TBLOffset+0D0H + 00000020H + ecx]
        movd edx,mm1
        mov edx,[edx+TBLOffset+0D0H + 00000020H + ecx]
        cvtpi2ps xmm0, qword ptr [esp + ecx*8 + 00000100H]
        cvtpi2ps xmm2, qword ptr [esp + ecx*8 + 00000108H]
        cvtpi2ps xmm1, qword ptr [esp + ecx*8 + 00000110H]
        cvtpi2ps xmm3, qword ptr [esp + ecx*8 + 00000118H]
        shufps  xmm0, xmm2, 68                          
        shufps  xmm1, xmm3, 68
        mulps   xmm0, dqword ptr [eax + ecx*8 + 00000100H]
        mulps   xmm1, dqword ptr [eax + ecx*8 + 00000110H]
        movaps  xmm2, xmm0                              
        movaps  xmm3, xmm0                              
        movaps  xmm4, xmm1                              
        movaps  xmm5, xmm1                              
        shufps  xmm2, xmm2, 0                           
        shufps  xmm3, xmm3, -86                         
        shufps  xmm4, xmm4, 0                           
        shufps  xmm5, xmm5, -86                         
        movaps  xmm6, xmm0                              
        movaps  xmm7, xmm1                              
        mulps   xmm2, dqword ptr [edx]                 
        mulps   xmm3, dqword ptr [edx + 10H]           
        mulps   xmm4, dqword ptr [edx + 20H]           
        mulps   xmm5, dqword ptr [edx + 30H]           
        addps   xmm2, xmm3                              
        addps   xmm4, xmm5                              
        addps   xmm2, xmm4                              
        movaps  xmm3, xmm0                              
        movaps  xmm4, xmm1                              
        shufps  xmm3, xmm3, 85                          
        shufps  xmm0, xmm0, -1                          
        shufps  xmm4, xmm4, 85                          
        shufps  xmm1, xmm1, -1                          
        mulps   xmm3, dqword ptr [edx + 40H]           
        mulps   xmm0, dqword ptr [edx + 50H]           
        mulps   xmm4, dqword ptr [edx + 60H]           
        mulps   xmm1, dqword ptr [edx + 70H]           
        addps   xmm3, xmm0                              
        addps   xmm4, xmm1                              
        addps   xmm3, xmm4                              
        movaps  xmm4, xmm2                              
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3                              
        shufps  xmm4, xmm4, 27                          
        movaps  dqword ptr [ebp + edi + 00000200H], xmm2
        movaps  dqword ptr [ebp + edi + 00000230H], xmm4
        movaps  xmm2, xmm6                              
        movaps  xmm3, xmm6                              
        movaps  xmm4, xmm7                              
        movaps  xmm5, xmm7                              
        shufps  xmm2, xmm2, 0                           
        shufps  xmm3, xmm3, -86                         
        shufps  xmm4, xmm4, 0                           
        shufps  xmm5, xmm5, -86                         
        mulps   xmm2, dqword ptr [edx + 00000080H]     
        mulps   xmm3, dqword ptr [edx + 00000090H]     
        mulps   xmm4, dqword ptr [edx + 000000A0H]
        mulps   xmm5, dqword ptr [edx + 000000B0H]     
        addps   xmm2, xmm3                              
        addps   xmm4, xmm5                              
        addps   xmm2, xmm4
        movaps  xmm3, xmm6                              
        movaps  xmm4, xmm7                              
        shufps  xmm3, xmm3, 85                          
        shufps  xmm6, xmm6, -1                          
        shufps  xmm4, xmm4, 85                          
        shufps  xmm7, xmm7, -1                          
        mulps   xmm3, dqword ptr [edx + 000000C0H]     
        mulps   xmm6, dqword ptr [edx + 000000D0H]     
        mulps   xmm4, dqword ptr [edx + 000000E0H]     
        mulps   xmm7, dqword ptr [edx + 000000F0H]     
        addps   xmm3, xmm6                              
        addps   xmm4, xmm7                              
        addps   xmm3, xmm4                              
        movaps  xmm4, xmm2                              
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3                              
        shufps  xmm4, xmm4, 27
        movaps  dqword ptr [ebp + edi + 00000210H], xmm2
        movaps  dqword ptr [ebp + edi + 00000220H], xmm4
        add     ebp, 64
        add     ecx, 4
        jne     @@011
        mov     ecx, -64
        movd ebp,mm1
@@012:  movaps  xmm0, dqword ptr [ecx + edi + 40H]
        movaps  xmm1, dqword ptr [ecx + edi + 00000140H]
        movaps  xmm2, xmm0
        movaps  xmm3, xmm1
        mulps   xmm1, dqword ptr [ebp+TBLOffset+030H]
        movaps  xmm4, xmm0                              
        movaps  xmm5, xmm0                              
        addps   xmm0, xmm3                              
        subps   xmm2, xmm3                              
        addps   xmm4, xmm1                              
        subps   xmm5, xmm1                              
        movaps  dqword ptr [ecx + edi + 40H], xmm0     
        movaps  xmm0, dqword ptr [ecx + edi + 000000C0H]
        movaps  xmm1, dqword ptr [ecx + edi + 000001C0H]
        movaps  xmm3, xmm0
        movaps  xmm6, xmm1                              
        mulps   xmm0, dqword ptr [ebp+TBLOffset+020H]         
        mulps   xmm1, dqword ptr [ebp+TBLOffset+040H]         
        movaps  xmm7, xmm3                              
        addps   xmm3, xmm6                              
        subps   xmm7, xmm6                              
        movaps  xmm6, xmm0                              
        addps   xmm0, xmm1                              
        subps   xmm6, xmm1
        movaps  xmm1, xmm7                              
        addps   xmm7, xmm0                              
        subps   xmm1, xmm0                              
        movaps  xmm0, dqword ptr [ebp+TBLOffset+0A0H]        
        mulps   xmm7, xmm0                              
        mulps   xmm1, xmm0                              
        movaps  xmm0, xmm4                              
        addps   xmm4, xmm7                              
        subps   xmm0, xmm7                              
        movaps  xmm7, xmm5                              
        addps   xmm5, xmm1                              
        subps   xmm7, xmm1                              
        movaps  xmm1, xmm2                              
        addps   xmm2, xmm6
        subps   xmm1, xmm6                              
        movaps  dqword ptr [ecx + edi + 00000280H], xmm0
        movaps  dqword ptr [ecx + edi + 000002C0H], xmm7
        movaps  dqword ptr [ecx + edi + 00000300H], xmm1
        movaps  dqword ptr [ecx + edi + 00000340H], xmm2
        movaps  dqword ptr [ecx + edi + 00000380H], xmm5
        movaps  dqword ptr [ecx + edi + 000003C0H], xmm4
        movaps  xmm0, dqword ptr [ecx + edi + 40H]
        movaps  xmm1, xmm0                              
        addps   xmm0, xmm3                              
        subps   xmm1, xmm3                              
        movaps  dqword ptr [ecx + edi + 00000400H], xmm0
        movaps  dqword ptr [ecx + edi + 00000240H], xmm1
        movaps  xmm0, dqword ptr [ecx + edi + 00000080H]
        movaps  xmm1, dqword ptr [ecx + edi + 00000200H]
        movaps  xmm2, xmm0
        movaps  xmm3, xmm1                              
        mulps   xmm0, dqword ptr [ebp+TBLOffset+050H]
        mulps   xmm1, dqword ptr [ebp+TBLOffset+080H]         
        movaps  xmm4, xmm2                              
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3                              
        movaps  xmm5, xmm0                              
        addps   xmm0, xmm1                              
        subps   xmm5, xmm1                              
        movaps  dqword ptr [ecx + edi + 40H], xmm2     
        movaps  xmm1, dqword ptr [ecx + edi + 00000100H]
        movaps  xmm2, dqword ptr [ecx + edi + 00000180H]
        movaps  xmm3, xmm1
        movaps  xmm6, xmm2                              
        mulps   xmm1, dqword ptr [ebp+TBLOffset+060H]         
        mulps   xmm2, dqword ptr [ebp+TBLOffset+070H]         
        movaps  xmm7, xmm3                              
        addps   xmm3, xmm6                              
        subps   xmm7, xmm6                              
        movaps  xmm6, xmm1                              
        addps   xmm1, xmm2
        subps   xmm6, xmm2                              
        movaps  xmm2, xmm4                              
        addps   xmm4, xmm1                              
        subps   xmm2, xmm1                              
        movaps  xmm1, xmm7                              
        addps   xmm7, xmm0                              
        subps   xmm1, xmm0                              
        movaps  xmm0, xmm5                              
        addps   xmm5, xmm6                              
        subps   xmm0, xmm6                              
        movaps  xmm6, xmm3                              
        addps   xmm3, dqword ptr [ecx + edi + 40H]     
        subps   xmm6, dqword ptr [ecx + edi + 40H]     
        movaps  dqword ptr [ecx + edi + 40H], xmm3     
        movaps  xmm3, xmm5                              
        addps   xmm5, xmm6                              
        subps   xmm3, xmm6                              
        mulps   xmm5, dqword ptr [ebp+TBLOffset+0A0H]        
        mulps   xmm3, dqword ptr [ebp+TBLOffset+0A0H]
        movaps  dqword ptr [ecx + edi + 00000140H], xmm5
        movaps  dqword ptr [ecx + edi + 00000100H], xmm3
        movaps  xmm3, dqword ptr [ebp+TBLOffset+030H]
        movaps  xmm5, xmm4                              
        mulps   xmm4, xmm3
        movaps  xmm6, xmm7                              
        mulps   xmm7, xmm3                              
        subps   xmm4, xmm6                              
        addps   xmm5, xmm7                              
        movaps  xmm6, xmm1                              
        mulps   xmm1, xmm3                              
        movaps  xmm7, xmm2                              
        mulps   xmm2, xmm3                              
        addps   xmm7, xmm1                              
        movaps  xmm3, dqword ptr [ebp+TBLOffset+090H]        
        subps   xmm2, xmm6                              
        mulps   xmm5, xmm3                              
        mulps   xmm7, xmm3
        mulps   xmm2, xmm3                              
        mulps   xmm4, xmm3                              
        movaps  xmm1, dqword ptr [ecx + edi + 40H]     
        movaps  xmm3, dqword ptr [ecx + edi + 00000400H]
        movaps  xmm6, xmm3
        addps   xmm3, xmm1                              
        subps   xmm6, xmm1                              
        movaps  dqword ptr [ecx + edi + 40H], xmm3     
        movaps  dqword ptr [ecx + edi + 00000400H], xmm6
        movaps  xmm1, dqword ptr [ecx + edi + 00000380H]
        movaps  xmm3, dqword ptr [ecx + edi + 000003C0H]
        movaps  xmm6, xmm1
        addps   xmm1, xmm7                              
        subps   xmm6, xmm7
        movaps  xmm7, xmm3                              
        addps   xmm3, xmm5                              
        subps   xmm7, xmm5                              
        movaps  dqword ptr [ecx + edi + 00000080H], xmm3
        movaps  dqword ptr [ecx + edi + 000000C0H], xmm1
        movaps  dqword ptr [ecx + edi + 00000380H], xmm6
        movaps  dqword ptr [ecx + edi + 000003C0H], xmm7
        movaps  xmm1, dqword ptr [ecx + edi + 00000100H]
        movaps  xmm3, dqword ptr [ecx + edi + 00000140H]
        movaps  xmm5, dqword ptr [ecx + edi + 00000300H]
        movaps  xmm6, dqword ptr [ecx + edi + 00000340H]
        movaps  xmm7, xmm5
        subps   xmm5, xmm3                              
        addps   xmm7, xmm3                              
        movaps  xmm3, xmm6                              
        addps   xmm6, xmm1                              
        subps   xmm3, xmm1                              
        movaps  dqword ptr [ecx + edi + 00000100H], xmm6
        movaps  dqword ptr [ecx + edi + 00000140H], xmm5
        movaps  dqword ptr [ecx + edi + 00000300H], xmm7
        movaps  dqword ptr [ecx + edi + 00000340H], xmm3
        movaps  xmm1, dqword ptr [ecx + edi + 00000240H]
        movaps  xmm3, dqword ptr [ecx + edi + 00000280H]
        movaps  xmm5, dqword ptr [ecx + edi + 000002C0H]
        movaps  xmm6, xmm3
        addps   xmm3, xmm4                              
        subps   xmm6, xmm4                              
        movaps  xmm4, xmm5                              
        addps   xmm5, xmm2                              
        subps   xmm4, xmm2                              
        movaps  xmm7, xmm1                              
        addps   xmm1, xmm0                              
        subps   xmm7, xmm0                              
        movaps  dqword ptr [ecx + edi + 00000180H], xmm5
        movaps  dqword ptr [ecx + edi + 000001C0H], xmm3
        movaps  dqword ptr [ecx + edi + 00000200H], xmm1
        movaps  dqword ptr [ecx + edi + 00000240H], xmm7
        movaps  dqword ptr [ecx + edi + 00000280H], xmm6
        movaps  dqword ptr [ecx + edi + 000002C0H], xmm4
        add     ecx, 16
        jne     @@012                                  
        add     edi, 1024
        jmp     dword ptr [ebp + 000000FCH]            
end;

procedure _IDCT_8x16_SSE;
asm
        mov     ecx, -32
@@013:  mov     edx, dword ptr [ebp+TBLOffset+0B0H + 00000020H + ecx]
        cvtpi2ps xmm0, qword ptr [esp + ecx*8 + 00000100H]
        cvtpi2ps xmm2, qword ptr [esp + ecx*8 + 00000108H]
        cvtpi2ps xmm1, qword ptr [esp + ecx*8 + 00000110H]
        cvtpi2ps xmm3, qword ptr [esp + ecx*8 + 00000118H]
        shufps  xmm0, xmm2, 68
        shufps  xmm1, xmm3, 68                          
        mulps   xmm0, dqword ptr [eax + ecx*8 + 00000100H]
        mulps   xmm1, dqword ptr [eax + ecx*8 + 00000110H]
        movaps  xmm2, xmm0
        movaps  xmm3, xmm0                              
        movaps  xmm4, xmm1                              
        movaps  xmm5, xmm1                              
        shufps  xmm2, xmm2, 0                           
        shufps  xmm3, xmm3, -86                         
        shufps  xmm4, xmm4, 0                           
        shufps  xmm5, xmm5, -86                         
        mulps   xmm2, dqword ptr [edx]                 
        mulps   xmm3, dqword ptr [edx + 10H]           
        mulps   xmm4, dqword ptr [edx + 20H]           
        mulps   xmm5, dqword ptr [edx + 30H]           
        addps   xmm2, xmm3                              
        addps   xmm4, xmm5                              
        addps   xmm2, xmm4                              
        movaps  xmm3, xmm0                              
        movaps  xmm4, xmm1                              
        shufps  xmm3, xmm3, 85                          
        shufps  xmm0, xmm0, -1                          
        shufps  xmm4, xmm4, 85                          
        shufps  xmm1, xmm1, -1                          
        mulps   xmm3, dqword ptr [edx + 40H]           
        mulps   xmm0, dqword ptr [edx + 50H]           
        mulps   xmm4, dqword ptr [edx + 60H]           
        mulps   xmm1, dqword ptr [edx + 70H]           
        addps   xmm3, xmm0                              
        addps   xmm4, xmm1                              
        addps   xmm3, xmm4                              
        movaps  xmm4, xmm2                              
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3                              
        shufps  xmm4, xmm4, 27                          
        movaps  dqword ptr [edi + ecx*8 + 00000100H], xmm2
        movaps  dqword ptr [edi + ecx*8 + 00000110H], xmm4
        add     ecx, 4
        jne     @@013                                   
        mov     ecx, -32
@@014:  movaps  xmm0, dqword ptr [ecx + edi + 20H]     
        movaps  xmm1, dqword ptr [ecx + edi + 000000A0H]
        movaps  xmm2, xmm0
        movaps  xmm3, xmm1                              
        mulps   xmm1, dqword ptr [ebp+TBLOffset+030H]         
        movaps  xmm4, xmm0                              
        movaps  xmm5, xmm0
        addps   xmm0, xmm3                              
        subps   xmm2, xmm3                              
        addps   xmm4, xmm1                              
        subps   xmm5, xmm1                              
        movaps  dqword ptr [ecx + edi + 20H], xmm0     
        movaps  xmm0, dqword ptr [ecx + edi + 60H]     
        movaps  xmm1, dqword ptr [ecx + edi + 000000E0H]
        movaps  xmm3, xmm0
        movaps  xmm6, xmm1
        mulps   xmm0, dqword ptr [ebp+TBLOffset+020H]         
        mulps   xmm1, dqword ptr [ebp+TBLOffset+040H]         
        movaps  xmm7, xmm3                              
        addps   xmm3, xmm6                              
        subps   xmm7, xmm6                              
        movaps  xmm6, xmm0                              
        addps   xmm0, xmm1                              
        subps   xmm6, xmm1                              
        movaps  xmm1, xmm7                              
        addps   xmm7, xmm0                              
        subps   xmm1, xmm0                              
        movaps  xmm0, dqword ptr [ebp+TBLOffset+0A0H]        
        mulps   xmm7, xmm0                              
        mulps   xmm1, xmm0                              
        movaps  xmm0, xmm4                              
        addps   xmm4, xmm7                              
        subps   xmm0, xmm7                              
        movaps  xmm7, xmm5                              
        addps   xmm5, xmm1                              
        subps   xmm7, xmm1                              
        movaps  xmm1, xmm2                              
        addps   xmm2, xmm6                              
        subps   xmm1, xmm6                              
        movaps  dqword ptr [ecx + edi + 00000140H], xmm0
        movaps  dqword ptr [ecx + edi + 00000160H], xmm7
        movaps  dqword ptr [ecx + edi + 00000180H], xmm1
        movaps  dqword ptr [ecx + edi + 000001A0H], xmm2
        movaps  dqword ptr [ecx + edi + 000001C0H], xmm5
        movaps  dqword ptr [ecx + edi + 000001E0H], xmm4
        movaps  xmm0, dqword ptr [ecx + edi + 20H]
        movaps  xmm1, xmm0                              
        addps   xmm0, xmm3                              
        subps   xmm1, xmm3                              
        movaps  dqword ptr [ecx + edi + 00000200H], xmm0
        movaps  dqword ptr [ecx + edi + 00000120H], xmm1
        movaps  xmm0, dqword ptr [ecx + edi + 40H]
        movaps  xmm1, dqword ptr [ecx + edi + 00000100H]
        movaps  xmm2, xmm0
        movaps  xmm3, xmm1                              
        mulps   xmm0, dqword ptr [ebp+TBLOffset+050H]
        mulps   xmm1, dqword ptr [ebp+TBLOffset+080H]
        movaps  xmm4, xmm2
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3                              
        movaps  xmm5, xmm0
        addps   xmm0, xmm1                              
        subps   xmm5, xmm1                              
        movaps  dqword ptr [ecx + edi + 20H], xmm2     
        movaps  xmm1, dqword ptr [ecx + edi + 00000080H]
        movaps  xmm2, dqword ptr [ecx + edi + 000000C0H]
        movaps  xmm3, xmm1
        movaps  xmm6, xmm2                              
        mulps   xmm1, dqword ptr [ebp+TBLOffset+060H]
        mulps   xmm2, dqword ptr [ebp+TBLOffset+070H]
        movaps  xmm7, xmm3                              
        addps   xmm3, xmm6                              
        subps   xmm7, xmm6                              
        movaps  xmm6, xmm1                              
        addps   xmm1, xmm2                              
        subps   xmm6, xmm2                              
        movaps  xmm2, xmm4                              
        addps   xmm4, xmm1                              
        subps   xmm2, xmm1                              
        movaps  xmm1, xmm7                              
        addps   xmm7, xmm0                              
        subps   xmm1, xmm0                              
        movaps  xmm0, xmm5                              
        addps   xmm5, xmm6                              
        subps   xmm0, xmm6                              
        movaps  xmm6, xmm3                              
        addps   xmm3, dqword ptr [ecx + edi + 20H]     
        subps   xmm6, dqword ptr [ecx + edi + 20H]     
        movaps  dqword ptr [ecx + edi + 20H], xmm3     
        movaps  xmm3, xmm5                              
        addps   xmm5, xmm6                              
        subps   xmm3, xmm6
        mulps   xmm5, dqword ptr [ebp+TBLOffset+0A0H]        
        mulps   xmm3, dqword ptr [ebp+TBLOffset+0A0H]        
        movaps  dqword ptr [ecx + edi + 000000A0H], xmm5
        movaps  dqword ptr [ecx + edi + 00000080H], xmm3
        movaps  xmm3, dqword ptr [ebp+TBLOffset+030H]
        movaps  xmm5, xmm4                              
        mulps   xmm4, xmm3                              
        movaps  xmm6, xmm7                              
        mulps   xmm7, xmm3                              
        subps   xmm4, xmm6                              
        addps   xmm5, xmm7                              
        movaps  xmm6, xmm1                              
        mulps   xmm1, xmm3                              
        movaps  xmm7, xmm2                              
        mulps   xmm2, xmm3                              
        addps   xmm7, xmm1                              
        movaps  xmm3, dqword ptr [ebp+TBLOffset+090H]
        subps   xmm2, xmm6                              
        mulps   xmm5, xmm3                              
        mulps   xmm7, xmm3                              
        mulps   xmm2, xmm3                              
        mulps   xmm4, xmm3                              
        movaps  xmm1, dqword ptr [ecx + edi + 20H]     
        movaps  xmm3, dqword ptr [ecx + edi + 00000200H]
        movaps  xmm6, xmm3
        addps   xmm3, xmm1                              
        subps   xmm6, xmm1                              
        movaps  dqword ptr [ecx + edi + 20H], xmm3     
        movaps  dqword ptr [ecx + edi + 00000200H], xmm6
        movaps  xmm1, dqword ptr [ecx + edi + 000001C0H]
        movaps  xmm3, dqword ptr [ecx + edi + 000001E0H]
        movaps  xmm6, xmm1
        addps   xmm1, xmm7
        subps   xmm6, xmm7                              
        movaps  xmm7, xmm3                              
        addps   xmm3, xmm5                              
        subps   xmm7, xmm5                              
        movaps  dqword ptr [ecx + edi + 40H], xmm3     
        movaps  dqword ptr [ecx + edi + 60H], xmm1     
        movaps  dqword ptr [ecx + edi + 000001C0H], xmm6
        movaps  dqword ptr [ecx + edi + 000001E0H], xmm7
        movaps  xmm1, dqword ptr [ecx + edi + 00000080H]
        movaps  xmm3, dqword ptr [ecx + edi + 000000A0H]
        movaps  xmm5, dqword ptr [ecx + edi + 00000180H]
        movaps  xmm6, dqword ptr [ecx + edi + 000001A0H]
        movaps  xmm7, xmm5
        subps   xmm5, xmm3                              
        addps   xmm7, xmm3                              
        movaps  xmm3, xmm6                              
        addps   xmm6, xmm1                              
        subps   xmm3, xmm1                              
        movaps  dqword ptr [ecx + edi + 00000080H], xmm6
        movaps  dqword ptr [ecx + edi + 000000A0H], xmm5
        movaps  dqword ptr [ecx + edi + 00000180H], xmm7
        movaps  dqword ptr [ecx + edi + 000001A0H], xmm3
        movaps  xmm1, dqword ptr [ecx + edi + 00000120H]
        movaps  xmm3, dqword ptr [ecx + edi + 00000140H]
        movaps  xmm5, dqword ptr [ecx + edi + 00000160H]
        movaps  xmm6, xmm3
        addps   xmm3, xmm4                              
        subps   xmm6, xmm4                              
        movaps  xmm4, xmm5                              
        addps   xmm5, xmm2                              
        subps   xmm4, xmm2                              
        movaps  xmm7, xmm1                              
        addps   xmm1, xmm0                              
        subps   xmm7, xmm0                              
        movaps  dqword ptr [ecx + edi + 000000C0H], xmm5
        movaps  dqword ptr [ecx + edi + 000000E0H], xmm3
        movaps  dqword ptr [ecx + edi + 00000100H], xmm1
        movaps  dqword ptr [ecx + edi + 00000120H], xmm7
        movaps  dqword ptr [ecx + edi + 00000140H], xmm6
        movaps  dqword ptr [ecx + edi + 00000160H], xmm4
        add     ecx, 16
        jne     @@014                                   
        add     edi, 512                                
        jmp     dword ptr [ebp + 000000FCH]             
end;

procedure _IDCT_16x8_SSE;
asm
        movd    mm1, ebp
        mov     ecx, -32                                
        mov     ebp, -512
@@015:  //mov     edx, dword ptr [ebp+TBLOffset+0D0H + 00000020H + ecx]
        movd edx,mm1
        mov edx,[edx+TBLOffset+0D0H + 00000020H + ecx]
        cvtpi2ps xmm0, qword ptr [esp + ecx*8 + 00000100H]
        cvtpi2ps xmm2, qword ptr [esp + ecx*8 + 00000108H]
        cvtpi2ps xmm1, qword ptr [esp + ecx*8 + 00000110H]
        cvtpi2ps xmm3, qword ptr [esp + ecx*8 + 00000118H]
        shufps  xmm0, xmm2, 68
        shufps  xmm1, xmm3, 68
        mulps   xmm0, dqword ptr [eax + ecx*8 + 00000100H]
        mulps   xmm1, dqword ptr [eax + ecx*8 + 00000110H]
        movaps  xmm2, xmm0
        movaps  xmm3, xmm0                              
        movaps  xmm4, xmm1                              
        movaps  xmm5, xmm1                              
        shufps  xmm2, xmm2, 0                           
        shufps  xmm3, xmm3, -86                         
        shufps  xmm4, xmm4, 0                           
        shufps  xmm5, xmm5, -86                         
        movaps  xmm6, xmm0                              
        movaps  xmm7, xmm1                              
        mulps   xmm2, dqword ptr [edx]                 
        mulps   xmm3, dqword ptr [edx + 10H]           
        mulps   xmm4, dqword ptr [edx + 20H]           
        mulps   xmm5, dqword ptr [edx + 30H]           
        addps   xmm2, xmm3                              
        addps   xmm4, xmm5                              
        addps   xmm2, xmm4                              
        movaps  xmm3, xmm0                              
        movaps  xmm4, xmm1                              
        shufps  xmm3, xmm3, 85                          
        shufps  xmm0, xmm0, -1                          
        shufps  xmm4, xmm4, 85                          
        shufps  xmm1, xmm1, -1                          
        mulps   xmm3, dqword ptr [edx + 40H]           
        mulps   xmm0, dqword ptr [edx + 50H]           
        mulps   xmm4, dqword ptr [edx + 60H]
        mulps   xmm1, dqword ptr [edx + 70H]           
        addps   xmm3, xmm0                              
        addps   xmm4, xmm1                              
        addps   xmm3, xmm4                              
        movaps  xmm4, xmm2                              
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3                              
        shufps  xmm4, xmm4, 27                          
        movaps  dqword ptr [ebp + edi + 00000200H], xmm2
        movaps  dqword ptr [ebp + edi + 00000230H], xmm4
        movaps  xmm2, xmm6
        movaps  xmm3, xmm6                              
        movaps  xmm4, xmm7                              
        movaps  xmm5, xmm7                              
        shufps  xmm2, xmm2, 0                           
        shufps  xmm3, xmm3, -86                         
        shufps  xmm4, xmm4, 0                           
        shufps  xmm5, xmm5, -86                         
        mulps   xmm2, dqword ptr [edx + 00000080H]     
        mulps   xmm3, dqword ptr [edx + 00000090H]     
        mulps   xmm4, dqword ptr [edx + 000000A0H]     
        mulps   xmm5, dqword ptr [edx + 000000B0H]
        addps   xmm2, xmm3                              
        addps   xmm4, xmm5                              
        addps   xmm2, xmm4                              
        movaps  xmm3, xmm6                              
        movaps  xmm4, xmm7                              
        shufps  xmm3, xmm3, 85                          
        shufps  xmm6, xmm6, -1                          
        shufps  xmm4, xmm4, 85                          
        shufps  xmm7, xmm7, -1                          
        mulps   xmm3, dqword ptr [edx + 000000C0H]     
        mulps   xmm6, dqword ptr [edx + 000000D0H]
        mulps   xmm4, dqword ptr [edx + 000000E0H]     
        mulps   xmm7, dqword ptr [edx + 000000F0H]     
        addps   xmm3, xmm6                              
        addps   xmm4, xmm7                              
        addps   xmm3, xmm4                              
        movaps  xmm4, xmm2                              
        addps   xmm2, xmm3                              
        subps   xmm4, xmm3                              
        shufps  xmm4, xmm4, 27                          
        movaps  dqword ptr [ebp + edi + 00000210H], xmm2
        movaps  dqword ptr [ebp + edi + 00000220H], xmm4
        add     ebp, 64
        add     ecx, 4
        jne     @@015
        mov     ecx, -64
        movd ebp,mm1
@@016:  movaps  xmm0, dqword ptr [ecx + edi + 00000080H]
        movaps  xmm1, dqword ptr [ecx + edi + 00000200H]
        movaps  xmm2, xmm0
        mulps   xmm0, dqword ptr [ebp+TBLOffset+020H]
        movaps  xmm3, xmm1
        mulps   xmm1, dqword ptr [ebp+TBLOffset+020H]
        subps   xmm0, xmm3
        addps   xmm2, xmm1
        movaps  xmm1, dqword ptr [ecx + edi + 00000100H]
        movaps  xmm3, dqword ptr [ecx + edi + 00000180H]
        movaps  xmm4, xmm1
        mulps   xmm1, dqword ptr [ebp+TBLOffset+040H]
        movaps  xmm5, xmm3
        mulps   xmm3, dqword ptr [ebp+TBLOffset+040H]
        subps   xmm5, xmm1
        addps   xmm3, xmm4
        movaps  xmm1, xmm2
        movaps  xmm4, xmm0
        addps   xmm2, xmm3
        subps   xmm1, xmm3
        addps   xmm0, xmm5
        subps   xmm4, xmm5
        movaps  xmm3, xmm4
        addps   xmm4, xmm1
        subps   xmm1, xmm3
        mulps   xmm4, dqword ptr [ebp+TBLOffset+0A0H]
        mulps   xmm1, dqword ptr [ebp+TBLOffset+0A0H]
        movaps  dqword ptr [ecx + edi + 00000080H], xmm4
        movaps  dqword ptr [ecx + edi + 00000100H], xmm0
        movaps  xmm5, dqword ptr [ecx + edi + 00000140H]
        movaps  xmm3, dqword ptr [ecx + edi + 40H]
        movaps  xmm6, xmm5
        addps   xmm5, xmm3
        subps   xmm3, xmm6
        movaps  xmm0, dqword ptr [ecx + edi + 000000C0H]
        movaps  xmm4, dqword ptr [ecx + edi + 000001C0H]
        movaps  xmm6, xmm0
        mulps   xmm0, dqword ptr [ebp+TBLOffset+030H]
        movaps  xmm7, xmm4
        mulps   xmm4, dqword ptr [ebp+TBLOffset+030H]
        subps   xmm0, xmm7
        addps   xmm4, xmm6
        movaps  xmm6, xmm0
        addps   xmm0, xmm3
        subps   xmm3, xmm6
        movaps  xmm6, xmm4
        addps   xmm4, xmm5
        subps   xmm5, xmm6
        movaps  xmm6, xmm2
        addps   xmm2, xmm4
        subps   xmm4, xmm6
        movaps  xmm7, xmm1
        addps   xmm1, xmm3
        subps   xmm3, xmm7
        movaps  xmm6, dqword ptr [ecx + edi + 00000100H]
        movaps  xmm7, xmm5
        subps   xmm5, xmm6
        addps   xmm7, xmm6
        movaps  dqword ptr [ecx + edi + 40H], xmm2
        movaps  dqword ptr [ecx + edi + 00000200H], xmm4
        movaps  dqword ptr [ecx + edi + 000000C0H], xmm1
        movaps  dqword ptr [ecx + edi + 00000180H], xmm3
        movaps  dqword ptr [ecx + edi + 00000140H], xmm5
        movaps  dqword ptr [ecx + edi + 00000100H], xmm7
        movaps  xmm2, dqword ptr [ecx + edi + 00000080H]
        movaps  xmm3, xmm0
        addps   xmm0, xmm2
        subps   xmm3, xmm2
        movaps  dqword ptr [ecx + edi + 00000080H], xmm0
        movaps  dqword ptr [ecx + edi + 000001C0H], xmm3
        add     ecx, 16
        jne     @@016
        add     edi, 512
        jmp     dword ptr [ebp + 000000FCH]
end;

procedure _IDCT_Init;
asm
        mov     ecx, dword ptr [@TBL_IDCT - 00000008H + eax*4]
        mov     dword ptr [ebp + 000000F8H], ecx
        ret
        nop; nop
@TBL_IDCT:
        dd offset @TBL_IDCT_SSE
        dd offset @TBL_IDCT_SSE2
        dd offset @TBL_IDCT_SSE2
        dd 00000000H
        dd 00000000H
        dd 00000000H
@TBL_IDCT_SSE2:
        dd offset _IDCT_8x16_SSE2
        dd offset _IDCT_16x8_SSE2
        dd offset _IDCT_16x16_SSE2
        dd offset _IDCT_8x8_SSE2
@TBL_IDCT_SSE:
        dd offset _IDCT_8x16_SSE
        dd offset _IDCT_16x8_SSE
        dd offset _IDCT_16x16_SSE
        dd offset _IDCT_8x8_SSE
end;

procedure _MEM_Set;
asm
        mov     eax, edx
        test    dl, 07H
        jnz     @@007
@@002:  mov     edx, ecx
        and     ecx, 0FFFFFFE0H
        jz      @@004
        add     eax, ecx
        neg     ecx
@@003:  movq    qword ptr [ecx + eax], mm1
        movq    qword ptr [ecx + eax + 08H], mm1
        movq    qword ptr [ecx + eax + 10H], mm1
        movq    qword ptr [ecx + eax + 18H], mm1
        add     ecx, 32
        jl      @@003
@@004:  and     edx, 0000001FH
        jz      @@006
        add     eax, edx
        movd    ecx, mm1
        neg     edx
@@005:  mov     byte ptr [edx + eax], cl
        inc     edx
        jnz     @@005
@@006:  ret

@@007:  movd    edx, mm1
        jmp     @@009

@@008:  mov     byte ptr [eax], dl
        inc     eax
        test    al, 07H
        jz      @@002
@@009:  dec     ecx
        jge     @@008
end;

procedure _HUF_GenTable;
asm
        sub     esp, 20
        mov     dword ptr [esp], ebx
@@001:  movzx   eax, byte ptr [esi]
        inc     esi
        mov     edx, eax
        shr     eax, 4
        and     edx, 00000003H
        and     eax, 00000001H
        lea     eax, [eax + edx*2]
        imul    eax, eax, 704
        dec     dword ptr [esp]
        lea     ebx, [eax + ebp + 00000510H]
        mov     dword ptr [esp + 04H], ebp
        lea     edx, [ebx + 000000C0H]
        mov     ecx, 512
        pcmpeqd mm1, mm1
        call    _MEM_Set
        mov     ebp, -16                                
        mov     dword ptr [esp + 10H], 0                
        lea     edi, [esi + 10H]                        
@@002:  movzx   eax, byte ptr [ebp + esi + 10H]         
        dec     eax                                     
        mov     dword ptr [ebx + ebp*4 + 000000C0H], eax
        js      @@006                                   
        mov     dword ptr [esp + 0CH], eax              
        mov     ecx, dword ptr [esp + 10H]              
        mov     dword ptr [ebx + ebp*4 + 00000080H], ecx
        mov     dword ptr [ebx + ebp*4 + 40H], edi      
        cmp     ebp, -8                                 
        jge     @@005                                   
@@003:  lea     eax, [ebp + 11H]                        
        shl     eax, 8                                  
        movzx   ecx, byte ptr [edi]                     
        or      eax, ecx                                
        mov     dword ptr [esp + 08H], eax              
        mov     ecx, -9                                 
        sub     ecx, ebp                                
        mov     edx, dword ptr [esp + 10H]              
        shl     edx, cl                                 
        mov     eax, 1                                  
        shl     eax, cl
        lea     edx, [ebx + edx*2 + 000000C0H]
        mov     cx, word ptr [esp + 08H]
@@004:  mov     word ptr [edx + eax*2 - 02H], cx
        dec     eax
        jnz     @@004
        inc     edi
        inc     dword ptr [esp + 10H]
        dec     dword ptr [esp + 0CH]
        jns     @@003
        dec     eax
@@005:  mov     edx, dword ptr [esp + 10H]
        add     edx, eax
        lea     edi, [eax + edi + 01H]
        mov     dword ptr [ebx + ebp*4 + 000000C0H], edx
        inc     edx
        mov     dword ptr [esp + 10H], edx
@@006:  shl     dword ptr [esp + 10H], 1
        inc     ebp
        jne     @@002
        mov     ebp, dword ptr [esp + 04H]
        sub     dword ptr [esp], edi
        add     dword ptr [esp], esi
        mov     esi, edi
        jg      @@001
        add     esp, 20
end;

procedure _BS_GetBitsECS;
asm
        sub     dword ptr [ebp + 38H], eax
        jl      @@006
        movd    mm1, eax
        movq    mm2, mm0
        psrlq   mm2, qword ptr [ebp+TBL64Offset + eax*8]
        psllq   mm0, mm1
        movd    eax, mm2
        ret
@@006:  mov     edx, dword ptr [ebp + 40H]
        sub     dword ptr [ebp + 44H], 8
        jl      @@007                                   
        test    edx, 00000007H                          
        jnz     @@007                                   
        pcmpeqd mm1, mm1                                
        pcmpeqb mm1, qword ptr [edx]                    
        pmovmskb ecx, mm1                               
        test    ecx, ecx                                
        jnz     @@007                                   
        add     dword ptr [ebp + 40H], 8                
        mov     ecx, dword ptr [edx]                    
        mov     edx, dword ptr [edx + 04H]              
        bswap   ecx                                     
        bswap   edx                                     
        movd    mm1, ecx                                
        movd    mm2, edx                                
        psllq   mm1, 32                                 
        mov     ecx, dword ptr [ebp + 38H]              
        por     mm1, mm2                                
        add     ecx, 64                                 
        movq    mm2, mm1                                
        mov     dword ptr [ebp + 38H], ecx              
        movq    mm3, qword ptr [ebp+TBL64Offset + ecx*8]
        lea     ecx, [eax + ecx - 40H]
        movd    mm4, ecx
        psrlq   mm1, mm4
        por     mm0, mm1
        psrlq   mm0, qword ptr [ebp+TBL64Offset + eax*8]
        movd    eax, mm0                                
        psllq   mm2, mm3                                
        sub     ecx, ecx                                
        movq    mm0, mm2                                
        ret                                             

@@007:  sub     esp, 16                                 
        mov     dword ptr [esp + 04H], ebx              
        mov     dword ptr [esp], eax                    
        mov     ecx, dword ptr [ebp + 38H]
        mov     ebx, 7                                  
        add     eax, ecx                                
        mov     dword ptr [esp + 08H], eax              
        sub     ecx, ebx
        and     ebx, edx                                
        sar     ecx, 3                                  
        sub     ebx, 8                                  
        sub     ebx, ecx
        sbb     eax, eax                                
        and     eax, ebx                                
        add     ecx, eax                                
        lea     ebx, [ebp + 30H]                        
        pxor    mm1, mm1                                
        movq    qword ptr [ebx], mm1                    
        sub     ebx, ecx                                
        add     dword ptr [ebp + 44H], 8                
@@008:  cmp     dword ptr [ebp + 44H], 0                
        jz      @@010                                   
        mov     al, byte ptr [edx]                      
        inc     edx                                     
        mov     byte ptr [ecx + ebx], al                
        dec     dword ptr [ebp + 44H]                   
        cmp     al, -1                                  
        jnz     @@009
        cmp     dword ptr [ebp + 44H], 0                
        jz      @@010                                   
        mov     al, byte ptr [edx]                      
        test    al, al                                  
        jnz     @@011                                   
        inc     edx                                     
        dec     dword ptr [ebp + 44H]                   
@@009:  add     dword ptr [ebp + 38H], 8                
        inc     ecx                                     
        jnz     @@008                                   
@@010:  sub     ecx, ecx                                
        mov     ebx, dword ptr [esp + 04H]              
        mov     dword ptr [esp + 04H], ecx              
        mov     dword ptr [esp + 0CH], ecx              
        mov     dword ptr [ebp + 40H], edx              
        cmp     dword ptr [ebp + 38H], ecx              
        jl      @@012                                   
        mov     eax, dword ptr [ebp + 30H]              
        mov     ecx, dword ptr [ebp + 34H]              
        bswap   eax                                     
        bswap   ecx                                     
        movd    mm1, eax                                
        movd    mm2, ecx
        psllq   mm1, 32                                 
        por     mm1, mm2
        movq    mm2, mm1                                
        psrlq   mm1, qword ptr [esp + 08H]              
        por     mm0, mm1                                
        mov     eax, dword ptr [esp]                    
        psrlq   mm0, qword ptr [ebp+TBL64Offset + eax*8]
        sub     eax, dword ptr [esp + 08H]
        movd    mm3, eax                                
        psllq   mm2, mm3                                
        movd    eax, mm0                                
        movq    mm0, mm2
        add     esp, 16                                 
        sub     ecx, ecx                                
        ret                                             

@@011:  dec     edx                                     
        inc     dword ptr [ebp + 44H]                   
        mov     byte ptr [ecx + ebx], 0                 
        jmp     @@010                                   

@@012:  mov     dword ptr [ebp + 38H], 0
        add     esp, 16                                 
        stc                                             
end;

procedure _HUF_Decode;
asm
        cmp     dword ptr [ebp + 38H], 8
        jl      @@007
        movq    mm2, mm0
        psrlq   mm2, 56
        movd    ecx, mm2
        movzx   ecx, word ptr [eax + ecx*2 + 000000C0H]
        cmp     ecx, 65535
        jz      @@007
        mov     eax, ecx
        shr     ecx, 8
        movd    mm1, ecx
        and     eax, 000000FFH
        sub     dword ptr [ebp + 38H], ecx
        psllq   mm0, mm1
        ret

@@007:  push    ebx
        push    esi
        push    edi
        mov     ebx, eax
        mov     esi, -1
        sub     edi, edi
@@008:  inc     esi
        mov     eax, 1
        call    _BS_GetBitsECS
        jc      @@009
        lea     edi, [eax + edi*2]
        cmp     edi, dword ptr [ebx + esi*4 + 00000080H]
        jg      @@008
        mov     eax, dword ptr [ebx + esi*4]
        sub     edi, dword ptr [ebx + esi*4 + 40H]
        movzx   eax, byte ptr [edi + eax]
@@009:  pop     edi
        pop     esi                                     
        pop     ebx
end;

procedure _JPG_InitDecoder;
asm
        mov     dword ptr [ebp + 28H], eax
        push    ebx
        mov     eax, 1
        cpuid
        mov     eax, 4
        shr     ecx, 1
        jc      @000
        dec     eax
        shl     edx, 6
        jc      @000
        dec     eax
        shl     edx, 1
        jc      @000
        shl     edx, 1
        adc     eax, -2
@000:   pop     ebx
        cmp     eax, 2
        jl      @@002
        mov     dword ptr [ebp + 2CH], eax
        call    _IDCT_Init
        call    _RGB_Init
@@001:  ret

@@002:  mov     byte ptr [ebp + 1CH], 3
        sub     eax, eax
end;

procedure _BS_GetBits;
asm
        sub     dword ptr [ebp + 38H], eax
        jl      @@001                                   
        movd    mm1, eax
        movq    mm2, mm0
        psrlq   mm2, qword ptr [ebp+TBL64Offset + eax*8]
        psllq   mm0, mm1                                
        movd    eax, mm2                                
        ret                                             

@@001:  mov     edx, dword ptr [ebp + 40H]              
        sub     dword ptr [ebp + 44H], 8                
        jl      @@002                                   
        add     dword ptr [ebp + 40H], 8                
        mov     ecx, dword ptr [edx]                    
        mov     edx, dword ptr [edx + 04H]              
        bswap   ecx                                     
        bswap   edx                                     
        movd    mm1, ecx
        movd    mm2, edx                                
        psllq   mm1, 32                                 
        mov     ecx, dword ptr [ebp + 38H]              
        por     mm1, mm2
        add     ecx, 64                                 
        movq    mm2, mm1                                
        mov     dword ptr [ebp + 38H], ecx              
        movq    mm3, qword ptr [ebp+TBL64Offset + ecx*8]
        lea     ecx, [eax + ecx - 40H]                  
        movd    mm4, ecx                                
        psrlq   mm1, mm4                                
        por     mm0, mm1
        psrlq   mm0, qword ptr [ebp+TBL64Offset + eax*8]
        movd    eax, mm0                                
        psllq   mm2, mm3                                
        sub     ecx, ecx                                
        movq    mm0, mm2                                
        ret                                             

@@002:  sub     esp, 16                                 
        pxor    mm1, mm1                                
        movq    qword ptr [esp], mm1                    
        movq    qword ptr [esp + 08H], mm1              
        mov     dword ptr [esp], eax                    
        mov     ecx, dword ptr [ebp + 38H]              
        add     ecx, eax                                
        mov     dword ptr [esp + 08H], ecx              
        sub     ecx, ecx                                
@@003:  cmp     edx, dword ptr [ebp + 48H]              
        jnc     @@004                                   
        mov     al, byte ptr [edx]                      
        mov     byte ptr [ecx + ebp + 30H], al          
        inc     ecx                                     
        inc     edx                                     
        jmp     @@003                                   

@@004:  mov     dword ptr [ebp + 44H], 0                
        mov     dword ptr [ebp + 40H], edx              
        shl     ecx, 3                                  
        add     dword ptr [ebp + 38H], ecx              
        jl      @@005                                   
        mov     eax, dword ptr [ebp + 30H]              
        mov     ecx, dword ptr [ebp + 34H]              
        bswap   eax                                     
        bswap   ecx                                     
        movd    mm1, eax
        movd    mm2, ecx                                
        psllq   mm1, 32                                 
        por     mm1, mm2                                
        movq    mm2, mm1                                
        psrlq   mm1, qword ptr [esp + 08H]
        por     mm0, mm1                                
        mov     eax, dword ptr [esp]                    
        psrlq   mm0, qword ptr [ebp+TBL64Offset + eax*8]
        sub     eax, dword ptr [esp + 08H]              
        movd    mm3, eax                                
        psllq   mm2, mm3                                
        movd    eax, mm0                                
        movq    mm0, mm2
        add     esp, 16                                 
        sub     ecx, ecx                                
        ret                                             

@@005:  mov     dword ptr [ebp + 38H], 0
        add     esp, 16
        stc
end;

procedure _JPG_GetMarker;
asm
        mov     eax, 8
        call    _BS_GetBits
        jc      @@007
        cmp     al, -1
        jnz     _JPG_GetMarker
@@003:  mov     eax, 8
        call    _BS_GetBits
        jc      @@007
        cmp     al, -1
        jz      @@003                                   
        cmp     al, -65                                 
        jbe     _JPG_GetMarker
        cmp     al, -32                                 
        jc      @@004                                   
        mov     al, -32                                 
@@004:  cmp     al, -48                                 
        jc      @@005                                   
        cmp     al, -41                                 
        ja      @@005                                   
        mov     al, -48                                 
@@005:  mov     dword ptr [ebp + 24H], eax              
        mov     al, 1                                   
@@006:  ret                                             

@@007:  mov     byte ptr [ebp + 1CH], 0                 
        sub     eax, eax                                
        jmp     @@006                                   
end;

procedure _BS_Test;
asm
        mov     edx, eax
        mov     ecx, dword ptr [ebp + 38H]
        add     edx, dword ptr [ebp + 40H]
        shr     ecx, 3
        sub     edx, ecx
        cmp     edx, dword ptr [ebp + 48H]
end;

procedure _JPG_TestLength;
asm
        mov     eax, 16
        call    _BS_GetBits
        jc      @@010
        sub     eax, 2
        call    _BS_Test
@@010:
end;

procedure _JPG_SOI;
asm
        mov     al, 1
end;

procedure _JPG_SOF;
asm
        sub     esp, 4                                  
        call    _JPG_TestLength
        jg      @@017                                   
        mov     eax, dword ptr [ebp + 000000F0H]
        test    eax,eax
        jz @1
        call    ReleaseMem
        mov     dword ptr [ebp + 000000F0H], eax
@1:     mov     eax, 32
        mov     dword ptr [ebp + 10H], eax
        call    _BS_GetBits
        mov     ecx, eax                                
        mov     esi, eax                                
        shr     ecx, 24                                 
        shr     eax, 8
        and     esi, 000000FFH                          
        and     eax, 0000FFFFH                          
        je      @@015                                   
        mov     dword ptr [ebp + 04H], eax              
        sub     cl, 8                                   
        test    cl, 0FBH                                
        jne     @@015                                   
        add     ecx, 7                                  
        sub     eax, eax                                
        bts     eax, ecx                                
        cvtsi2ss xmm0, eax                              
        shufps  xmm0, xmm0, 0                           
        movaps  dqword ptr [ebp + 000000E0H], xmm0
        mov     eax, 16
        call    _BS_GetBits
        mov     edi, eax                                
        shl     esi, 8                                  
        and     edi, 00000003H                          
        shr     eax, 8                                  
        bt      edi, 0                                  
        jae     @@015                                   
        or      eax, esi                                
        je      @@015                                   
        mov     dword ptr [ebp], eax                    
        mov     dword ptr [ebp + 14H], edi              
        mov     eax, dword ptr [ebp + 00000100H]        
        mov     eax, dword ptr [eax + edi*4 - 04H]
        mov     dword ptr [ebp + 000000C4H], eax
        mov     dword ptr [esp], edi                    
        sub     edi, edi                                
        mov     ebx, edi                                
        lea     esi, [ebp + 50H]                        
@@011:  mov     eax, 24
        call    _BS_GetBits                           
        mov     ecx, eax
        mov     edx, eax                                
        and     eax, 00000003H                          
        shr     ecx, 8                                  
        shr     edx, 12                                 
        and     ecx, 0000000FH                          
        je      @@015                                   
        and     edx, 0000000FH                          
        je      @@015                                   
        mov     dword ptr [esi + 04H], ecx              
        mov     dword ptr [esi], edx                    
        cmp     ecx, ebx                                
        cmova   ebx, ecx                                
        cmp     edx, edi                                
        cmova   edi, edx                                
        shl     eax, 8
        lea     eax, [eax + ebp + 00000110H]            
        mov     dword ptr [esi + 14H], eax              
        imul    ecx, edx                                
        mov     dword ptr [esi + 08H], ecx              
        add     esi, 32
        dec     dword ptr [esp]                         
        jnz     @@011                                   
        mov     dword ptr [ebp + 000000B0H], edi        
        mov     dword ptr [ebp + 000000B4H], ebx        
        cmp     edi, 4
        ja      @@015                                   
        cmp     ebx, 4                                  
        ja      @@015                                   
        shl     edi, 3                                  
        mov     eax, dword ptr [ebp]
        sub     edx, edx                                
        lea     eax, [edi + eax - 01H]                  
        div     edi                                     
        mov     dword ptr [ebp + 000000CCH], eax
        mov     dword ptr [ebp + 000000D0H], eax        
        imul    edi                                     
        mov     dword ptr [ebp + 08H], eax              
        shl     ebx, 3                                  
        lea     ecx, [ebx*4 - 00000004H]                
        imul    eax, ecx                                
        mov     dword ptr [ebp + 000000D8H], eax        
        mov     eax, dword ptr [ebp + 04H]              
        lea     eax, [ebx + eax - 01H]                  
        div     ebx                                     
        imul    ebx                                     
        imul    edi, ebx                                
        imul    eax, dword ptr [ebp + 08H]              
        jo      @@015
        mov     ecx, eax
        shl     ecx, 2                                  
        je      @@015                                   
        div     edi                                     
        mov     dword ptr [ebp + 000000BCH], eax
        mov     eax, edi                                
        shl     edi, 2                                  
        mov     dword ptr [ebp + 000000B8H], edi        
        mov     ebx, dword ptr [ebp + 14H]              
        imul    edi, ebx                                
        lea     ebx, [ebx*4 + 00000004H]                
        shr     eax, 3                                  
        imul    ebx, eax
        lea     eax, [edi + ebx + 0FH]                  
        add     eax, ecx                                
        jo      @@015
        call    AllocMem
        test    eax,eax
        jz      @@016
        mov     dword ptr [ebp + 000000F0H], eax        
        and     eax, 0FFFFFFF0H                         
        mov     dword ptr [ebp + 000000F4H], eax        
        lea     ecx, [ebx + eax]
        add     ecx, edi                                
        mov     dword ptr [ebp + 0CH], ecx              
        mov     dword ptr [ebp + 000000C8H], ecx        
        mov     dword ptr [ebp + 000000D4H], ecx        
        add     edi, eax
        mov     ebx, eax                                
        mov     dword ptr [esp], 0                      
        lea     esi, [ebp + 50H]                        
@@012:  mov     eax, dword ptr [ebp + 000000B0H]        
        div     byte ptr [esi]
        bswap   eax                                     
        mov     ax, word ptr [ebp + 000000B4H]          
        div     byte ptr [esi + 04H]                    
        mov     ecx, 16777217                           
        mov     edx, 3                                  
        cmp     eax, ecx                                
        jz      @@013                                   
        shl     ecx, 1                                  
        dec     edx
        cmp     eax, ecx                                
        jz      @@013                                   
        dec     edx                                     
        dec     ecx                                     
        cmp     eax, ecx                                
        jz      @@013                                   
        dec     edx                                     
        bswap   ecx                                     
        cmp     eax, ecx                                
        jnz     @@015
@@013:  shl     edx, 2                                  
        mov     ecx, dword ptr [ebp + 000000F8H]        
        mov     ecx, dword ptr [edx + ecx]
        mov     dword ptr [esi + 1CH], ecx              
        push    0                                       
        call    _RGB_GenPointer
        add     ebx, dword ptr [ebp + 000000B8H]        
        add     edi, 4                                  
        add     esi, 32                                 
        mov     eax, dword ptr [esp]                    
        inc     eax                                     
        mov     dword ptr [esp], eax                    
        cmp     eax, dword ptr [ebp + 14H]              
        jnz     @@012
        mov     edx, 12                                 
        push    1
        call    _RGB_GenPointer
        mov     al, 1
@@014:  add     esp, 4
        ret

@@015:  mov     al, 5
        jmp     @@018

@@016:  mov     al, 1
        jmp     @@018

@@017:  mov     al, 0
@@018:  mov     byte ptr [ebp + 1CH], al
        sub     eax, eax
        jmp     @@014
end;

procedure _BS_GetPtr;
asm
        mov     ecx, dword ptr [ebp + 38H]              
        mov     eax, dword ptr [ebp + 40H]
        shr     ecx, 3
        sub     eax, ecx
        mov     dword ptr [ebp + 38H], 0
        add     dword ptr [ebp + 44H], ecx
        mov     dword ptr [ebp + 40H], eax
end;

procedure TBL_jpeg_natural_order;
asm
        db 00H, 01H, 08H, 10H, 09H, 02H, 03H, 0AH
        db 11H, 18H, 20H, 19H, 12H, 0BH, 04H, 05H
        db 0CH, 13H, 1AH, 21H, 28H, 30H, 29H, 22H
        db 1BH, 14H, 0DH, 06H, 07H, 0EH, 15H, 1CH
        db 23H, 2AH, 31H, 38H, 39H, 32H, 2BH, 24H
        db 1DH, 16H, 0FH, 17H, 1EH, 25H, 2CH, 33H
        db 3AH, 3BH, 34H, 2DH, 26H, 1FH, 27H, 2EH
        db 35H, 3CH, 3DH, 36H, 2FH, 37H, 3EH, 3FH
        db 3FH, 3FH, 3FH, 3FH, 3FH, 3FH, 3FH, 3FH
        db 3FH, 3FH, 3FH, 3FH, 3FH, 3FH, 3FH, 3FH
end;

procedure _JPG_ECS;
asm
        // Code Modification by Mark Griffiths to make this library thread safe
        mov     edi, esp
        sub     esp, 4
        and     esp, 0FFFFFFF0H
        add     esp, 4
        push    edi
        sub     esp, 120H
        // end of new code - original code is commented out below.
{
        mov     dword ptr [espsav], esp
        sub     esp, 287
        and     esp, 0FFFFFFF0H
}
        sub     edx, edx
        mov     ecx, dword ptr [ebp + 000000BCH]
        add     edx, dword ptr [ebp + 000000C0H]
        jnz     @@033
        mov     edx, ecx
@@033:  sub     ecx, edx
        sbb     eax, eax
        and     eax, ecx
        add     edx, eax
        je      @@040
        mov     dword ptr [esp + 00000108H], edx
        sub     dword ptr [ebp + 000000BCH], edx
        call    _BS_GetPtr
        mov     dword ptr [ebp + 000000FCH], offset @RetIdct
@@034:  mov     eax, dword ptr [ebp + 14H]
        mov     dword ptr [esp + 00000100H], eax
        lea     esi, [ebp + 50H]
        mov     edi, dword ptr [ebp + 000000F4H]
@@035:  mov     eax, dword ptr [esi + 08H]
        mov     dword ptr [esp + 00000104H], eax
@@036:  lea     edx, [esp]
        mov     ecx, 256
        pxor    mm1, mm1
        call    _MEM_Set
        mov     eax, dword ptr [esi + 0CH]
        call    _HUF_Decode
        jb      @@042
        mov     ebx, dword ptr [esi + 18H]
        test    eax, eax
        jz      @@037
        mov     dword ptr [esp + 0000010CH], eax
        call    _BS_GetBitsECS
        jb      @@042
        mov     ecx, dword ptr [esp + 0000010CH]
        mov     edx, eax
        shr     eax, cl
        sbb     eax, eax
        not     eax
        shl     eax, cl
        adc     eax, edx
        add     ebx, eax
        mov     dword ptr [esi + 18H], ebx
@@037:  mov     dword ptr [esp], ebx
        mov     ebx, -62
@@038:  mov     eax, dword ptr [esi + 10H]
        call    _HUF_Decode
        jb      @@042
        mov     dword ptr [esp + 0000010CH], eax
        shr     eax, 4
        add     ebx, eax
        and     dword ptr [esp + 0000010CH], 0000000FH
        jz      @@039
        mov     eax, dword ptr [esp + 0000010CH]
        call    _BS_GetBitsECS
        jc      @@042
        mov     ecx, dword ptr [esp + 0000010CH]
        mov     edx, eax
        shr     eax, cl
        sbb     eax, eax
        not     eax
        shl     eax, cl
        adc     eax, edx
        movzx   ecx, byte ptr [TBL_jpeg_natural_order + 0000003FH + ebx]
        mov     dword ptr [esp + ecx*4], eax
        mov     eax, 15
@@039:  cmp     eax, 15
        sbb     eax, eax
        not     eax
        and     ebx, eax
        inc     ebx
        jle     @@038
        mov     eax, dword ptr [esi + 14H]
        jmp     dword ptr [esi + 1CH]

@RetIdct:
        dec     dword ptr [esp + 00000104H]
        jne     @@036
        add     esi, 32
        dec     dword ptr [esp + 00000100H]
        jne     @@035
        call    dword ptr [ebp + 000000C4H]
        dec     dword ptr [esp + 00000108H]
        jne     @@034
        call    _BS_GetPtr
@@040:  mov     al, 1
@@041:
        // Code Modification by Mark Griffiths to make this library thread safe
        add     esp, 120H
        pop     esp
        // end of new code - original code is commented out below.
//        mov     esp, dword ptr [espsav]
        ret

@@042:  mov     byte ptr [ebp + 1CH], 0
        sub     eax, eax
        jmp     @@041
end;

procedure _JPG_SOS;
asm
        call    _JPG_TestLength
        jg      @@021                                   
        mov     eax, 8                                  
        call    _BS_GetBits                           
        mov     esi, eax                                
        cmp     eax, dword ptr [ebp + 14H]              
        jnz     @@022
        lea     ebx, [ebp + 50H]                        
@@019:  mov     eax, 16                                 
        call    _BS_GetBits                           
        mov     ecx, eax                                
        shr     eax, 4                                  
        and     ecx, 00000003H                          
        and     eax, 00000003H                          
        imul    eax, eax, 1408                          
        imul    ecx, ecx, 1408                          
        lea     eax, [eax + ebp + 00000510H]            
        lea     ecx, [ecx + ebp + 000007D0H]            
        mov     dword ptr [ebx + 0CH], eax              
        mov     dword ptr [ebx + 10H], ecx              
        add     ebx, 32                                 
        dec     esi                                     
        jnz     @@019                                   
        mov     eax, 24
        call    _BS_GetBits                           
        call    _JPG_ECS
@@020:  ret                                             

@@021:  mov     al, 0                                   
        jmp     @@023                                   

@@022:  mov     al, 5                                   
@@023:  mov     byte ptr [ebp + 1CH], al                
        sub     eax, eax                                
        jmp     @@020                                   
end;

procedure _JPG_DQT;
asm
        sub     esp, 4
        call    _JPG_TestLength
        jg      @@027
        mov     dword ptr [esp], eax
@@024:  mov     eax, 8
        call    _BS_GetBits
        mov     ebx, eax
        shr     eax, 4
        and     ebx, 00000003H                          
        and     eax, 00000001H                          
        shl     ebx, 8                                  
        lea     esi, [eax*8 + 00000008H]                
        mov     edi, -64                                
        lea     ebx, [ebx + ebp + 00000110H]
@@025:  mov     eax, esi                                
        call    _BS_GetBits                           
        movzx   ecx, byte ptr [TBL_jpeg_natural_order + 00000040H + edi]
        cvtsi2ss xmm0, eax
        inc     edi
        movss   dword ptr [ebx + ecx*4], xmm0
        jnz     @@025                                   
        shl     esi, 3                                  
        inc     esi
        sub     dword ptr [esp], esi                    
        jg      @@024                                   
        mov     al, 1                                   
@@026:  add     esp, 4                                  
        ret                                             

@@027:  mov     al, 0                                   
        mov     byte ptr [ebp + 1CH], al                
        sub     eax, eax                                
        jmp     @@026                                   
end;

procedure _BS_Align;
asm
        push    esi                                     
        push    edi
        push    ebx
        call    _BS_GetPtr
        mov     esi, eax
        lea     edi, [ebp + 30H]
        pxor    mm0, mm0
        movq    qword ptr [edi], mm0
        lea     ecx, [eax + 07H]
        and     ecx, 0FFFFFFF8H
        sub     ecx, eax
        mov     edx, dword ptr [ebp + 44H]
        sub     edx, ecx
        sbb     eax, eax
        and     eax, edx
        add     ecx, eax
        mov     eax, ecx
        shl     eax, 3
        sub     dword ptr [ebp + 44H], ecx
        rep movsb
        mov     dword ptr [ebp + 38H], eax
        mov     dword ptr [ebp + 40H], esi              
        mov     eax, dword ptr [ebp + 30H]              
        mov     ecx, dword ptr [ebp + 34H]
        bswap   eax                                     
        bswap   ecx                                     
        movd    mm0, eax                                
        movd    mm1, ecx                                
        psllq   mm0, 32
        por     mm0, mm1                                
        pop     ebx
        pop     edi                                     
        pop     esi
end;

procedure _BS_SkipBytes;
asm
        sub     dword ptr [ebp + 44H], eax
        mov     ecx, dword ptr [ebp + 38H]
        add     eax, dword ptr [ebp + 40H]
        shr     ecx, 3
        sub     eax, ecx
        mov     dword ptr [ebp + 38H], 0
        mov     dword ptr [ebp + 40H], eax
        add     dword ptr [ebp + 44H], ecx
        call    _BS_Align
end;

procedure _JPG_DHT;
asm
        call    _JPG_TestLength
        jg      @@029
        mov     ebx, eax
        call    _BS_GetPtr
        mov     esi, eax
        mov     eax, ebx
        call    _BS_SkipBytes
        call    _HUF_GenTable
        mov     al, 1
@@028:  ret

@@029:  mov     byte ptr [ebp + 1CH], 0
        sub     eax, eax
end;

procedure _JPG_DRI;
asm
        call    _JPG_TestLength
        jg      @@031
        mov     eax, 16
        call    _BS_GetBits
        mov     dword ptr [ebp + 000000C0H], eax
        mov     al, 1
        ret
@@031:  mov     byte ptr [ebp + 1CH], 0
        sub     eax, eax
end;

procedure _JPG_RST;
asm
        sub     eax, eax
        lea     ecx, [ebp + 50H]
        mov     edx, dword ptr [ebp + 14H]
@@032:  mov     dword ptr [ecx + 18H], eax
        add     ecx, 32
        dec     edx
        jnz     @@032
        call    _JPG_ECS
end;

procedure _JPG_EOI;
asm
        mov     byte ptr [ebp + 1CH], 2
        sub     eax, eax
end;

procedure _JPG_SKIP;
asm
        call    _JPG_TestLength
        jg      @@044
        call    _BS_SkipBytes
        mov     al, 1
        ret
@@044:  mov     byte ptr [ebp + 1CH], 0
        sub     eax, eax
end;

procedure _JPG_UNSUPP;
asm
        mov     byte ptr [ebp + 1CH], 5
        sub     eax, eax
end;

procedure _JPG_ProcessMarker;
asm
        mov     eax, dword ptr [ebp + 20H]
        mov     eax, dword ptr [@TBL_TransList + eax*4]
        mov     edx, dword ptr [ebp + 24H]
@@008:  mov     ecx, dword ptr [eax]
        cmp     cl, dl
        jz      @@009
        cmp     cl, -1
        jz      @@009
        add     eax, 4
        jmp     @@008
        nop; nop; nop
@TBL_Trans_DEAD:
        db 0D8H, 01H, 09H, 00H, 0FFH, 00H, 0BH, 00H
@TBL_Trans_SOI:
        db 0DAH, 02H, 02H, 00H, 0C0H, 01H, 01H, 00H
        db 0C1H, 01H, 01H, 00H, 0DBH, 01H, 03H, 00H
        db 0C4H, 01H, 04H, 00H, 0DDH, 01H, 05H, 00H
        db 0E0H, 01H, 0AH, 00H, 0FFH, 00H, 0BH, 00H
@TBL_Trans_ECS:
        db 0D9H, 00H, 07H, 00H, 0D0H, 02H, 06H, 00H
        db 0FFH, 00H, 0BH, 00H
@TBL_TransList:
        dd offset @TBL_Trans_DEAD
        dd offset @TBL_Trans_SOI
        dd offset @TBL_Trans_ECS
        dd 00000000H
@TBL_TransFunc:
        dd offset _JPG_SOF
        dd offset _JPG_SOS
        dd offset _JPG_DQT
        dd offset _JPG_DHT
        dd offset _JPG_DRI
        dd offset _JPG_RST
        dd offset _JPG_EOI
        dd offset _JPG_ECS
        dd offset _JPG_SOI
        dd offset _JPG_SKIP
        dd offset _JPG_UNSUPP
@@009:  shr     ecx, 8
        mov     byte ptr [ebp + 20H], cl
        shr     ecx, 8
        call    dword ptr [@TBL_TransFunc - 00000004H + ecx*4]
end;


function JpegDecode(Buffer: pointer; BufferLen: cardinal;
  var pImg: PJpegDecode): TJpegDecodeError; stdcall;
asm     pop ebp  // delphi created a push ebp
        sub     eax, eax
        pushad
        mov     eax, dword ptr [esp + 2CH] // pImg
        mov     dword ptr [eax], 0
        mov     eax,TBLOffset+TBLSize+TBL64Size
        call    AllocMem // SSE2 ops expect 16 bytes aligned data -> no GetMem()
        test    eax,eax
        jz      @@005
        mov     ebp, eax
        mov     dword ptr [eax + 18H], -1412571974 // magic
        lea edx,eax+TBLOffset
        mov eax,offset TBL
        mov ecx,TBLSize
        call move // move TBL content into pImg for 16 bytes align
        push ebp
        add ebp,TBLOffset
        lea eax,ebp+0B0H // first _TBL_MultRow8x8 dd offset
        sub ebp,offset TBL
        mov ecx,16   // dd count for _TBL_MultRow8x8 and _TBL_MultRow16x16
@next:  add dword ptr [eax],ebp // dd offset corection TBL -> pImg
        add eax,4 // next dd
        dec ecx
        jnz @next
        pop ebp
        lea eax,ebp+TBL64Offset // create TBL_64 from code
        mov ecx,040H
@64:    mov [eax],ecx // 64..0 stored as mmx register 
        add eax,8     // next mmx register
        dec ecx
        jnz @64
@end:   xor eax,eax // no options is implemented by now 
        call    _JPG_InitDecoder
        test    eax, eax
        jz      @@002
        mov     esi, dword ptr [esp + 24H] // Buffer
        mov     ecx, dword ptr [esp + 28H] // BufferLen
        push    esi
        mov     dword ptr [ebp + 40H], esi
        mov     dword ptr [ebp + 44H], ecx
        add     esi, ecx
        mov     dword ptr [ebp + 48H], esi
        call    _BS_Align
        pop     esi
@@001:  call    _JPG_GetMarker
        test    eax, eax
        jz      @@002
        call    _JPG_ProcessMarker
        test    eax, eax
        jnz     @@001
@@002:  mov     bl, byte ptr [ebp + 1CH]
        cmp     bl, 2
        jnz     @@004
        mov     ecx, dword ptr [esp + 2CH]
        mov     dword ptr [ecx], ebp       // update pImg var
@@003:  mov     byte ptr [esp + 1CH], bl   // error code will be poped to eax
        emms    // allow FPU usage in Delphi code
        popad
        ret     12

@@004:  mov     eax, dword ptr [ebp + 000000F0H]
        or      eax, eax
        jz      @@006
        call    ReleaseMem // release any bitmap area (to avoid memory leak)
@@006:  mov     eax,ebp
        call    ReleaseMem // release TJpegDecode instance
        jmp     @@003

@@005:  mov     bl, 1
        jmp     @@003
end;

function JpegDecode(Buffer: pointer; BufferLen: integer): TBitmap;
var pImg: PJpegDecode;
begin
  result := nil;
  if JpegDecode(Buffer,BufferLen,pImg)=JPEG_SUCCESS then
  try
    result := pImg^.ToBitmap;
  finally
    pImg^.Free;
  end;
end;

procedure JpegDraw(Buffer: pointer; BufferLen: integer; Canvas: TCanvas; X,Y: integer);
var pImg: PJpegDecode;
begin
  if JpegDecode(Buffer,BufferLen,pImg)=JPEG_SUCCESS then
  try
    pImg^.DrawTo(Canvas,X,Y);
  finally
    pImg^.Free;
  end;
end;


{ TJpegDecode }

function TJpegDecode.Free: boolean;
asm
        test    eax, eax
        jz      @z
        cmp     dword ptr [eax + 18H], -1412571974  // magic
        jnz     @z
        push    eax
        mov     eax, dword ptr [eax + 000000F0H] // main allocated memory block
        call    ReleaseMem
        pop     eax
        call    ReleaseMem // TJpegDecode instance
        xor     eax,eax
@z:
end;

procedure TJpegDecode.ToBMI(var BMI: TBitmapInfo);
begin
  fillchar(BMI,sizeof(BMI),0);
  if @self=nil then
    exit;
  BMI.bmiHeader.biSize := sizeof(BMI.bmiHeader);
  BMI.bmiHeader.biWidth := scanlength;
  BMI.bmiHeader.biHeight := -height;
  BMI.bmiHeader.biPlanes := 1;
  BMI.bmiHeader.biBitCount := bitsPixel;
  BMI.bmiHeader.biCompression := BI_RGB;
end;

function TJpegDecode.ToBitmap: TBitmap;
var BMI: TBitmapInfo;
    DC: HDC;
begin
  if @self=nil then begin
    result := nil;
    exit;
  end;
  result := TBitmap.Create;
  result.PixelFormat := pf24bit;
  result.Width := width;
  result.Height := height;
//  DrawTo(result.Canvas,0,0); exit;
  ToBMI(BMI);
  DC := GetDC(0);
  if SetDIBits(DC,result.Handle,0,height,pRGB,BMI,DIB_RGB_COLORS)<>height then
    FreeAndNil(result);
  ReleaseDC(0,DC);
end;

procedure TJpegDecode.DrawTo(Canvas: TCanvas; X, Y: integer);
var BMI: TBitmapInfo;
begin
  if @self=nil then
    exit;
  ToBMI(BMI);
  StretchDIBits(Canvas.Handle,X,Y,width,height,0,0,width,height,pRGB,
   BMI,DIB_RGB_COLORS,SrcCopy);
end;

procedure TJpegDecode.DrawTo(Canvas: TCanvas; const Dest: TRect);
var BMI: TBitmapInfo;
begin
  if @self=nil then
    exit;
  ToBMI(BMI);
  StretchDIBits(Canvas.Handle,
    Dest.Left,Dest.Top,Dest.Right-Dest.Left,Dest.Bottom-Dest.Top,
    0,0,width,height,pRGB,BMI,DIB_RGB_COLORS,SrcCopy);
end;

procedure TJpegDecode.DrawTo(Canvas: TCanvas; const Source, Dest: TRect);
var BMI: TBitmapInfo;
begin
  if @self=nil then
    exit;
  ToBMI(BMI);
  StretchDIBits(Canvas.Handle,
    Dest.Left,Dest.Top,Dest.Right-Dest.Left,Dest.Bottom-Dest.Top,
    Source.Left,Source.Top,Source.Right-Source.Left,Source.Bottom-Source.Top,
    pRGB,BMI,DIB_RGB_COLORS,SrcCopy);
end;

{$ENDIF}

end.
