mirror of
https://github.com/mainflux/mainflux.git
synced 2025-05-09 19:29:29 +08:00

* Return Auth service Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Update Compose to run with SpiceDB and Auth svc Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Update auth gRPC API Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Remove Users' policies Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Move Groups to internal Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Use shared groups in Users Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Remove unused code Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Use pkg Groups in Things Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Remove Things groups Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Make imports consistent Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Update Groups networking Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Remove things groups-specific API Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Move Things Clients to the root Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Move Clients to Users root Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Temporarily remove tracing Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Fix imports Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Add buffer config for gRPC Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Update auth type for Things Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Use Auth for login Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Add temporary solution for refresh token Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Update Tokenizer interface Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Updade tokens issuing Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Fix token issuing Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Update JWT validator and refactor Tokenizer Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Rename access timeout Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Rename login to authenticate Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Update Identify to use SubjectID Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Add Auth to Groups Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Use the Auth service for Groups Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Update auth schema Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Fix Auth for Groups Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Add auth for addons (#14) Signed-off-by: Arvindh <arvindh91@gmail.com> Speparate Login and Refresh tokens Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Merge authN and authZ requests for things Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Add connect and disconnect Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Update sharing Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Fix policies addition and removal Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Update relation with roels Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Add gRPC to Things Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Assign and Unassign members to group and Listing of Group members (#15) * add auth for addons Signed-off-by: Arvindh <arvindh91@gmail.com> * add assign and unassign to group Signed-off-by: Arvindh <arvindh91@gmail.com> * add group incomplete repo implementation Signed-off-by: Arvindh <arvindh91@gmail.com> * groups for users Signed-off-by: Arvindh <arvindh91@gmail.com> * groups for users Signed-off-by: Arvindh <arvindh91@gmail.com> * groups for users Signed-off-by: Arvindh <arvindh91@gmail.com> * groups for users Signed-off-by: Arvindh <arvindh91@gmail.com> --------- Signed-off-by: Arvindh <arvindh91@gmail.com> Move coap mqtt and ws policies to spicedb (#16) Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> Remove old policies Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> NOISSUE - Things authorize to return thingID (#18) This commit modifies the authorize endpoint to the grpc endpoint to return thingID. The authorize endpoint allows adapters to get the publisher of the message. Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> Add Groups to users service (#17) * add assign and unassign to group Signed-off-by: Arvindh <arvindh91@gmail.com> * add group incomplete repo implementation Signed-off-by: Arvindh <arvindh91@gmail.com> * groups for users Signed-off-by: Arvindh <arvindh91@gmail.com> * groups for users Signed-off-by: Arvindh <arvindh91@gmail.com> * groups for users Signed-off-by: Arvindh <arvindh91@gmail.com> * groups for users Signed-off-by: Arvindh <arvindh91@gmail.com> * groups for users stable 1 Signed-off-by: Arvindh <arvindh91@gmail.com> * groups for users stable 2 Signed-off-by: Arvindh <arvindh91@gmail.com> * groups for users & things Signed-off-by: Arvindh <arvindh91@gmail.com> * Amend signature Signed-off-by: Arvindh <arvindh91@gmail.com> * fix merge error Signed-off-by: Arvindh <arvindh91@gmail.com> --------- Signed-off-by: Arvindh <arvindh91@gmail.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * NOISSUE - Fix es code (#21) Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * NOISSUE - Fix Bugs (#20) * fix bugs Signed-off-by: Arvindh <arvindh91@gmail.com> * fix bugs Signed-off-by: Arvindh <arvindh91@gmail.com> --------- Signed-off-by: Arvindh <arvindh91@gmail.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * NOISSUE - Test e2e (#19) * fix: connect method Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> * fix: e2e Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> * fix changes in sdk and e2e Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> * feat(docker): remove unnecessary port mapping Remove the port mapping for MQTT broker in the docker-compose.yml file. Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> * Enable group listing Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> * feat(responses): update ChannelsPage struct The ChannelsPage struct in the responses.go file has been updated. The "Channels" field has been renamed to "Groups" to provide more accurate naming. This change ensures consistency and clarity in the codebase. Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> * feat(things): add UpdateClientSecret method Add the UpdateClientSecret method to the things service. This method allows updating the client secret for a specific client identified by the provided token, id, and key parameters. Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> --------- Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * Use smaller buffers for gRPC Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * Clean up tests (#22) Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * Add Connect Disconnect endpoints (#23) * fix bugs Signed-off-by: Arvindh <arvindh91@gmail.com> * fix bugs Signed-off-by: Arvindh <arvindh91@gmail.com> * fix list of things in a channel and Add connect disconnect endpoint Signed-off-by: Arvindh <arvindh91@gmail.com> * fix list of things in a channel and Add connect disconnect endpoint Signed-off-by: Arvindh <arvindh91@gmail.com> --------- Signed-off-by: Arvindh <arvindh91@gmail.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * Add: Things share with users (#25) * fix list of things in a channel and Add connect disconnect endpoint Signed-off-by: Arvindh <arvindh91@gmail.com> * add: things share with other users Signed-off-by: Arvindh <arvindh91@gmail.com> --------- Signed-off-by: Arvindh <arvindh91@gmail.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * NOISSUE - Rename gRPC Services (#24) * Rename things and users auth service Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> * docs: add authorization docs for gRPC services Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> * Rename things and users grpc services Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> * Remove mainflux.env package Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> --------- Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * Add: Listing of things, channels, groups, users (#26) * add: listing of channels, users, groups, things Signed-off-by: Arvindh <arvindh91@gmail.com> * add: listing of channels, users, groups, things Signed-off-by: Arvindh <arvindh91@gmail.com> * add: listing of channels, users, groups, things Signed-off-by: Arvindh <arvindh91@gmail.com> * add: listing of channels, users, groups, things Signed-off-by: Arvindh <arvindh91@gmail.com> --------- Signed-off-by: Arvindh <arvindh91@gmail.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * NOISSUE - Clean Up Users (#27) * feat(groups): rename redis package to events - Renamed the `redis` package to `events` in the `internal/groups` directory. - Updated the file paths and names accordingly. - This change reflects the more accurate purpose of the package and improves code organization. Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> * feat(auth): Modify identity method Change request and response of identity method Add accessToken and refreshToken to Token response Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> * clean up users, remove dead code Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> * feat(users): add unit tests for user service This commit adds unit tests for the user service in the `users` package. The tests cover various scenarios and ensure the correct behavior of the service. Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> --------- Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * Add: List of user groups & removed repeating code in groups (#29) * removed repeating code in list groups Signed-off-by: Arvindh <arvindh91@gmail.com> * add: list of user group Signed-off-by: Arvindh <arvindh91@gmail.com> * fix: otel handler operator name for endpoints Signed-off-by: Arvindh <arvindh91@gmail.com> --------- Signed-off-by: Arvindh <arvindh91@gmail.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * NOISSUE - Clean Up Things Service (#28) * Rework things service Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> * add tests Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> --------- Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * NOISSUE - Clean Up Auth Service (#30) * clean up auth service Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> * feat(auth): remove unused import Remove the unused import of `emptypb` in `auth.pb.go`. This import is not being used in the codebase and can be safely removed. Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> --------- Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * NOISSUE - Update API docs (#31) Signed-off-by: rodneyosodo <blackd0t@protonmail.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * Remove TODO comments and cleanup the code Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> * Update dependenices Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> --------- Signed-off-by: Arvindh <arvindh91@gmail.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Signed-off-by: Rodney Osodo <28790446+rodneyosodo@users.noreply.github.com> Signed-off-by: rodneyosodo <blackd0t@protonmail.com> Co-authored-by: b1ackd0t <28790446+rodneyosodo@users.noreply.github.com> Co-authored-by: Arvindh <30824765+arvindh123@users.noreply.github.com>
4176 lines
82 KiB
ArmAsm
4176 lines
82 KiB
ArmAsm
// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
|
|
|
|
//go:build !appengine && !noasm && gc && !noasm
|
|
|
|
// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
|
|
// Requires: CMOV
|
|
TEXT ·sequenceDecs_decode_amd64(SB), $8-32
|
|
MOVQ br+8(FP), CX
|
|
MOVQ 24(CX), DX
|
|
MOVBQZX 32(CX), BX
|
|
MOVQ (CX), AX
|
|
MOVQ 8(CX), SI
|
|
ADDQ SI, AX
|
|
MOVQ AX, (SP)
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 72(AX), DI
|
|
MOVQ 80(AX), R8
|
|
MOVQ 88(AX), R9
|
|
MOVQ 104(AX), R10
|
|
MOVQ s+0(FP), AX
|
|
MOVQ 144(AX), R11
|
|
MOVQ 152(AX), R12
|
|
MOVQ 160(AX), R13
|
|
|
|
sequenceDecs_decode_amd64_main_loop:
|
|
MOVQ (SP), R14
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ SI, $0x08
|
|
JL sequenceDecs_decode_amd64_fill_byte_by_byte
|
|
MOVQ BX, AX
|
|
SHRQ $0x03, AX
|
|
SUBQ AX, R14
|
|
MOVQ (R14), DX
|
|
SUBQ AX, SI
|
|
ANDQ $0x07, BX
|
|
JMP sequenceDecs_decode_amd64_fill_end
|
|
|
|
sequenceDecs_decode_amd64_fill_byte_by_byte:
|
|
CMPQ SI, $0x00
|
|
JLE sequenceDecs_decode_amd64_fill_check_overread
|
|
CMPQ BX, $0x07
|
|
JLE sequenceDecs_decode_amd64_fill_end
|
|
SHLQ $0x08, DX
|
|
SUBQ $0x01, R14
|
|
SUBQ $0x01, SI
|
|
SUBQ $0x08, BX
|
|
MOVBQZX (R14), AX
|
|
ORQ AX, DX
|
|
JMP sequenceDecs_decode_amd64_fill_byte_by_byte
|
|
|
|
sequenceDecs_decode_amd64_fill_check_overread:
|
|
CMPQ BX, $0x40
|
|
JA error_overread
|
|
|
|
sequenceDecs_decode_amd64_fill_end:
|
|
// Update offset
|
|
MOVQ R9, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R15
|
|
SHLQ CL, R15
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decode_amd64_of_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decode_amd64_of_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decode_amd64_of_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R15
|
|
ADDQ R15, AX
|
|
|
|
sequenceDecs_decode_amd64_of_update_zero:
|
|
MOVQ AX, 16(R10)
|
|
|
|
// Update match length
|
|
MOVQ R8, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R15
|
|
SHLQ CL, R15
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decode_amd64_ml_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decode_amd64_ml_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decode_amd64_ml_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R15
|
|
ADDQ R15, AX
|
|
|
|
sequenceDecs_decode_amd64_ml_update_zero:
|
|
MOVQ AX, 8(R10)
|
|
|
|
// Fill bitreader to have enough for the remaining
|
|
CMPQ SI, $0x08
|
|
JL sequenceDecs_decode_amd64_fill_2_byte_by_byte
|
|
MOVQ BX, AX
|
|
SHRQ $0x03, AX
|
|
SUBQ AX, R14
|
|
MOVQ (R14), DX
|
|
SUBQ AX, SI
|
|
ANDQ $0x07, BX
|
|
JMP sequenceDecs_decode_amd64_fill_2_end
|
|
|
|
sequenceDecs_decode_amd64_fill_2_byte_by_byte:
|
|
CMPQ SI, $0x00
|
|
JLE sequenceDecs_decode_amd64_fill_2_check_overread
|
|
CMPQ BX, $0x07
|
|
JLE sequenceDecs_decode_amd64_fill_2_end
|
|
SHLQ $0x08, DX
|
|
SUBQ $0x01, R14
|
|
SUBQ $0x01, SI
|
|
SUBQ $0x08, BX
|
|
MOVBQZX (R14), AX
|
|
ORQ AX, DX
|
|
JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
|
|
|
|
sequenceDecs_decode_amd64_fill_2_check_overread:
|
|
CMPQ BX, $0x40
|
|
JA error_overread
|
|
|
|
sequenceDecs_decode_amd64_fill_2_end:
|
|
// Update literal length
|
|
MOVQ DI, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R15
|
|
SHLQ CL, R15
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decode_amd64_ll_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decode_amd64_ll_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decode_amd64_ll_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R15
|
|
ADDQ R15, AX
|
|
|
|
sequenceDecs_decode_amd64_ll_update_zero:
|
|
MOVQ AX, (R10)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R14, (SP)
|
|
MOVQ R9, AX
|
|
SHRQ $0x08, AX
|
|
MOVBQZX AL, AX
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decode_amd64_skip_update
|
|
|
|
// Update Literal Length State
|
|
MOVBQZX DI, R14
|
|
SHRQ $0x10, DI
|
|
MOVWQZX DI, DI
|
|
LEAQ (BX)(R14*1), CX
|
|
MOVQ DX, R15
|
|
MOVQ CX, BX
|
|
ROLQ CL, R15
|
|
MOVL $0x00000001, BP
|
|
MOVB R14, CL
|
|
SHLL CL, BP
|
|
DECL BP
|
|
ANDQ BP, R15
|
|
ADDQ R15, DI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Match Length State
|
|
MOVBQZX R8, R14
|
|
SHRQ $0x10, R8
|
|
MOVWQZX R8, R8
|
|
LEAQ (BX)(R14*1), CX
|
|
MOVQ DX, R15
|
|
MOVQ CX, BX
|
|
ROLQ CL, R15
|
|
MOVL $0x00000001, BP
|
|
MOVB R14, CL
|
|
SHLL CL, BP
|
|
DECL BP
|
|
ANDQ BP, R15
|
|
ADDQ R15, R8
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Offset State
|
|
MOVBQZX R9, R14
|
|
SHRQ $0x10, R9
|
|
MOVWQZX R9, R9
|
|
LEAQ (BX)(R14*1), CX
|
|
MOVQ DX, R15
|
|
MOVQ CX, BX
|
|
ROLQ CL, R15
|
|
MOVL $0x00000001, BP
|
|
MOVB R14, CL
|
|
SHLL CL, BP
|
|
DECL BP
|
|
ANDQ BP, R15
|
|
ADDQ R15, R9
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R9*8), R9
|
|
|
|
sequenceDecs_decode_amd64_skip_update:
|
|
// Adjust offset
|
|
MOVQ 16(R10), CX
|
|
CMPQ AX, $0x01
|
|
JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
|
|
MOVQ R12, R13
|
|
MOVQ R11, R12
|
|
MOVQ CX, R11
|
|
JMP sequenceDecs_decode_amd64_after_adjust
|
|
|
|
sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
|
|
CMPQ (R10), $0x00000000
|
|
JNE sequenceDecs_decode_amd64_adjust_offset_maybezero
|
|
INCQ CX
|
|
JMP sequenceDecs_decode_amd64_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decode_amd64_adjust_offset_maybezero:
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
|
|
MOVQ R11, CX
|
|
JMP sequenceDecs_decode_amd64_after_adjust
|
|
|
|
sequenceDecs_decode_amd64_adjust_offset_nonzero:
|
|
CMPQ CX, $0x01
|
|
JB sequenceDecs_decode_amd64_adjust_zero
|
|
JEQ sequenceDecs_decode_amd64_adjust_one
|
|
CMPQ CX, $0x02
|
|
JA sequenceDecs_decode_amd64_adjust_three
|
|
JMP sequenceDecs_decode_amd64_adjust_two
|
|
|
|
sequenceDecs_decode_amd64_adjust_zero:
|
|
MOVQ R11, AX
|
|
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_amd64_adjust_one:
|
|
MOVQ R12, AX
|
|
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_amd64_adjust_two:
|
|
MOVQ R13, AX
|
|
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_amd64_adjust_three:
|
|
LEAQ -1(R11), AX
|
|
|
|
sequenceDecs_decode_amd64_adjust_test_temp_valid:
|
|
TESTQ AX, AX
|
|
JNZ sequenceDecs_decode_amd64_adjust_temp_valid
|
|
MOVQ $0x00000001, AX
|
|
|
|
sequenceDecs_decode_amd64_adjust_temp_valid:
|
|
CMPQ CX, $0x01
|
|
CMOVQNE R12, R13
|
|
MOVQ R11, R12
|
|
MOVQ AX, R11
|
|
MOVQ AX, CX
|
|
|
|
sequenceDecs_decode_amd64_after_adjust:
|
|
MOVQ CX, 16(R10)
|
|
|
|
// Check values
|
|
MOVQ 8(R10), AX
|
|
MOVQ (R10), R14
|
|
LEAQ (AX)(R14*1), R15
|
|
MOVQ s+0(FP), BP
|
|
ADDQ R15, 256(BP)
|
|
MOVQ ctx+16(FP), R15
|
|
SUBQ R14, 128(R15)
|
|
JS error_not_enough_literals
|
|
CMPQ AX, $0x00020002
|
|
JA sequenceDecs_decode_amd64_error_match_len_too_big
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_amd64_match_len_ofs_ok
|
|
TESTQ AX, AX
|
|
JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decode_amd64_match_len_ofs_ok:
|
|
ADDQ $0x18, R10
|
|
MOVQ ctx+16(FP), AX
|
|
DECQ 96(AX)
|
|
JNS sequenceDecs_decode_amd64_main_loop
|
|
MOVQ s+0(FP), AX
|
|
MOVQ R11, 144(AX)
|
|
MOVQ R12, 152(AX)
|
|
MOVQ R13, 160(AX)
|
|
MOVQ br+8(FP), AX
|
|
MOVQ DX, 24(AX)
|
|
MOVB BL, 32(AX)
|
|
MOVQ SI, 8(AX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decode_amd64_error_match_len_too_big:
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// Return with overread error
|
|
error_overread:
|
|
MOVQ $0x00000006, ret+24(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
|
|
// Requires: CMOV
|
|
TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
|
|
MOVQ br+8(FP), CX
|
|
MOVQ 24(CX), DX
|
|
MOVBQZX 32(CX), BX
|
|
MOVQ (CX), AX
|
|
MOVQ 8(CX), SI
|
|
ADDQ SI, AX
|
|
MOVQ AX, (SP)
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 72(AX), DI
|
|
MOVQ 80(AX), R8
|
|
MOVQ 88(AX), R9
|
|
MOVQ 104(AX), R10
|
|
MOVQ s+0(FP), AX
|
|
MOVQ 144(AX), R11
|
|
MOVQ 152(AX), R12
|
|
MOVQ 160(AX), R13
|
|
|
|
sequenceDecs_decode_56_amd64_main_loop:
|
|
MOVQ (SP), R14
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ SI, $0x08
|
|
JL sequenceDecs_decode_56_amd64_fill_byte_by_byte
|
|
MOVQ BX, AX
|
|
SHRQ $0x03, AX
|
|
SUBQ AX, R14
|
|
MOVQ (R14), DX
|
|
SUBQ AX, SI
|
|
ANDQ $0x07, BX
|
|
JMP sequenceDecs_decode_56_amd64_fill_end
|
|
|
|
sequenceDecs_decode_56_amd64_fill_byte_by_byte:
|
|
CMPQ SI, $0x00
|
|
JLE sequenceDecs_decode_56_amd64_fill_check_overread
|
|
CMPQ BX, $0x07
|
|
JLE sequenceDecs_decode_56_amd64_fill_end
|
|
SHLQ $0x08, DX
|
|
SUBQ $0x01, R14
|
|
SUBQ $0x01, SI
|
|
SUBQ $0x08, BX
|
|
MOVBQZX (R14), AX
|
|
ORQ AX, DX
|
|
JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
|
|
|
|
sequenceDecs_decode_56_amd64_fill_check_overread:
|
|
CMPQ BX, $0x40
|
|
JA error_overread
|
|
|
|
sequenceDecs_decode_56_amd64_fill_end:
|
|
// Update offset
|
|
MOVQ R9, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R15
|
|
SHLQ CL, R15
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decode_56_amd64_of_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decode_56_amd64_of_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decode_56_amd64_of_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R15
|
|
ADDQ R15, AX
|
|
|
|
sequenceDecs_decode_56_amd64_of_update_zero:
|
|
MOVQ AX, 16(R10)
|
|
|
|
// Update match length
|
|
MOVQ R8, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R15
|
|
SHLQ CL, R15
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decode_56_amd64_ml_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decode_56_amd64_ml_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decode_56_amd64_ml_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R15
|
|
ADDQ R15, AX
|
|
|
|
sequenceDecs_decode_56_amd64_ml_update_zero:
|
|
MOVQ AX, 8(R10)
|
|
|
|
// Update literal length
|
|
MOVQ DI, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R15
|
|
SHLQ CL, R15
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decode_56_amd64_ll_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decode_56_amd64_ll_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decode_56_amd64_ll_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R15
|
|
ADDQ R15, AX
|
|
|
|
sequenceDecs_decode_56_amd64_ll_update_zero:
|
|
MOVQ AX, (R10)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R14, (SP)
|
|
MOVQ R9, AX
|
|
SHRQ $0x08, AX
|
|
MOVBQZX AL, AX
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decode_56_amd64_skip_update
|
|
|
|
// Update Literal Length State
|
|
MOVBQZX DI, R14
|
|
SHRQ $0x10, DI
|
|
MOVWQZX DI, DI
|
|
LEAQ (BX)(R14*1), CX
|
|
MOVQ DX, R15
|
|
MOVQ CX, BX
|
|
ROLQ CL, R15
|
|
MOVL $0x00000001, BP
|
|
MOVB R14, CL
|
|
SHLL CL, BP
|
|
DECL BP
|
|
ANDQ BP, R15
|
|
ADDQ R15, DI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Match Length State
|
|
MOVBQZX R8, R14
|
|
SHRQ $0x10, R8
|
|
MOVWQZX R8, R8
|
|
LEAQ (BX)(R14*1), CX
|
|
MOVQ DX, R15
|
|
MOVQ CX, BX
|
|
ROLQ CL, R15
|
|
MOVL $0x00000001, BP
|
|
MOVB R14, CL
|
|
SHLL CL, BP
|
|
DECL BP
|
|
ANDQ BP, R15
|
|
ADDQ R15, R8
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Offset State
|
|
MOVBQZX R9, R14
|
|
SHRQ $0x10, R9
|
|
MOVWQZX R9, R9
|
|
LEAQ (BX)(R14*1), CX
|
|
MOVQ DX, R15
|
|
MOVQ CX, BX
|
|
ROLQ CL, R15
|
|
MOVL $0x00000001, BP
|
|
MOVB R14, CL
|
|
SHLL CL, BP
|
|
DECL BP
|
|
ANDQ BP, R15
|
|
ADDQ R15, R9
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R9*8), R9
|
|
|
|
sequenceDecs_decode_56_amd64_skip_update:
|
|
// Adjust offset
|
|
MOVQ 16(R10), CX
|
|
CMPQ AX, $0x01
|
|
JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
|
|
MOVQ R12, R13
|
|
MOVQ R11, R12
|
|
MOVQ CX, R11
|
|
JMP sequenceDecs_decode_56_amd64_after_adjust
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
|
|
CMPQ (R10), $0x00000000
|
|
JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero
|
|
INCQ CX
|
|
JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
|
|
MOVQ R11, CX
|
|
JMP sequenceDecs_decode_56_amd64_after_adjust
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
|
|
CMPQ CX, $0x01
|
|
JB sequenceDecs_decode_56_amd64_adjust_zero
|
|
JEQ sequenceDecs_decode_56_amd64_adjust_one
|
|
CMPQ CX, $0x02
|
|
JA sequenceDecs_decode_56_amd64_adjust_three
|
|
JMP sequenceDecs_decode_56_amd64_adjust_two
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_zero:
|
|
MOVQ R11, AX
|
|
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_one:
|
|
MOVQ R12, AX
|
|
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_two:
|
|
MOVQ R13, AX
|
|
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_three:
|
|
LEAQ -1(R11), AX
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
|
|
TESTQ AX, AX
|
|
JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid
|
|
MOVQ $0x00000001, AX
|
|
|
|
sequenceDecs_decode_56_amd64_adjust_temp_valid:
|
|
CMPQ CX, $0x01
|
|
CMOVQNE R12, R13
|
|
MOVQ R11, R12
|
|
MOVQ AX, R11
|
|
MOVQ AX, CX
|
|
|
|
sequenceDecs_decode_56_amd64_after_adjust:
|
|
MOVQ CX, 16(R10)
|
|
|
|
// Check values
|
|
MOVQ 8(R10), AX
|
|
MOVQ (R10), R14
|
|
LEAQ (AX)(R14*1), R15
|
|
MOVQ s+0(FP), BP
|
|
ADDQ R15, 256(BP)
|
|
MOVQ ctx+16(FP), R15
|
|
SUBQ R14, 128(R15)
|
|
JS error_not_enough_literals
|
|
CMPQ AX, $0x00020002
|
|
JA sequenceDecs_decode_56_amd64_error_match_len_too_big
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok
|
|
TESTQ AX, AX
|
|
JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decode_56_amd64_match_len_ofs_ok:
|
|
ADDQ $0x18, R10
|
|
MOVQ ctx+16(FP), AX
|
|
DECQ 96(AX)
|
|
JNS sequenceDecs_decode_56_amd64_main_loop
|
|
MOVQ s+0(FP), AX
|
|
MOVQ R11, 144(AX)
|
|
MOVQ R12, 152(AX)
|
|
MOVQ R13, 160(AX)
|
|
MOVQ br+8(FP), AX
|
|
MOVQ DX, 24(AX)
|
|
MOVB BL, 32(AX)
|
|
MOVQ SI, 8(AX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decode_56_amd64_error_match_len_too_big:
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// Return with overread error
|
|
error_overread:
|
|
MOVQ $0x00000006, ret+24(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
|
|
// Requires: BMI, BMI2, CMOV
|
|
TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
|
|
MOVQ br+8(FP), BX
|
|
MOVQ 24(BX), AX
|
|
MOVBQZX 32(BX), DX
|
|
MOVQ (BX), CX
|
|
MOVQ 8(BX), BX
|
|
ADDQ BX, CX
|
|
MOVQ CX, (SP)
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 72(CX), SI
|
|
MOVQ 80(CX), DI
|
|
MOVQ 88(CX), R8
|
|
MOVQ 104(CX), R9
|
|
MOVQ s+0(FP), CX
|
|
MOVQ 144(CX), R10
|
|
MOVQ 152(CX), R11
|
|
MOVQ 160(CX), R12
|
|
|
|
sequenceDecs_decode_bmi2_main_loop:
|
|
MOVQ (SP), R13
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ BX, $0x08
|
|
JL sequenceDecs_decode_bmi2_fill_byte_by_byte
|
|
MOVQ DX, CX
|
|
SHRQ $0x03, CX
|
|
SUBQ CX, R13
|
|
MOVQ (R13), AX
|
|
SUBQ CX, BX
|
|
ANDQ $0x07, DX
|
|
JMP sequenceDecs_decode_bmi2_fill_end
|
|
|
|
sequenceDecs_decode_bmi2_fill_byte_by_byte:
|
|
CMPQ BX, $0x00
|
|
JLE sequenceDecs_decode_bmi2_fill_check_overread
|
|
CMPQ DX, $0x07
|
|
JLE sequenceDecs_decode_bmi2_fill_end
|
|
SHLQ $0x08, AX
|
|
SUBQ $0x01, R13
|
|
SUBQ $0x01, BX
|
|
SUBQ $0x08, DX
|
|
MOVBQZX (R13), CX
|
|
ORQ CX, AX
|
|
JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
|
|
|
|
sequenceDecs_decode_bmi2_fill_check_overread:
|
|
CMPQ DX, $0x40
|
|
JA error_overread
|
|
|
|
sequenceDecs_decode_bmi2_fill_end:
|
|
// Update offset
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R14
|
|
MOVQ AX, R15
|
|
LEAQ (DX)(R14*1), CX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
MOVQ CX, DX
|
|
MOVQ R8, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R15, CX
|
|
MOVQ CX, 16(R9)
|
|
|
|
// Update match length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, DI, R14
|
|
MOVQ AX, R15
|
|
LEAQ (DX)(R14*1), CX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
MOVQ CX, DX
|
|
MOVQ DI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R15, CX
|
|
MOVQ CX, 8(R9)
|
|
|
|
// Fill bitreader to have enough for the remaining
|
|
CMPQ BX, $0x08
|
|
JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte
|
|
MOVQ DX, CX
|
|
SHRQ $0x03, CX
|
|
SUBQ CX, R13
|
|
MOVQ (R13), AX
|
|
SUBQ CX, BX
|
|
ANDQ $0x07, DX
|
|
JMP sequenceDecs_decode_bmi2_fill_2_end
|
|
|
|
sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
|
|
CMPQ BX, $0x00
|
|
JLE sequenceDecs_decode_bmi2_fill_2_check_overread
|
|
CMPQ DX, $0x07
|
|
JLE sequenceDecs_decode_bmi2_fill_2_end
|
|
SHLQ $0x08, AX
|
|
SUBQ $0x01, R13
|
|
SUBQ $0x01, BX
|
|
SUBQ $0x08, DX
|
|
MOVBQZX (R13), CX
|
|
ORQ CX, AX
|
|
JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
|
|
|
|
sequenceDecs_decode_bmi2_fill_2_check_overread:
|
|
CMPQ DX, $0x40
|
|
JA error_overread
|
|
|
|
sequenceDecs_decode_bmi2_fill_2_end:
|
|
// Update literal length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, SI, R14
|
|
MOVQ AX, R15
|
|
LEAQ (DX)(R14*1), CX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
MOVQ CX, DX
|
|
MOVQ SI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R15, CX
|
|
MOVQ CX, (R9)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R13, (SP)
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R13
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decode_bmi2_skip_update
|
|
LEAQ (SI)(DI*1), R14
|
|
ADDQ R8, R14
|
|
MOVBQZX R14, R14
|
|
LEAQ (DX)(R14*1), CX
|
|
MOVQ AX, R15
|
|
MOVQ CX, DX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
|
|
// Update Offset State
|
|
BZHIQ R8, R15, CX
|
|
SHRXQ R8, R15, R15
|
|
MOVQ $0x00001010, R14
|
|
BEXTRQ R14, R8, R8
|
|
ADDQ CX, R8
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Match Length State
|
|
BZHIQ DI, R15, CX
|
|
SHRXQ DI, R15, R15
|
|
MOVQ $0x00001010, R14
|
|
BEXTRQ R14, DI, DI
|
|
ADDQ CX, DI
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Literal Length State
|
|
BZHIQ SI, R15, CX
|
|
MOVQ $0x00001010, R14
|
|
BEXTRQ R14, SI, SI
|
|
ADDQ CX, SI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(SI*8), SI
|
|
|
|
sequenceDecs_decode_bmi2_skip_update:
|
|
// Adjust offset
|
|
MOVQ 16(R9), CX
|
|
CMPQ R13, $0x01
|
|
JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
|
|
MOVQ R11, R12
|
|
MOVQ R10, R11
|
|
MOVQ CX, R10
|
|
JMP sequenceDecs_decode_bmi2_after_adjust
|
|
|
|
sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
|
|
CMPQ (R9), $0x00000000
|
|
JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero
|
|
INCQ CX
|
|
JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decode_bmi2_adjust_offset_maybezero:
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
|
|
MOVQ R10, CX
|
|
JMP sequenceDecs_decode_bmi2_after_adjust
|
|
|
|
sequenceDecs_decode_bmi2_adjust_offset_nonzero:
|
|
CMPQ CX, $0x01
|
|
JB sequenceDecs_decode_bmi2_adjust_zero
|
|
JEQ sequenceDecs_decode_bmi2_adjust_one
|
|
CMPQ CX, $0x02
|
|
JA sequenceDecs_decode_bmi2_adjust_three
|
|
JMP sequenceDecs_decode_bmi2_adjust_two
|
|
|
|
sequenceDecs_decode_bmi2_adjust_zero:
|
|
MOVQ R10, R13
|
|
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_bmi2_adjust_one:
|
|
MOVQ R11, R13
|
|
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_bmi2_adjust_two:
|
|
MOVQ R12, R13
|
|
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_bmi2_adjust_three:
|
|
LEAQ -1(R10), R13
|
|
|
|
sequenceDecs_decode_bmi2_adjust_test_temp_valid:
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decode_bmi2_adjust_temp_valid
|
|
MOVQ $0x00000001, R13
|
|
|
|
sequenceDecs_decode_bmi2_adjust_temp_valid:
|
|
CMPQ CX, $0x01
|
|
CMOVQNE R11, R12
|
|
MOVQ R10, R11
|
|
MOVQ R13, R10
|
|
MOVQ R13, CX
|
|
|
|
sequenceDecs_decode_bmi2_after_adjust:
|
|
MOVQ CX, 16(R9)
|
|
|
|
// Check values
|
|
MOVQ 8(R9), R13
|
|
MOVQ (R9), R14
|
|
LEAQ (R13)(R14*1), R15
|
|
MOVQ s+0(FP), BP
|
|
ADDQ R15, 256(BP)
|
|
MOVQ ctx+16(FP), R15
|
|
SUBQ R14, 128(R15)
|
|
JS error_not_enough_literals
|
|
CMPQ R13, $0x00020002
|
|
JA sequenceDecs_decode_bmi2_error_match_len_too_big
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decode_bmi2_match_len_ofs_ok:
|
|
ADDQ $0x18, R9
|
|
MOVQ ctx+16(FP), CX
|
|
DECQ 96(CX)
|
|
JNS sequenceDecs_decode_bmi2_main_loop
|
|
MOVQ s+0(FP), CX
|
|
MOVQ R10, 144(CX)
|
|
MOVQ R11, 152(CX)
|
|
MOVQ R12, 160(CX)
|
|
MOVQ br+8(FP), CX
|
|
MOVQ AX, 24(CX)
|
|
MOVB DL, 32(CX)
|
|
MOVQ BX, 8(CX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decode_bmi2_error_match_len_too_big:
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// Return with overread error
|
|
error_overread:
|
|
MOVQ $0x00000006, ret+24(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
|
|
// Requires: BMI, BMI2, CMOV
|
|
TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
|
|
MOVQ br+8(FP), BX
|
|
MOVQ 24(BX), AX
|
|
MOVBQZX 32(BX), DX
|
|
MOVQ (BX), CX
|
|
MOVQ 8(BX), BX
|
|
ADDQ BX, CX
|
|
MOVQ CX, (SP)
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 72(CX), SI
|
|
MOVQ 80(CX), DI
|
|
MOVQ 88(CX), R8
|
|
MOVQ 104(CX), R9
|
|
MOVQ s+0(FP), CX
|
|
MOVQ 144(CX), R10
|
|
MOVQ 152(CX), R11
|
|
MOVQ 160(CX), R12
|
|
|
|
sequenceDecs_decode_56_bmi2_main_loop:
|
|
MOVQ (SP), R13
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ BX, $0x08
|
|
JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte
|
|
MOVQ DX, CX
|
|
SHRQ $0x03, CX
|
|
SUBQ CX, R13
|
|
MOVQ (R13), AX
|
|
SUBQ CX, BX
|
|
ANDQ $0x07, DX
|
|
JMP sequenceDecs_decode_56_bmi2_fill_end
|
|
|
|
sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
|
|
CMPQ BX, $0x00
|
|
JLE sequenceDecs_decode_56_bmi2_fill_check_overread
|
|
CMPQ DX, $0x07
|
|
JLE sequenceDecs_decode_56_bmi2_fill_end
|
|
SHLQ $0x08, AX
|
|
SUBQ $0x01, R13
|
|
SUBQ $0x01, BX
|
|
SUBQ $0x08, DX
|
|
MOVBQZX (R13), CX
|
|
ORQ CX, AX
|
|
JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
|
|
|
|
sequenceDecs_decode_56_bmi2_fill_check_overread:
|
|
CMPQ DX, $0x40
|
|
JA error_overread
|
|
|
|
sequenceDecs_decode_56_bmi2_fill_end:
|
|
// Update offset
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R14
|
|
MOVQ AX, R15
|
|
LEAQ (DX)(R14*1), CX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
MOVQ CX, DX
|
|
MOVQ R8, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R15, CX
|
|
MOVQ CX, 16(R9)
|
|
|
|
// Update match length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, DI, R14
|
|
MOVQ AX, R15
|
|
LEAQ (DX)(R14*1), CX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
MOVQ CX, DX
|
|
MOVQ DI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R15, CX
|
|
MOVQ CX, 8(R9)
|
|
|
|
// Update literal length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, SI, R14
|
|
MOVQ AX, R15
|
|
LEAQ (DX)(R14*1), CX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
MOVQ CX, DX
|
|
MOVQ SI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R15, CX
|
|
MOVQ CX, (R9)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R13, (SP)
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R13
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decode_56_bmi2_skip_update
|
|
LEAQ (SI)(DI*1), R14
|
|
ADDQ R8, R14
|
|
MOVBQZX R14, R14
|
|
LEAQ (DX)(R14*1), CX
|
|
MOVQ AX, R15
|
|
MOVQ CX, DX
|
|
ROLQ CL, R15
|
|
BZHIQ R14, R15, R15
|
|
|
|
// Update Offset State
|
|
BZHIQ R8, R15, CX
|
|
SHRXQ R8, R15, R15
|
|
MOVQ $0x00001010, R14
|
|
BEXTRQ R14, R8, R8
|
|
ADDQ CX, R8
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Match Length State
|
|
BZHIQ DI, R15, CX
|
|
SHRXQ DI, R15, R15
|
|
MOVQ $0x00001010, R14
|
|
BEXTRQ R14, DI, DI
|
|
ADDQ CX, DI
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Literal Length State
|
|
BZHIQ SI, R15, CX
|
|
MOVQ $0x00001010, R14
|
|
BEXTRQ R14, SI, SI
|
|
ADDQ CX, SI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(SI*8), SI
|
|
|
|
sequenceDecs_decode_56_bmi2_skip_update:
|
|
// Adjust offset
|
|
MOVQ 16(R9), CX
|
|
CMPQ R13, $0x01
|
|
JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
|
|
MOVQ R11, R12
|
|
MOVQ R10, R11
|
|
MOVQ CX, R10
|
|
JMP sequenceDecs_decode_56_bmi2_after_adjust
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
|
|
CMPQ (R9), $0x00000000
|
|
JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
|
|
INCQ CX
|
|
JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
|
|
MOVQ R10, CX
|
|
JMP sequenceDecs_decode_56_bmi2_after_adjust
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
|
|
CMPQ CX, $0x01
|
|
JB sequenceDecs_decode_56_bmi2_adjust_zero
|
|
JEQ sequenceDecs_decode_56_bmi2_adjust_one
|
|
CMPQ CX, $0x02
|
|
JA sequenceDecs_decode_56_bmi2_adjust_three
|
|
JMP sequenceDecs_decode_56_bmi2_adjust_two
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_zero:
|
|
MOVQ R10, R13
|
|
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_one:
|
|
MOVQ R11, R13
|
|
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_two:
|
|
MOVQ R12, R13
|
|
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_three:
|
|
LEAQ -1(R10), R13
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid
|
|
MOVQ $0x00000001, R13
|
|
|
|
sequenceDecs_decode_56_bmi2_adjust_temp_valid:
|
|
CMPQ CX, $0x01
|
|
CMOVQNE R11, R12
|
|
MOVQ R10, R11
|
|
MOVQ R13, R10
|
|
MOVQ R13, CX
|
|
|
|
sequenceDecs_decode_56_bmi2_after_adjust:
|
|
MOVQ CX, 16(R9)
|
|
|
|
// Check values
|
|
MOVQ 8(R9), R13
|
|
MOVQ (R9), R14
|
|
LEAQ (R13)(R14*1), R15
|
|
MOVQ s+0(FP), BP
|
|
ADDQ R15, 256(BP)
|
|
MOVQ ctx+16(FP), R15
|
|
SUBQ R14, 128(R15)
|
|
JS error_not_enough_literals
|
|
CMPQ R13, $0x00020002
|
|
JA sequenceDecs_decode_56_bmi2_error_match_len_too_big
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
|
|
ADDQ $0x18, R9
|
|
MOVQ ctx+16(FP), CX
|
|
DECQ 96(CX)
|
|
JNS sequenceDecs_decode_56_bmi2_main_loop
|
|
MOVQ s+0(FP), CX
|
|
MOVQ R10, 144(CX)
|
|
MOVQ R11, 152(CX)
|
|
MOVQ R12, 160(CX)
|
|
MOVQ br+8(FP), CX
|
|
MOVQ AX, 24(CX)
|
|
MOVB DL, 32(CX)
|
|
MOVQ BX, 8(CX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decode_56_bmi2_error_match_len_too_big:
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// Return with overread error
|
|
error_overread:
|
|
MOVQ $0x00000006, ret+24(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
|
|
// Requires: SSE
|
|
TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
|
|
MOVQ ctx+0(FP), R10
|
|
MOVQ 8(R10), CX
|
|
TESTQ CX, CX
|
|
JZ empty_seqs
|
|
MOVQ (R10), AX
|
|
MOVQ 24(R10), DX
|
|
MOVQ 32(R10), BX
|
|
MOVQ 80(R10), SI
|
|
MOVQ 104(R10), DI
|
|
MOVQ 120(R10), R8
|
|
MOVQ 56(R10), R9
|
|
MOVQ 64(R10), R10
|
|
ADDQ R10, R9
|
|
|
|
// seqsBase += 24 * seqIndex
|
|
LEAQ (DX)(DX*2), R11
|
|
SHLQ $0x03, R11
|
|
ADDQ R11, AX
|
|
|
|
// outBase += outPosition
|
|
ADDQ DI, BX
|
|
|
|
main_loop:
|
|
MOVQ (AX), R11
|
|
MOVQ 16(AX), R12
|
|
MOVQ 8(AX), R13
|
|
|
|
// Copy literals
|
|
TESTQ R11, R11
|
|
JZ check_offset
|
|
XORQ R14, R14
|
|
|
|
copy_1:
|
|
MOVUPS (SI)(R14*1), X0
|
|
MOVUPS X0, (BX)(R14*1)
|
|
ADDQ $0x10, R14
|
|
CMPQ R14, R11
|
|
JB copy_1
|
|
ADDQ R11, SI
|
|
ADDQ R11, BX
|
|
ADDQ R11, DI
|
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
check_offset:
|
|
LEAQ (DI)(R10*1), R11
|
|
CMPQ R12, R11
|
|
JG error_match_off_too_big
|
|
CMPQ R12, R8
|
|
JG error_match_off_too_big
|
|
|
|
// Copy match from history
|
|
MOVQ R12, R11
|
|
SUBQ DI, R11
|
|
JLS copy_match
|
|
MOVQ R9, R14
|
|
SUBQ R11, R14
|
|
CMPQ R13, R11
|
|
JG copy_all_from_history
|
|
MOVQ R13, R11
|
|
SUBQ $0x10, R11
|
|
JB copy_4_small
|
|
|
|
copy_4_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (BX)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, BX
|
|
SUBQ $0x10, R11
|
|
JAE copy_4_loop
|
|
LEAQ 16(R14)(R11*1), R14
|
|
LEAQ 16(BX)(R11*1), BX
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(BX)
|
|
JMP copy_4_end
|
|
|
|
copy_4_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_4_move_3
|
|
CMPQ R13, $0x08
|
|
JB copy_4_move_4through7
|
|
JMP copy_4_move_8through16
|
|
|
|
copy_4_move_3:
|
|
MOVW (R14), R11
|
|
MOVB 2(R14), R12
|
|
MOVW R11, (BX)
|
|
MOVB R12, 2(BX)
|
|
ADDQ R13, R14
|
|
ADDQ R13, BX
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_4through7:
|
|
MOVL (R14), R11
|
|
MOVL -4(R14)(R13*1), R12
|
|
MOVL R11, (BX)
|
|
MOVL R12, -4(BX)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, BX
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_8through16:
|
|
MOVQ (R14), R11
|
|
MOVQ -8(R14)(R13*1), R12
|
|
MOVQ R11, (BX)
|
|
MOVQ R12, -8(BX)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, BX
|
|
|
|
copy_4_end:
|
|
ADDQ R13, DI
|
|
ADDQ $0x18, AX
|
|
INCQ DX
|
|
CMPQ DX, CX
|
|
JB main_loop
|
|
JMP loop_finished
|
|
|
|
copy_all_from_history:
|
|
MOVQ R11, R15
|
|
SUBQ $0x10, R15
|
|
JB copy_5_small
|
|
|
|
copy_5_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (BX)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, BX
|
|
SUBQ $0x10, R15
|
|
JAE copy_5_loop
|
|
LEAQ 16(R14)(R15*1), R14
|
|
LEAQ 16(BX)(R15*1), BX
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(BX)
|
|
JMP copy_5_end
|
|
|
|
copy_5_small:
|
|
CMPQ R11, $0x03
|
|
JE copy_5_move_3
|
|
JB copy_5_move_1or2
|
|
CMPQ R11, $0x08
|
|
JB copy_5_move_4through7
|
|
JMP copy_5_move_8through16
|
|
|
|
copy_5_move_1or2:
|
|
MOVB (R14), R15
|
|
MOVB -1(R14)(R11*1), BP
|
|
MOVB R15, (BX)
|
|
MOVB BP, -1(BX)(R11*1)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_3:
|
|
MOVW (R14), R15
|
|
MOVB 2(R14), BP
|
|
MOVW R15, (BX)
|
|
MOVB BP, 2(BX)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_4through7:
|
|
MOVL (R14), R15
|
|
MOVL -4(R14)(R11*1), BP
|
|
MOVL R15, (BX)
|
|
MOVL BP, -4(BX)(R11*1)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_8through16:
|
|
MOVQ (R14), R15
|
|
MOVQ -8(R14)(R11*1), BP
|
|
MOVQ R15, (BX)
|
|
MOVQ BP, -8(BX)(R11*1)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
|
|
copy_5_end:
|
|
ADDQ R11, DI
|
|
SUBQ R11, R13
|
|
|
|
// Copy match from the current buffer
|
|
copy_match:
|
|
MOVQ BX, R11
|
|
SUBQ R12, R11
|
|
|
|
// ml <= mo
|
|
CMPQ R13, R12
|
|
JA copy_overlapping_match
|
|
|
|
// Copy non-overlapping match
|
|
ADDQ R13, DI
|
|
MOVQ BX, R12
|
|
ADDQ R13, BX
|
|
|
|
copy_2:
|
|
MOVUPS (R11), X0
|
|
MOVUPS X0, (R12)
|
|
ADDQ $0x10, R11
|
|
ADDQ $0x10, R12
|
|
SUBQ $0x10, R13
|
|
JHI copy_2
|
|
JMP handle_loop
|
|
|
|
// Copy overlapping match
|
|
copy_overlapping_match:
|
|
ADDQ R13, DI
|
|
|
|
copy_slow_3:
|
|
MOVB (R11), R12
|
|
MOVB R12, (BX)
|
|
INCQ R11
|
|
INCQ BX
|
|
DECQ R13
|
|
JNZ copy_slow_3
|
|
|
|
handle_loop:
|
|
ADDQ $0x18, AX
|
|
INCQ DX
|
|
CMPQ DX, CX
|
|
JB main_loop
|
|
|
|
loop_finished:
|
|
// Return value
|
|
MOVB $0x01, ret+8(FP)
|
|
|
|
// Update the context
|
|
MOVQ ctx+0(FP), AX
|
|
MOVQ DX, 24(AX)
|
|
MOVQ DI, 104(AX)
|
|
SUBQ 80(AX), SI
|
|
MOVQ SI, 112(AX)
|
|
RET
|
|
|
|
error_match_off_too_big:
|
|
// Return value
|
|
MOVB $0x00, ret+8(FP)
|
|
|
|
// Update the context
|
|
MOVQ ctx+0(FP), AX
|
|
MOVQ DX, 24(AX)
|
|
MOVQ DI, 104(AX)
|
|
SUBQ 80(AX), SI
|
|
MOVQ SI, 112(AX)
|
|
RET
|
|
|
|
empty_seqs:
|
|
// Return value
|
|
MOVB $0x01, ret+8(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
|
|
// Requires: SSE
|
|
TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
|
|
MOVQ ctx+0(FP), R10
|
|
MOVQ 8(R10), CX
|
|
TESTQ CX, CX
|
|
JZ empty_seqs
|
|
MOVQ (R10), AX
|
|
MOVQ 24(R10), DX
|
|
MOVQ 32(R10), BX
|
|
MOVQ 80(R10), SI
|
|
MOVQ 104(R10), DI
|
|
MOVQ 120(R10), R8
|
|
MOVQ 56(R10), R9
|
|
MOVQ 64(R10), R10
|
|
ADDQ R10, R9
|
|
|
|
// seqsBase += 24 * seqIndex
|
|
LEAQ (DX)(DX*2), R11
|
|
SHLQ $0x03, R11
|
|
ADDQ R11, AX
|
|
|
|
// outBase += outPosition
|
|
ADDQ DI, BX
|
|
|
|
main_loop:
|
|
MOVQ (AX), R11
|
|
MOVQ 16(AX), R12
|
|
MOVQ 8(AX), R13
|
|
|
|
// Copy literals
|
|
TESTQ R11, R11
|
|
JZ check_offset
|
|
MOVQ R11, R14
|
|
SUBQ $0x10, R14
|
|
JB copy_1_small
|
|
|
|
copy_1_loop:
|
|
MOVUPS (SI), X0
|
|
MOVUPS X0, (BX)
|
|
ADDQ $0x10, SI
|
|
ADDQ $0x10, BX
|
|
SUBQ $0x10, R14
|
|
JAE copy_1_loop
|
|
LEAQ 16(SI)(R14*1), SI
|
|
LEAQ 16(BX)(R14*1), BX
|
|
MOVUPS -16(SI), X0
|
|
MOVUPS X0, -16(BX)
|
|
JMP copy_1_end
|
|
|
|
copy_1_small:
|
|
CMPQ R11, $0x03
|
|
JE copy_1_move_3
|
|
JB copy_1_move_1or2
|
|
CMPQ R11, $0x08
|
|
JB copy_1_move_4through7
|
|
JMP copy_1_move_8through16
|
|
|
|
copy_1_move_1or2:
|
|
MOVB (SI), R14
|
|
MOVB -1(SI)(R11*1), R15
|
|
MOVB R14, (BX)
|
|
MOVB R15, -1(BX)(R11*1)
|
|
ADDQ R11, SI
|
|
ADDQ R11, BX
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_3:
|
|
MOVW (SI), R14
|
|
MOVB 2(SI), R15
|
|
MOVW R14, (BX)
|
|
MOVB R15, 2(BX)
|
|
ADDQ R11, SI
|
|
ADDQ R11, BX
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_4through7:
|
|
MOVL (SI), R14
|
|
MOVL -4(SI)(R11*1), R15
|
|
MOVL R14, (BX)
|
|
MOVL R15, -4(BX)(R11*1)
|
|
ADDQ R11, SI
|
|
ADDQ R11, BX
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_8through16:
|
|
MOVQ (SI), R14
|
|
MOVQ -8(SI)(R11*1), R15
|
|
MOVQ R14, (BX)
|
|
MOVQ R15, -8(BX)(R11*1)
|
|
ADDQ R11, SI
|
|
ADDQ R11, BX
|
|
|
|
copy_1_end:
|
|
ADDQ R11, DI
|
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
check_offset:
|
|
LEAQ (DI)(R10*1), R11
|
|
CMPQ R12, R11
|
|
JG error_match_off_too_big
|
|
CMPQ R12, R8
|
|
JG error_match_off_too_big
|
|
|
|
// Copy match from history
|
|
MOVQ R12, R11
|
|
SUBQ DI, R11
|
|
JLS copy_match
|
|
MOVQ R9, R14
|
|
SUBQ R11, R14
|
|
CMPQ R13, R11
|
|
JG copy_all_from_history
|
|
MOVQ R13, R11
|
|
SUBQ $0x10, R11
|
|
JB copy_4_small
|
|
|
|
copy_4_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (BX)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, BX
|
|
SUBQ $0x10, R11
|
|
JAE copy_4_loop
|
|
LEAQ 16(R14)(R11*1), R14
|
|
LEAQ 16(BX)(R11*1), BX
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(BX)
|
|
JMP copy_4_end
|
|
|
|
copy_4_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_4_move_3
|
|
CMPQ R13, $0x08
|
|
JB copy_4_move_4through7
|
|
JMP copy_4_move_8through16
|
|
|
|
copy_4_move_3:
|
|
MOVW (R14), R11
|
|
MOVB 2(R14), R12
|
|
MOVW R11, (BX)
|
|
MOVB R12, 2(BX)
|
|
ADDQ R13, R14
|
|
ADDQ R13, BX
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_4through7:
|
|
MOVL (R14), R11
|
|
MOVL -4(R14)(R13*1), R12
|
|
MOVL R11, (BX)
|
|
MOVL R12, -4(BX)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, BX
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_8through16:
|
|
MOVQ (R14), R11
|
|
MOVQ -8(R14)(R13*1), R12
|
|
MOVQ R11, (BX)
|
|
MOVQ R12, -8(BX)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, BX
|
|
|
|
copy_4_end:
|
|
ADDQ R13, DI
|
|
ADDQ $0x18, AX
|
|
INCQ DX
|
|
CMPQ DX, CX
|
|
JB main_loop
|
|
JMP loop_finished
|
|
|
|
copy_all_from_history:
|
|
MOVQ R11, R15
|
|
SUBQ $0x10, R15
|
|
JB copy_5_small
|
|
|
|
copy_5_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (BX)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, BX
|
|
SUBQ $0x10, R15
|
|
JAE copy_5_loop
|
|
LEAQ 16(R14)(R15*1), R14
|
|
LEAQ 16(BX)(R15*1), BX
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(BX)
|
|
JMP copy_5_end
|
|
|
|
copy_5_small:
|
|
CMPQ R11, $0x03
|
|
JE copy_5_move_3
|
|
JB copy_5_move_1or2
|
|
CMPQ R11, $0x08
|
|
JB copy_5_move_4through7
|
|
JMP copy_5_move_8through16
|
|
|
|
copy_5_move_1or2:
|
|
MOVB (R14), R15
|
|
MOVB -1(R14)(R11*1), BP
|
|
MOVB R15, (BX)
|
|
MOVB BP, -1(BX)(R11*1)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_3:
|
|
MOVW (R14), R15
|
|
MOVB 2(R14), BP
|
|
MOVW R15, (BX)
|
|
MOVB BP, 2(BX)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_4through7:
|
|
MOVL (R14), R15
|
|
MOVL -4(R14)(R11*1), BP
|
|
MOVL R15, (BX)
|
|
MOVL BP, -4(BX)(R11*1)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_8through16:
|
|
MOVQ (R14), R15
|
|
MOVQ -8(R14)(R11*1), BP
|
|
MOVQ R15, (BX)
|
|
MOVQ BP, -8(BX)(R11*1)
|
|
ADDQ R11, R14
|
|
ADDQ R11, BX
|
|
|
|
copy_5_end:
|
|
ADDQ R11, DI
|
|
SUBQ R11, R13
|
|
|
|
// Copy match from the current buffer
|
|
copy_match:
|
|
MOVQ BX, R11
|
|
SUBQ R12, R11
|
|
|
|
// ml <= mo
|
|
CMPQ R13, R12
|
|
JA copy_overlapping_match
|
|
|
|
// Copy non-overlapping match
|
|
ADDQ R13, DI
|
|
MOVQ R13, R12
|
|
SUBQ $0x10, R12
|
|
JB copy_2_small
|
|
|
|
copy_2_loop:
|
|
MOVUPS (R11), X0
|
|
MOVUPS X0, (BX)
|
|
ADDQ $0x10, R11
|
|
ADDQ $0x10, BX
|
|
SUBQ $0x10, R12
|
|
JAE copy_2_loop
|
|
LEAQ 16(R11)(R12*1), R11
|
|
LEAQ 16(BX)(R12*1), BX
|
|
MOVUPS -16(R11), X0
|
|
MOVUPS X0, -16(BX)
|
|
JMP copy_2_end
|
|
|
|
copy_2_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_2_move_3
|
|
JB copy_2_move_1or2
|
|
CMPQ R13, $0x08
|
|
JB copy_2_move_4through7
|
|
JMP copy_2_move_8through16
|
|
|
|
copy_2_move_1or2:
|
|
MOVB (R11), R12
|
|
MOVB -1(R11)(R13*1), R14
|
|
MOVB R12, (BX)
|
|
MOVB R14, -1(BX)(R13*1)
|
|
ADDQ R13, R11
|
|
ADDQ R13, BX
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_3:
|
|
MOVW (R11), R12
|
|
MOVB 2(R11), R14
|
|
MOVW R12, (BX)
|
|
MOVB R14, 2(BX)
|
|
ADDQ R13, R11
|
|
ADDQ R13, BX
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_4through7:
|
|
MOVL (R11), R12
|
|
MOVL -4(R11)(R13*1), R14
|
|
MOVL R12, (BX)
|
|
MOVL R14, -4(BX)(R13*1)
|
|
ADDQ R13, R11
|
|
ADDQ R13, BX
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_8through16:
|
|
MOVQ (R11), R12
|
|
MOVQ -8(R11)(R13*1), R14
|
|
MOVQ R12, (BX)
|
|
MOVQ R14, -8(BX)(R13*1)
|
|
ADDQ R13, R11
|
|
ADDQ R13, BX
|
|
|
|
copy_2_end:
|
|
JMP handle_loop
|
|
|
|
// Copy overlapping match
|
|
copy_overlapping_match:
|
|
ADDQ R13, DI
|
|
|
|
copy_slow_3:
|
|
MOVB (R11), R12
|
|
MOVB R12, (BX)
|
|
INCQ R11
|
|
INCQ BX
|
|
DECQ R13
|
|
JNZ copy_slow_3
|
|
|
|
handle_loop:
|
|
ADDQ $0x18, AX
|
|
INCQ DX
|
|
CMPQ DX, CX
|
|
JB main_loop
|
|
|
|
loop_finished:
|
|
// Return value
|
|
MOVB $0x01, ret+8(FP)
|
|
|
|
// Update the context
|
|
MOVQ ctx+0(FP), AX
|
|
MOVQ DX, 24(AX)
|
|
MOVQ DI, 104(AX)
|
|
SUBQ 80(AX), SI
|
|
MOVQ SI, 112(AX)
|
|
RET
|
|
|
|
error_match_off_too_big:
|
|
// Return value
|
|
MOVB $0x00, ret+8(FP)
|
|
|
|
// Update the context
|
|
MOVQ ctx+0(FP), AX
|
|
MOVQ DX, 24(AX)
|
|
MOVQ DI, 104(AX)
|
|
SUBQ 80(AX), SI
|
|
MOVQ SI, 112(AX)
|
|
RET
|
|
|
|
empty_seqs:
|
|
// Return value
|
|
MOVB $0x01, ret+8(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
|
|
// Requires: CMOV, SSE
|
|
TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
|
|
MOVQ br+8(FP), CX
|
|
MOVQ 24(CX), DX
|
|
MOVBQZX 32(CX), BX
|
|
MOVQ (CX), AX
|
|
MOVQ 8(CX), SI
|
|
ADDQ SI, AX
|
|
MOVQ AX, (SP)
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 72(AX), DI
|
|
MOVQ 80(AX), R8
|
|
MOVQ 88(AX), R9
|
|
XORQ CX, CX
|
|
MOVQ CX, 8(SP)
|
|
MOVQ CX, 16(SP)
|
|
MOVQ CX, 24(SP)
|
|
MOVQ 112(AX), R10
|
|
MOVQ 128(AX), CX
|
|
MOVQ CX, 32(SP)
|
|
MOVQ 144(AX), R11
|
|
MOVQ 136(AX), R12
|
|
MOVQ 200(AX), CX
|
|
MOVQ CX, 56(SP)
|
|
MOVQ 176(AX), CX
|
|
MOVQ CX, 48(SP)
|
|
MOVQ 184(AX), AX
|
|
MOVQ AX, 40(SP)
|
|
MOVQ 40(SP), AX
|
|
ADDQ AX, 48(SP)
|
|
|
|
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
|
|
ADDQ R10, 32(SP)
|
|
|
|
// outBase += outPosition
|
|
ADDQ R12, R10
|
|
|
|
sequenceDecs_decodeSync_amd64_main_loop:
|
|
MOVQ (SP), R13
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ SI, $0x08
|
|
JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte
|
|
MOVQ BX, AX
|
|
SHRQ $0x03, AX
|
|
SUBQ AX, R13
|
|
MOVQ (R13), DX
|
|
SUBQ AX, SI
|
|
ANDQ $0x07, BX
|
|
JMP sequenceDecs_decodeSync_amd64_fill_end
|
|
|
|
sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
|
|
CMPQ SI, $0x00
|
|
JLE sequenceDecs_decodeSync_amd64_fill_check_overread
|
|
CMPQ BX, $0x07
|
|
JLE sequenceDecs_decodeSync_amd64_fill_end
|
|
SHLQ $0x08, DX
|
|
SUBQ $0x01, R13
|
|
SUBQ $0x01, SI
|
|
SUBQ $0x08, BX
|
|
MOVBQZX (R13), AX
|
|
ORQ AX, DX
|
|
JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_amd64_fill_check_overread:
|
|
CMPQ BX, $0x40
|
|
JA error_overread
|
|
|
|
sequenceDecs_decodeSync_amd64_fill_end:
|
|
// Update offset
|
|
MOVQ R9, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R14
|
|
SHLQ CL, R14
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decodeSync_amd64_of_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decodeSync_amd64_of_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decodeSync_amd64_of_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R14
|
|
ADDQ R14, AX
|
|
|
|
sequenceDecs_decodeSync_amd64_of_update_zero:
|
|
MOVQ AX, 8(SP)
|
|
|
|
// Update match length
|
|
MOVQ R8, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R14
|
|
SHLQ CL, R14
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decodeSync_amd64_ml_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decodeSync_amd64_ml_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decodeSync_amd64_ml_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R14
|
|
ADDQ R14, AX
|
|
|
|
sequenceDecs_decodeSync_amd64_ml_update_zero:
|
|
MOVQ AX, 16(SP)
|
|
|
|
// Fill bitreader to have enough for the remaining
|
|
CMPQ SI, $0x08
|
|
JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
|
|
MOVQ BX, AX
|
|
SHRQ $0x03, AX
|
|
SUBQ AX, R13
|
|
MOVQ (R13), DX
|
|
SUBQ AX, SI
|
|
ANDQ $0x07, BX
|
|
JMP sequenceDecs_decodeSync_amd64_fill_2_end
|
|
|
|
sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
|
|
CMPQ SI, $0x00
|
|
JLE sequenceDecs_decodeSync_amd64_fill_2_check_overread
|
|
CMPQ BX, $0x07
|
|
JLE sequenceDecs_decodeSync_amd64_fill_2_end
|
|
SHLQ $0x08, DX
|
|
SUBQ $0x01, R13
|
|
SUBQ $0x01, SI
|
|
SUBQ $0x08, BX
|
|
MOVBQZX (R13), AX
|
|
ORQ AX, DX
|
|
JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_amd64_fill_2_check_overread:
|
|
CMPQ BX, $0x40
|
|
JA error_overread
|
|
|
|
sequenceDecs_decodeSync_amd64_fill_2_end:
|
|
// Update literal length
|
|
MOVQ DI, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R14
|
|
SHLQ CL, R14
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decodeSync_amd64_ll_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decodeSync_amd64_ll_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decodeSync_amd64_ll_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R14
|
|
ADDQ R14, AX
|
|
|
|
sequenceDecs_decodeSync_amd64_ll_update_zero:
|
|
MOVQ AX, 24(SP)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R13, (SP)
|
|
MOVQ R9, AX
|
|
SHRQ $0x08, AX
|
|
MOVBQZX AL, AX
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decodeSync_amd64_skip_update
|
|
|
|
// Update Literal Length State
|
|
MOVBQZX DI, R13
|
|
SHRQ $0x10, DI
|
|
MOVWQZX DI, DI
|
|
LEAQ (BX)(R13*1), CX
|
|
MOVQ DX, R14
|
|
MOVQ CX, BX
|
|
ROLQ CL, R14
|
|
MOVL $0x00000001, R15
|
|
MOVB R13, CL
|
|
SHLL CL, R15
|
|
DECL R15
|
|
ANDQ R15, R14
|
|
ADDQ R14, DI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Match Length State
|
|
MOVBQZX R8, R13
|
|
SHRQ $0x10, R8
|
|
MOVWQZX R8, R8
|
|
LEAQ (BX)(R13*1), CX
|
|
MOVQ DX, R14
|
|
MOVQ CX, BX
|
|
ROLQ CL, R14
|
|
MOVL $0x00000001, R15
|
|
MOVB R13, CL
|
|
SHLL CL, R15
|
|
DECL R15
|
|
ANDQ R15, R14
|
|
ADDQ R14, R8
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Offset State
|
|
MOVBQZX R9, R13
|
|
SHRQ $0x10, R9
|
|
MOVWQZX R9, R9
|
|
LEAQ (BX)(R13*1), CX
|
|
MOVQ DX, R14
|
|
MOVQ CX, BX
|
|
ROLQ CL, R14
|
|
MOVL $0x00000001, R15
|
|
MOVB R13, CL
|
|
SHLL CL, R15
|
|
DECL R15
|
|
ANDQ R15, R14
|
|
ADDQ R14, R9
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R9*8), R9
|
|
|
|
sequenceDecs_decodeSync_amd64_skip_update:
|
|
// Adjust offset
|
|
MOVQ s+0(FP), CX
|
|
MOVQ 8(SP), R13
|
|
CMPQ AX, $0x01
|
|
JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
|
|
MOVUPS 144(CX), X0
|
|
MOVQ R13, 144(CX)
|
|
MOVUPS X0, 152(CX)
|
|
JMP sequenceDecs_decodeSync_amd64_after_adjust
|
|
|
|
sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
|
|
CMPQ 24(SP), $0x00000000
|
|
JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
|
|
INCQ R13
|
|
JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
|
|
MOVQ 144(CX), R13
|
|
JMP sequenceDecs_decodeSync_amd64_after_adjust
|
|
|
|
sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
|
|
MOVQ R13, AX
|
|
XORQ R14, R14
|
|
MOVQ $-1, R15
|
|
CMPQ R13, $0x03
|
|
CMOVQEQ R14, AX
|
|
CMOVQEQ R15, R14
|
|
ADDQ 144(CX)(AX*8), R14
|
|
JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
|
|
MOVQ $0x00000001, R14
|
|
|
|
sequenceDecs_decodeSync_amd64_adjust_temp_valid:
|
|
CMPQ R13, $0x01
|
|
JZ sequenceDecs_decodeSync_amd64_adjust_skip
|
|
MOVQ 152(CX), AX
|
|
MOVQ AX, 160(CX)
|
|
|
|
sequenceDecs_decodeSync_amd64_adjust_skip:
|
|
MOVQ 144(CX), AX
|
|
MOVQ AX, 152(CX)
|
|
MOVQ R14, 144(CX)
|
|
MOVQ R14, R13
|
|
|
|
sequenceDecs_decodeSync_amd64_after_adjust:
|
|
MOVQ R13, 8(SP)
|
|
|
|
// Check values
|
|
MOVQ 16(SP), AX
|
|
MOVQ 24(SP), CX
|
|
LEAQ (AX)(CX*1), R14
|
|
MOVQ s+0(FP), R15
|
|
ADDQ R14, 256(R15)
|
|
MOVQ ctx+16(FP), R14
|
|
SUBQ CX, 104(R14)
|
|
JS error_not_enough_literals
|
|
CMPQ AX, $0x00020002
|
|
JA sequenceDecs_decodeSync_amd64_error_match_len_too_big
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok
|
|
TESTQ AX, AX
|
|
JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
|
|
MOVQ 24(SP), AX
|
|
MOVQ 8(SP), CX
|
|
MOVQ 16(SP), R13
|
|
|
|
// Check if we have enough space in s.out
|
|
LEAQ (AX)(R13*1), R14
|
|
ADDQ R10, R14
|
|
CMPQ R14, 32(SP)
|
|
JA error_not_enough_space
|
|
|
|
// Copy literals
|
|
TESTQ AX, AX
|
|
JZ check_offset
|
|
XORQ R14, R14
|
|
|
|
copy_1:
|
|
MOVUPS (R11)(R14*1), X0
|
|
MOVUPS X0, (R10)(R14*1)
|
|
ADDQ $0x10, R14
|
|
CMPQ R14, AX
|
|
JB copy_1
|
|
ADDQ AX, R11
|
|
ADDQ AX, R10
|
|
ADDQ AX, R12
|
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
check_offset:
|
|
MOVQ R12, AX
|
|
ADDQ 40(SP), AX
|
|
CMPQ CX, AX
|
|
JG error_match_off_too_big
|
|
CMPQ CX, 56(SP)
|
|
JG error_match_off_too_big
|
|
|
|
// Copy match from history
|
|
MOVQ CX, AX
|
|
SUBQ R12, AX
|
|
JLS copy_match
|
|
MOVQ 48(SP), R14
|
|
SUBQ AX, R14
|
|
CMPQ R13, AX
|
|
JG copy_all_from_history
|
|
MOVQ R13, AX
|
|
SUBQ $0x10, AX
|
|
JB copy_4_small
|
|
|
|
copy_4_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R10)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R10
|
|
SUBQ $0x10, AX
|
|
JAE copy_4_loop
|
|
LEAQ 16(R14)(AX*1), R14
|
|
LEAQ 16(R10)(AX*1), R10
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R10)
|
|
JMP copy_4_end
|
|
|
|
copy_4_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_4_move_3
|
|
CMPQ R13, $0x08
|
|
JB copy_4_move_4through7
|
|
JMP copy_4_move_8through16
|
|
|
|
copy_4_move_3:
|
|
MOVW (R14), AX
|
|
MOVB 2(R14), CL
|
|
MOVW AX, (R10)
|
|
MOVB CL, 2(R10)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R10
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_4through7:
|
|
MOVL (R14), AX
|
|
MOVL -4(R14)(R13*1), CX
|
|
MOVL AX, (R10)
|
|
MOVL CX, -4(R10)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R10
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_8through16:
|
|
MOVQ (R14), AX
|
|
MOVQ -8(R14)(R13*1), CX
|
|
MOVQ AX, (R10)
|
|
MOVQ CX, -8(R10)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R10
|
|
|
|
copy_4_end:
|
|
ADDQ R13, R12
|
|
JMP handle_loop
|
|
JMP loop_finished
|
|
|
|
copy_all_from_history:
|
|
MOVQ AX, R15
|
|
SUBQ $0x10, R15
|
|
JB copy_5_small
|
|
|
|
copy_5_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R10)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R10
|
|
SUBQ $0x10, R15
|
|
JAE copy_5_loop
|
|
LEAQ 16(R14)(R15*1), R14
|
|
LEAQ 16(R10)(R15*1), R10
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R10)
|
|
JMP copy_5_end
|
|
|
|
copy_5_small:
|
|
CMPQ AX, $0x03
|
|
JE copy_5_move_3
|
|
JB copy_5_move_1or2
|
|
CMPQ AX, $0x08
|
|
JB copy_5_move_4through7
|
|
JMP copy_5_move_8through16
|
|
|
|
copy_5_move_1or2:
|
|
MOVB (R14), R15
|
|
MOVB -1(R14)(AX*1), BP
|
|
MOVB R15, (R10)
|
|
MOVB BP, -1(R10)(AX*1)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_3:
|
|
MOVW (R14), R15
|
|
MOVB 2(R14), BP
|
|
MOVW R15, (R10)
|
|
MOVB BP, 2(R10)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_4through7:
|
|
MOVL (R14), R15
|
|
MOVL -4(R14)(AX*1), BP
|
|
MOVL R15, (R10)
|
|
MOVL BP, -4(R10)(AX*1)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_8through16:
|
|
MOVQ (R14), R15
|
|
MOVQ -8(R14)(AX*1), BP
|
|
MOVQ R15, (R10)
|
|
MOVQ BP, -8(R10)(AX*1)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
|
|
copy_5_end:
|
|
ADDQ AX, R12
|
|
SUBQ AX, R13
|
|
|
|
// Copy match from the current buffer
|
|
copy_match:
|
|
MOVQ R10, AX
|
|
SUBQ CX, AX
|
|
|
|
// ml <= mo
|
|
CMPQ R13, CX
|
|
JA copy_overlapping_match
|
|
|
|
// Copy non-overlapping match
|
|
ADDQ R13, R12
|
|
MOVQ R10, CX
|
|
ADDQ R13, R10
|
|
|
|
copy_2:
|
|
MOVUPS (AX), X0
|
|
MOVUPS X0, (CX)
|
|
ADDQ $0x10, AX
|
|
ADDQ $0x10, CX
|
|
SUBQ $0x10, R13
|
|
JHI copy_2
|
|
JMP handle_loop
|
|
|
|
// Copy overlapping match
|
|
copy_overlapping_match:
|
|
ADDQ R13, R12
|
|
|
|
copy_slow_3:
|
|
MOVB (AX), CL
|
|
MOVB CL, (R10)
|
|
INCQ AX
|
|
INCQ R10
|
|
DECQ R13
|
|
JNZ copy_slow_3
|
|
|
|
handle_loop:
|
|
MOVQ ctx+16(FP), AX
|
|
DECQ 96(AX)
|
|
JNS sequenceDecs_decodeSync_amd64_main_loop
|
|
|
|
loop_finished:
|
|
MOVQ br+8(FP), AX
|
|
MOVQ DX, 24(AX)
|
|
MOVB BL, 32(AX)
|
|
MOVQ SI, 8(AX)
|
|
|
|
// Update the context
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ R12, 136(AX)
|
|
MOVQ 144(AX), CX
|
|
SUBQ CX, R11
|
|
MOVQ R11, 168(AX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
|
|
MOVQ 16(SP), AX
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ AX, 216(CX)
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decodeSync_amd64_error_match_len_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
error_match_off_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 8(SP), CX
|
|
MOVQ CX, 224(AX)
|
|
MOVQ R12, 136(AX)
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// Return with overread error
|
|
error_overread:
|
|
MOVQ $0x00000006, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough output space error
|
|
error_not_enough_space:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ R12, 136(AX)
|
|
MOVQ $0x00000005, ret+24(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
|
|
// Requires: BMI, BMI2, CMOV, SSE
|
|
TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
|
|
MOVQ br+8(FP), BX
|
|
MOVQ 24(BX), AX
|
|
MOVBQZX 32(BX), DX
|
|
MOVQ (BX), CX
|
|
MOVQ 8(BX), BX
|
|
ADDQ BX, CX
|
|
MOVQ CX, (SP)
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 72(CX), SI
|
|
MOVQ 80(CX), DI
|
|
MOVQ 88(CX), R8
|
|
XORQ R9, R9
|
|
MOVQ R9, 8(SP)
|
|
MOVQ R9, 16(SP)
|
|
MOVQ R9, 24(SP)
|
|
MOVQ 112(CX), R9
|
|
MOVQ 128(CX), R10
|
|
MOVQ R10, 32(SP)
|
|
MOVQ 144(CX), R10
|
|
MOVQ 136(CX), R11
|
|
MOVQ 200(CX), R12
|
|
MOVQ R12, 56(SP)
|
|
MOVQ 176(CX), R12
|
|
MOVQ R12, 48(SP)
|
|
MOVQ 184(CX), CX
|
|
MOVQ CX, 40(SP)
|
|
MOVQ 40(SP), CX
|
|
ADDQ CX, 48(SP)
|
|
|
|
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
|
|
ADDQ R9, 32(SP)
|
|
|
|
// outBase += outPosition
|
|
ADDQ R11, R9
|
|
|
|
sequenceDecs_decodeSync_bmi2_main_loop:
|
|
MOVQ (SP), R12
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ BX, $0x08
|
|
JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
|
|
MOVQ DX, CX
|
|
SHRQ $0x03, CX
|
|
SUBQ CX, R12
|
|
MOVQ (R12), AX
|
|
SUBQ CX, BX
|
|
ANDQ $0x07, DX
|
|
JMP sequenceDecs_decodeSync_bmi2_fill_end
|
|
|
|
sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
|
|
CMPQ BX, $0x00
|
|
JLE sequenceDecs_decodeSync_bmi2_fill_check_overread
|
|
CMPQ DX, $0x07
|
|
JLE sequenceDecs_decodeSync_bmi2_fill_end
|
|
SHLQ $0x08, AX
|
|
SUBQ $0x01, R12
|
|
SUBQ $0x01, BX
|
|
SUBQ $0x08, DX
|
|
MOVBQZX (R12), CX
|
|
ORQ CX, AX
|
|
JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_bmi2_fill_check_overread:
|
|
CMPQ DX, $0x40
|
|
JA error_overread
|
|
|
|
sequenceDecs_decodeSync_bmi2_fill_end:
|
|
// Update offset
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R13
|
|
MOVQ AX, R14
|
|
LEAQ (DX)(R13*1), CX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
MOVQ CX, DX
|
|
MOVQ R8, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R14, CX
|
|
MOVQ CX, 8(SP)
|
|
|
|
// Update match length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, DI, R13
|
|
MOVQ AX, R14
|
|
LEAQ (DX)(R13*1), CX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
MOVQ CX, DX
|
|
MOVQ DI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R14, CX
|
|
MOVQ CX, 16(SP)
|
|
|
|
// Fill bitreader to have enough for the remaining
|
|
CMPQ BX, $0x08
|
|
JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
|
|
MOVQ DX, CX
|
|
SHRQ $0x03, CX
|
|
SUBQ CX, R12
|
|
MOVQ (R12), AX
|
|
SUBQ CX, BX
|
|
ANDQ $0x07, DX
|
|
JMP sequenceDecs_decodeSync_bmi2_fill_2_end
|
|
|
|
sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
|
|
CMPQ BX, $0x00
|
|
JLE sequenceDecs_decodeSync_bmi2_fill_2_check_overread
|
|
CMPQ DX, $0x07
|
|
JLE sequenceDecs_decodeSync_bmi2_fill_2_end
|
|
SHLQ $0x08, AX
|
|
SUBQ $0x01, R12
|
|
SUBQ $0x01, BX
|
|
SUBQ $0x08, DX
|
|
MOVBQZX (R12), CX
|
|
ORQ CX, AX
|
|
JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
|
|
CMPQ DX, $0x40
|
|
JA error_overread
|
|
|
|
sequenceDecs_decodeSync_bmi2_fill_2_end:
|
|
// Update literal length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, SI, R13
|
|
MOVQ AX, R14
|
|
LEAQ (DX)(R13*1), CX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
MOVQ CX, DX
|
|
MOVQ SI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R14, CX
|
|
MOVQ CX, 24(SP)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R12, (SP)
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R12
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decodeSync_bmi2_skip_update
|
|
LEAQ (SI)(DI*1), R13
|
|
ADDQ R8, R13
|
|
MOVBQZX R13, R13
|
|
LEAQ (DX)(R13*1), CX
|
|
MOVQ AX, R14
|
|
MOVQ CX, DX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
|
|
// Update Offset State
|
|
BZHIQ R8, R14, CX
|
|
SHRXQ R8, R14, R14
|
|
MOVQ $0x00001010, R13
|
|
BEXTRQ R13, R8, R8
|
|
ADDQ CX, R8
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Match Length State
|
|
BZHIQ DI, R14, CX
|
|
SHRXQ DI, R14, R14
|
|
MOVQ $0x00001010, R13
|
|
BEXTRQ R13, DI, DI
|
|
ADDQ CX, DI
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Literal Length State
|
|
BZHIQ SI, R14, CX
|
|
MOVQ $0x00001010, R13
|
|
BEXTRQ R13, SI, SI
|
|
ADDQ CX, SI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(SI*8), SI
|
|
|
|
sequenceDecs_decodeSync_bmi2_skip_update:
|
|
// Adjust offset
|
|
MOVQ s+0(FP), CX
|
|
MOVQ 8(SP), R13
|
|
CMPQ R12, $0x01
|
|
JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
|
|
MOVUPS 144(CX), X0
|
|
MOVQ R13, 144(CX)
|
|
MOVUPS X0, 152(CX)
|
|
JMP sequenceDecs_decodeSync_bmi2_after_adjust
|
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
|
|
CMPQ 24(SP), $0x00000000
|
|
JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
|
|
INCQ R13
|
|
JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
|
|
MOVQ 144(CX), R13
|
|
JMP sequenceDecs_decodeSync_bmi2_after_adjust
|
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
|
|
MOVQ R13, R12
|
|
XORQ R14, R14
|
|
MOVQ $-1, R15
|
|
CMPQ R13, $0x03
|
|
CMOVQEQ R14, R12
|
|
CMOVQEQ R15, R14
|
|
ADDQ 144(CX)(R12*8), R14
|
|
JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
|
|
MOVQ $0x00000001, R14
|
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
|
|
CMPQ R13, $0x01
|
|
JZ sequenceDecs_decodeSync_bmi2_adjust_skip
|
|
MOVQ 152(CX), R12
|
|
MOVQ R12, 160(CX)
|
|
|
|
sequenceDecs_decodeSync_bmi2_adjust_skip:
|
|
MOVQ 144(CX), R12
|
|
MOVQ R12, 152(CX)
|
|
MOVQ R14, 144(CX)
|
|
MOVQ R14, R13
|
|
|
|
sequenceDecs_decodeSync_bmi2_after_adjust:
|
|
MOVQ R13, 8(SP)
|
|
|
|
// Check values
|
|
MOVQ 16(SP), CX
|
|
MOVQ 24(SP), R12
|
|
LEAQ (CX)(R12*1), R14
|
|
MOVQ s+0(FP), R15
|
|
ADDQ R14, 256(R15)
|
|
MOVQ ctx+16(FP), R14
|
|
SUBQ R12, 104(R14)
|
|
JS error_not_enough_literals
|
|
CMPQ CX, $0x00020002
|
|
JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
|
|
MOVQ 24(SP), CX
|
|
MOVQ 8(SP), R12
|
|
MOVQ 16(SP), R13
|
|
|
|
// Check if we have enough space in s.out
|
|
LEAQ (CX)(R13*1), R14
|
|
ADDQ R9, R14
|
|
CMPQ R14, 32(SP)
|
|
JA error_not_enough_space
|
|
|
|
// Copy literals
|
|
TESTQ CX, CX
|
|
JZ check_offset
|
|
XORQ R14, R14
|
|
|
|
copy_1:
|
|
MOVUPS (R10)(R14*1), X0
|
|
MOVUPS X0, (R9)(R14*1)
|
|
ADDQ $0x10, R14
|
|
CMPQ R14, CX
|
|
JB copy_1
|
|
ADDQ CX, R10
|
|
ADDQ CX, R9
|
|
ADDQ CX, R11
|
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
check_offset:
|
|
MOVQ R11, CX
|
|
ADDQ 40(SP), CX
|
|
CMPQ R12, CX
|
|
JG error_match_off_too_big
|
|
CMPQ R12, 56(SP)
|
|
JG error_match_off_too_big
|
|
|
|
// Copy match from history
|
|
MOVQ R12, CX
|
|
SUBQ R11, CX
|
|
JLS copy_match
|
|
MOVQ 48(SP), R14
|
|
SUBQ CX, R14
|
|
CMPQ R13, CX
|
|
JG copy_all_from_history
|
|
MOVQ R13, CX
|
|
SUBQ $0x10, CX
|
|
JB copy_4_small
|
|
|
|
copy_4_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R9)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R9
|
|
SUBQ $0x10, CX
|
|
JAE copy_4_loop
|
|
LEAQ 16(R14)(CX*1), R14
|
|
LEAQ 16(R9)(CX*1), R9
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R9)
|
|
JMP copy_4_end
|
|
|
|
copy_4_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_4_move_3
|
|
CMPQ R13, $0x08
|
|
JB copy_4_move_4through7
|
|
JMP copy_4_move_8through16
|
|
|
|
copy_4_move_3:
|
|
MOVW (R14), CX
|
|
MOVB 2(R14), R12
|
|
MOVW CX, (R9)
|
|
MOVB R12, 2(R9)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R9
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_4through7:
|
|
MOVL (R14), CX
|
|
MOVL -4(R14)(R13*1), R12
|
|
MOVL CX, (R9)
|
|
MOVL R12, -4(R9)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R9
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_8through16:
|
|
MOVQ (R14), CX
|
|
MOVQ -8(R14)(R13*1), R12
|
|
MOVQ CX, (R9)
|
|
MOVQ R12, -8(R9)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R9
|
|
|
|
copy_4_end:
|
|
ADDQ R13, R11
|
|
JMP handle_loop
|
|
JMP loop_finished
|
|
|
|
copy_all_from_history:
|
|
MOVQ CX, R15
|
|
SUBQ $0x10, R15
|
|
JB copy_5_small
|
|
|
|
copy_5_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R9)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R9
|
|
SUBQ $0x10, R15
|
|
JAE copy_5_loop
|
|
LEAQ 16(R14)(R15*1), R14
|
|
LEAQ 16(R9)(R15*1), R9
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R9)
|
|
JMP copy_5_end
|
|
|
|
copy_5_small:
|
|
CMPQ CX, $0x03
|
|
JE copy_5_move_3
|
|
JB copy_5_move_1or2
|
|
CMPQ CX, $0x08
|
|
JB copy_5_move_4through7
|
|
JMP copy_5_move_8through16
|
|
|
|
copy_5_move_1or2:
|
|
MOVB (R14), R15
|
|
MOVB -1(R14)(CX*1), BP
|
|
MOVB R15, (R9)
|
|
MOVB BP, -1(R9)(CX*1)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_3:
|
|
MOVW (R14), R15
|
|
MOVB 2(R14), BP
|
|
MOVW R15, (R9)
|
|
MOVB BP, 2(R9)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_4through7:
|
|
MOVL (R14), R15
|
|
MOVL -4(R14)(CX*1), BP
|
|
MOVL R15, (R9)
|
|
MOVL BP, -4(R9)(CX*1)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_8through16:
|
|
MOVQ (R14), R15
|
|
MOVQ -8(R14)(CX*1), BP
|
|
MOVQ R15, (R9)
|
|
MOVQ BP, -8(R9)(CX*1)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
|
|
copy_5_end:
|
|
ADDQ CX, R11
|
|
SUBQ CX, R13
|
|
|
|
// Copy match from the current buffer
|
|
copy_match:
|
|
MOVQ R9, CX
|
|
SUBQ R12, CX
|
|
|
|
// ml <= mo
|
|
CMPQ R13, R12
|
|
JA copy_overlapping_match
|
|
|
|
// Copy non-overlapping match
|
|
ADDQ R13, R11
|
|
MOVQ R9, R12
|
|
ADDQ R13, R9
|
|
|
|
copy_2:
|
|
MOVUPS (CX), X0
|
|
MOVUPS X0, (R12)
|
|
ADDQ $0x10, CX
|
|
ADDQ $0x10, R12
|
|
SUBQ $0x10, R13
|
|
JHI copy_2
|
|
JMP handle_loop
|
|
|
|
// Copy overlapping match
|
|
copy_overlapping_match:
|
|
ADDQ R13, R11
|
|
|
|
copy_slow_3:
|
|
MOVB (CX), R12
|
|
MOVB R12, (R9)
|
|
INCQ CX
|
|
INCQ R9
|
|
DECQ R13
|
|
JNZ copy_slow_3
|
|
|
|
handle_loop:
|
|
MOVQ ctx+16(FP), CX
|
|
DECQ 96(CX)
|
|
JNS sequenceDecs_decodeSync_bmi2_main_loop
|
|
|
|
loop_finished:
|
|
MOVQ br+8(FP), CX
|
|
MOVQ AX, 24(CX)
|
|
MOVB DL, 32(CX)
|
|
MOVQ BX, 8(CX)
|
|
|
|
// Update the context
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ R11, 136(AX)
|
|
MOVQ 144(AX), CX
|
|
SUBQ CX, R10
|
|
MOVQ R10, 168(AX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
|
|
MOVQ 16(SP), AX
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ AX, 216(CX)
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
error_match_off_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 8(SP), CX
|
|
MOVQ CX, 224(AX)
|
|
MOVQ R11, 136(AX)
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// Return with overread error
|
|
error_overread:
|
|
MOVQ $0x00000006, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough output space error
|
|
error_not_enough_space:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ R11, 136(AX)
|
|
MOVQ $0x00000005, ret+24(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
|
|
// Requires: CMOV, SSE
|
|
TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
|
|
MOVQ br+8(FP), CX
|
|
MOVQ 24(CX), DX
|
|
MOVBQZX 32(CX), BX
|
|
MOVQ (CX), AX
|
|
MOVQ 8(CX), SI
|
|
ADDQ SI, AX
|
|
MOVQ AX, (SP)
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 72(AX), DI
|
|
MOVQ 80(AX), R8
|
|
MOVQ 88(AX), R9
|
|
XORQ CX, CX
|
|
MOVQ CX, 8(SP)
|
|
MOVQ CX, 16(SP)
|
|
MOVQ CX, 24(SP)
|
|
MOVQ 112(AX), R10
|
|
MOVQ 128(AX), CX
|
|
MOVQ CX, 32(SP)
|
|
MOVQ 144(AX), R11
|
|
MOVQ 136(AX), R12
|
|
MOVQ 200(AX), CX
|
|
MOVQ CX, 56(SP)
|
|
MOVQ 176(AX), CX
|
|
MOVQ CX, 48(SP)
|
|
MOVQ 184(AX), AX
|
|
MOVQ AX, 40(SP)
|
|
MOVQ 40(SP), AX
|
|
ADDQ AX, 48(SP)
|
|
|
|
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
|
|
ADDQ R10, 32(SP)
|
|
|
|
// outBase += outPosition
|
|
ADDQ R12, R10
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_main_loop:
|
|
MOVQ (SP), R13
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ SI, $0x08
|
|
JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
|
|
MOVQ BX, AX
|
|
SHRQ $0x03, AX
|
|
SUBQ AX, R13
|
|
MOVQ (R13), DX
|
|
SUBQ AX, SI
|
|
ANDQ $0x07, BX
|
|
JMP sequenceDecs_decodeSync_safe_amd64_fill_end
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
|
|
CMPQ SI, $0x00
|
|
JLE sequenceDecs_decodeSync_safe_amd64_fill_check_overread
|
|
CMPQ BX, $0x07
|
|
JLE sequenceDecs_decodeSync_safe_amd64_fill_end
|
|
SHLQ $0x08, DX
|
|
SUBQ $0x01, R13
|
|
SUBQ $0x01, SI
|
|
SUBQ $0x08, BX
|
|
MOVBQZX (R13), AX
|
|
ORQ AX, DX
|
|
JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
|
|
CMPQ BX, $0x40
|
|
JA error_overread
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_end:
|
|
// Update offset
|
|
MOVQ R9, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R14
|
|
SHLQ CL, R14
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R14
|
|
ADDQ R14, AX
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_of_update_zero:
|
|
MOVQ AX, 8(SP)
|
|
|
|
// Update match length
|
|
MOVQ R8, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R14
|
|
SHLQ CL, R14
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R14
|
|
ADDQ R14, AX
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
|
|
MOVQ AX, 16(SP)
|
|
|
|
// Fill bitreader to have enough for the remaining
|
|
CMPQ SI, $0x08
|
|
JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
|
|
MOVQ BX, AX
|
|
SHRQ $0x03, AX
|
|
SUBQ AX, R13
|
|
MOVQ (R13), DX
|
|
SUBQ AX, SI
|
|
ANDQ $0x07, BX
|
|
JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
|
|
CMPQ SI, $0x00
|
|
JLE sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
|
|
CMPQ BX, $0x07
|
|
JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
|
|
SHLQ $0x08, DX
|
|
SUBQ $0x01, R13
|
|
SUBQ $0x01, SI
|
|
SUBQ $0x08, BX
|
|
MOVBQZX (R13), AX
|
|
ORQ AX, DX
|
|
JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
|
|
CMPQ BX, $0x40
|
|
JA error_overread
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_fill_2_end:
|
|
// Update literal length
|
|
MOVQ DI, AX
|
|
MOVQ BX, CX
|
|
MOVQ DX, R14
|
|
SHLQ CL, R14
|
|
MOVB AH, CL
|
|
SHRQ $0x20, AX
|
|
TESTQ CX, CX
|
|
JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
|
|
ADDQ CX, BX
|
|
CMPQ BX, $0x40
|
|
JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
|
|
CMPQ CX, $0x40
|
|
JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
|
|
NEGQ CX
|
|
SHRQ CL, R14
|
|
ADDQ R14, AX
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
|
|
MOVQ AX, 24(SP)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R13, (SP)
|
|
MOVQ R9, AX
|
|
SHRQ $0x08, AX
|
|
MOVBQZX AL, AX
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decodeSync_safe_amd64_skip_update
|
|
|
|
// Update Literal Length State
|
|
MOVBQZX DI, R13
|
|
SHRQ $0x10, DI
|
|
MOVWQZX DI, DI
|
|
LEAQ (BX)(R13*1), CX
|
|
MOVQ DX, R14
|
|
MOVQ CX, BX
|
|
ROLQ CL, R14
|
|
MOVL $0x00000001, R15
|
|
MOVB R13, CL
|
|
SHLL CL, R15
|
|
DECL R15
|
|
ANDQ R15, R14
|
|
ADDQ R14, DI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Match Length State
|
|
MOVBQZX R8, R13
|
|
SHRQ $0x10, R8
|
|
MOVWQZX R8, R8
|
|
LEAQ (BX)(R13*1), CX
|
|
MOVQ DX, R14
|
|
MOVQ CX, BX
|
|
ROLQ CL, R14
|
|
MOVL $0x00000001, R15
|
|
MOVB R13, CL
|
|
SHLL CL, R15
|
|
DECL R15
|
|
ANDQ R15, R14
|
|
ADDQ R14, R8
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Offset State
|
|
MOVBQZX R9, R13
|
|
SHRQ $0x10, R9
|
|
MOVWQZX R9, R9
|
|
LEAQ (BX)(R13*1), CX
|
|
MOVQ DX, R14
|
|
MOVQ CX, BX
|
|
ROLQ CL, R14
|
|
MOVL $0x00000001, R15
|
|
MOVB R13, CL
|
|
SHLL CL, R15
|
|
DECL R15
|
|
ANDQ R15, R14
|
|
ADDQ R14, R9
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R9*8), R9
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_skip_update:
|
|
// Adjust offset
|
|
MOVQ s+0(FP), CX
|
|
MOVQ 8(SP), R13
|
|
CMPQ AX, $0x01
|
|
JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
|
|
MOVUPS 144(CX), X0
|
|
MOVQ R13, 144(CX)
|
|
MOVUPS X0, 152(CX)
|
|
JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
|
|
CMPQ 24(SP), $0x00000000
|
|
JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
|
|
INCQ R13
|
|
JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
|
|
MOVQ 144(CX), R13
|
|
JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
|
|
MOVQ R13, AX
|
|
XORQ R14, R14
|
|
MOVQ $-1, R15
|
|
CMPQ R13, $0x03
|
|
CMOVQEQ R14, AX
|
|
CMOVQEQ R15, R14
|
|
ADDQ 144(CX)(AX*8), R14
|
|
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
|
|
MOVQ $0x00000001, R14
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
|
|
CMPQ R13, $0x01
|
|
JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip
|
|
MOVQ 152(CX), AX
|
|
MOVQ AX, 160(CX)
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_adjust_skip:
|
|
MOVQ 144(CX), AX
|
|
MOVQ AX, 152(CX)
|
|
MOVQ R14, 144(CX)
|
|
MOVQ R14, R13
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_after_adjust:
|
|
MOVQ R13, 8(SP)
|
|
|
|
// Check values
|
|
MOVQ 16(SP), AX
|
|
MOVQ 24(SP), CX
|
|
LEAQ (AX)(CX*1), R14
|
|
MOVQ s+0(FP), R15
|
|
ADDQ R14, 256(R15)
|
|
MOVQ ctx+16(FP), R14
|
|
SUBQ CX, 104(R14)
|
|
JS error_not_enough_literals
|
|
CMPQ AX, $0x00020002
|
|
JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
|
|
TESTQ AX, AX
|
|
JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
|
|
MOVQ 24(SP), AX
|
|
MOVQ 8(SP), CX
|
|
MOVQ 16(SP), R13
|
|
|
|
// Check if we have enough space in s.out
|
|
LEAQ (AX)(R13*1), R14
|
|
ADDQ R10, R14
|
|
CMPQ R14, 32(SP)
|
|
JA error_not_enough_space
|
|
|
|
// Copy literals
|
|
TESTQ AX, AX
|
|
JZ check_offset
|
|
MOVQ AX, R14
|
|
SUBQ $0x10, R14
|
|
JB copy_1_small
|
|
|
|
copy_1_loop:
|
|
MOVUPS (R11), X0
|
|
MOVUPS X0, (R10)
|
|
ADDQ $0x10, R11
|
|
ADDQ $0x10, R10
|
|
SUBQ $0x10, R14
|
|
JAE copy_1_loop
|
|
LEAQ 16(R11)(R14*1), R11
|
|
LEAQ 16(R10)(R14*1), R10
|
|
MOVUPS -16(R11), X0
|
|
MOVUPS X0, -16(R10)
|
|
JMP copy_1_end
|
|
|
|
copy_1_small:
|
|
CMPQ AX, $0x03
|
|
JE copy_1_move_3
|
|
JB copy_1_move_1or2
|
|
CMPQ AX, $0x08
|
|
JB copy_1_move_4through7
|
|
JMP copy_1_move_8through16
|
|
|
|
copy_1_move_1or2:
|
|
MOVB (R11), R14
|
|
MOVB -1(R11)(AX*1), R15
|
|
MOVB R14, (R10)
|
|
MOVB R15, -1(R10)(AX*1)
|
|
ADDQ AX, R11
|
|
ADDQ AX, R10
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_3:
|
|
MOVW (R11), R14
|
|
MOVB 2(R11), R15
|
|
MOVW R14, (R10)
|
|
MOVB R15, 2(R10)
|
|
ADDQ AX, R11
|
|
ADDQ AX, R10
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_4through7:
|
|
MOVL (R11), R14
|
|
MOVL -4(R11)(AX*1), R15
|
|
MOVL R14, (R10)
|
|
MOVL R15, -4(R10)(AX*1)
|
|
ADDQ AX, R11
|
|
ADDQ AX, R10
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_8through16:
|
|
MOVQ (R11), R14
|
|
MOVQ -8(R11)(AX*1), R15
|
|
MOVQ R14, (R10)
|
|
MOVQ R15, -8(R10)(AX*1)
|
|
ADDQ AX, R11
|
|
ADDQ AX, R10
|
|
|
|
copy_1_end:
|
|
ADDQ AX, R12
|
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
check_offset:
|
|
MOVQ R12, AX
|
|
ADDQ 40(SP), AX
|
|
CMPQ CX, AX
|
|
JG error_match_off_too_big
|
|
CMPQ CX, 56(SP)
|
|
JG error_match_off_too_big
|
|
|
|
// Copy match from history
|
|
MOVQ CX, AX
|
|
SUBQ R12, AX
|
|
JLS copy_match
|
|
MOVQ 48(SP), R14
|
|
SUBQ AX, R14
|
|
CMPQ R13, AX
|
|
JG copy_all_from_history
|
|
MOVQ R13, AX
|
|
SUBQ $0x10, AX
|
|
JB copy_4_small
|
|
|
|
copy_4_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R10)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R10
|
|
SUBQ $0x10, AX
|
|
JAE copy_4_loop
|
|
LEAQ 16(R14)(AX*1), R14
|
|
LEAQ 16(R10)(AX*1), R10
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R10)
|
|
JMP copy_4_end
|
|
|
|
copy_4_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_4_move_3
|
|
CMPQ R13, $0x08
|
|
JB copy_4_move_4through7
|
|
JMP copy_4_move_8through16
|
|
|
|
copy_4_move_3:
|
|
MOVW (R14), AX
|
|
MOVB 2(R14), CL
|
|
MOVW AX, (R10)
|
|
MOVB CL, 2(R10)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R10
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_4through7:
|
|
MOVL (R14), AX
|
|
MOVL -4(R14)(R13*1), CX
|
|
MOVL AX, (R10)
|
|
MOVL CX, -4(R10)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R10
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_8through16:
|
|
MOVQ (R14), AX
|
|
MOVQ -8(R14)(R13*1), CX
|
|
MOVQ AX, (R10)
|
|
MOVQ CX, -8(R10)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R10
|
|
|
|
copy_4_end:
|
|
ADDQ R13, R12
|
|
JMP handle_loop
|
|
JMP loop_finished
|
|
|
|
copy_all_from_history:
|
|
MOVQ AX, R15
|
|
SUBQ $0x10, R15
|
|
JB copy_5_small
|
|
|
|
copy_5_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R10)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R10
|
|
SUBQ $0x10, R15
|
|
JAE copy_5_loop
|
|
LEAQ 16(R14)(R15*1), R14
|
|
LEAQ 16(R10)(R15*1), R10
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R10)
|
|
JMP copy_5_end
|
|
|
|
copy_5_small:
|
|
CMPQ AX, $0x03
|
|
JE copy_5_move_3
|
|
JB copy_5_move_1or2
|
|
CMPQ AX, $0x08
|
|
JB copy_5_move_4through7
|
|
JMP copy_5_move_8through16
|
|
|
|
copy_5_move_1or2:
|
|
MOVB (R14), R15
|
|
MOVB -1(R14)(AX*1), BP
|
|
MOVB R15, (R10)
|
|
MOVB BP, -1(R10)(AX*1)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_3:
|
|
MOVW (R14), R15
|
|
MOVB 2(R14), BP
|
|
MOVW R15, (R10)
|
|
MOVB BP, 2(R10)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_4through7:
|
|
MOVL (R14), R15
|
|
MOVL -4(R14)(AX*1), BP
|
|
MOVL R15, (R10)
|
|
MOVL BP, -4(R10)(AX*1)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_8through16:
|
|
MOVQ (R14), R15
|
|
MOVQ -8(R14)(AX*1), BP
|
|
MOVQ R15, (R10)
|
|
MOVQ BP, -8(R10)(AX*1)
|
|
ADDQ AX, R14
|
|
ADDQ AX, R10
|
|
|
|
copy_5_end:
|
|
ADDQ AX, R12
|
|
SUBQ AX, R13
|
|
|
|
// Copy match from the current buffer
|
|
copy_match:
|
|
MOVQ R10, AX
|
|
SUBQ CX, AX
|
|
|
|
// ml <= mo
|
|
CMPQ R13, CX
|
|
JA copy_overlapping_match
|
|
|
|
// Copy non-overlapping match
|
|
ADDQ R13, R12
|
|
MOVQ R13, CX
|
|
SUBQ $0x10, CX
|
|
JB copy_2_small
|
|
|
|
copy_2_loop:
|
|
MOVUPS (AX), X0
|
|
MOVUPS X0, (R10)
|
|
ADDQ $0x10, AX
|
|
ADDQ $0x10, R10
|
|
SUBQ $0x10, CX
|
|
JAE copy_2_loop
|
|
LEAQ 16(AX)(CX*1), AX
|
|
LEAQ 16(R10)(CX*1), R10
|
|
MOVUPS -16(AX), X0
|
|
MOVUPS X0, -16(R10)
|
|
JMP copy_2_end
|
|
|
|
copy_2_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_2_move_3
|
|
JB copy_2_move_1or2
|
|
CMPQ R13, $0x08
|
|
JB copy_2_move_4through7
|
|
JMP copy_2_move_8through16
|
|
|
|
copy_2_move_1or2:
|
|
MOVB (AX), CL
|
|
MOVB -1(AX)(R13*1), R14
|
|
MOVB CL, (R10)
|
|
MOVB R14, -1(R10)(R13*1)
|
|
ADDQ R13, AX
|
|
ADDQ R13, R10
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_3:
|
|
MOVW (AX), CX
|
|
MOVB 2(AX), R14
|
|
MOVW CX, (R10)
|
|
MOVB R14, 2(R10)
|
|
ADDQ R13, AX
|
|
ADDQ R13, R10
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_4through7:
|
|
MOVL (AX), CX
|
|
MOVL -4(AX)(R13*1), R14
|
|
MOVL CX, (R10)
|
|
MOVL R14, -4(R10)(R13*1)
|
|
ADDQ R13, AX
|
|
ADDQ R13, R10
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_8through16:
|
|
MOVQ (AX), CX
|
|
MOVQ -8(AX)(R13*1), R14
|
|
MOVQ CX, (R10)
|
|
MOVQ R14, -8(R10)(R13*1)
|
|
ADDQ R13, AX
|
|
ADDQ R13, R10
|
|
|
|
copy_2_end:
|
|
JMP handle_loop
|
|
|
|
// Copy overlapping match
|
|
copy_overlapping_match:
|
|
ADDQ R13, R12
|
|
|
|
copy_slow_3:
|
|
MOVB (AX), CL
|
|
MOVB CL, (R10)
|
|
INCQ AX
|
|
INCQ R10
|
|
DECQ R13
|
|
JNZ copy_slow_3
|
|
|
|
handle_loop:
|
|
MOVQ ctx+16(FP), AX
|
|
DECQ 96(AX)
|
|
JNS sequenceDecs_decodeSync_safe_amd64_main_loop
|
|
|
|
loop_finished:
|
|
MOVQ br+8(FP), AX
|
|
MOVQ DX, 24(AX)
|
|
MOVB BL, 32(AX)
|
|
MOVQ SI, 8(AX)
|
|
|
|
// Update the context
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ R12, 136(AX)
|
|
MOVQ 144(AX), CX
|
|
SUBQ CX, R11
|
|
MOVQ R11, 168(AX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
|
|
MOVQ 16(SP), AX
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ AX, 216(CX)
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
error_match_off_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 8(SP), CX
|
|
MOVQ CX, 224(AX)
|
|
MOVQ R12, 136(AX)
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// Return with overread error
|
|
error_overread:
|
|
MOVQ $0x00000006, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough output space error
|
|
error_not_enough_space:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ R12, 136(AX)
|
|
MOVQ $0x00000005, ret+24(FP)
|
|
RET
|
|
|
|
// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
|
|
// Requires: BMI, BMI2, CMOV, SSE
|
|
TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
|
|
MOVQ br+8(FP), BX
|
|
MOVQ 24(BX), AX
|
|
MOVBQZX 32(BX), DX
|
|
MOVQ (BX), CX
|
|
MOVQ 8(BX), BX
|
|
ADDQ BX, CX
|
|
MOVQ CX, (SP)
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 72(CX), SI
|
|
MOVQ 80(CX), DI
|
|
MOVQ 88(CX), R8
|
|
XORQ R9, R9
|
|
MOVQ R9, 8(SP)
|
|
MOVQ R9, 16(SP)
|
|
MOVQ R9, 24(SP)
|
|
MOVQ 112(CX), R9
|
|
MOVQ 128(CX), R10
|
|
MOVQ R10, 32(SP)
|
|
MOVQ 144(CX), R10
|
|
MOVQ 136(CX), R11
|
|
MOVQ 200(CX), R12
|
|
MOVQ R12, 56(SP)
|
|
MOVQ 176(CX), R12
|
|
MOVQ R12, 48(SP)
|
|
MOVQ 184(CX), CX
|
|
MOVQ CX, 40(SP)
|
|
MOVQ 40(SP), CX
|
|
ADDQ CX, 48(SP)
|
|
|
|
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
|
|
ADDQ R9, 32(SP)
|
|
|
|
// outBase += outPosition
|
|
ADDQ R11, R9
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_main_loop:
|
|
MOVQ (SP), R12
|
|
|
|
// Fill bitreader to have enough for the offset and match length.
|
|
CMPQ BX, $0x08
|
|
JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
|
|
MOVQ DX, CX
|
|
SHRQ $0x03, CX
|
|
SUBQ CX, R12
|
|
MOVQ (R12), AX
|
|
SUBQ CX, BX
|
|
ANDQ $0x07, DX
|
|
JMP sequenceDecs_decodeSync_safe_bmi2_fill_end
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
|
|
CMPQ BX, $0x00
|
|
JLE sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
|
|
CMPQ DX, $0x07
|
|
JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
|
|
SHLQ $0x08, AX
|
|
SUBQ $0x01, R12
|
|
SUBQ $0x01, BX
|
|
SUBQ $0x08, DX
|
|
MOVBQZX (R12), CX
|
|
ORQ CX, AX
|
|
JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
|
|
CMPQ DX, $0x40
|
|
JA error_overread
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_end:
|
|
// Update offset
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R13
|
|
MOVQ AX, R14
|
|
LEAQ (DX)(R13*1), CX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
MOVQ CX, DX
|
|
MOVQ R8, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R14, CX
|
|
MOVQ CX, 8(SP)
|
|
|
|
// Update match length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, DI, R13
|
|
MOVQ AX, R14
|
|
LEAQ (DX)(R13*1), CX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
MOVQ CX, DX
|
|
MOVQ DI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R14, CX
|
|
MOVQ CX, 16(SP)
|
|
|
|
// Fill bitreader to have enough for the remaining
|
|
CMPQ BX, $0x08
|
|
JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
|
|
MOVQ DX, CX
|
|
SHRQ $0x03, CX
|
|
SUBQ CX, R12
|
|
MOVQ (R12), AX
|
|
SUBQ CX, BX
|
|
ANDQ $0x07, DX
|
|
JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
|
|
CMPQ BX, $0x00
|
|
JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
|
|
CMPQ DX, $0x07
|
|
JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
|
|
SHLQ $0x08, AX
|
|
SUBQ $0x01, R12
|
|
SUBQ $0x01, BX
|
|
SUBQ $0x08, DX
|
|
MOVBQZX (R12), CX
|
|
ORQ CX, AX
|
|
JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
|
|
CMPQ DX, $0x40
|
|
JA error_overread
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
|
|
// Update literal length
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, SI, R13
|
|
MOVQ AX, R14
|
|
LEAQ (DX)(R13*1), CX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
MOVQ CX, DX
|
|
MOVQ SI, CX
|
|
SHRQ $0x20, CX
|
|
ADDQ R14, CX
|
|
MOVQ CX, 24(SP)
|
|
|
|
// Fill bitreader for state updates
|
|
MOVQ R12, (SP)
|
|
MOVQ $0x00000808, CX
|
|
BEXTRQ CX, R8, R12
|
|
MOVQ ctx+16(FP), CX
|
|
CMPQ 96(CX), $0x00
|
|
JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
|
|
LEAQ (SI)(DI*1), R13
|
|
ADDQ R8, R13
|
|
MOVBQZX R13, R13
|
|
LEAQ (DX)(R13*1), CX
|
|
MOVQ AX, R14
|
|
MOVQ CX, DX
|
|
ROLQ CL, R14
|
|
BZHIQ R13, R14, R14
|
|
|
|
// Update Offset State
|
|
BZHIQ R8, R14, CX
|
|
SHRXQ R8, R14, R14
|
|
MOVQ $0x00001010, R13
|
|
BEXTRQ R13, R8, R8
|
|
ADDQ CX, R8
|
|
|
|
// Load ctx.ofTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 48(CX), CX
|
|
MOVQ (CX)(R8*8), R8
|
|
|
|
// Update Match Length State
|
|
BZHIQ DI, R14, CX
|
|
SHRXQ DI, R14, R14
|
|
MOVQ $0x00001010, R13
|
|
BEXTRQ R13, DI, DI
|
|
ADDQ CX, DI
|
|
|
|
// Load ctx.mlTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ 24(CX), CX
|
|
MOVQ (CX)(DI*8), DI
|
|
|
|
// Update Literal Length State
|
|
BZHIQ SI, R14, CX
|
|
MOVQ $0x00001010, R13
|
|
BEXTRQ R13, SI, SI
|
|
ADDQ CX, SI
|
|
|
|
// Load ctx.llTable
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ (CX)(SI*8), SI
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_skip_update:
|
|
// Adjust offset
|
|
MOVQ s+0(FP), CX
|
|
MOVQ 8(SP), R13
|
|
CMPQ R12, $0x01
|
|
JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
|
|
MOVUPS 144(CX), X0
|
|
MOVQ R13, 144(CX)
|
|
MOVUPS X0, 152(CX)
|
|
JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
|
|
CMPQ 24(SP), $0x00000000
|
|
JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
|
|
INCQ R13
|
|
JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
|
|
MOVQ 144(CX), R13
|
|
JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
|
|
MOVQ R13, R12
|
|
XORQ R14, R14
|
|
MOVQ $-1, R15
|
|
CMPQ R13, $0x03
|
|
CMOVQEQ R14, R12
|
|
CMOVQEQ R15, R14
|
|
ADDQ 144(CX)(R12*8), R14
|
|
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
|
|
MOVQ $0x00000001, R14
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
|
|
CMPQ R13, $0x01
|
|
JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip
|
|
MOVQ 152(CX), R12
|
|
MOVQ R12, 160(CX)
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
|
|
MOVQ 144(CX), R12
|
|
MOVQ R12, 152(CX)
|
|
MOVQ R14, 144(CX)
|
|
MOVQ R14, R13
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_after_adjust:
|
|
MOVQ R13, 8(SP)
|
|
|
|
// Check values
|
|
MOVQ 16(SP), CX
|
|
MOVQ 24(SP), R12
|
|
LEAQ (CX)(R12*1), R14
|
|
MOVQ s+0(FP), R15
|
|
ADDQ R14, 256(R15)
|
|
MOVQ ctx+16(FP), R14
|
|
SUBQ R12, 104(R14)
|
|
JS error_not_enough_literals
|
|
CMPQ CX, $0x00020002
|
|
JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
|
|
TESTQ R13, R13
|
|
JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
|
|
TESTQ CX, CX
|
|
JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
|
|
|
|
sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
|
|
MOVQ 24(SP), CX
|
|
MOVQ 8(SP), R12
|
|
MOVQ 16(SP), R13
|
|
|
|
// Check if we have enough space in s.out
|
|
LEAQ (CX)(R13*1), R14
|
|
ADDQ R9, R14
|
|
CMPQ R14, 32(SP)
|
|
JA error_not_enough_space
|
|
|
|
// Copy literals
|
|
TESTQ CX, CX
|
|
JZ check_offset
|
|
MOVQ CX, R14
|
|
SUBQ $0x10, R14
|
|
JB copy_1_small
|
|
|
|
copy_1_loop:
|
|
MOVUPS (R10), X0
|
|
MOVUPS X0, (R9)
|
|
ADDQ $0x10, R10
|
|
ADDQ $0x10, R9
|
|
SUBQ $0x10, R14
|
|
JAE copy_1_loop
|
|
LEAQ 16(R10)(R14*1), R10
|
|
LEAQ 16(R9)(R14*1), R9
|
|
MOVUPS -16(R10), X0
|
|
MOVUPS X0, -16(R9)
|
|
JMP copy_1_end
|
|
|
|
copy_1_small:
|
|
CMPQ CX, $0x03
|
|
JE copy_1_move_3
|
|
JB copy_1_move_1or2
|
|
CMPQ CX, $0x08
|
|
JB copy_1_move_4through7
|
|
JMP copy_1_move_8through16
|
|
|
|
copy_1_move_1or2:
|
|
MOVB (R10), R14
|
|
MOVB -1(R10)(CX*1), R15
|
|
MOVB R14, (R9)
|
|
MOVB R15, -1(R9)(CX*1)
|
|
ADDQ CX, R10
|
|
ADDQ CX, R9
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_3:
|
|
MOVW (R10), R14
|
|
MOVB 2(R10), R15
|
|
MOVW R14, (R9)
|
|
MOVB R15, 2(R9)
|
|
ADDQ CX, R10
|
|
ADDQ CX, R9
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_4through7:
|
|
MOVL (R10), R14
|
|
MOVL -4(R10)(CX*1), R15
|
|
MOVL R14, (R9)
|
|
MOVL R15, -4(R9)(CX*1)
|
|
ADDQ CX, R10
|
|
ADDQ CX, R9
|
|
JMP copy_1_end
|
|
|
|
copy_1_move_8through16:
|
|
MOVQ (R10), R14
|
|
MOVQ -8(R10)(CX*1), R15
|
|
MOVQ R14, (R9)
|
|
MOVQ R15, -8(R9)(CX*1)
|
|
ADDQ CX, R10
|
|
ADDQ CX, R9
|
|
|
|
copy_1_end:
|
|
ADDQ CX, R11
|
|
|
|
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
|
|
check_offset:
|
|
MOVQ R11, CX
|
|
ADDQ 40(SP), CX
|
|
CMPQ R12, CX
|
|
JG error_match_off_too_big
|
|
CMPQ R12, 56(SP)
|
|
JG error_match_off_too_big
|
|
|
|
// Copy match from history
|
|
MOVQ R12, CX
|
|
SUBQ R11, CX
|
|
JLS copy_match
|
|
MOVQ 48(SP), R14
|
|
SUBQ CX, R14
|
|
CMPQ R13, CX
|
|
JG copy_all_from_history
|
|
MOVQ R13, CX
|
|
SUBQ $0x10, CX
|
|
JB copy_4_small
|
|
|
|
copy_4_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R9)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R9
|
|
SUBQ $0x10, CX
|
|
JAE copy_4_loop
|
|
LEAQ 16(R14)(CX*1), R14
|
|
LEAQ 16(R9)(CX*1), R9
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R9)
|
|
JMP copy_4_end
|
|
|
|
copy_4_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_4_move_3
|
|
CMPQ R13, $0x08
|
|
JB copy_4_move_4through7
|
|
JMP copy_4_move_8through16
|
|
|
|
copy_4_move_3:
|
|
MOVW (R14), CX
|
|
MOVB 2(R14), R12
|
|
MOVW CX, (R9)
|
|
MOVB R12, 2(R9)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R9
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_4through7:
|
|
MOVL (R14), CX
|
|
MOVL -4(R14)(R13*1), R12
|
|
MOVL CX, (R9)
|
|
MOVL R12, -4(R9)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R9
|
|
JMP copy_4_end
|
|
|
|
copy_4_move_8through16:
|
|
MOVQ (R14), CX
|
|
MOVQ -8(R14)(R13*1), R12
|
|
MOVQ CX, (R9)
|
|
MOVQ R12, -8(R9)(R13*1)
|
|
ADDQ R13, R14
|
|
ADDQ R13, R9
|
|
|
|
copy_4_end:
|
|
ADDQ R13, R11
|
|
JMP handle_loop
|
|
JMP loop_finished
|
|
|
|
copy_all_from_history:
|
|
MOVQ CX, R15
|
|
SUBQ $0x10, R15
|
|
JB copy_5_small
|
|
|
|
copy_5_loop:
|
|
MOVUPS (R14), X0
|
|
MOVUPS X0, (R9)
|
|
ADDQ $0x10, R14
|
|
ADDQ $0x10, R9
|
|
SUBQ $0x10, R15
|
|
JAE copy_5_loop
|
|
LEAQ 16(R14)(R15*1), R14
|
|
LEAQ 16(R9)(R15*1), R9
|
|
MOVUPS -16(R14), X0
|
|
MOVUPS X0, -16(R9)
|
|
JMP copy_5_end
|
|
|
|
copy_5_small:
|
|
CMPQ CX, $0x03
|
|
JE copy_5_move_3
|
|
JB copy_5_move_1or2
|
|
CMPQ CX, $0x08
|
|
JB copy_5_move_4through7
|
|
JMP copy_5_move_8through16
|
|
|
|
copy_5_move_1or2:
|
|
MOVB (R14), R15
|
|
MOVB -1(R14)(CX*1), BP
|
|
MOVB R15, (R9)
|
|
MOVB BP, -1(R9)(CX*1)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_3:
|
|
MOVW (R14), R15
|
|
MOVB 2(R14), BP
|
|
MOVW R15, (R9)
|
|
MOVB BP, 2(R9)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_4through7:
|
|
MOVL (R14), R15
|
|
MOVL -4(R14)(CX*1), BP
|
|
MOVL R15, (R9)
|
|
MOVL BP, -4(R9)(CX*1)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
JMP copy_5_end
|
|
|
|
copy_5_move_8through16:
|
|
MOVQ (R14), R15
|
|
MOVQ -8(R14)(CX*1), BP
|
|
MOVQ R15, (R9)
|
|
MOVQ BP, -8(R9)(CX*1)
|
|
ADDQ CX, R14
|
|
ADDQ CX, R9
|
|
|
|
copy_5_end:
|
|
ADDQ CX, R11
|
|
SUBQ CX, R13
|
|
|
|
// Copy match from the current buffer
|
|
copy_match:
|
|
MOVQ R9, CX
|
|
SUBQ R12, CX
|
|
|
|
// ml <= mo
|
|
CMPQ R13, R12
|
|
JA copy_overlapping_match
|
|
|
|
// Copy non-overlapping match
|
|
ADDQ R13, R11
|
|
MOVQ R13, R12
|
|
SUBQ $0x10, R12
|
|
JB copy_2_small
|
|
|
|
copy_2_loop:
|
|
MOVUPS (CX), X0
|
|
MOVUPS X0, (R9)
|
|
ADDQ $0x10, CX
|
|
ADDQ $0x10, R9
|
|
SUBQ $0x10, R12
|
|
JAE copy_2_loop
|
|
LEAQ 16(CX)(R12*1), CX
|
|
LEAQ 16(R9)(R12*1), R9
|
|
MOVUPS -16(CX), X0
|
|
MOVUPS X0, -16(R9)
|
|
JMP copy_2_end
|
|
|
|
copy_2_small:
|
|
CMPQ R13, $0x03
|
|
JE copy_2_move_3
|
|
JB copy_2_move_1or2
|
|
CMPQ R13, $0x08
|
|
JB copy_2_move_4through7
|
|
JMP copy_2_move_8through16
|
|
|
|
copy_2_move_1or2:
|
|
MOVB (CX), R12
|
|
MOVB -1(CX)(R13*1), R14
|
|
MOVB R12, (R9)
|
|
MOVB R14, -1(R9)(R13*1)
|
|
ADDQ R13, CX
|
|
ADDQ R13, R9
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_3:
|
|
MOVW (CX), R12
|
|
MOVB 2(CX), R14
|
|
MOVW R12, (R9)
|
|
MOVB R14, 2(R9)
|
|
ADDQ R13, CX
|
|
ADDQ R13, R9
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_4through7:
|
|
MOVL (CX), R12
|
|
MOVL -4(CX)(R13*1), R14
|
|
MOVL R12, (R9)
|
|
MOVL R14, -4(R9)(R13*1)
|
|
ADDQ R13, CX
|
|
ADDQ R13, R9
|
|
JMP copy_2_end
|
|
|
|
copy_2_move_8through16:
|
|
MOVQ (CX), R12
|
|
MOVQ -8(CX)(R13*1), R14
|
|
MOVQ R12, (R9)
|
|
MOVQ R14, -8(R9)(R13*1)
|
|
ADDQ R13, CX
|
|
ADDQ R13, R9
|
|
|
|
copy_2_end:
|
|
JMP handle_loop
|
|
|
|
// Copy overlapping match
|
|
copy_overlapping_match:
|
|
ADDQ R13, R11
|
|
|
|
copy_slow_3:
|
|
MOVB (CX), R12
|
|
MOVB R12, (R9)
|
|
INCQ CX
|
|
INCQ R9
|
|
DECQ R13
|
|
JNZ copy_slow_3
|
|
|
|
handle_loop:
|
|
MOVQ ctx+16(FP), CX
|
|
DECQ 96(CX)
|
|
JNS sequenceDecs_decodeSync_safe_bmi2_main_loop
|
|
|
|
loop_finished:
|
|
MOVQ br+8(FP), CX
|
|
MOVQ AX, 24(CX)
|
|
MOVB DL, 32(CX)
|
|
MOVQ BX, 8(CX)
|
|
|
|
// Update the context
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ R11, 136(AX)
|
|
MOVQ 144(AX), CX
|
|
SUBQ CX, R10
|
|
MOVQ R10, 168(AX)
|
|
|
|
// Return success
|
|
MOVQ $0x00000000, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match length error
|
|
sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
|
|
MOVQ 16(SP), AX
|
|
MOVQ ctx+16(FP), CX
|
|
MOVQ AX, 216(CX)
|
|
MOVQ $0x00000001, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match too long error
|
|
sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ $0x00000002, ret+24(FP)
|
|
RET
|
|
|
|
// Return with match offset too long error
|
|
error_match_off_too_big:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 8(SP), CX
|
|
MOVQ CX, 224(AX)
|
|
MOVQ R11, 136(AX)
|
|
MOVQ $0x00000003, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough literals error
|
|
error_not_enough_literals:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ $0x00000004, ret+24(FP)
|
|
RET
|
|
|
|
// Return with overread error
|
|
error_overread:
|
|
MOVQ $0x00000006, ret+24(FP)
|
|
RET
|
|
|
|
// Return with not enough output space error
|
|
error_not_enough_space:
|
|
MOVQ ctx+16(FP), AX
|
|
MOVQ 24(SP), CX
|
|
MOVQ CX, 208(AX)
|
|
MOVQ 16(SP), CX
|
|
MOVQ CX, 216(AX)
|
|
MOVQ R11, 136(AX)
|
|
MOVQ $0x00000005, ret+24(FP)
|
|
RET
|