diff --git a/api/api.go b/api/api.go
index 9918bf5..2124bf9 100644
--- a/api/api.go
+++ b/api/api.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"net/http"
 
+	"git.dubyatp.xyz/chat-api-server/db"
 	"github.com/go-chi/chi/v5"
 	"github.com/go-chi/chi/v5/middleware"
 	"github.com/go-chi/docgen"
@@ -14,6 +15,10 @@ import (
 var routes = flag.Bool("routes", false, "Generate API route documentation")
 
 func Start() {
+
+	db.InitScyllaDB()
+	defer db.CloseScyllaDB()
+
 	flag.Parse()
 
 	r := chi.NewRouter()
diff --git a/api/db.go b/api/db.go
index 82e8de5..a3c2ec3 100644
--- a/api/db.go
+++ b/api/db.go
@@ -3,275 +3,168 @@ package api
 import (
 	"errors"
 	"fmt"
-	"time"
 
-	"git.dubyatp.xyz/chat-api-server/fake_db"
+	"git.dubyatp.xyz/chat-api-server/db"
+	"github.com/gocql/gocql"
 )
 
 func dbGetUser(id string) (*User, error) {
-	data := fake_db.ExecDB("users")
-	if data == nil {
-		return nil, errors.New("failed to load users database")
+	query := `SELECT id, name, password FROM users WHERE id = ?`
+	var user User
+	err := db.Session.Query(query, id).Scan(&user.ID, &user.Name, &user.Password)
+	if err == gocql.ErrNotFound {
+		return nil, errors.New("User not found")
+	} else if err != nil {
+		return nil, fmt.Errorf("failed to query user: %v", err)
 	}
-
-	users := data["users"].([]interface{})
-	for _, u := range users {
-		user := u.(map[string]interface{})
-		if user["ID"].(string) == id {
-			return &User{
-				ID:       user["ID"].(string),
-				Name:     user["Name"].(string),
-				Password: user["Password"].(string),
-			}, nil
-		}
-	}
-	return nil, errors.New("User not found")
+	return &user, nil
 }
 
 func dbGetUserByName(username string) (*User, error) {
-	data := fake_db.ExecDB("users")
-	if data == nil {
-		return nil, errors.New("failed to load users database")
+
+	query := `SELECT id, name, password FROM users WHERE name = ? ALLOW FILTERING`
+	var user User
+	err := db.Session.Query(query, username).Scan(&user.ID, &user.Name, &user.Password)
+	if err == gocql.ErrNotFound {
+		return nil, errors.New("User not found")
+	} else if err != nil {
+		return nil, fmt.Errorf("failed to query user: %v", err)
 	}
 
-	users := data["users"].([]interface{})
-	for _, u := range users {
-		user := u.(map[string]interface{})
-		if user["Name"].(string) == username {
-			return &User{
-				ID:       user["ID"].(string),
-				Name:     user["Name"].(string),
-				Password: user["Password"].(string),
-			}, nil
-		}
-	}
-	return nil, errors.New("User not found")
+	return &user, nil
 }
 
 func dbGetAllUsers() ([]*User, error) {
-	data := fake_db.ExecDB("users")
-	if data == nil {
-		return nil, errors.New("failed to load users database")
+	query := `SELECT id, name, password FROM users`
+	iter := db.Session.Query(query).Iter()
+	defer iter.Close()
+
+	var users []*User
+	for {
+		user := &User{}
+		if !iter.Scan(&user.ID, &user.Name, &user.Password) {
+			break
+		}
+		users = append(users, user)
 	}
 
-	users := data["users"].([]interface{})
-	var result []*User
-	for _, u := range users {
-		user := u.(map[string]interface{})
-		result = append(result, &User{
-			ID:   user["ID"].(string),
-			Name: user["Name"].(string),
-		})
+	if err := iter.Close(); err != nil {
+		return nil, fmt.Errorf("failed to iterate users: %v", err)
 	}
-	if len(result) == 0 {
+
+	if len(users) == 0 {
 		return nil, errors.New("no users found")
 	}
-	return result, nil
+
+	return users, nil
 }
 
 func dbGetMessage(id string) (*Message, error) {
-	data := fake_db.ExecDB("messages")
-	if data == nil {
-		return nil, errors.New("failed to load messages database")
+	query := `SELECT id, body, edited, timestamp, userid FROM messages WHERE id = ?`
+	var message Message
+	err := db.Session.Query(query, id).Scan(
+		&message.ID,
+		&message.Body,
+		&message.Edited,
+		&message.Timestamp,
+		&message.UserID)
+	if err == gocql.ErrNotFound {
+		return nil, errors.New("Message not found")
+	} else if err != nil {
+		return nil, fmt.Errorf("failed to query message: %v", err)
 	}
 
-	messages := data["messages"].([]interface{})
-	for _, m := range messages {
-		message := m.(map[string]interface{})
-		if message["ID"].(string) == id {
-			timestamp, err := time.Parse(time.RFC3339, message["Timestamp"].(string))
-			if err != nil {
-				return nil, fmt.Errorf("failed to parse timestamp: %v", err)
-			}
-			editedStr, ok := message["Edited"].(string)
-			var edited time.Time
-			if ok && editedStr != "" {
-				var err error
-				edited, err = time.Parse(time.RFC3339, editedStr)
-				if err != nil {
-					return nil, fmt.Errorf("failed to parse edited timestamp: %v", err)
-				}
-			}
-			return &Message{
-				ID:        message["ID"].(string),
-				UserID:    message["UserID"].(string),
-				Body:      message["Body"].(string),
-				Timestamp: timestamp,
-				Edited:    edited,
-			}, nil
-		}
-	}
-	return nil, errors.New("Message not found")
+	return &message, nil
 }
 
 func dbGetAllMessages() ([]*Message, error) {
-	data := fake_db.ExecDB("messages")
-	//println(data)
-	if data == nil {
-		return nil, errors.New("failed to load messages database")
+	query := `SELECT id, body, edited, timestamp, userid FROM messages`
+	iter := db.Session.Query(query).Iter()
+	defer iter.Close()
+
+	var messages []*Message
+	for {
+		message := &Message{}
+		if !iter.Scan(
+			&message.ID,
+			&message.Body,
+			&message.Edited,
+			&message.Timestamp,
+			&message.UserID) {
+			break
+		}
+		messages = append(messages, message)
 	}
 
-	messages := data["messages"].([]interface{})
-	var result []*Message
-	for _, m := range messages {
-		message := m.(map[string]interface{})
-		timestamp, err := time.Parse(time.RFC3339, message["Timestamp"].(string))
-		if err != nil {
-			return nil, fmt.Errorf("failed to parse timestamp: %v", err)
-		}
-		editedStr, ok := message["Edited"].(string)
-		var edited time.Time
-		if ok && editedStr != "" {
-			var err error
-			edited, err = time.Parse(time.RFC3339, editedStr)
-			if err != nil {
-				return nil, fmt.Errorf("failed to parse edited timestamp: %v", err)
-			}
-		}
-		result = append(result, &Message{
-			ID:        message["ID"].(string),
-			UserID:    message["UserID"].(string),
-			Body:      message["Body"].(string),
-			Timestamp: timestamp,
-			Edited:    edited,
-		})
+	if err := iter.Close(); err != nil {
+		return nil, fmt.Errorf("failed to iterate messages: %v", err)
 	}
-	if len(result) == 0 {
+
+	if len(messages) == 0 {
 		return nil, errors.New("no messages found")
 	}
-	return result, nil
+
+	return messages, nil
 }
 
 func dbAddUser(user *User) error {
-	currentData := fake_db.ExecDB("users")
-	if currentData == nil {
-		return fmt.Errorf("error reading users database")
+	query := `INSERT INTO users (id, name, password) VALUES (?, ?, ?)`
+	err := db.Session.Query(query, user.ID, user.Name, user.Password).Exec()
+	if err != nil {
+		return fmt.Errorf("failed to add user: %v", err)
 	}
-
-	users, ok := currentData["users"].([]interface{})
-	if !ok {
-		return fmt.Errorf("users data is in an unexpected format")
-	}
-
-	dbUser := map[string]interface{}{
-		"ID":       user.ID,
-		"Name":     user.Name,
-		"Password": user.Password,
-	}
-
-	users = append(users, dbUser)
-	return fake_db.WriteDB("users", users)
+	return nil
 }
 
 func dbAddMessage(message *Message) error {
-	currentData := fake_db.ExecDB("messages")
-	if currentData == nil {
-		return fmt.Errorf("error reading messages database")
+	query := `INSERT INTO messages (id, body, edited, timestamp, userid)
+			  VALUES (?, ?, ?, ?, ?)`
+	err := db.Session.Query(query,
+		message.ID,
+		message.Body,
+		nil,
+		message.Timestamp,
+		message.UserID).Exec()
+	if err != nil {
+		return fmt.Errorf("failed to add message: %v", err)
 	}
-
-	messages, ok := currentData["messages"].([]interface{})
-	if !ok {
-		return fmt.Errorf("messages data is in an unexpected format")
-	}
-
-	var edited interface{}
-	if message.Edited.IsZero() {
-		edited = nil // Set to nil if Edited is the zero value
-	} else {
-		edited = message.Edited.Format(time.RFC3339)
-	}
-
-	dbMessage := map[string]interface{}{
-		"ID":        message.ID,
-		"UserID":    message.UserID, // JSON numbers are float64
-		"Body":      message.Body,
-		"Timestamp": message.Timestamp.Format(time.RFC3339),
-		"Edited":    edited,
-	}
-
-	messages = append(messages, dbMessage)
-	return fake_db.WriteDB("messages", messages)
+	return nil
 }
 
 func dbUpdateMessage(updatedMessage *Message) error {
-	currentData := fake_db.ExecDB("messages")
-	if currentData == nil {
-		return fmt.Errorf("error reading messages database")
+	var edited interface{}
+	if updatedMessage.Edited.IsZero() {
+		edited = nil
+	} else {
+		edited = updatedMessage.Edited
 	}
 
-	messages, ok := currentData["messages"].([]interface{})
-	if !ok {
-		return fmt.Errorf("messages data is in an unexpected format")
+	query := `UPDATE messages
+			  SET body = ?, edited = ?, timestamp = ?
+			  WHERE ID = ?`
+
+	err := db.Session.Query(query,
+		updatedMessage.Body,
+		edited,
+		updatedMessage.Timestamp,
+		updatedMessage.ID).Exec()
+
+	if err != nil {
+		return fmt.Errorf("failed to update message: %v", err)
 	}
 
-	var updatedMessages []interface{}
-	found := false
+	return nil
 
-	for _, m := range messages {
-		message, ok := m.(map[string]interface{})
-		if !ok {
-			continue
-		}
-
-		if messageID, ok := message["ID"].(string); ok && messageID == updatedMessage.ID {
-			found = true
-
-			var edited interface{}
-			if updatedMessage.Edited.IsZero() {
-				edited = nil // Set to nil if Edited is the zero value
-			} else {
-				edited = updatedMessage.Edited.Format(time.RFC3339)
-			}
-
-			message = map[string]interface{}{
-				"ID":        updatedMessage.ID,
-				"UserID":    updatedMessage.UserID,
-				"Body":      updatedMessage.Body,
-				"Timestamp": updatedMessage.Timestamp.Format(time.RFC3339),
-				"Edited":    edited,
-			}
-		}
-		updatedMessages = append(updatedMessages, message)
-	}
-
-	if !found {
-		return fmt.Errorf("message with ID %s not found", updatedMessage.ID)
-	}
-
-	return fake_db.WriteDB("messages", updatedMessages)
 }
 
 func dbDeleteMessage(id string) error {
-	currentData := fake_db.ExecDB("messages")
-	if currentData == nil {
-		return fmt.Errorf("error reading messages database")
+	query := `DELETE FROM messages WHERE ID = ?`
+
+	err := db.Session.Query(query, id).Exec()
+
+	if err != nil {
+		return fmt.Errorf("failed to delete message: %v", err)
 	}
 
-	messages, ok := currentData["messages"].([]interface{})
-	if !ok {
-		return fmt.Errorf("messages data is in an unexpected format")
-	}
-
-	var updatedMessages []interface{}
-	found := false
-
-	for _, m := range messages {
-		message, ok := m.(map[string]interface{})
-		if !ok {
-			continue
-		}
-
-		if messageID, ok := message["ID"].(string); ok && messageID == id {
-			found = true
-			continue
-		}
-
-		updatedMessages = append(updatedMessages, message)
-	}
-
-	if !found {
-		return fmt.Errorf("message with ID %s not found", id)
-	}
-
-	return fake_db.WriteDB("messages", updatedMessages)
+	return nil
 }
diff --git a/api/message.go b/api/message.go
index 5bb07e2..bbc0a19 100644
--- a/api/message.go
+++ b/api/message.go
@@ -65,7 +65,8 @@ func EditMessage(w http.ResponseWriter, r *http.Request) {
 	}
 
 	message.Body = body
-	message.Edited = time.Now()
+	editedTime := time.Now()
+	message.Edited = &editedTime
 
 	err = dbUpdateMessage(message)
 	if err != nil {
@@ -85,7 +86,7 @@ func DeleteMessage(w http.ResponseWriter, r *http.Request) {
 		render.Render(w, r, ErrNotFound)
 		return
 	}
-	dbDeleteMessage(message.ID)
+	dbDeleteMessage(message.ID.String())
 	if err := render.Render(w, r, NewMessageResponse(message)); err != nil {
 		render.Render(w, r, ErrRender(err))
 		return
@@ -104,8 +105,8 @@ func ListMessages(w http.ResponseWriter, r *http.Request) {
 	}
 }
 
-func newMessageID() string {
-	return "msg_" + uuid.New().String()
+func newMessageID() uuid.UUID {
+	return uuid.New()
 }
 
 func NewMessage(w http.ResponseWriter, r *http.Request) {
@@ -135,7 +136,6 @@ func NewMessage(w http.ResponseWriter, r *http.Request) {
 		UserID:    user.ID,
 		Body:      body,
 		Timestamp: time.Now(),
-		Edited:    time.Time{},
 	}
 
 	err = dbAddMessage(&msg)
@@ -150,11 +150,11 @@ func NewMessage(w http.ResponseWriter, r *http.Request) {
 type messageKey struct{}
 
 type Message struct {
-	ID        string    `json:"id"`
-	UserID    string    `json:"user_id"`
-	Body      string    `json:"body"`
-	Timestamp time.Time `json:"timestamp"`
-	Edited    time.Time `json:"edited"`
+	ID        uuid.UUID  `json:"id"`
+	UserID    uuid.UUID  `json:"user_id"`
+	Body      string     `json:"body"`
+	Timestamp time.Time  `json:"timestamp"`
+	Edited    *time.Time `json:"edited"`
 }
 
 type MessageRequest struct {
@@ -175,8 +175,8 @@ type MessageResponse struct {
 
 func (m MessageResponse) MarshalJSON() ([]byte, error) {
 	type OrderedMessageResponse struct {
-		ID        string       `json:"id"`
-		UserID    string       `json:"user_id"`
+		ID        uuid.UUID    `json:"id"`
+		UserID    uuid.UUID    `json:"user_id"`
 		Body      string       `json:"body"`
 		Timestamp string       `json:"timestamp"`
 		Edited    *string      `json:"edited,omitempty"` // Use a pointer to allow null values
@@ -185,7 +185,7 @@ func (m MessageResponse) MarshalJSON() ([]byte, error) {
 	}
 
 	var edited *string
-	if !m.Message.Edited.IsZero() { // Check if Edited is not the zero value
+	if m.Message.Edited != nil { // Check if Edited is not the zero value
 		editedStr := m.Message.Edited.Format(time.RFC3339)
 		edited = &editedStr
 	}
diff --git a/api/response.go b/api/response.go
index b48c715..b94c583 100644
--- a/api/response.go
+++ b/api/response.go
@@ -10,7 +10,7 @@ func NewMessageResponse(message *Message) *MessageResponse {
 	resp := &MessageResponse{Message: message}
 
 	if resp.User == nil {
-		if user, _ := dbGetUser(resp.UserID); user != nil {
+		if user, _ := dbGetUser(resp.UserID.String()); user != nil {
 			resp.User = NewUserPayloadResponse(user)
 		}
 	}
diff --git a/api/user.go b/api/user.go
index 7841215..f859441 100644
--- a/api/user.go
+++ b/api/user.go
@@ -89,8 +89,8 @@ func ListUsers(w http.ResponseWriter, r *http.Request) {
 	}
 }
 
-func newUserID() string {
-	return "user_" + uuid.New().String()
+func newUserID() uuid.UUID {
+	return uuid.New()
 }
 
 func NewUser(w http.ResponseWriter, r *http.Request) {
@@ -130,9 +130,9 @@ func NewUser(w http.ResponseWriter, r *http.Request) {
 type userKey struct{}
 
 type User struct {
-	ID       string `json:"id"`
-	Name     string `json:"name"`
-	Password string `json:"-"`
+	ID       uuid.UUID `json:"id"`
+	Name     string    `json:"name"`
+	Password string    `json:"-"`
 }
 
 type UserPayload struct {
diff --git a/db/scylla.go b/db/scylla.go
new file mode 100644
index 0000000..9163854
--- /dev/null
+++ b/db/scylla.go
@@ -0,0 +1,28 @@
+package db
+
+import (
+	"log"
+
+	"github.com/gocql/gocql"
+)
+
+var Session *gocql.Session
+
+func InitScyllaDB() {
+	cluster := gocql.NewCluster("127.0.0.1") // Replace with your ScyllaDB cluster IPs
+	cluster.Keyspace = "chatservice"         // Replace with your keyspace
+	cluster.Consistency = gocql.Quorum
+
+	session, err := cluster.CreateSession()
+	if err != nil {
+		log.Fatalf("Failed to connect to ScyllaDB: %v", err)
+	}
+	Session = session
+	log.Println("Connected to ScyllaDB")
+}
+
+func CloseScyllaDB() {
+	if Session != nil {
+		Session.Close()
+	}
+}
diff --git a/fake_db/fake_db.go b/fake_db/fake_db.go
deleted file mode 100644
index 6fec138..0000000
--- a/fake_db/fake_db.go
+++ /dev/null
@@ -1,73 +0,0 @@
-package fake_db
-
-import (
-	"encoding/json"
-	"fmt"
-	"io"
-	"os"
-)
-
-func ExecDB(db_name string) map[string]interface{} {
-	var result map[string]interface{}
-
-	if db_name == "users" {
-		users_db, err := os.Open("./test_data/users.json")
-		if err != nil {
-			fmt.Println(err)
-			return nil
-		}
-		fmt.Println("Successfully opened Users DB")
-		defer users_db.Close()
-
-		byteValue, _ := io.ReadAll(users_db)
-		var users []interface{}
-		json.Unmarshal(byteValue, &users)
-		result = map[string]interface{}{"users": users}
-
-	} else if db_name == "messages" {
-		messages_db, err := os.Open("./test_data/messages.json")
-		if err != nil {
-			fmt.Println(err)
-			return nil
-		}
-		fmt.Println("Successfully opened Messages DB")
-		defer messages_db.Close()
-
-		byteValue, _ := io.ReadAll(messages_db)
-		var messages []interface{}
-		json.Unmarshal(byteValue, &messages)
-		result = map[string]interface{}{"messages": messages}
-
-	} else {
-		fmt.Println("Invalid DB name")
-		return nil
-	}
-
-	return result
-}
-
-func WriteDB(db_name string, data interface{}) error {
-	var filePath string
-
-	switch db_name {
-	case "users":
-		filePath = "./test_data/users.json"
-	case "messages":
-		filePath = "./test_data/messages.json"
-	default:
-		return fmt.Errorf("invalid database name: %s", db_name)
-	}
-
-	jsonData, err := json.MarshalIndent(data, "", "  ")
-	if err != nil {
-		return fmt.Errorf("error marshaling data to JSON: %v", err)
-	}
-
-	err = os.WriteFile(filePath, jsonData, 0644)
-	if err != nil {
-		return fmt.Errorf("error writing to file: %v", err)
-	}
-
-	fmt.Printf("Successfully wrote to %s DB\n", db_name)
-	return nil
-}
diff --git a/flake.nix b/flake.nix
index d5bd632..7252997 100644
--- a/flake.nix
+++ b/flake.nix
@@ -1,7 +1,9 @@
 {
   description = "Unnamed Chat Server API";
 
-  inputs.nixpkgs.url = "nixpkgs/nixos-unstable";
+  inputs = {
+    nixpkgs.url = "nixpkgs/nixos-unstable";
+  };
 
   outputs = { self, nixpkgs }:
     let
diff --git a/go.mod b/go.mod
index 7d4d653..ac01431 100644
--- a/go.mod
+++ b/go.mod
@@ -8,10 +8,19 @@ require (
 	github.com/go-chi/chi/v5 v5.2.0
 	github.com/go-chi/docgen v1.3.0
 	github.com/go-chi/render v1.0.3
+	github.com/gocql/gocql v1.7.0
 	github.com/google/uuid v1.6.0
 )
 
+require (
+	github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed // indirect
+	github.com/klauspost/compress v1.17.9 // indirect
+	gopkg.in/inf.v0 v0.9.1 // indirect
+)
+
 require (
 	github.com/ajg/form v1.5.1 // indirect
 	golang.org/x/crypto v0.36.0
 )
+
+replace github.com/gocql/gocql => github.com/scylladb/gocql v1.14.5
diff --git a/go.sum b/go.sum
index 65458ed..d74d725 100644
--- a/go.sum
+++ b/go.sum
@@ -1,5 +1,9 @@
 github.com/ajg/form v1.5.1 h1:t9c7v8JUKu/XxOGBU0yjNpaMloxGEJhUkqFRq0ibGeU=
 github.com/ajg/form v1.5.1/go.mod h1:uL1WgH+h2mgNtvBq0339dVnzXdBETtL2LeUXaIv25UY=
+github.com/bitly/go-hostpool v0.0.0-20171023180738-a3a6125de932/go.mod h1:NOuUCSz6Q9T7+igc/hlvDOUdtWKryOrtFyIVABv/p7k=
+github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869/go.mod h1:Ekp36dRnpXw/yCqJaO+ZrUyxD+3VXMFFr56k5XYrpB4=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/go-chi/chi/v5 v5.0.1/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8=
 github.com/go-chi/chi/v5 v5.2.0 h1:Aj1EtB0qR2Rdo2dG4O94RIU35w2lvQSj6BRA4+qwFL0=
 github.com/go-chi/chi/v5 v5.2.0/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8=
@@ -8,9 +12,40 @@ github.com/go-chi/docgen v1.3.0/go.mod h1:G9W0G551cs2BFMSn/cnGwX+JBHEloAgo17MBhy
 github.com/go-chi/render v1.0.1/go.mod h1:pq4Rr7HbnsdaeHagklXub+p6Wd16Af5l9koip1OvJns=
 github.com/go-chi/render v1.0.3 h1:AsXqd2a1/INaIfUSKq3G5uA8weYx20FOsM7uSoCyyt4=
 github.com/go-chi/render v1.0.3/go.mod h1:/gr3hVkmYR0YlEy3LxCuVRFzEu9Ruok+gFqbIofjao0=
+github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed h1:5upAirOpQc1Q53c0bnx2ufif5kANL7bfZWcc6VJWJd8=
+github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed/go.mod h1:tMWxXQ9wFIaZeTI9F+hmhFiGpFmhOHzyShyFUhRm0H4=
+github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA=
+github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
+github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/scylladb/gocql v1.14.5 h1:lyJKf0m/Vate+8MGiVeRhQNpLVVsL21gvp89zEZdltI=
+github.com/scylladb/gocql v1.14.5/go.mod h1:1efi3H0Gr72WCR0W+i+d63FmwmJhDL/zfAC0gMJHVlM=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
 golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
+golang.org/x/net v0.0.0-20220526153639-5463443f8c37/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
+gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
 gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8=
diff --git a/test_data/messages.json b/test_data/messages.json
deleted file mode 100644
index c1e4f81..0000000
--- a/test_data/messages.json
+++ /dev/null
@@ -1,100 +0,0 @@
-[
-  {
-    "Body": "hello",
-    "Edited": null,
-    "ID": "1",
-    "Timestamp": "2024-12-25T05:00:40Z",
-    "UserID": "user_8d7cd2ed-0aa2-4810-a172-42dd58563a54"
-  },
-  {
-    "Body": "world",
-    "Edited": null,
-    "ID": "2",
-    "Timestamp": "2024-12-25T05:00:43Z",
-    "UserID": "user_63dac6ad-f255-4af8-a057-4b064a982a84"
-  },
-  {
-    "Body": "abababa",
-    "Edited": null,
-    "ID": "3",
-    "Timestamp": "2024-12-25T05:01:20Z",
-    "UserID": "user_8d7cd2ed-0aa2-4810-a172-42dd58563a54"
-  },
-  {
-    "Body": "bitch",
-    "Edited": null,
-    "ID": "4",
-    "Timestamp": "2024-12-25T05:05:55Z",
-    "UserID": "user_63dac6ad-f255-4af8-a057-4b064a982a84"
-  },
-  {
-    "Body": "NIBBA",
-    "Edited": null,
-    "ID": "5",
-    "Timestamp": "2025-03-24T14:48:28.249221047-04:00",
-    "UserID": "user_8d7cd2ed-0aa2-4810-a172-42dd58563a54"
-  },
-  {
-    "Body": "nibby",
-    "Edited": null,
-    "ID": "6",
-    "Timestamp": "2025-03-24T14:49:03.246929039-04:00",
-    "UserID": "user_8d7cd2ed-0aa2-4810-a172-42dd58563a54"
-  },
-  {
-    "Body": "aaaaababananana",
-    "Edited": null,
-    "ID": "msg_60f70a47-3be2-4315-869a-d6f151ec262a",
-    "Timestamp": "2025-03-24T15:01:07.14371835-04:00",
-    "UserID": "user_8d7cd2ed-0aa2-4810-a172-42dd58563a54"
-  },
-  {
-    "Body": "ababa abbott",
-    "Edited": null,
-    "ID": "msg_94cbc26d-9098-4fa9-bd21-794516c2263d",
-    "Timestamp": "2025-03-24T20:34:57.198849367-04:00",
-    "UserID": "user_8d7cd2ed-0aa2-4810-a172-42dd58563a54"
-  },
-  {
-    "Body": "AAAAAA",
-    "Edited": null,
-    "ID": "msg_ca8483db-e823-45c4-882c-fe0930610ba9",
-    "Timestamp": "2025-03-24T21:17:04.350827576-04:00",
-    "UserID": "user_8d7cd2ed-0aa2-4810-a172-42dd58563a54"
-  },
-  {
-    "Body": "i am a femboiiiii",
-    "Edited": null,
-    "ID": "msg_fcdbb48a-4ea5-4fb3-b925-3a15eb7c291c",
-    "Timestamp": "2025-03-24T21:27:48.565290147-04:00",
-    "UserID": "user_63dac6ad-f255-4af8-a057-4b064a982a84"
-  },
-  {
-    "Body": "i love soap",
-    "Edited": "2025-03-27T14:49:14-04:00",
-    "ID": "msg_59851eb1-2e63-46c1-b496-55566c414e33",
-    "Timestamp": "2025-03-27T14:40:26-04:00",
-    "UserID": "user_8d7cd2ed-0aa2-4810-a172-42dd58563a54"
-  },
-  {
-    "Body": "I'd just like to interject for a moment. What you're referring to as Linux, is in fact, GNU/Linux, or as I've recently taken to calling it, GNU plus Linux. Linux is not an operating system unto itself, but rather another free component of a fully functioning GNU system made useful by the GNU corelibs, shell utilities and vital system components comprising a full OS as defined by POSIX. Many computer users run a modified version of the GNU system every day, without realizing it. Through a peculiar turn of events, the version of GNU which is widely used today is often called “Linux,” and many of its users are not aware that it is basically the GNU system, developed by the GNU Project. There really is a Linux, and these people are using it, but it is just a part of the system they use.\n\nLinux is the kernel: the program in the system that allocates the machine's resources to the other programs that you run. The kernel is an essential part of an operating system, but useless by itself; it can only function in the context of a complete operating system. Linux is normally used in combination with the GNU operating system: the whole system is basically GNU with Linux added, or GNU/Linux. All the so-called “Linux” distributions are really distributions of GNU/Linux.",
-    "Edited": "2025-03-27T20:35:33-04:00",
-    "ID": "msg_d77f8e0f-5c23-4c10-984f-b07559e7c5ed",
-    "Timestamp": "2025-03-27T18:56:27-04:00",
-    "UserID": "user_8d7cd2ed-0aa2-4810-a172-42dd58563a54"
-  },
-  {
-    "Body": "oh \n\n\nok",
-    "Edited": null,
-    "ID": "msg_8d0d8e24-2c1d-4337-afdb-06d1a121e486",
-    "Timestamp": "2025-03-27T18:57:52-04:00",
-    "UserID": "user_63dac6ad-f255-4af8-a057-4b064a982a84"
-  },
-  {
-    "Body": "we shall ATTACK at the edge of propaganda",
-    "Edited": null,
-    "ID": "msg_dc55edfd-e0f7-4923-b686-df90ad4bb108",
-    "Timestamp": "2025-03-27T19:00:17-04:00",
-    "UserID": "user_63dac6ad-f255-4af8-a057-4b064a982a84"
-  }
-]
\ No newline at end of file
diff --git a/test_data/users.json b/test_data/users.json
deleted file mode 100644
index 97ed1f9..0000000
--- a/test_data/users.json
+++ /dev/null
@@ -1,12 +0,0 @@
-[
-  {
-    "ID": "user_8d7cd2ed-0aa2-4810-a172-42dd58563a54",
-    "Name": "duby",
-    "Password": "$2a$10$fYKgHJRgR6hJl9VAAu4HPeeyTbDP3UCxiAxZMMKDL8A0ya0Sdg.pq"
-  },
-  {
-    "ID": "user_63dac6ad-f255-4af8-a057-4b064a982a84",
-    "Name": "astolfo",
-    "Password": "$2a$10$ryzbb6l/hkZH6wwtdLdbYew3R1ug4O3tdHi4581WQHui8JKSPFqSu"
-  }
-]
\ No newline at end of file
diff --git a/vendor/github.com/gocql/gocql/.gitignore b/vendor/github.com/gocql/gocql/.gitignore
new file mode 100644
index 0000000..bce6cf5
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/.gitignore
@@ -0,0 +1,5 @@
+gocql-fuzz
+fuzz-corpus
+fuzz-work
+gocql.test
+.idea
diff --git a/vendor/github.com/gocql/gocql/AUTHORS b/vendor/github.com/gocql/gocql/AUTHORS
new file mode 100644
index 0000000..ac589e1
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/AUTHORS
@@ -0,0 +1,148 @@
+# This source file refers to The gocql Authors for copyright purposes.
+
+Christoph Hack <christoph@tux21b.org>
+Jonathan Rudenberg <jonathan@titanous.com>
+Thorsten von Eicken <tve@rightscale.com>
+Matt Robenolt <mattr@disqus.com>
+Phillip Couto <phillip.couto@stemstudios.com>
+Niklas Korz <korz.niklask@gmail.com>
+Nimi Wariboko Jr <nimi@channelmeter.com>
+Ghais Issa <ghais.issa@gmail.com>
+Sasha Klizhentas <klizhentas@gmail.com>
+Konstantin Cherkasov <k.cherkasoff@gmail.com>
+Ben Hood <0x6e6562@gmail.com>
+Pete Hopkins <phopkins@gmail.com>
+Chris Bannister <c.bannister@gmail.com>
+Maxim Bublis <b@codemonkey.ru>
+Alex Zorin <git@zor.io>
+Kasper Middelboe Petersen <me@phant.dk>
+Harpreet Sawhney <harpreet.sawhney@gmail.com>
+Charlie Andrews <charlieandrews.cwa@gmail.com>
+Stanislavs Koikovs <stanislavs.koikovs@gmail.com>
+Dan Forest <bonjour@dan.tf>
+Miguel Serrano <miguelvps@gmail.com>
+Stefan Radomski <gibheer@zero-knowledge.org>
+Josh Wright <jshwright@gmail.com>
+Jacob Rhoden <jacob.rhoden@gmail.com>
+Ben Frye <benfrye@gmail.com>
+Fred McCann <fred@sharpnoodles.com>
+Dan Simmons <dan@simmons.io>
+Muir Manders <muir@retailnext.net>
+Sankar P <sankar.curiosity@gmail.com>
+Julien Da Silva <julien.dasilva@gmail.com>
+Dan Kennedy <daniel@firstcs.co.uk>
+Nick Dhupia<nick.dhupia@gmail.com>
+Yasuharu Goto <matope.ono@gmail.com>
+Jeremy Schlatter <jeremy.schlatter@gmail.com>
+Matthias Kadenbach <matthias.kadenbach@gmail.com>
+Dean Elbaz <elbaz.dean@gmail.com>
+Mike Berman <evencode@gmail.com>
+Dmitriy Fedorenko <c0va23@gmail.com>
+Zach Marcantel <zmarcantel@gmail.com>
+James Maloney <jamessagan@gmail.com>
+Ashwin Purohit <purohit@gmail.com>
+Dan Kinder <dkinder.is.me@gmail.com>
+Oliver Beattie <oliver@obeattie.com>
+Justin Corpron <jncorpron@gmail.com>
+Miles Delahunty <miles.delahunty@gmail.com>
+Zach Badgett <zach.badgett@gmail.com>
+Maciek Sakrejda <maciek@heroku.com>
+Jeff Mitchell <jeffrey.mitchell@gmail.com>
+Baptiste Fontaine <b@ptistefontaine.fr>
+Matt Heath <matt@mattheath.com>
+Jamie Cuthill <jamie.cuthill@gmail.com>
+Adrian Casajus <adriancasajus@gmail.com>
+John Weldon <johnweldon4@gmail.com>
+Adrien Bustany <adrien@bustany.org>
+Andrey Smirnov <smirnov.andrey@gmail.com>
+Adam Weiner <adamsweiner@gmail.com>
+Daniel Cannon <daniel@danielcannon.co.uk>
+Johnny Bergström <johnny@joonix.se>
+Adriano Orioli <orioli.adriano@gmail.com>
+Claudiu Raveica <claudiu.raveica@gmail.com>
+Artem Chernyshev <artem.0xD2@gmail.com>
+Ference Fu <fym201@msn.com>
+LOVOO <opensource@lovoo.com>
+nikandfor <nikandfor@gmail.com>
+Anthony Woods <awoods@raintank.io>
+Alexander Inozemtsev <alexander.inozemtsev@gmail.com>
+Rob McColl <rob@robmccoll.com>; <rmccoll@ionicsecurity.com>
+Viktor Tönköl <viktor.toenkoel@motionlogic.de>
+Ian Lozinski <ian.lozinski@gmail.com>
+Michael Highstead <highstead@gmail.com>
+Sarah Brown <esbie.is@gmail.com>
+Caleb Doxsey <caleb@datadoghq.com>
+Frederic Hemery <frederic.hemery@datadoghq.com>
+Pekka Enberg <penberg@scylladb.com>
+Mark M <m.mim95@gmail.com>
+Bartosz Burclaf <burclaf@gmail.com>
+Marcus King <marcusking01@gmail.com>
+Andrew de Andrade <andrew@deandrade.com.br>
+Robert Nix <robert@nicerobot.org>
+Nathan Youngman <git@nathany.com>
+Charles Law <charles.law@gmail.com>; <claw@conduce.com>
+Nathan Davies <nathanjamesdavies@gmail.com>
+Bo Blanton <bo.blanton@gmail.com>
+Vincent Rischmann <me@vrischmann.me>
+Jesse Claven <jesse.claven@gmail.com>
+Derrick Wippler <thrawn01@gmail.com>
+Leigh McCulloch <leigh@leighmcculloch.com>
+Ron Kuris <swcafe@gmail.com>
+Raphael Gavache <raphael.gavache@gmail.com>
+Yasser Abdolmaleki <yasser@yasser.ca>
+Krishnanand Thommandra <devtkrishna@gmail.com>
+Blake Atkinson <me@blakeatkinson.com>
+Dharmendra Parsaila <d4dharmu@gmail.com>
+Nayef Ghattas <nayef.ghattas@datadoghq.com>
+Michał Matczuk <mmatczuk@gmail.com>
+Ben Krebsbach <ben.krebsbach@gmail.com>
+Vivian Mathews <vivian.mathews.3@gmail.com>
+Sascha Steinbiss <satta@debian.org>
+Seth Rosenblum <seth.t.rosenblum@gmail.com>
+Javier Zunzunegui <javier.zunzunegui.b@gmail.com>
+Luke Hines <lukehines@protonmail.com>
+Zhixin Wen <john.wenzhixin@hotmail.com>
+Chang Liu <changliu.it@gmail.com>
+Ingo Oeser <nightlyone@gmail.com>
+Luke Hines <lukehines@protonmail.com>
+Jacob Greenleaf <jacob@jacobgreenleaf.com>
+Alex Lourie <alex@instaclustr.com>; <djay.il@gmail.com>
+Marco Cadetg <cadetg@gmail.com>
+Karl Matthias <karl@matthias.org>
+Thomas Meson <zllak@hycik.org>
+Martin Sucha <martin.sucha@kiwi.com>; <git@mm.ms47.eu>
+Pavel Buchinchik <p.buchinchik@gmail.com>
+Rintaro Okamura <rintaro.okamura@gmail.com>
+Ivan Boyarkin <ivan.boyarkin@kiwi.com>; <mr.vanboy@gmail.com>
+Yura Sokolov <y.sokolov@joom.com>; <funny.falcon@gmail.com>
+Jorge Bay <jorgebg@apache.org>
+Dmitriy Kozlov <hummerd@mail.ru>
+Alexey Romanovsky <alexus1024+gocql@gmail.com>
+Jaume Marhuenda Beltran <jaumemarhuenda@gmail.com>
+Piotr Dulikowski <piodul@scylladb.com>
+Árni Dagur <arni@dagur.eu>
+Tushar Das <tushar.das5@gmail.com>
+Maxim Vladimirskiy <horkhe@gmail.com>
+Bogdan-Ciprian Rusu <bogdanciprian.rusu@crowdstrike.com>
+Yuto Doi <yutodoi.seattle@gmail.com>
+Krishna Vadali <tejavadali@gmail.com>
+Jens-W. Schicke-Uffmann <drahflow@gmx.de>
+Ondrej Polakovič <ondrej.polakovic@kiwi.com>
+Sergei Karetnikov <sergei.karetnikov@gmail.com>
+Stefan Miklosovic <smiklosovic@apache.org>
+Adam Burk <amburk@gmail.com>
+Valerii Ponomarov <kiparis.kh@gmail.com>
+Neal Turett <neal.turett@datadoghq.com>
+Doug Schaapveld <djschaap@gmail.com>
+Steven Seidman <steven.seidman@datadoghq.com>
+Wojciech Przytuła <wojciech.przytula@scylladb.com>
+João Reis <joao.reis@datastax.com>
+Lauro Ramos Venancio <lauro.venancio@incognia.com>
+Dmitry Kropachev <dmitry.kropachev@gmail.com>
+Oliver Boyle <pleasedontspamme4321+gocql@gmail.com>
+Jackson Fleming <jackson.fleming@instaclustr.com>
+Sylwia Szunejko <sylwia.szunejko@scylladb.com>
+Karol Baryła <karol.baryla@scylladb.com>
+Marcin Mazurek <marcinek.mazurek@gmail.com>
+Moguchev Leonid Alekseevich <lmoguchev@ozon.ru>
+Julien Lefevre <julien.lefevr@gmail.com>
\ No newline at end of file
diff --git a/vendor/github.com/gocql/gocql/CONTRIBUTING.md b/vendor/github.com/gocql/gocql/CONTRIBUTING.md
new file mode 100644
index 0000000..8c2df74
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/CONTRIBUTING.md
@@ -0,0 +1,78 @@
+# Contributing to gocql
+
+**TL;DR** - this manifesto sets out the bare minimum requirements for submitting a patch to gocql.
+
+This guide outlines the process of landing patches in gocql and the general approach to maintaining the code base.
+
+## Background
+
+The goal of the gocql project is to provide a stable and robust CQL driver for Go. gocql is a community driven project that is coordinated by a small team of core developers.
+
+## Minimum Requirement Checklist
+
+The following is a check list of requirements that need to be satisfied in order for us to merge your patch:
+
+* You should raise a pull request to gocql/gocql on Github
+* The pull request has a title that clearly summarizes the purpose of the patch
+* The motivation behind the patch is clearly defined in the pull request summary
+* Your name and email have been added to the `AUTHORS` file (for copyright purposes)
+* The patch will merge cleanly
+* The test coverage does not fall below the critical threshold (currently 64%) 
+* The merge commit passes the regression test suite on Travis
+* `go fmt` has been applied to the submitted code
+* Notable changes (i.e. new features or changed behavior, bugfixes) are appropriately documented in CHANGELOG.md, functional changes also in godoc
+
+If there are any requirements that can't be reasonably satisfied, please state this either on the pull request or as part of discussion on the mailing list. Where appropriate, the core team may apply discretion and make an exception to these requirements.
+
+## Beyond The Checklist
+
+In addition to stating the hard requirements, there are a bunch of things that we consider when assessing changes to the library. These soft requirements are helpful pointers of how to get a patch landed quicker and with less fuss.
+
+### General QA Approach
+
+The gocql team needs to consider the ongoing maintainability of the library at all times. Patches that look like they will introduce maintenance issues for the team will not be accepted.
+
+Your patch will get merged quicker if you have decent test cases that provide test coverage for the new behavior you wish to introduce.
+
+Unit tests are good, integration tests are even better. An example of a unit test is `marshal_test.go` - this tests the serialization code in isolation. `cassandra_test.go` is an integration test suite that is executed against every version of Cassandra that gocql supports as part of the CI process on Travis.
+
+That said, the point of writing tests is to provide a safety net to catch regressions, so there is no need to go overboard with tests. Remember that the more tests you write, the more code we will have to maintain. So there's a balance to strike there.
+
+### When It's Too Difficult To Automate Testing
+
+There are legitimate examples of where it is infeasible to write a regression test for a change. Never fear, we will still consider the patch and quite possibly accept the change without a test. The gocql team takes a pragmatic approach to testing. At the end of the day, you could be addressing an issue that is too difficult to reproduce in a test suite, but still occurs in a real production app. In this case, your production app is the test case, and we will have to trust that your change is good.
+
+Examples of pull requests that have been accepted without tests include:
+
+* https://github.com/gocql/gocql/pull/181 - this patch would otherwise require a multi-node cluster to be booted as part of the CI build
+* https://github.com/gocql/gocql/pull/179 - this bug can only be reproduced under heavy load in certain circumstances
+
+### Sign Off Procedure
+
+Generally speaking, a pull request can get merged by any one of the core gocql team. If your change is minor, chances are that one team member will just go ahead and merge it there and then. As stated earlier, suitable test coverage will increase the likelihood that a single reviewer will assess and merge your change. If your change has no test coverage, or looks like it may have wider implications for the health and stability of the library, the reviewer may elect to refer the change to another team member to achieve consensus before proceeding. Therefore, the tighter and cleaner your patch is, the quicker it will go through the review process.
+
+### Supported Features
+
+gocql is a low level wire driver for Cassandra CQL. By and large, we would like to keep the functional scope of the library as narrow as possible. We think that gocql should be tight and focused, and we will be naturally skeptical of things that could just as easily be implemented in a higher layer. Inevitably you will come across something that could be implemented in a higher layer, save for a minor change to the core API. In this instance, please strike up a conversation with the gocql team. Chances are we will understand what you are trying to achieve and will try to accommodate this in a maintainable way.
+
+### Longer Term Evolution
+
+There are some long term plans for gocql that have to be taken into account when assessing changes. That said, gocql is ultimately a community driven project and we don't have a massive development budget, so sometimes the long term view might need to be de-prioritized ahead of short term changes.
+
+## Officially Supported Server Versions
+
+Currently, the officially supported versions of the Cassandra server include:
+
+* 1.2.18
+* 2.0.9
+
+Chances are that gocql will work with many other versions. If you would like us to support a particular version of Cassandra, please start a conversation about what version you'd like us to consider. We are more likely to accept a new version if you help out by extending the regression suite to cover the new version to be supported.
+
+## The Core Dev Team
+
+The core development team includes:
+
+* tux21b
+* phillipCouto
+* Zariel
+* 0x6e6562
diff --git a/vendor/github.com/gocql/gocql/LICENSE b/vendor/github.com/gocql/gocql/LICENSE
new file mode 100644
index 0000000..3836494
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2016, The Gocql authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/gocql/gocql/Makefile b/vendor/github.com/gocql/gocql/Makefile
new file mode 100644
index 0000000..65dae37
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/Makefile
@@ -0,0 +1,5 @@
+# Makefile to run the Docker cleanup script
+
+clean-old-temporary-docker-images:
+	@echo "Running Docker Hub image cleanup script..."
+	python ci/clean-old-temporary-docker-images.py
diff --git a/vendor/github.com/gocql/gocql/README.md b/vendor/github.com/gocql/gocql/README.md
new file mode 100644
index 0000000..62d8995
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/README.md
@@ -0,0 +1,242 @@
+<div align="center">
+
+![Build Passing](https://github.com/scylladb/gocql/workflows/Build/badge.svg)
+[![Read the Fork Driver Docs](https://img.shields.io/badge/Read_the_Docs-pkg_go-blue)](https://pkg.go.dev/github.com/scylladb/gocql#section-documentation)
+[![Protocol Specs](https://img.shields.io/badge/Protocol_Specs-ScyllaDB_Docs-blue)](https://github.com/scylladb/scylladb/blob/master/docs/dev/protocol-extensions.md)
+
+</div>
+
+<h1 align="center"> 
+
+Scylla Shard-Aware Fork of [apache/cassandra-gocql-driver](https://github.com/apache/cassandra-gocql-driver)
+
+</h1>
+
+
+<img src="./.github/assets/logo.svg" width="200" align="left" />
+
+This is a fork of [apache/cassandra-gocql-driver](https://github.com/apache/cassandra-gocql-driver) package that we created at Scylla.
+It contains extensions to tokenAwareHostPolicy supported by the Scylla 2.3 and onwards.
+It allows driver to select a connection to a particular shard on a host based on the token.
+This eliminates passing data between shards and significantly reduces latency.
+
+There are open pull requests to merge the functionality to the upstream project:
+
+* [gocql/gocql#1210](https://github.com/gocql/gocql/pull/1210)
+* [gocql/gocql#1211](https://github.com/gocql/gocql/pull/1211).
+
+It also provides support for shard aware ports, a faster way to connect to all shards, details available in [blogpost](https://www.scylladb.com/2021/04/27/connect-faster-to-scylla-with-a-shard-aware-port/).
+
+---
+
+### Table of Contents
+
+- [1. Sunsetting Model](#1-sunsetting-model)
+- [2. Installation](#2-installation)
+- [3. Quick Start](#3-quick-start)
+- [4. Data Types](#4-data-types)
+- [5. Configuration](#5-configuration)
+  - [5.1 Shard-aware port](#51-shard-aware-port)
+  - [5.2 Iterator](#52-iterator)
+- [6. Contributing](#6-contributing)
+
+## 1. Sunsetting Model
+
+> [!WARNING]
+> In general, the gocql team will focus on supporting the current and previous versions of Go. gocql may still work with older versions of Go, but official support for these versions will have been sunset.
+
+## 2. Installation
+
+This is a drop-in replacement to gocql, it reuses the `github.com/gocql/gocql` import path.
+
+Add the following line to your project `go.mod` file.
+
+```mod
+replace github.com/gocql/gocql => github.com/scylladb/gocql latest
+```
+
+and run
+
+```sh
+go mod tidy
+```
+
+to evaluate `latest` to a concrete tag.
+
+Your project now uses the Scylla driver fork, make sure you are using the `TokenAwareHostPolicy` to enable the shard-awareness, continue reading for details.
+
+## 3. Quick Start  
+
+Spawn a ScyllaDB Instance using Docker Run command:
+
+```sh
+docker run --name node1 --network your-network -p "9042:9042" -d scylladb/scylla:6.1.2 \
+	--overprovisioned 1 \
+	--smp 1
+```
+
+Then, create a new connection using ScyllaDB GoCQL following the example below:
+
+```go
+package main
+
+import (
+    "fmt"
+    "github.com/gocql/gocql"
+)
+
+func main() {
+    var cluster = gocql.NewCluster("localhost:9042")
+
+    var session, err = cluster.CreateSession()
+    if err != nil {
+        panic("Failed to connect to cluster")
+    }
+
+    defer session.Close()
+
+    var query = session.Query("SELECT * FROM system.clients")
+
+    if rows, err := query.Iter().SliceMap(); err == nil {
+        for _, row := range rows {
+            fmt.Printf("%v\n", row)
+        }
+    } else {
+        panic("Query error: " + err.Error())
+    }
+}
+```
+
+## 4. Data Types
+
+Here's an list of all ScyllaDB Types reflected in the GoCQL environment: 
+
+| ScyllaDB Type    | Go Type            |
+| ---------------- | ------------------ |
+| `ascii`          | `string`           |
+| `bigint`         | `int64`            |
+| `blob`           | `[]byte`           |
+| `boolean`        | `bool`             |
+| `date`           | `time.Time`        |
+| `decimal`        | `inf.Dec`          |
+| `double`         | `float64`          |
+| `duration`       | `gocql.Duration`   |
+| `float`          | `float32`          |
+| `uuid`           | `gocql.UUID`       |
+| `int`            | `int32`            |
+| `inet`           | `string`           |
+| `list<int>`      | `[]int32`          |
+| `map<int, text>` | `map[int32]string` |
+| `set<int>`       | `[]int32`          |
+| `smallint`       | `int16`            |
+| `text`           | `string`           |
+| `time`           | `time.Duration`    |
+| `timestamp`      | `time.Time`        |
+| `timeuuid`       | `gocql.UUID`       |
+| `tinyint`        | `int8`             |
+| `varchar`        | `string`           |
+| `varint`         | `int64`            |
+
+## 5. Configuration
+
+In order to make shard-awareness work, token aware host selection policy has to be enabled.
+Please make sure that the gocql configuration has `PoolConfig.HostSelectionPolicy` properly set like in the example below.
+
+__When working with a Scylla cluster, `PoolConfig.NumConns` option has no effect - the driver opens one connection for each shard and completely ignores this option.__
+
+```go
+c := gocql.NewCluster(hosts...)
+
+// Enable token aware host selection policy, if using multi-dc cluster set a local DC.
+fallback := gocql.RoundRobinHostPolicy()
+if localDC != "" {
+	fallback = gocql.DCAwareRoundRobinPolicy(localDC)
+}
+c.PoolConfig.HostSelectionPolicy = gocql.TokenAwareHostPolicy(fallback)
+
+// If using multi-dc cluster use the "local" consistency levels.
+if localDC != "" {
+	c.Consistency = gocql.LocalQuorum
+}
+
+// When working with a Scylla cluster the driver always opens one connection per shard, so `NumConns` is ignored.
+// c.NumConns = 4
+```
+
+### 5.1 Shard-aware port
+
+This version of gocql supports a more robust method of establishing connection for each shard by using _shard aware port_ for native transport.
+It greatly reduces time and the number of connections needed to establish a connection per shard in some cases - ex. when many clients connect at once, or when there are non-shard-aware clients connected to the same cluster.
+
+If you are using a custom Dialer and if your nodes expose the shard-aware port, it is highly recommended to update it so that it uses a specific source port when connecting.
+
+* If you are using a custom `net.Dialer`, you can make your dialer honor the source port by wrapping it in a `gocql.ScyllaShardAwareDialer`:
+
+  ```go
+  oldDialer := net.Dialer{...}
+  clusterConfig.Dialer := &gocql.ScyllaShardAwareDialer{oldDialer}
+  ```
+
+* If you are using a custom type implementing `gocql.Dialer`, you can get the source port by using the `gocql.ScyllaGetSourcePort` function.
+  An example:
+
+  ```go
+  func (d *myDialer) DialContext(ctx context.Context, network, addr string) (net.Conn, error) {
+      sourcePort := gocql.ScyllaGetSourcePort(ctx)
+      localAddr, err := net.ResolveTCPAddr(network, fmt.Sprintf(":%d", sourcePort))
+      if err != nil {
+          return nil, err
+      }
+	  d := &net.Dialer{LocalAddr: localAddr}
+	  return d.DialContext(ctx, network, addr)
+  }
+  ```
+
+  The source port might be already bound by another connection on your system.
+  In such case, you should return an appropriate error so that the driver can retry with a different port suitable for the shard it tries to connect to.
+
+  * If you are using `net.Dialer.DialContext`, this function will return an error in case the source port is unavailable, and you can just return that error from your custom `Dialer`.
+  * Otherwise, if you detect that the source port is unavailable, you can either return `gocql.ErrScyllaSourcePortAlreadyInUse` or `syscall.EADDRINUSE`.
+
+For this feature to work correctly, you need to make sure the following conditions are met:
+
+* Your cluster nodes are configured to listen on the shard-aware port (`native_shard_aware_transport_port` option),
+* Your cluster nodes are not behind a NAT which changes source ports,
+* If you have a custom Dialer, it connects from the correct source port (see the guide above).
+
+The feature is designed to gracefully fall back to the using the non-shard-aware port when it detects that some of the above conditions are not met.
+The driver will print a warning about misconfigured address translation if it detects it.
+Issues with shard-aware port not being reachable are not reported in non-debug mode, because there is no way to detect it without false positives.
+
+If you suspect that this feature is causing you problems, you can completely disable it by setting the `ClusterConfig.DisableShardAwarePort` flag to true.
+
+### 5.2 Iterator
+
+Paging is a way to parse large result sets in smaller chunks.
+The driver provides an iterator to simplify this process.
+
+Use `Query.Iter()` to obtain iterator:
+
+```go
+iter := session.Query("SELECT id, value FROM my_table WHERE id > 100 AND id < 10000").Iter()
+var results []int
+
+var id, value int
+for !iter.Scan(&id, &value) {
+	if id%2 == 0 {
+		results = append(results, value)
+	}
+}
+
+if err := iter.Close(); err != nil {
+    // handle error
+}
+```
+
+In case of range and `ALLOW FILTERING` queries server can send empty responses for some pages.
+That is why you should never consider empty response as the end of the result set.
+Always check `iter.Scan()` result to know if there are more results, or `Iter.LastPage()` to know if the last page was reached.
+
+## 6. Contributing
+
+If you have any interest to be contributing in this GoCQL Fork, please read the [CONTRIBUTING.md](CONTRIBUTING.md) before initialize any Issue or Pull Request.
diff --git a/vendor/github.com/gocql/gocql/address_translators.go b/vendor/github.com/gocql/gocql/address_translators.go
new file mode 100644
index 0000000..6638bca
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/address_translators.go
@@ -0,0 +1,26 @@
+package gocql
+
+import "net"
+
+// AddressTranslator provides a way to translate node addresses (and ports) that are
+// discovered or received as a node event. This can be useful in an ec2 environment,
+// for instance, to translate public IPs to private IPs.
+type AddressTranslator interface {
+	// Translate will translate the provided address and/or port to another
+	// address and/or port. If no translation is possible, Translate will return the
+	// address and port provided to it.
+	Translate(addr net.IP, port int) (net.IP, int)
+}
+
+type AddressTranslatorFunc func(addr net.IP, port int) (net.IP, int)
+
+func (fn AddressTranslatorFunc) Translate(addr net.IP, port int) (net.IP, int) {
+	return fn(addr, port)
+}
+
+// IdentityTranslator will do nothing but return what it was provided. It is essentially a no-op.
+func IdentityTranslator() AddressTranslator {
+	return AddressTranslatorFunc(func(addr net.IP, port int) (net.IP, int) {
+		return addr, port
+	})
+}
diff --git a/vendor/github.com/gocql/gocql/cluster.go b/vendor/github.com/gocql/gocql/cluster.go
new file mode 100644
index 0000000..d54a4d5
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/cluster.go
@@ -0,0 +1,541 @@
+// Copyright (c) 2012 The gocql Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gocql
+
+import (
+	"context"
+	"crypto/tls"
+	"crypto/x509"
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"net"
+	"sync/atomic"
+	"time"
+)
+
+const defaultDriverName = "ScyllaDB GoCQL Driver"
+
+// PoolConfig configures the connection pool used by the driver, it defaults to
+// using a round-robin host selection policy and a round-robin connection selection
+// policy for each host.
+type PoolConfig struct {
+	// HostSelectionPolicy sets the policy for selecting which host to use for a
+	// given query (default: RoundRobinHostPolicy())
+	// It is not supported to use a single HostSelectionPolicy in multiple sessions
+	// (even if you close the old session before using in a new session).
+	HostSelectionPolicy HostSelectionPolicy
+}
+
+func (p PoolConfig) buildPool(session *Session) *policyConnPool {
+	return newPolicyConnPool(session)
+}
+
+// ClusterConfig is a struct to configure the default cluster implementation
+// of gocql. It has a variety of attributes that can be used to modify the
+// behavior to fit the most common use cases. Applications that require a
+// different setup must implement their own cluster.
+type ClusterConfig struct {
+	// addresses for the initial connections. It is recommended to use the value set in
+	// the Cassandra config for broadcast_address or listen_address, an IP address not
+	// a domain name. This is because events from Cassandra will use the configured IP
+	// address, which is used to index connected hosts. If the domain name specified
+	// resolves to more than 1 IP address then the driver may connect multiple times to
+	// the same host, and will not mark the node being down or up from events.
+	Hosts []string
+
+	// CQL version (default: 3.0.0)
+	CQLVersion string
+
+	// ProtoVersion sets the version of the native protocol to use, this will
+	// enable features in the driver for specific protocol versions, generally this
+	// should be set to a known version (2,3,4) for the cluster being connected to.
+	//
+	// If it is 0 or unset (the default) then the driver will attempt to discover the
+	// highest supported protocol for the cluster. In clusters with nodes of different
+	// versions the protocol selected is not defined (ie, it can be any of the supported in the cluster)
+	ProtoVersion int
+
+	// Timeout limits the time spent on the client side while executing a query.
+	// Specifically, query or batch execution will return an error if the client does not receive a response
+	// from the server within the Timeout period.
+	// Timeout is also used to configure the read timeout on the underlying network connection.
+	// Client Timeout should always be higher than the request timeouts configured on the server,
+	// so that retries don't overload the server.
+	// Timeout has a default value of 11 seconds, which is higher than default server timeout for most query types.
+	// Timeout is not applied to requests during initial connection setup, see ConnectTimeout.
+	Timeout time.Duration
+
+	// ConnectTimeout limits the time spent during connection setup.
+	// During initial connection setup, internal queries, AUTH requests will return an error if the client
+	// does not receive a response within the ConnectTimeout period.
+	// ConnectTimeout is applied to the connection setup queries independently.
+	// ConnectTimeout also limits the duration of dialing a new TCP connection
+	// in case there is no Dialer nor HostDialer configured.
+	// ConnectTimeout has a default value of 11 seconds.
+	ConnectTimeout time.Duration
+
+	// WriteTimeout limits the time the driver waits to write a request to a network connection.
+	// WriteTimeout should be lower than or equal to Timeout.
+	// WriteTimeout defaults to the value of Timeout.
+	WriteTimeout time.Duration
+
+	// Port used when dialing.
+	// Default: 9042
+	Port int
+
+	// Initial keyspace. Optional.
+	Keyspace string
+
+	// The size of the connection pool for each host.
+	// The pool filling runs in separate gourutine during the session initialization phase.
+	// gocql will always try to get 1 connection on each host pool
+	// during session initialization AND it will attempt
+	// to fill each pool afterward asynchronously if NumConns > 1.
+	// Notice: There is no guarantee that pool filling will be finished in the initialization phase.
+	// Also, it describes a maximum number of connections at the same time.
+	// Default: 2
+	NumConns int
+
+	// Maximum number of inflight requests allowed per connection.
+	// Default: 32768 for CQL v3 and newer
+	// Default: 128 for older CQL versions
+	MaxRequestsPerConn int
+
+	// Default consistency level.
+	// Default: Quorum
+	Consistency Consistency
+
+	// Compression algorithm.
+	// Default: nil
+	Compressor Compressor
+
+	// Default: nil
+	Authenticator Authenticator
+
+	WarningsHandlerBuilder WarningHandlerBuilder
+
+	// An Authenticator factory. Can be used to create alternative authenticators.
+	// Default: nil
+	AuthProvider func(h *HostInfo) (Authenticator, error)
+
+	// Default retry policy to use for queries.
+	// Default: no retries.
+	RetryPolicy RetryPolicy
+
+	// ConvictionPolicy decides whether to mark host as down based on the error and host info.
+	// Default: SimpleConvictionPolicy
+	ConvictionPolicy ConvictionPolicy
+
+	// Default reconnection policy to use for reconnecting before trying to mark host as down.
+	ReconnectionPolicy ReconnectionPolicy
+
+	// A reconnection policy to use for reconnecting when connecting to the cluster first time.
+	InitialReconnectionPolicy ReconnectionPolicy
+
+	// The keepalive period to use, enabled if > 0 (default: 15 seconds)
+	// SocketKeepalive is used to set up the default dialer and is ignored if Dialer or HostDialer is provided.
+	SocketKeepalive time.Duration
+
+	// Maximum cache size for prepared statements globally for gocql.
+	// Default: 1000
+	MaxPreparedStmts int
+
+	// Maximum cache size for query info about statements for each session.
+	// Default: 1000
+	MaxRoutingKeyInfo int
+
+	// Default page size to use for created sessions.
+	// Default: 5000
+	PageSize int
+
+	// Consistency for the serial part of queries, values can be either SERIAL or LOCAL_SERIAL.
+	// Default: unset
+	SerialConsistency SerialConsistency
+
+	// SslOpts configures TLS use when HostDialer is not set.
+	// SslOpts is ignored if HostDialer is set.
+	SslOpts       *SslOptions
+	actualSslOpts atomic.Value
+
+	// Sends a client side timestamp for all requests which overrides the timestamp at which it arrives at the server.
+	// Default: true, only enabled for protocol 3 and above.
+	DefaultTimestamp bool
+
+	// The name of the driver that is going to be reported to the server.
+	// Default: "ScyllaDB GoLang Driver"
+	DriverName string
+
+	// The version of the driver that is going to be reported to the server.
+	// Defaulted to current library version
+	DriverVersion string
+
+	// PoolConfig configures the underlying connection pool, allowing the
+	// configuration of host selection and connection selection policies.
+	PoolConfig PoolConfig
+
+	// If not zero, gocql attempt to reconnect known DOWN nodes in every ReconnectInterval.
+	ReconnectInterval time.Duration
+
+	// The maximum amount of time to wait for schema agreement in a cluster after
+	// receiving a schema change frame. (default: 60s)
+	MaxWaitSchemaAgreement time.Duration
+
+	// HostFilter will filter all incoming events for host, any which don't pass
+	// the filter will be ignored. If set will take precedence over any options set
+	// via Discovery
+	HostFilter HostFilter
+
+	// AddressTranslator will translate addresses found on peer discovery and/or
+	// node change events.
+	AddressTranslator AddressTranslator
+
+	// If IgnorePeerAddr is true and the address in system.peers does not match
+	// the supplied host by either initial hosts or discovered via events then the
+	// host will be replaced with the supplied address.
+	//
+	// For example if an event comes in with host=10.0.0.1 but when looking up that
+	// address in system.local or system.peers returns 127.0.0.1, the peer will be
+	// set to 10.0.0.1 which is what will be used to connect to.
+	IgnorePeerAddr bool
+
+	// If DisableInitialHostLookup then the driver will not attempt to get host info
+	// from the system.peers table, this will mean that the driver will connect to
+	// hosts supplied and will not attempt to lookup the hosts information, this will
+	// mean that data_centre, rack and token information will not be available and as
+	// such host filtering and token aware query routing will not be available.
+	DisableInitialHostLookup bool
+
+	// Configure events the driver will register for
+	Events struct {
+		// disable registering for status events (node up/down)
+		DisableNodeStatusEvents bool
+		// disable registering for topology events (node added/removed/moved)
+		DisableTopologyEvents bool
+		// disable registering for schema events (keyspace/table/function removed/created/updated)
+		DisableSchemaEvents bool
+	}
+
+	// DisableSkipMetadata will override the internal result metadata cache so that the driver does not
+	// send skip_metadata for queries, this means that the result will always contain
+	// the metadata to parse the rows and will not reuse the metadata from the prepared
+	// statement.
+	//
+	// See https://issues.apache.org/jira/browse/CASSANDRA-10786
+	// See https://github.com/scylladb/scylladb/issues/20860
+	//
+	// Default: true
+	DisableSkipMetadata bool
+
+	// QueryObserver will set the provided query observer on all queries created from this session.
+	// Use it to collect metrics / stats from queries by providing an implementation of QueryObserver.
+	QueryObserver QueryObserver
+
+	// BatchObserver will set the provided batch observer on all queries created from this session.
+	// Use it to collect metrics / stats from batch queries by providing an implementation of BatchObserver.
+	BatchObserver BatchObserver
+
+	// ConnectObserver will set the provided connect observer on all queries
+	// created from this session.
+	ConnectObserver ConnectObserver
+
+	// FrameHeaderObserver will set the provided frame header observer on all frames' headers created from this session.
+	// Use it to collect metrics / stats from frames by providing an implementation of FrameHeaderObserver.
+	FrameHeaderObserver FrameHeaderObserver
+
+	// StreamObserver will be notified of stream state changes.
+	// This can be used to track in-flight protocol requests and responses.
+	StreamObserver StreamObserver
+
+	// Default idempotence for queries
+	DefaultIdempotence bool
+
+	// The time to wait for frames before flushing the frames connection to Cassandra.
+	// Can help reduce syscall overhead by making less calls to write. Set to 0 to
+	// disable.
+	//
+	// (default: 200 microseconds)
+	WriteCoalesceWaitTime time.Duration
+
+	// Dialer will be used to establish all connections created for this Cluster.
+	// If not provided, a default dialer configured with ConnectTimeout will be used.
+	// Dialer is ignored if HostDialer is provided.
+	Dialer Dialer
+
+	// HostDialer will be used to establish all connections for this Cluster.
+	// Unlike Dialer, HostDialer is responsible for setting up the entire connection, including the TLS session.
+	// To support shard-aware port, HostDialer should implement ShardDialer.
+	// If not provided, Dialer will be used instead.
+	HostDialer HostDialer
+
+	// DisableShardAwarePort will prevent the driver from connecting to Scylla's shard-aware port,
+	// even if there are nodes in the cluster that support it.
+	//
+	// It is generally recommended to leave this option turned off because gocql can use
+	// the shard-aware port to make the process of establishing more robust.
+	// However, if you have a cluster with nodes which expose shard-aware port
+	// but the port is unreachable due to network configuration issues, you can use
+	// this option to work around the issue. Set it to true only if you neither can fix
+	// your network nor disable shard-aware port on your nodes.
+	DisableShardAwarePort bool
+
+	// Logger for this ClusterConfig.
+	// If not specified, defaults to the global gocql.Logger.
+	Logger StdLogger
+
+	// The timeout for the requests to the schema tables. (default: 60s)
+	MetadataSchemaRequestTimeout time.Duration
+
+	// internal config for testing
+	disableControlConn bool
+	disableInit        bool
+}
+
+type Dialer interface {
+	DialContext(ctx context.Context, network, addr string) (net.Conn, error)
+}
+
+// NewCluster generates a new config for the default cluster implementation.
+//
+// The supplied hosts are used to initially connect to the cluster then the rest of
+// the ring will be automatically discovered. It is recommended to use the value set in
+// the Cassandra config for broadcast_address or listen_address, an IP address not
+// a domain name. This is because events from Cassandra will use the configured IP
+// address, which is used to index connected hosts. If the domain name specified
+// resolves to more than 1 IP address then the driver may connect multiple times to
+// the same host, and will not mark the node being down or up from events.
+func NewCluster(hosts ...string) *ClusterConfig {
+	cfg := &ClusterConfig{
+		Hosts:                        hosts,
+		CQLVersion:                   "3.0.0",
+		Timeout:                      11 * time.Second,
+		ConnectTimeout:               11 * time.Second,
+		Port:                         9042,
+		NumConns:                     2,
+		Consistency:                  Quorum,
+		MaxPreparedStmts:             defaultMaxPreparedStmts,
+		MaxRoutingKeyInfo:            1000,
+		PageSize:                     5000,
+		DefaultTimestamp:             true,
+		DriverName:                   defaultDriverName,
+		DriverVersion:                defaultDriverVersion,
+		MaxWaitSchemaAgreement:       60 * time.Second,
+		ReconnectInterval:            60 * time.Second,
+		ConvictionPolicy:             &SimpleConvictionPolicy{},
+		ReconnectionPolicy:           &ConstantReconnectionPolicy{MaxRetries: 3, Interval: 1 * time.Second},
+		InitialReconnectionPolicy:    &NoReconnectionPolicy{},
+		SocketKeepalive:              15 * time.Second,
+		WriteCoalesceWaitTime:        200 * time.Microsecond,
+		MetadataSchemaRequestTimeout: 60 * time.Second,
+		DisableSkipMetadata:          true,
+		WarningsHandlerBuilder:       DefaultWarningHandlerBuilder,
+	}
+
+	return cfg
+}
+
+func (cfg *ClusterConfig) logger() StdLogger {
+	if cfg.Logger == nil {
+		return Logger
+	}
+	return cfg.Logger
+}
+
+// CreateSession initializes the cluster based on this config and returns a
+// session object that can be used to interact with the database.
+func (cfg *ClusterConfig) CreateSession() (*Session, error) {
+	return NewSession(*cfg)
+}
+
+func (cfg *ClusterConfig) CreateSessionNonBlocking() (*Session, error) {
+	return NewSessionNonBlocking(*cfg)
+}
+
+// translateAddressPort is a helper method that will use the given AddressTranslator
+// if defined, to translate the given address and port into a possibly new address
+// and port, If no AddressTranslator or if an error occurs, the given address and
+// port will be returned.
+func (cfg *ClusterConfig) translateAddressPort(addr net.IP, port int) (net.IP, int) {
+	if cfg.AddressTranslator == nil || len(addr) == 0 {
+		return addr, port
+	}
+	newAddr, newPort := cfg.AddressTranslator.Translate(addr, port)
+	if gocqlDebug {
+		cfg.logger().Printf("gocql: translating address '%v:%d' to '%v:%d'", addr, port, newAddr, newPort)
+	}
+	return newAddr, newPort
+}
+
+func (cfg *ClusterConfig) filterHost(host *HostInfo) bool {
+	return !(cfg.HostFilter == nil || cfg.HostFilter.Accept(host))
+}
+
+func (cfg *ClusterConfig) ValidateAndInitSSL() error {
+	if cfg.SslOpts == nil {
+		return nil
+	}
+	actualTLSConfig, err := setupTLSConfig(cfg.SslOpts)
+	if err != nil {
+		return fmt.Errorf("failed to initialize ssl configuration: %s", err.Error())
+	}
+
+	cfg.actualSslOpts.Store(actualTLSConfig)
+	return nil
+}
+
+func (cfg *ClusterConfig) getActualTLSConfig() *tls.Config {
+	val, ok := cfg.actualSslOpts.Load().(*tls.Config)
+	if !ok {
+		return nil
+	}
+	return val.Clone()
+}
+
+func (cfg *ClusterConfig) Validate() error {
+	if len(cfg.Hosts) == 0 {
+		return ErrNoHosts
+	}
+
+	if cfg.Authenticator != nil && cfg.AuthProvider != nil {
+		return errors.New("Can't use both Authenticator and AuthProvider in cluster config.")
+	}
+
+	if cfg.InitialReconnectionPolicy == nil {
+		return errors.New("InitialReconnectionPolicy is nil")
+	}
+
+	if cfg.InitialReconnectionPolicy.GetMaxRetries() <= 0 {
+		return errors.New("InitialReconnectionPolicy.GetMaxRetries returns negative number")
+	}
+
+	if cfg.ReconnectionPolicy == nil {
+		return errors.New("ReconnectionPolicy is nil")
+	}
+
+	if cfg.InitialReconnectionPolicy.GetMaxRetries() <= 0 {
+		return errors.New("ReconnectionPolicy.GetMaxRetries returns negative number")
+	}
+
+	if cfg.PageSize < 0 {
+		return errors.New("PageSize should be positive number or zero")
+	}
+
+	if cfg.MaxRoutingKeyInfo < 0 {
+		return errors.New("MaxRoutingKeyInfo should be positive number or zero")
+	}
+
+	if cfg.MaxPreparedStmts < 0 {
+		return errors.New("MaxPreparedStmts should be positive number or zero")
+	}
+
+	if cfg.SocketKeepalive < 0 {
+		return errors.New("SocketKeepalive should be positive time.Duration or zero")
+	}
+
+	if cfg.MaxRequestsPerConn < 0 {
+		return errors.New("MaxRequestsPerConn should be positive number or zero")
+	}
+
+	if cfg.NumConns < 0 {
+		return errors.New("NumConns should be positive non-zero number or zero")
+	}
+
+	if cfg.Port <= 0 || cfg.Port > 65535 {
+		return errors.New("Port should be a valid port number: a number between 1 and 65535")
+	}
+
+	if cfg.WriteTimeout < 0 {
+		return errors.New("WriteTimeout should be positive time.Duration or zero")
+	}
+
+	if cfg.Timeout < 0 {
+		return errors.New("Timeout should be positive time.Duration or zero")
+	}
+
+	if cfg.ConnectTimeout < 0 {
+		return errors.New("ConnectTimeout should be positive time.Duration or zero")
+	}
+
+	if cfg.MetadataSchemaRequestTimeout < 0 {
+		return errors.New("MetadataSchemaRequestTimeout should be positive time.Duration or zero")
+	}
+
+	if cfg.WriteCoalesceWaitTime < 0 {
+		return errors.New("WriteCoalesceWaitTime should be positive time.Duration or zero")
+	}
+
+	if cfg.ReconnectInterval < 0 {
+		return errors.New("ReconnectInterval should be positive time.Duration or zero")
+	}
+
+	if cfg.MaxWaitSchemaAgreement < 0 {
+		return errors.New("MaxWaitSchemaAgreement should be positive time.Duration or zero")
+	}
+
+	if cfg.ProtoVersion < 0 {
+		return errors.New("ProtoVersion should be positive number or zero")
+	}
+
+	if !cfg.DisableSkipMetadata {
+		Logger.Println("warning: enabling skipping metadata can lead to unpredictible results when executing query and altering columns involved in the query.")
+	}
+
+	return cfg.ValidateAndInitSSL()
+}
+
+var (
+	ErrNoHosts              = errors.New("no hosts provided")
+	ErrNoConnectionsStarted = errors.New("no connections were made when creating the session")
+	ErrHostQueryFailed      = errors.New("unable to populate Hosts")
+)
+
+func setupTLSConfig(sslOpts *SslOptions) (*tls.Config, error) {
+	//  Config.InsecureSkipVerify | EnableHostVerification | Result
+	//  Config is nil             | true                   | verify host
+	//  Config is nil             | false                  | do not verify host
+	//  false                     | false                  | verify host
+	//  true                      | false                  | do not verify host
+	//  false                     | true                   | verify host
+	//  true                      | true                   | verify host
+	var tlsConfig *tls.Config
+	if sslOpts.Config == nil {
+		tlsConfig = &tls.Config{
+			InsecureSkipVerify: !sslOpts.EnableHostVerification,
+		}
+	} else {
+		// use clone to avoid race.
+		tlsConfig = sslOpts.Config.Clone()
+	}
+
+	if tlsConfig.InsecureSkipVerify && sslOpts.EnableHostVerification {
+		tlsConfig.InsecureSkipVerify = false
+	}
+
+	// ca cert is optional
+	if sslOpts.CaPath != "" {
+		if tlsConfig.RootCAs == nil {
+			tlsConfig.RootCAs = x509.NewCertPool()
+		}
+
+		pem, err := ioutil.ReadFile(sslOpts.CaPath)
+		if err != nil {
+			return nil, fmt.Errorf("unable to open CA certs: %v", err)
+		}
+
+		if !tlsConfig.RootCAs.AppendCertsFromPEM(pem) {
+			return nil, errors.New("failed parsing or CA certs")
+		}
+	}
+
+	if sslOpts.CertPath != "" || sslOpts.KeyPath != "" {
+		mycert, err := tls.LoadX509KeyPair(sslOpts.CertPath, sslOpts.KeyPath)
+		if err != nil {
+			return nil, fmt.Errorf("unable to load X509 key pair: %v", err)
+		}
+		tlsConfig.Certificates = append(tlsConfig.Certificates, mycert)
+	}
+
+	return tlsConfig, nil
+}
diff --git a/vendor/github.com/gocql/gocql/compressor.go b/vendor/github.com/gocql/gocql/compressor.go
new file mode 100644
index 0000000..120441b
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/compressor.go
@@ -0,0 +1,29 @@
+package gocql
+
+import (
+	"github.com/klauspost/compress/s2"
+)
+
+type Compressor interface {
+	Name() string
+	Encode(data []byte) ([]byte, error)
+	Decode(data []byte) ([]byte, error)
+}
+
+// SnappyCompressor implements the Compressor interface and can be used to
+// compress incoming and outgoing frames. It uses S2 compression algorithm
+// that is compatible with snappy and aims for high throughput, which is why
+// it features concurrent compression for bigger payloads.
+type SnappyCompressor struct{}
+
+func (s SnappyCompressor) Name() string {
+	return "snappy"
+}
+
+func (s SnappyCompressor) Encode(data []byte) ([]byte, error) {
+	return s2.EncodeSnappy(nil, data), nil
+}
+
+func (s SnappyCompressor) Decode(data []byte) ([]byte, error) {
+	return s2.Decode(nil, data)
+}
diff --git a/vendor/github.com/gocql/gocql/conn.go b/vendor/github.com/gocql/gocql/conn.go
new file mode 100644
index 0000000..f8635cb
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/conn.go
@@ -0,0 +1,1964 @@
+// Copyright (c) 2012 The gocql Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gocql
+
+import (
+	"bufio"
+	"context"
+	"crypto/tls"
+	"errors"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"net"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/gocql/gocql/internal/lru"
+	"github.com/gocql/gocql/internal/streams"
+)
+
+// approve the authenticator with the list of allowed authenticators. If the provided list is empty,
+// the given authenticator is allowed.
+func approve(authenticator string, approvedAuthenticators []string) bool {
+	if len(approvedAuthenticators) == 0 {
+		return true
+	}
+	for _, s := range approvedAuthenticators {
+		if authenticator == s {
+			return true
+		}
+	}
+	return false
+}
+
+// JoinHostPort is a utility to return an address string that can be used
+// by `gocql.Conn` to form a connection with a host.
+func JoinHostPort(addr string, port int) string {
+	addr = strings.TrimSpace(addr)
+	if _, _, err := net.SplitHostPort(addr); err != nil {
+		addr = net.JoinHostPort(addr, strconv.Itoa(port))
+	}
+	return addr
+}
+
+type Authenticator interface {
+	Challenge(req []byte) (resp []byte, auth Authenticator, err error)
+	Success(data []byte) error
+}
+
+type WarningHandlerBuilder func(session *Session) WarningHandler
+
+type WarningHandler interface {
+	HandleWarnings(qry ExecutableQuery, host *HostInfo, warnings []string)
+}
+
+// PasswordAuthenticator specifies credentials to be used when authenticating.
+// It can be configured with an "allow list" of authenticator class names to avoid
+// attempting to authenticate with Cassandra if it doesn't provide an expected authenticator.
+type PasswordAuthenticator struct {
+	Username string
+	Password string
+	// Setting this to nil or empty will allow authenticating with any authenticator
+	// provided by the server.  This is the default behavior of most other driver
+	// implementations.
+	AllowedAuthenticators []string
+}
+
+func (p PasswordAuthenticator) Challenge(req []byte) ([]byte, Authenticator, error) {
+	if !approve(string(req), p.AllowedAuthenticators) {
+		return nil, nil, fmt.Errorf("unexpected authenticator %q", req)
+	}
+	resp := make([]byte, 2+len(p.Username)+len(p.Password))
+	resp[0] = 0
+	copy(resp[1:], p.Username)
+	resp[len(p.Username)+1] = 0
+	copy(resp[2+len(p.Username):], p.Password)
+	return resp, nil, nil
+}
+
+func (p PasswordAuthenticator) Success(data []byte) error {
+	return nil
+}
+
+// SslOptions configures TLS use.
+//
+// Warning: Due to historical reasons, the SslOptions is insecure by default, so you need to set EnableHostVerification
+// to true if no Config is set. Most users should set SslOptions.Config to a *tls.Config.
+// SslOptions and Config.InsecureSkipVerify interact as follows:
+//
+//	Config.InsecureSkipVerify | EnableHostVerification | Result
+//	Config is nil             | false                  | do not verify host
+//	Config is nil             | true                   | verify host
+//	false                     | false                  | verify host
+//	true                      | false                  | do not verify host
+//	false                     | true                   | verify host
+//	true                      | true                   | verify host
+type SslOptions struct {
+	*tls.Config
+
+	// CertPath and KeyPath are optional depending on server
+	// config, but both fields must be omitted to avoid using a
+	// client certificate
+	CertPath string
+	KeyPath  string
+	CaPath   string //optional depending on server config
+	// If you want to verify the hostname and server cert (like a wildcard for cass cluster) then you should turn this
+	// on.
+	// This option is basically the inverse of tls.Config.InsecureSkipVerify.
+	// See InsecureSkipVerify in http://golang.org/pkg/crypto/tls/ for more info.
+	//
+	// See SslOptions documentation to see how EnableHostVerification interacts with the provided tls.Config.
+	EnableHostVerification bool
+}
+
+type ConnConfig struct {
+	ProtoVersion   int
+	CQLVersion     string
+	Timeout        time.Duration
+	WriteTimeout   time.Duration
+	ConnectTimeout time.Duration
+	Dialer         Dialer
+	HostDialer     HostDialer
+	Compressor     Compressor
+	Authenticator  Authenticator
+	AuthProvider   func(h *HostInfo) (Authenticator, error)
+	Keepalive      time.Duration
+	Logger         StdLogger
+
+	tlsConfig       *tls.Config
+	disableCoalesce bool
+}
+
+func (c *ConnConfig) logger() StdLogger {
+	if c.Logger == nil {
+		return Logger
+	}
+	return c.Logger
+}
+
+type ConnErrorHandler interface {
+	HandleError(conn *Conn, err error, closed bool)
+}
+
+type connErrorHandlerFn func(conn *Conn, err error, closed bool)
+
+func (fn connErrorHandlerFn) HandleError(conn *Conn, err error, closed bool) {
+	fn(conn, err, closed)
+}
+
+// If not zero, how many timeouts we will allow to occur before the connection is closed
+// and restarted. This is to prevent a single query timeout from killing a connection
+// which may be serving more queries just fine.
+// Default is 0, should not be changed concurrently with queries.
+//
+// Deprecated.
+var TimeoutLimit int64 = 0
+
+type ConnInterface interface {
+	Close()
+	exec(ctx context.Context, req frameBuilder, tracer Tracer) (*framer, error)
+	awaitSchemaAgreement(ctx context.Context) error
+	executeQuery(ctx context.Context, qry *Query) *Iter
+	querySystem(ctx context.Context, query string) *Iter
+	getIsSchemaV2() bool
+	setSchemaV2(s bool)
+	query(ctx context.Context, statement string, values ...interface{}) (iter *Iter)
+	getScyllaSupported() scyllaSupported
+}
+
+// Conn is a single connection to a Cassandra node. It can be used to execute
+// queries, but users are usually advised to use a more reliable, higher
+// level API.
+type Conn struct {
+	conn net.Conn
+	r    *bufio.Reader
+	w    contextWriter
+
+	timeout        time.Duration
+	writeTimeout   time.Duration
+	cfg            *ConnConfig
+	frameObserver  FrameHeaderObserver
+	streamObserver StreamObserver
+
+	headerBuf [maxFrameHeaderSize]byte
+
+	streams *streams.IDGenerator
+	mu      sync.Mutex
+	// calls stores a map from stream ID to callReq.
+	// This map is protected by mu.
+	// calls should not be used when closed is true, calls is set to nil when closed=true.
+	calls map[int]*callReq
+
+	errorHandler ConnErrorHandler
+	compressor   Compressor
+	auth         Authenticator
+	addr         string
+
+	version         uint8
+	currentKeyspace string
+	host            *HostInfo
+	supported       map[string][]string
+	scyllaSupported scyllaSupported
+	cqlProtoExts    []cqlProtocolExtension
+	isSchemaV2      bool
+
+	session *Session
+
+	// true if connection close process for the connection started.
+	// closed is protected by mu.
+	closed bool
+	ctx    context.Context
+	cancel context.CancelFunc
+
+	timeouts int64
+
+	logger           StdLogger
+	tabletsRoutingV1 int32
+}
+
+func (c *Conn) getIsSchemaV2() bool {
+	return c.isSchemaV2
+}
+
+func (c *Conn) setSchemaV2(s bool) {
+	c.isSchemaV2 = s
+}
+
+func (c *Conn) getScyllaSupported() scyllaSupported {
+	return c.scyllaSupported
+}
+
+// connect establishes a connection to a Cassandra node using session's connection config.
+func (s *Session) connect(ctx context.Context, host *HostInfo, errorHandler ConnErrorHandler) (*Conn, error) {
+	return s.dial(ctx, host, s.connCfg, errorHandler)
+}
+
+// connectShard establishes a connection to a shard.
+// If nrShards is zero, shard-aware dialing is disabled.
+func (s *Session) connectShard(ctx context.Context, host *HostInfo, errorHandler ConnErrorHandler,
+	shardID, nrShards int) (*Conn, error) {
+	return s.dialShard(ctx, host, s.connCfg, errorHandler, shardID, nrShards)
+}
+
+// dial establishes a connection to a Cassandra node and notifies the session's connectObserver.
+func (s *Session) dial(ctx context.Context, host *HostInfo, connConfig *ConnConfig, errorHandler ConnErrorHandler) (*Conn, error) {
+	return s.dialShard(ctx, host, connConfig, errorHandler, 0, 0)
+}
+
+// dialShard establishes a connection to a host/shard and notifies the session's connectObserver.
+// If nrShards is zero, shard-aware dialing is disabled.
+func (s *Session) dialShard(ctx context.Context, host *HostInfo, connConfig *ConnConfig, errorHandler ConnErrorHandler,
+	shardID, nrShards int) (*Conn, error) {
+	var obs ObservedConnect
+	if s.connectObserver != nil {
+		obs.Host = host
+		obs.Start = time.Now()
+	}
+
+	conn, err := s.dialWithoutObserver(ctx, host, connConfig, errorHandler, shardID, nrShards)
+
+	if s.connectObserver != nil {
+		obs.End = time.Now()
+		obs.Err = err
+		s.connectObserver.ObserveConnect(obs)
+	}
+
+	return conn, err
+}
+
+// dialWithoutObserver establishes connection to a Cassandra node.
+//
+// dialWithoutObserver does not notify the connection observer, so you most probably want to call dial() instead.
+//
+// If nrShards is zero, shard-aware dialing is disabled.
+func (s *Session) dialWithoutObserver(ctx context.Context, host *HostInfo, cfg *ConnConfig, errorHandler ConnErrorHandler,
+	shardID, nrShards int) (*Conn, error) {
+
+	shardDialer, ok := cfg.HostDialer.(ShardDialer)
+	var (
+		dialedHost *DialedHost
+		err        error
+	)
+	if ok && nrShards > 0 {
+		dialedHost, err = shardDialer.DialShard(ctx, host, shardID, nrShards)
+	} else {
+		dialedHost, err = cfg.HostDialer.DialHost(ctx, host)
+	}
+
+	if err != nil {
+		return nil, err
+	}
+
+	writeTimeout := cfg.Timeout
+	if cfg.WriteTimeout > 0 {
+		writeTimeout = cfg.WriteTimeout
+	}
+
+	ctx, cancel := context.WithCancel(ctx)
+	c := &Conn{
+		conn:          dialedHost.Conn,
+		r:             bufio.NewReader(dialedHost.Conn),
+		cfg:           cfg,
+		calls:         make(map[int]*callReq),
+		version:       uint8(cfg.ProtoVersion),
+		addr:          dialedHost.Conn.RemoteAddr().String(),
+		errorHandler:  errorHandler,
+		compressor:    cfg.Compressor,
+		session:       s,
+		streams:       s.streamIDGenerator(cfg.ProtoVersion),
+		host:          host,
+		isSchemaV2:    true, // Try using "system.peers_v2" until proven otherwise
+		frameObserver: s.frameObserver,
+		w: &deadlineContextWriter{
+			w:         dialedHost.Conn,
+			timeout:   writeTimeout,
+			semaphore: make(chan struct{}, 1),
+			quit:      make(chan struct{}),
+		},
+		ctx:            ctx,
+		cancel:         cancel,
+		logger:         cfg.logger(),
+		streamObserver: s.streamObserver,
+		writeTimeout:   writeTimeout,
+	}
+
+	if err := c.init(ctx, dialedHost); err != nil {
+		cancel()
+		c.Close()
+		return nil, err
+	}
+
+	return c, nil
+}
+
+func (s *Session) streamIDGenerator(protocol int) *streams.IDGenerator {
+	if s.cfg.MaxRequestsPerConn > 0 {
+		return streams.NewLimited(s.cfg.MaxRequestsPerConn)
+	}
+	return streams.New(protocol)
+}
+
+func (c *Conn) init(ctx context.Context, dialedHost *DialedHost) error {
+	if c.session.cfg.AuthProvider != nil {
+		var err error
+		c.auth, err = c.cfg.AuthProvider(c.host)
+		if err != nil {
+			return err
+		}
+	} else {
+		c.auth = c.cfg.Authenticator
+	}
+
+	startup := &startupCoordinator{
+		frameTicker: make(chan struct{}),
+		conn:        c,
+	}
+
+	c.timeout = c.cfg.ConnectTimeout
+	if err := startup.setupConn(ctx); err != nil {
+		return err
+	}
+
+	c.timeout = c.cfg.Timeout
+
+	// dont coalesce startup frames
+	if c.session.cfg.WriteCoalesceWaitTime > 0 && !c.cfg.disableCoalesce && !dialedHost.DisableCoalesce {
+		c.w = newWriteCoalescer(c.conn, c.writeTimeout, c.session.cfg.WriteCoalesceWaitTime, ctx.Done())
+	}
+
+	if c.isScyllaConn() { // ScyllaDB does not support system.peers_v2
+		c.setSchemaV2(false)
+	}
+
+	go c.serve(ctx)
+	go c.heartBeat(ctx)
+
+	return nil
+}
+
+func (c *Conn) Write(p []byte) (n int, err error) {
+	return c.w.writeContext(context.Background(), p)
+}
+
+func (c *Conn) Read(p []byte) (n int, err error) {
+	const maxAttempts = 5
+
+	for i := 0; i < maxAttempts; i++ {
+		var nn int
+		if c.timeout > 0 {
+			c.conn.SetReadDeadline(time.Now().Add(c.timeout))
+		}
+
+		nn, err = io.ReadFull(c.r, p[n:])
+		n += nn
+		if err == nil {
+			break
+		}
+
+		if verr, ok := err.(net.Error); !ok || !verr.Temporary() {
+			break
+		}
+	}
+
+	return
+}
+
+type startupCoordinator struct {
+	conn        *Conn
+	frameTicker chan struct{}
+}
+
+func (s *startupCoordinator) setupConn(ctx context.Context) error {
+	var cancel context.CancelFunc
+	if s.conn.timeout > 0 {
+		ctx, cancel = context.WithTimeout(ctx, s.conn.timeout)
+	} else {
+		ctx, cancel = context.WithCancel(ctx)
+	}
+	defer cancel()
+
+	startupErr := make(chan error)
+	go func() {
+		for range s.frameTicker {
+			err := s.conn.recv(ctx)
+			if err != nil {
+				select {
+				case startupErr <- err:
+				case <-ctx.Done():
+				}
+
+				return
+			}
+		}
+	}()
+
+	go func() {
+		defer close(s.frameTicker)
+		err := s.options(ctx)
+		select {
+		case startupErr <- err:
+		case <-ctx.Done():
+		}
+	}()
+
+	select {
+	case err := <-startupErr:
+		if err != nil {
+			return err
+		}
+	case <-ctx.Done():
+		return errors.New("gocql: no response to connection startup within timeout")
+	}
+
+	return nil
+}
+
+func (s *startupCoordinator) write(ctx context.Context, frame frameBuilder) (frame, error) {
+	select {
+	case s.frameTicker <- struct{}{}:
+	case <-ctx.Done():
+		return nil, ctx.Err()
+	}
+
+	framer, err := s.conn.exec(ctx, frame, nil)
+	if err != nil {
+		return nil, err
+	}
+
+	return framer.parseFrame()
+}
+
+func (s *startupCoordinator) options(ctx context.Context) error {
+	frame, err := s.write(ctx, &writeOptionsFrame{})
+	if err != nil {
+		return err
+	}
+
+	v, ok := frame.(*supportedFrame)
+	if !ok {
+		return NewErrProtocol("Unknown type of response to startup frame: %T", frame)
+	}
+	// Keep raw supported multimap for debug purposes
+	s.conn.supported = v.supported
+	s.conn.scyllaSupported = parseSupported(s.conn.supported)
+	s.conn.host.setScyllaSupported(s.conn.scyllaSupported)
+	s.conn.cqlProtoExts = parseCQLProtocolExtensions(s.conn.supported)
+
+	return s.startup(ctx)
+}
+
+func (s *startupCoordinator) startup(ctx context.Context) error {
+	m := map[string]string{
+		"CQL_VERSION":    s.conn.cfg.CQLVersion,
+		"DRIVER_NAME":    s.conn.session.cfg.DriverName,
+		"DRIVER_VERSION": s.conn.session.cfg.DriverVersion,
+	}
+
+	if s.conn.compressor != nil {
+		comp := s.conn.supported["COMPRESSION"]
+		name := s.conn.compressor.Name()
+		for _, compressor := range comp {
+			if compressor == name {
+				m["COMPRESSION"] = compressor
+				break
+			}
+		}
+
+		if _, ok := m["COMPRESSION"]; !ok {
+			s.conn.compressor = nil
+		}
+	}
+
+	for _, ext := range s.conn.cqlProtoExts {
+		serialized := ext.serialize()
+		for k, v := range serialized {
+			m[k] = v
+		}
+	}
+
+	frame, err := s.write(ctx, &writeStartupFrame{opts: m})
+	if err != nil {
+		return err
+	}
+
+	switch v := frame.(type) {
+	case error:
+		return v
+	case *readyFrame:
+		return nil
+	case *authenticateFrame:
+		return s.authenticateHandshake(ctx, v)
+	default:
+		return NewErrProtocol("Unknown type of response to startup frame: %s", v)
+	}
+}
+
+func (s *startupCoordinator) authenticateHandshake(ctx context.Context, authFrame *authenticateFrame) error {
+	if s.conn.auth == nil {
+		return fmt.Errorf("authentication required (using %q)", authFrame.class)
+	}
+
+	resp, challenger, err := s.conn.auth.Challenge([]byte(authFrame.class))
+	if err != nil {
+		return err
+	}
+
+	req := &writeAuthResponseFrame{data: resp}
+	for {
+		frame, err := s.write(ctx, req)
+		if err != nil {
+			return err
+		}
+
+		switch v := frame.(type) {
+		case error:
+			return v
+		case *authSuccessFrame:
+			if challenger != nil {
+				return challenger.Success(v.data)
+			}
+			return nil
+		case *authChallengeFrame:
+			resp, challenger, err = challenger.Challenge(v.data)
+			if err != nil {
+				return err
+			}
+
+			req = &writeAuthResponseFrame{
+				data: resp,
+			}
+		default:
+			return fmt.Errorf("unknown frame response during authentication: %v", v)
+		}
+	}
+}
+
+func (c *Conn) closeWithError(err error) {
+	if c == nil {
+		return
+	}
+
+	c.mu.Lock()
+	if c.closed {
+		c.mu.Unlock()
+		return
+	}
+	c.closed = true
+
+	var callsToClose map[int]*callReq
+
+	// We should attempt to deliver the error back to the caller if it
+	// exists. However, don't block c.mu while we are delivering the
+	// error to outstanding calls.
+	if err != nil {
+		callsToClose = c.calls
+		// It is safe to change c.calls to nil. Nobody should use it after c.closed is set to true.
+		c.calls = nil
+	}
+	c.mu.Unlock()
+
+	for _, req := range callsToClose {
+		// we need to send the error to all waiting queries.
+		select {
+		case req.resp <- callResp{err: err}:
+		case <-req.timeout:
+		}
+		if req.streamObserverContext != nil {
+			req.streamObserverEndOnce.Do(func() {
+				req.streamObserverContext.StreamAbandoned(ObservedStream{
+					Host: c.host,
+				})
+			})
+		}
+	}
+
+	// if error was nil then unblock the quit channel
+	c.cancel()
+	cerr := c.close()
+
+	if err != nil {
+		c.errorHandler.HandleError(c, err, true)
+	} else if cerr != nil {
+		// TODO(zariel): is it a good idea to do this?
+		c.errorHandler.HandleError(c, cerr, true)
+	}
+}
+
+func (c *Conn) isTabletSupported() bool {
+	return atomic.LoadInt32(&c.tabletsRoutingV1) == 1
+}
+
+func (c *Conn) setTabletSupported(val bool) {
+	intVal := int32(0)
+	if val {
+		intVal = 1
+	}
+	atomic.StoreInt32(&c.tabletsRoutingV1, intVal)
+}
+
+func (c *Conn) close() error {
+	return c.conn.Close()
+}
+
+func (c *Conn) Close() {
+	c.closeWithError(nil)
+}
+
+// Serve starts the stream multiplexer for this connection, which is required
+// to execute any queries. This method runs as long as the connection is
+// open and is therefore usually called in a separate goroutine.
+func (c *Conn) serve(ctx context.Context) {
+	var err error
+	for err == nil {
+		err = c.recv(ctx)
+	}
+
+	c.closeWithError(err)
+}
+
+func (c *Conn) discardFrame(head frameHeader) error {
+	_, err := io.CopyN(ioutil.Discard, c, int64(head.length))
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+type protocolError struct {
+	frame frame
+}
+
+func (p *protocolError) Error() string {
+	if err, ok := p.frame.(error); ok {
+		return err.Error()
+	}
+	return fmt.Sprintf("gocql: received unexpected frame on stream %d: %v", p.frame.Header().stream, p.frame)
+}
+
+func (c *Conn) heartBeat(ctx context.Context) {
+	sleepTime := 1 * time.Second
+	timer := time.NewTimer(sleepTime)
+	defer timer.Stop()
+
+	var failures int
+
+	for {
+		if failures > 5 {
+			c.closeWithError(fmt.Errorf("gocql: heartbeat failed"))
+			return
+		}
+
+		timer.Reset(sleepTime)
+
+		select {
+		case <-ctx.Done():
+			return
+		case <-timer.C:
+		}
+
+		framer, err := c.exec(context.Background(), &writeOptionsFrame{}, nil)
+		if err != nil {
+			failures++
+			continue
+		}
+
+		resp, err := framer.parseFrame()
+		if err != nil {
+			// invalid frame
+			failures++
+			continue
+		}
+
+		switch resp.(type) {
+		case *supportedFrame:
+			// Everything ok
+			sleepTime = 30 * time.Second
+			failures = 0
+		case error:
+			// TODO: should we do something here?
+		default:
+			panic(fmt.Sprintf("gocql: unknown frame in response to options: %T", resp))
+		}
+	}
+}
+
+func (c *Conn) recv(ctx context.Context) error {
+	// not safe for concurrent reads
+
+	// read a full header, ignore timeouts, as this is being ran in a loop
+	// TODO: TCP level deadlines? or just query level deadlines?
+	if c.timeout > 0 {
+		c.conn.SetReadDeadline(time.Time{})
+	}
+
+	headStartTime := time.Now()
+	// were just reading headers over and over and copy bodies
+	head, err := readHeader(c.r, c.headerBuf[:])
+	headEndTime := time.Now()
+	if err != nil {
+		return err
+	}
+
+	if c.frameObserver != nil {
+		c.frameObserver.ObserveFrameHeader(context.Background(), ObservedFrameHeader{
+			Version: protoVersion(head.version),
+			Flags:   head.flags,
+			Stream:  int16(head.stream),
+			Opcode:  frameOp(head.op),
+			Length:  int32(head.length),
+			Start:   headStartTime,
+			End:     headEndTime,
+			Host:    c.host,
+		})
+	}
+
+	if head.stream > c.streams.NumStreams {
+		return fmt.Errorf("gocql: frame header stream is beyond call expected bounds: %d", head.stream)
+	} else if head.stream == -1 {
+		// TODO: handle cassandra event frames, we shouldnt get any currently
+		framer := newFramerWithExts(c.compressor, c.version, c.cqlProtoExts)
+		c.setTabletSupported(framer.tabletsRoutingV1)
+		if err := framer.readFrame(c, &head); err != nil {
+			return err
+		}
+		go c.session.handleEvent(framer)
+		return nil
+	} else if head.stream <= 0 {
+		// reserved stream that we dont use, probably due to a protocol error
+		// or a bug in Cassandra, this should be an error, parse it and return.
+		framer := newFramerWithExts(c.compressor, c.version, c.cqlProtoExts)
+		c.setTabletSupported(framer.tabletsRoutingV1)
+		if err := framer.readFrame(c, &head); err != nil {
+			return err
+		}
+
+		frame, err := framer.parseFrame()
+		if err != nil {
+			return err
+		}
+
+		return &protocolError{
+			frame: frame,
+		}
+	}
+
+	c.mu.Lock()
+	if c.closed {
+		c.mu.Unlock()
+		return ErrConnectionClosed
+	}
+	call, ok := c.calls[head.stream]
+	delete(c.calls, head.stream)
+	c.mu.Unlock()
+	if call == nil || !ok {
+		c.logger.Printf("gocql: received response for stream which has no handler: header=%v\n", head)
+		return c.discardFrame(head)
+	} else if head.stream != call.streamID {
+		panic(fmt.Sprintf("call has incorrect streamID: got %d expected %d", call.streamID, head.stream))
+	}
+
+	framer := newFramerWithExts(c.compressor, c.version, c.cqlProtoExts)
+
+	err = framer.readFrame(c, &head)
+	if err != nil {
+		// only net errors should cause the connection to be closed. Though
+		// cassandra returning corrupt frames will be returned here as well.
+		if _, ok := err.(net.Error); ok {
+			return err
+		}
+	}
+
+	// we either, return a response to the caller, the caller timedout, or the
+	// connection has closed. Either way we should never block indefinatly here
+	select {
+	case call.resp <- callResp{framer: framer, err: err}:
+	case <-call.timeout:
+		c.releaseStream(call)
+	case <-ctx.Done():
+	}
+
+	return nil
+}
+
+func (c *Conn) releaseStream(call *callReq) {
+	if call.timer != nil {
+		call.timer.Stop()
+	}
+
+	c.streams.Clear(call.streamID)
+
+	if call.streamObserverContext != nil {
+		call.streamObserverEndOnce.Do(func() {
+			call.streamObserverContext.StreamFinished(ObservedStream{
+				Host: c.host,
+			})
+		})
+	}
+}
+
+func (c *Conn) handleTimeout() {
+	if TimeoutLimit > 0 && atomic.AddInt64(&c.timeouts, 1) > TimeoutLimit {
+		c.closeWithError(ErrTooManyTimeouts)
+	}
+}
+
+type callReq struct {
+	// resp will receive the frame that was sent as a response to this stream.
+	resp     chan callResp
+	timeout  chan struct{} // indicates to recv() that a call has timed out
+	streamID int           // current stream in use
+
+	timer *time.Timer
+
+	// streamObserverContext is notified about events regarding this stream
+	streamObserverContext StreamObserverContext
+
+	// streamObserverEndOnce ensures that either StreamAbandoned or StreamFinished is called,
+	// but not both.
+	streamObserverEndOnce sync.Once
+}
+
+type callResp struct {
+	// framer is the response frame.
+	// May be nil if err is not nil.
+	framer *framer
+	// err is error encountered, if any.
+	err error
+}
+
+// contextWriter is like io.Writer, but takes context as well.
+type contextWriter interface {
+	// writeContext writes p to the connection.
+	//
+	// If ctx is canceled before we start writing p (e.g. during waiting while another write is currently in progress),
+	// p is not written and ctx.Err() is returned. Context is ignored after we start writing p (i.e. we don't interrupt
+	// blocked writes that are in progress) so that we always either write the full frame or not write it at all.
+	//
+	// It returns the number of bytes written from p (0 <= n <= len(p)) and any error that caused the write to stop
+	// early. writeContext must return a non-nil error if it returns n < len(p). writeContext must not modify the
+	// data in p, even temporarily.
+	writeContext(ctx context.Context, p []byte) (n int, err error)
+}
+
+type deadlineWriter interface {
+	SetWriteDeadline(time.Time) error
+	io.Writer
+}
+
+type deadlineContextWriter struct {
+	w       deadlineWriter
+	timeout time.Duration
+	// semaphore protects critical section for SetWriteDeadline/Write.
+	// It is a channel with capacity 1.
+	semaphore chan struct{}
+
+	// quit closed once the connection is closed.
+	quit chan struct{}
+}
+
+// writeContext implements contextWriter.
+func (c *deadlineContextWriter) writeContext(ctx context.Context, p []byte) (int, error) {
+	select {
+	case <-ctx.Done():
+		return 0, ctx.Err()
+	case <-c.quit:
+		return 0, ErrConnectionClosed
+	case c.semaphore <- struct{}{}:
+		// acquired
+	}
+
+	defer func() {
+		// release
+		<-c.semaphore
+	}()
+
+	if c.timeout > 0 {
+		err := c.w.SetWriteDeadline(time.Now().Add(c.timeout))
+		if err != nil {
+			return 0, err
+		}
+	}
+	return c.w.Write(p)
+}
+
+func newWriteCoalescer(conn deadlineWriter, writeTimeout, coalesceDuration time.Duration,
+	quit <-chan struct{}) *writeCoalescer {
+	wc := &writeCoalescer{
+		writeCh: make(chan writeRequest),
+		c:       conn,
+		quit:    quit,
+		timeout: writeTimeout,
+	}
+	go wc.writeFlusher(coalesceDuration)
+	return wc
+}
+
+type writeCoalescer struct {
+	c deadlineWriter
+
+	mu sync.Mutex
+
+	quit    <-chan struct{}
+	writeCh chan writeRequest
+
+	timeout time.Duration
+
+	testEnqueuedHook func()
+	testFlushedHook  func()
+}
+
+type writeRequest struct {
+	// resultChan is a channel (with buffer size 1) where to send results of the write.
+	resultChan chan<- writeResult
+	// data to write.
+	data []byte
+}
+
+type writeResult struct {
+	n   int
+	err error
+}
+
+// writeContext implements contextWriter.
+func (w *writeCoalescer) writeContext(ctx context.Context, p []byte) (int, error) {
+	resultChan := make(chan writeResult, 1)
+	wr := writeRequest{
+		resultChan: resultChan,
+		data:       p,
+	}
+
+	select {
+	case <-ctx.Done():
+		return 0, ctx.Err()
+	case <-w.quit:
+		return 0, io.EOF // TODO: better error here?
+	case w.writeCh <- wr:
+		// enqueued for writing
+	}
+
+	if w.testEnqueuedHook != nil {
+		w.testEnqueuedHook()
+	}
+
+	result := <-resultChan
+	return result.n, result.err
+}
+
+func (w *writeCoalescer) writeFlusher(interval time.Duration) {
+	timer := time.NewTimer(interval)
+	defer timer.Stop()
+
+	if !timer.Stop() {
+		<-timer.C
+	}
+
+	w.writeFlusherImpl(timer.C, func() { timer.Reset(interval) })
+}
+
+func (w *writeCoalescer) writeFlusherImpl(timerC <-chan time.Time, resetTimer func()) {
+	running := false
+
+	var buffers net.Buffers
+	var resultChans []chan<- writeResult
+
+	for {
+		select {
+		case req := <-w.writeCh:
+			buffers = append(buffers, req.data)
+			resultChans = append(resultChans, req.resultChan)
+			if !running {
+				// Start timer on first write.
+				resetTimer()
+				running = true
+			}
+		case <-w.quit:
+			result := writeResult{
+				n:   0,
+				err: io.EOF, // TODO: better error here?
+			}
+			// Unblock whoever was waiting.
+			for _, resultChan := range resultChans {
+				// resultChan has capacity 1, so it does not block.
+				resultChan <- result
+			}
+			return
+		case <-timerC:
+			running = false
+			w.flush(resultChans, buffers)
+			buffers = nil
+			resultChans = nil
+			if w.testFlushedHook != nil {
+				w.testFlushedHook()
+			}
+		}
+	}
+}
+
+func (w *writeCoalescer) flush(resultChans []chan<- writeResult, buffers net.Buffers) {
+	// Flush everything we have so far.
+	if w.timeout > 0 {
+		err := w.c.SetWriteDeadline(time.Now().Add(w.timeout))
+		if err != nil {
+			for i := range resultChans {
+				resultChans[i] <- writeResult{
+					n:   0,
+					err: err,
+				}
+			}
+			return
+		}
+	}
+	// Copy buffers because WriteTo modifies buffers in-place.
+	buffers2 := make(net.Buffers, len(buffers))
+	copy(buffers2, buffers)
+	n, err := buffers2.WriteTo(w.c)
+	// Writes of bytes before n succeeded, writes of bytes starting from n failed with err.
+	// Use n as remaining byte counter.
+	for i := range buffers {
+		if int64(len(buffers[i])) <= n {
+			// this buffer was fully written.
+			resultChans[i] <- writeResult{
+				n:   len(buffers[i]),
+				err: nil,
+			}
+			n -= int64(len(buffers[i]))
+		} else {
+			// this buffer was not (fully) written.
+			resultChans[i] <- writeResult{
+				n:   int(n),
+				err: err,
+			}
+			n = 0
+		}
+	}
+}
+
+// addCall attempts to add a call to c.calls.
+// It fails with error if the connection already started closing or if a call for the given stream
+// already exists.
+func (c *Conn) addCall(call *callReq) error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if c.closed {
+		return ErrConnectionClosed
+	}
+	existingCall := c.calls[call.streamID]
+	if existingCall != nil {
+		return fmt.Errorf("attempting to use stream already in use: %d -> %d", call.streamID,
+			existingCall.streamID)
+	}
+	c.calls[call.streamID] = call
+	return nil
+}
+
+func (c *Conn) exec(ctx context.Context, req frameBuilder, tracer Tracer) (*framer, error) {
+	if ctxErr := ctx.Err(); ctxErr != nil {
+		return nil, &QueryError{err: ctxErr, potentiallyExecuted: false}
+	}
+
+	// TODO: move tracer onto conn
+	stream, ok := c.streams.GetStream()
+	if !ok {
+		return nil, &QueryError{err: ErrNoStreams, potentiallyExecuted: false}
+	}
+
+	// resp is basically a waiting semaphore protecting the framer
+	framer := newFramerWithExts(c.compressor, c.version, c.cqlProtoExts)
+	c.setTabletSupported(framer.tabletsRoutingV1)
+
+	call := &callReq{
+		timeout:  make(chan struct{}),
+		streamID: stream,
+		resp:     make(chan callResp),
+	}
+
+	if c.streamObserver != nil {
+		call.streamObserverContext = c.streamObserver.StreamContext(ctx)
+	}
+
+	if err := c.addCall(call); err != nil {
+		return nil, &QueryError{err: err, potentiallyExecuted: false}
+	}
+
+	// After this point, we need to either read from call.resp or close(call.timeout)
+	// since closeWithError can try to write a connection close error to call.resp.
+	// If we don't close(call.timeout) or read from call.resp, closeWithError can deadlock.
+
+	if tracer != nil {
+		framer.trace()
+	}
+
+	if call.streamObserverContext != nil {
+		call.streamObserverContext.StreamStarted(ObservedStream{
+			Host: c.host,
+		})
+	}
+
+	err := req.buildFrame(framer, stream)
+	if err != nil {
+		// closeWithError will block waiting for this stream to either receive a response
+		// or for us to timeout.
+		close(call.timeout)
+		// We failed to serialize the frame into a buffer.
+		// This should not affect the connection as we didn't write anything. We just free the current call.
+		c.mu.Lock()
+		if !c.closed {
+			delete(c.calls, call.streamID)
+		}
+		c.mu.Unlock()
+		// We need to release the stream after we remove the call from c.calls, otherwise the existingCall != nil
+		// check above could fail.
+		c.releaseStream(call)
+		return nil, &QueryError{err: err, potentiallyExecuted: false}
+	}
+
+	n, err := c.w.writeContext(ctx, framer.buf)
+	if err != nil {
+		// closeWithError will block waiting for this stream to either receive a response
+		// or for us to timeout, close the timeout chan here. Im not entirely sure
+		// but we should not get a response after an error on the write side.
+		close(call.timeout)
+		if (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded)) && n == 0 {
+			// We have not started to write this frame.
+			// Release the stream as no response can come from the server on the stream.
+			c.mu.Lock()
+			if !c.closed {
+				delete(c.calls, call.streamID)
+			}
+			c.mu.Unlock()
+			// We need to release the stream after we remove the call from c.calls, otherwise the existingCall != nil
+			// check above could fail.
+			c.releaseStream(call)
+		} else {
+			// I think this is the correct thing to do, im not entirely sure. It is not
+			// ideal as readers might still get some data, but they probably wont.
+			// Here we need to be careful as the stream is not available and if all
+			// writes just timeout or fail then the pool might use this connection to
+			// send a frame on, with all the streams used up and not returned.
+			c.closeWithError(err)
+		}
+		return nil, &QueryError{err: err, potentiallyExecuted: true}
+	}
+
+	var timeoutCh <-chan time.Time
+	if c.timeout > 0 {
+		if call.timer == nil {
+			call.timer = time.NewTimer(0)
+			<-call.timer.C
+		} else {
+			if !call.timer.Stop() {
+				select {
+				case <-call.timer.C:
+				default:
+				}
+			}
+		}
+
+		call.timer.Reset(c.timeout)
+		timeoutCh = call.timer.C
+	}
+
+	var ctxDone <-chan struct{}
+	if ctx != nil {
+		ctxDone = ctx.Done()
+	}
+
+	select {
+	case resp := <-call.resp:
+		close(call.timeout)
+		if resp.err != nil {
+			if !c.Closed() {
+				// if the connection is closed then we cant release the stream,
+				// this is because the request is still outstanding and we have
+				// been handed another error from another stream which caused the
+				// connection to close.
+				c.releaseStream(call)
+			}
+			return nil, &QueryError{err: resp.err, potentiallyExecuted: true}
+		}
+		// dont release the stream if detect a timeout as another request can reuse
+		// that stream and get a response for the old request, which we have no
+		// easy way of detecting.
+		//
+		// Ensure that the stream is not released if there are potentially outstanding
+		// requests on the stream to prevent nil pointer dereferences in recv().
+		defer c.releaseStream(call)
+
+		if v := resp.framer.header.version.version(); v != c.version {
+			return nil, &QueryError{err: NewErrProtocol("unexpected protocol version in response: got %d expected %d", v, c.version), potentiallyExecuted: true}
+		}
+
+		return resp.framer, nil
+	case <-timeoutCh:
+		close(call.timeout)
+		c.handleTimeout()
+		return nil, &QueryError{err: ErrTimeoutNoResponse, potentiallyExecuted: true}
+	case <-ctxDone:
+		close(call.timeout)
+		return nil, &QueryError{err: ctx.Err(), potentiallyExecuted: true}
+	case <-c.ctx.Done():
+		close(call.timeout)
+		return nil, &QueryError{err: ErrConnectionClosed, potentiallyExecuted: true}
+	}
+}
+
+// ObservedStream observes a single request/response stream.
+type ObservedStream struct {
+	// Host of the connection used to send the stream.
+	Host *HostInfo
+}
+
+// StreamObserver is notified about request/response pairs.
+// Streams are created for executing queries/batches or
+// internal requests to the database and might live longer than
+// execution of the query - the stream is still tracked until
+// response arrives so that stream IDs are not reused.
+type StreamObserver interface {
+	// StreamContext is called before creating a new stream.
+	// ctx is context passed to Session.Query / Session.Batch,
+	// but might also be an internal context (for example
+	// for internal requests that use control connection).
+	// StreamContext might return nil if it is not interested
+	// in the details of this stream.
+	// StreamContext is called before the stream is created
+	// and the returned StreamObserverContext might be discarded
+	// without any methods called on the StreamObserverContext if
+	// creation of the stream fails.
+	// Note that if you don't need to track per-stream data,
+	// you can always return the same StreamObserverContext.
+	StreamContext(ctx context.Context) StreamObserverContext
+}
+
+// StreamObserverContext is notified about state of a stream.
+// A stream is started every time a request is written to the server
+// and is finished when a response is received.
+// It is abandoned when the underlying network connection is closed
+// before receiving a response.
+type StreamObserverContext interface {
+	// StreamStarted is called when the stream is started.
+	// This happens just before a request is written to the wire.
+	StreamStarted(observedStream ObservedStream)
+
+	// StreamAbandoned is called when we stop waiting for response.
+	// This happens when the underlying network connection is closed.
+	// StreamFinished won't be called if StreamAbandoned is.
+	StreamAbandoned(observedStream ObservedStream)
+
+	// StreamFinished is called when we receive a response for the stream.
+	StreamFinished(observedStream ObservedStream)
+}
+
+type preparedStatment struct {
+	id       []byte
+	request  preparedMetadata
+	response resultMetadata
+}
+
+type inflightPrepare struct {
+	done chan struct{}
+	err  error
+
+	preparedStatment *preparedStatment
+}
+
+func (c *Conn) prepareStatement(ctx context.Context, stmt string, tracer Tracer) (*preparedStatment, error) {
+	stmtCacheKey := c.session.stmtsLRU.keyFor(c.host.HostID(), c.currentKeyspace, stmt)
+	flight, ok := c.session.stmtsLRU.execIfMissing(stmtCacheKey, func(lru *lru.Cache) *inflightPrepare {
+		flight := &inflightPrepare{
+			done: make(chan struct{}),
+		}
+		lru.Add(stmtCacheKey, flight)
+		return flight
+	})
+
+	if !ok {
+		go func() {
+			defer close(flight.done)
+
+			prep := &writePrepareFrame{
+				statement: stmt,
+			}
+			if c.version > protoVersion4 {
+				prep.keyspace = c.currentKeyspace
+			}
+
+			// we won the race to do the load, if our context is canceled we shouldnt
+			// stop the load as other callers are waiting for it but this caller should get
+			// their context cancelled error.
+			framer, err := c.exec(c.ctx, prep, tracer)
+			if err != nil {
+				flight.err = err
+				c.session.stmtsLRU.remove(stmtCacheKey)
+				return
+			}
+
+			frame, err := framer.parseFrame()
+			if err != nil {
+				flight.err = err
+				c.session.stmtsLRU.remove(stmtCacheKey)
+				return
+			}
+
+			// TODO(zariel): tidy this up, simplify handling of frame parsing so its not duplicated
+			// everytime we need to parse a frame.
+			if len(framer.traceID) > 0 && tracer != nil {
+				tracer.Trace(framer.traceID)
+			}
+
+			switch x := frame.(type) {
+			case *resultPreparedFrame:
+				flight.preparedStatment = &preparedStatment{
+					// defensively copy as we will recycle the underlying buffer after we
+					// return.
+					id: copyBytes(x.preparedID),
+					// the type info's should _not_ have a reference to the framers read buffer,
+					// therefore we can just copy them directly.
+					request:  x.reqMeta,
+					response: x.respMeta,
+				}
+			case error:
+				flight.err = x
+			default:
+				flight.err = NewErrProtocol("Unknown type in response to prepare frame: %s", x)
+			}
+
+			if flight.err != nil {
+				c.session.stmtsLRU.remove(stmtCacheKey)
+			}
+		}()
+	}
+
+	select {
+	case <-ctx.Done():
+		return nil, ctx.Err()
+	case <-flight.done:
+		return flight.preparedStatment, flight.err
+	}
+}
+
+func marshalQueryValue(typ TypeInfo, value interface{}, dst *queryValues) error {
+	if named, ok := value.(*namedValue); ok {
+		dst.name = named.name
+		value = named.value
+	}
+
+	if _, ok := value.(unsetColumn); !ok {
+		val, err := Marshal(typ, value)
+		if err != nil {
+			return err
+		}
+
+		dst.value = val
+	} else {
+		dst.isUnset = true
+	}
+
+	return nil
+}
+
+func (c *Conn) executeQuery(ctx context.Context, qry *Query) (iter *Iter) {
+	defer func() {
+		if iter == nil || c.session == nil {
+			return
+		}
+		warnings := iter.Warnings()
+		if len(warnings) > 0 && c.session.warningHandler != nil {
+			c.session.warningHandler.HandleWarnings(qry, iter.host, warnings)
+		}
+	}()
+	params := queryParams{
+		consistency: qry.cons,
+	}
+
+	// frame checks that it is not 0
+	params.serialConsistency = qry.serialCons
+	params.defaultTimestamp = qry.defaultTimestamp
+	params.defaultTimestampValue = qry.defaultTimestampValue
+
+	if len(qry.pageState) > 0 {
+		params.pagingState = qry.pageState
+	}
+	if qry.pageSize > 0 {
+		params.pageSize = qry.pageSize
+	}
+	if c.version > protoVersion4 {
+		params.keyspace = c.currentKeyspace
+	}
+
+	var (
+		frame frameBuilder
+		info  *preparedStatment
+	)
+
+	if !qry.skipPrepare && qry.shouldPrepare() {
+		// Prepare all DML queries. Other queries can not be prepared.
+		var err error
+		info, err = c.prepareStatement(ctx, qry.stmt, qry.trace)
+		if err != nil {
+			return &Iter{err: err}
+		}
+
+		values := qry.values
+		if qry.binding != nil {
+			values, err = qry.binding(&QueryInfo{
+				Id:          info.id,
+				Args:        info.request.columns,
+				Rval:        info.response.columns,
+				PKeyColumns: info.request.pkeyColumns,
+			})
+
+			if err != nil {
+				return &Iter{err: err}
+			}
+		}
+
+		if len(values) != info.request.actualColCount {
+			return &Iter{err: fmt.Errorf("gocql: expected %d values send got %d", info.request.actualColCount, len(values))}
+		}
+
+		params.values = make([]queryValues, len(values))
+		for i := 0; i < len(values); i++ {
+			v := &params.values[i]
+			value := values[i]
+			typ := info.request.columns[i].TypeInfo
+			if err := marshalQueryValue(typ, value, v); err != nil {
+				return &Iter{err: err}
+			}
+		}
+
+		params.skipMeta = !(c.session.cfg.DisableSkipMetadata || qry.disableSkipMetadata)
+
+		frame = &writeExecuteFrame{
+			preparedID:    info.id,
+			params:        params,
+			customPayload: qry.customPayload,
+		}
+
+		// Set "lwt", keyspace", "table" property in the query if it is present in preparedMetadata
+		qry.routingInfo.mu.Lock()
+		qry.routingInfo.lwt = info.request.lwt
+		qry.routingInfo.keyspace = info.request.keyspace
+		qry.routingInfo.table = info.request.table
+		qry.routingInfo.mu.Unlock()
+	} else {
+		frame = &writeQueryFrame{
+			statement:     qry.stmt,
+			params:        params,
+			customPayload: qry.customPayload,
+		}
+	}
+
+	framer, err := c.exec(ctx, frame, qry.trace)
+	if err != nil {
+		return &Iter{err: err}
+	}
+
+	resp, err := framer.parseFrame()
+	if err != nil {
+		return &Iter{err: err}
+	}
+
+	if len(framer.customPayload) > 0 {
+		if tabletInfo, ok := framer.customPayload["tablets-routing-v1"]; ok {
+			var firstToken string
+			var lastToken string
+			var replicas [][]interface{}
+			tabletInfoValue := []interface{}{&firstToken, &lastToken, &replicas}
+			Unmarshal(TupleTypeInfo{
+				NativeType: NativeType{proto: c.version, typ: TypeTuple},
+				Elems: []TypeInfo{
+					NativeType{typ: TypeBigInt},
+					NativeType{typ: TypeBigInt},
+					CollectionType{
+						NativeType: NativeType{proto: c.version, typ: TypeList},
+						Elem: TupleTypeInfo{
+							NativeType: NativeType{proto: c.version, typ: TypeTuple},
+							Elems: []TypeInfo{
+								NativeType{proto: c.version, typ: TypeUUID},
+								NativeType{proto: c.version, typ: TypeInt},
+							}},
+					},
+				},
+			}, tabletInfo, tabletInfoValue)
+
+			tablet := TabletInfo{}
+			tablet.firstToken, err = strconv.ParseInt(firstToken, 10, 64)
+			if err != nil {
+				return &Iter{err: err}
+			}
+			tablet.lastToken, err = strconv.ParseInt(lastToken, 10, 64)
+			if err != nil {
+				return &Iter{err: err}
+			}
+
+			tabletReplicas := make([]ReplicaInfo, 0, len(replicas))
+			for _, replica := range replicas {
+				if len(replica) != 2 {
+					return &Iter{err: err}
+				}
+				if hostId, ok := replica[0].(UUID); ok {
+					if shardId, ok := replica[1].(int); ok {
+						repInfo := ReplicaInfo{hostId, shardId}
+						tabletReplicas = append(tabletReplicas, repInfo)
+					} else {
+						return &Iter{err: err}
+					}
+				} else {
+					return &Iter{err: err}
+				}
+			}
+			tablet.replicas = tabletReplicas
+			tablet.keyspaceName = qry.routingInfo.keyspace
+			tablet.tableName = qry.routingInfo.table
+
+			c.session.metadataDescriber.addTablet(&tablet)
+		}
+	}
+
+	if len(framer.traceID) > 0 && qry.trace != nil {
+		qry.trace.Trace(framer.traceID)
+	}
+
+	switch x := resp.(type) {
+	case *resultVoidFrame:
+		return &Iter{framer: framer}
+	case *resultRowsFrame:
+		iter := &Iter{
+			meta:    x.meta,
+			framer:  framer,
+			numRows: x.numRows,
+		}
+
+		if params.skipMeta {
+			if info != nil {
+				iter.meta = info.response
+				iter.meta.pagingState = copyBytes(x.meta.pagingState)
+			} else {
+				return &Iter{framer: framer, err: errors.New("gocql: did not receive metadata but prepared info is nil")}
+			}
+		} else {
+			iter.meta = x.meta
+		}
+
+		if x.meta.morePages() && !qry.disableAutoPage {
+			newQry := new(Query)
+			*newQry = *qry
+			newQry.pageState = copyBytes(x.meta.pagingState)
+			newQry.metrics = &queryMetrics{m: make(map[string]*hostMetrics)}
+
+			iter.next = &nextIter{
+				qry: newQry,
+				pos: int((1 - qry.prefetch) * float64(x.numRows)),
+			}
+
+			if iter.next.pos < 1 {
+				iter.next.pos = 1
+			}
+		}
+
+		return iter
+	case *resultKeyspaceFrame:
+		return &Iter{framer: framer}
+	case *schemaChangeKeyspace, *schemaChangeTable, *schemaChangeFunction, *schemaChangeAggregate, *schemaChangeType:
+		iter := &Iter{framer: framer}
+		if err := c.awaitSchemaAgreement(ctx); err != nil {
+			// TODO: should have this behind a flag
+			c.logger.Println(err)
+		}
+		// dont return an error from this, might be a good idea to give a warning
+		// though. The impact of this returning an error would be that the cluster
+		// is not consistent with regards to its schema.
+		return iter
+	case *RequestErrUnprepared:
+		stmtCacheKey := c.session.stmtsLRU.keyFor(c.host.HostID(), c.currentKeyspace, qry.stmt)
+		c.session.stmtsLRU.evictPreparedID(stmtCacheKey, x.StatementId)
+		return c.executeQuery(ctx, qry)
+	case error:
+		return &Iter{err: x, framer: framer}
+	default:
+		return &Iter{
+			err:    NewErrProtocol("Unknown type in response to execute query (%T): %s", x, x),
+			framer: framer,
+		}
+	}
+}
+
+func (c *Conn) Pick(qry *Query) *Conn {
+	if c.Closed() {
+		return nil
+	}
+	return c
+}
+
+func (c *Conn) Closed() bool {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	return c.closed
+}
+
+func (c *Conn) Address() string {
+	return c.addr
+}
+
+func (c *Conn) AvailableStreams() int {
+	return c.streams.Available()
+}
+
+func (c *Conn) UseKeyspace(keyspace string) error {
+	q := &writeQueryFrame{statement: `USE "` + keyspace + `"`}
+	q.params.consistency = c.session.cons
+
+	framer, err := c.exec(c.ctx, q, nil)
+	if err != nil {
+		return err
+	}
+
+	resp, err := framer.parseFrame()
+	if err != nil {
+		return err
+	}
+
+	switch x := resp.(type) {
+	case *resultKeyspaceFrame:
+	case error:
+		return x
+	default:
+		return NewErrProtocol("unknown frame in response to USE: %v", x)
+	}
+
+	c.currentKeyspace = keyspace
+
+	return nil
+}
+
+func (c *Conn) executeBatch(ctx context.Context, batch *Batch) (iter *Iter) {
+	defer func() {
+		if iter == nil || c.session == nil {
+			return
+		}
+		warnings := iter.Warnings()
+		if len(warnings) > 0 && c.session.warningHandler != nil {
+			c.session.warningHandler.HandleWarnings(batch, iter.host, warnings)
+		}
+	}()
+
+	if c.version == protoVersion1 {
+		return &Iter{err: ErrUnsupported}
+	}
+
+	n := len(batch.Entries)
+	req := &writeBatchFrame{
+		typ:                   batch.Type,
+		statements:            make([]batchStatment, n),
+		consistency:           batch.Cons,
+		serialConsistency:     batch.serialCons,
+		defaultTimestamp:      batch.defaultTimestamp,
+		defaultTimestampValue: batch.defaultTimestampValue,
+		customPayload:         batch.CustomPayload,
+	}
+
+	stmts := make(map[string]string, len(batch.Entries))
+
+	hasLwtEntries := false
+
+	for i := 0; i < n; i++ {
+		entry := &batch.Entries[i]
+		b := &req.statements[i]
+
+		if len(entry.Args) > 0 || entry.binding != nil {
+			info, err := c.prepareStatement(batch.Context(), entry.Stmt, batch.trace)
+			if err != nil {
+				return &Iter{err: err}
+			}
+
+			var values []interface{}
+			if entry.binding == nil {
+				values = entry.Args
+			} else {
+				values, err = entry.binding(&QueryInfo{
+					Id:          info.id,
+					Args:        info.request.columns,
+					Rval:        info.response.columns,
+					PKeyColumns: info.request.pkeyColumns,
+				})
+				if err != nil {
+					return &Iter{err: err}
+				}
+			}
+
+			if len(values) != info.request.actualColCount {
+				return &Iter{err: fmt.Errorf("gocql: batch statement %d expected %d values send got %d", i, info.request.actualColCount, len(values))}
+			}
+
+			b.preparedID = info.id
+			stmts[string(info.id)] = entry.Stmt
+
+			b.values = make([]queryValues, info.request.actualColCount)
+
+			for j := 0; j < info.request.actualColCount; j++ {
+				v := &b.values[j]
+				value := values[j]
+				typ := info.request.columns[j].TypeInfo
+				if err := marshalQueryValue(typ, value, v); err != nil {
+					return &Iter{err: err}
+				}
+			}
+
+			if !hasLwtEntries && info.request.lwt {
+				hasLwtEntries = true
+			}
+		} else {
+			b.statement = entry.Stmt
+		}
+	}
+
+	// The batch is considered to be conditional if even one of the
+	// statements is conditional.
+	batch.routingInfo.mu.Lock()
+	batch.routingInfo.lwt = hasLwtEntries
+	batch.routingInfo.mu.Unlock()
+
+	// TODO: should batch support tracing?
+	framer, err := c.exec(batch.Context(), req, batch.trace)
+	if err != nil {
+		return &Iter{err: err}
+	}
+
+	resp, err := framer.parseFrame()
+	if err != nil {
+		return &Iter{err: err, framer: framer}
+	}
+
+	if len(framer.traceID) > 0 && batch.trace != nil {
+		batch.trace.Trace(framer.traceID)
+	}
+
+	switch x := resp.(type) {
+	case *resultVoidFrame:
+		return &Iter{}
+	case *RequestErrUnprepared:
+		stmt, found := stmts[string(x.StatementId)]
+		if found {
+			key := c.session.stmtsLRU.keyFor(c.host.HostID(), c.currentKeyspace, stmt)
+			c.session.stmtsLRU.evictPreparedID(key, x.StatementId)
+		}
+		return c.executeBatch(ctx, batch)
+	case *resultRowsFrame:
+		iter := &Iter{
+			meta:    x.meta,
+			framer:  framer,
+			numRows: x.numRows,
+		}
+
+		return iter
+	case error:
+		return &Iter{err: x, framer: framer}
+	default:
+		return &Iter{err: NewErrProtocol("Unknown type in response to batch statement: %s", x), framer: framer}
+	}
+}
+
+func (c *Conn) query(ctx context.Context, statement string, values ...interface{}) (iter *Iter) {
+	q := c.session.Query(statement, values...).Consistency(One).Trace(nil)
+	q.skipPrepare = true
+	q.disableSkipMetadata = true
+	// we want to keep the query on this connection
+	q.conn = c
+	return c.executeQuery(ctx, q)
+}
+
+func (c *Conn) querySystem(ctx context.Context, query string) *Iter {
+	usingClause := ""
+	if c.session.control != nil {
+		usingClause = c.session.usingTimeoutClause
+	}
+	queryStmt := query + usingClause
+	return c.query(ctx, queryStmt)
+}
+
+const qrySystemPeers = "SELECT * FROM system.peers"
+const qrySystemPeersV2 = "SELECT * FROM system.peers_2"
+
+const qrySystemLocal = "SELECT * FROM system.local WHERE key='local'"
+
+func getSchemaAgreement(queryLocalSchemasRows []string, querySystemPeersRows []map[string]interface{}, connectAddress net.IP, port int, translateAddressPort func(addr net.IP, port int) (net.IP, int), logger StdLogger) (err error) {
+	versions := make(map[string]struct{})
+
+	for _, row := range querySystemPeersRows {
+		var host *HostInfo
+		host, err = hostInfoFromMap(row, &HostInfo{connectAddress: connectAddress, port: port}, translateAddressPort)
+		if err != nil {
+			return err
+		}
+		if !isValidPeer(host) || host.schemaVersion == "" {
+			logger.Printf("invalid peer or peer with empty schema_version: peer=%q", host)
+			continue
+		}
+
+		versions[host.schemaVersion] = struct{}{}
+	}
+
+	for _, schemaVersion := range queryLocalSchemasRows {
+		versions[schemaVersion] = struct{}{}
+		schemaVersion = ""
+	}
+
+	if len(versions) > 1 {
+		schemas := make([]string, 0, len(versions))
+		for schema := range versions {
+			schemas = append(schemas, schema)
+		}
+
+		return &ErrSchemaMismatch{schemas: schemas}
+	}
+
+	return nil
+}
+
+func (c *Conn) awaitSchemaAgreement(ctx context.Context) error {
+	var localSchemas = "SELECT schema_version FROM system.local WHERE key='local'"
+
+	var schemaVersion string
+
+	endDeadline := time.Now().Add(c.session.cfg.MaxWaitSchemaAgreement)
+
+	var err error
+	ticker := time.NewTicker(200 * time.Millisecond) // Create a ticker that ticks every 200ms
+	defer ticker.Stop()
+
+	waitForNextTick := func() error {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-ticker.C:
+			return nil
+		}
+	}
+
+	for time.Now().Before(endDeadline) {
+		var iter *Iter
+		if c.getIsSchemaV2() {
+			iter = c.querySystem(ctx, qrySystemPeersV2)
+		} else {
+			iter = c.querySystem(ctx, qrySystemPeers)
+		}
+		var systemPeersRows []map[string]interface{}
+		systemPeersRows, err = iter.SliceMap()
+		if err != nil {
+			return err
+		}
+		if err = iter.Close(); err != nil {
+			return err
+		}
+
+		schemaVersions := []string{}
+
+		iter = c.querySystem(ctx, localSchemas)
+		for iter.Scan(&schemaVersion) {
+			schemaVersions = append(schemaVersions, schemaVersion)
+			schemaVersion = ""
+		}
+
+		if err = iter.Close(); err != nil {
+			return err
+		}
+		err = getSchemaAgreement(schemaVersions, systemPeersRows, c.host.ConnectAddress(), c.session.cfg.Port, c.session.cfg.translateAddressPort, c.logger)
+
+		if err == ErrConnectionClosed || err == nil {
+			return err
+		}
+
+		if tickerErr := waitForNextTick(); tickerErr != nil {
+			return tickerErr
+		}
+	}
+
+	return err
+}
+
+var (
+	ErrQueryArgLength      = errors.New("gocql: query argument length mismatch")
+	ErrTimeoutNoResponse   = errors.New("gocql: no response received from cassandra within timeout period")
+	ErrTooManyTimeouts     = errors.New("gocql: too many query timeouts on the connection")
+	ErrConnectionClosed    = errors.New("gocql: connection closed waiting for response")
+	ErrNoStreams           = errors.New("gocql: no streams available on connection")
+	ErrHostDown            = errors.New("gocql: host is nil or down")
+	ErrNoPool              = errors.New("gocql: host does not have a pool")
+	ErrNoConnectionsInPool = errors.New("gocql: host pool does not have connections")
+)
+
+type ErrSchemaMismatch struct {
+	schemas []string
+}
+
+func (e *ErrSchemaMismatch) Error() string {
+	return fmt.Sprintf("gocql: cluster schema versions not consistent: %+v", e.schemas)
+}
+
+type QueryError struct {
+	err                 error
+	potentiallyExecuted bool
+	isIdempotent        bool
+}
+
+func (e *QueryError) IsIdempotent() bool {
+	return e.isIdempotent
+}
+
+func (e *QueryError) PotentiallyExecuted() bool {
+	return e.potentiallyExecuted
+}
+
+func (e *QueryError) Error() string {
+	return fmt.Sprintf("%s (potentially executed: %v)", e.err.Error(), e.potentiallyExecuted)
+}
+
+func (e *QueryError) Unwrap() error {
+	return e.err
+}
diff --git a/vendor/github.com/gocql/gocql/connectionpool.go b/vendor/github.com/gocql/gocql/connectionpool.go
new file mode 100644
index 0000000..5f30b72
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/connectionpool.go
@@ -0,0 +1,562 @@
+// Copyright (c) 2012 The gocql Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gocql
+
+import (
+	"fmt"
+	"math/rand"
+	"net"
+	"sync"
+	"time"
+
+	"github.com/gocql/gocql/debounce"
+)
+
+// interface to implement to receive the host information
+type SetHosts interface {
+	SetHosts(hosts []*HostInfo)
+}
+
+// interface to implement to receive the partitioner value
+type SetPartitioner interface {
+	SetPartitioner(partitioner string)
+}
+
+// interface to implement to receive the tablets value
+type SetTablets interface {
+	SetTablets(tablets TabletInfoList)
+}
+
+type policyConnPool struct {
+	session *Session
+
+	port     int
+	numConns int
+	keyspace string
+
+	mu            sync.RWMutex
+	hostConnPools map[string]*hostConnPool
+}
+
+func connConfig(cfg *ClusterConfig) (*ConnConfig, error) {
+	hostDialer := cfg.HostDialer
+
+	if hostDialer == nil {
+		dialer := cfg.Dialer
+		if dialer == nil {
+			d := net.Dialer{
+				Timeout: cfg.ConnectTimeout,
+			}
+			if cfg.SocketKeepalive > 0 {
+				d.KeepAlive = cfg.SocketKeepalive
+			}
+			dialer = &ScyllaShardAwareDialer{d}
+		}
+
+		hostDialer = &scyllaDialer{
+			dialer:    dialer,
+			logger:    cfg.logger(),
+			tlsConfig: cfg.getActualTLSConfig(),
+			cfg:       cfg,
+		}
+	}
+
+	return &ConnConfig{
+		ProtoVersion:   cfg.ProtoVersion,
+		CQLVersion:     cfg.CQLVersion,
+		Timeout:        cfg.Timeout,
+		WriteTimeout:   cfg.WriteTimeout,
+		ConnectTimeout: cfg.ConnectTimeout,
+		Dialer:         cfg.Dialer,
+		HostDialer:     hostDialer,
+		Compressor:     cfg.Compressor,
+		Authenticator:  cfg.Authenticator,
+		AuthProvider:   cfg.AuthProvider,
+		Keepalive:      cfg.SocketKeepalive,
+		Logger:         cfg.logger(),
+		tlsConfig:      cfg.getActualTLSConfig(),
+	}, nil
+}
+
+func newPolicyConnPool(session *Session) *policyConnPool {
+	// create the pool
+	pool := &policyConnPool{
+		session:       session,
+		port:          session.cfg.Port,
+		numConns:      session.cfg.NumConns,
+		keyspace:      session.cfg.Keyspace,
+		hostConnPools: map[string]*hostConnPool{},
+	}
+
+	return pool
+}
+
+func (p *policyConnPool) SetHosts(hosts []*HostInfo) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	toRemove := make(map[string]struct{})
+	for hostID := range p.hostConnPools {
+		toRemove[hostID] = struct{}{}
+	}
+
+	pools := make(chan *hostConnPool)
+	createCount := 0
+	for _, host := range hosts {
+		if !host.IsUp() {
+			// don't create a connection pool for a down host
+			continue
+		}
+		hostID := host.HostID()
+		if _, exists := p.hostConnPools[hostID]; exists {
+			// still have this host, so don't remove it
+			delete(toRemove, hostID)
+			continue
+		}
+
+		createCount++
+		go func(host *HostInfo) {
+			// create a connection pool for the host
+			pools <- newHostConnPool(
+				p.session,
+				host,
+				p.port,
+				p.numConns,
+				p.keyspace,
+			)
+		}(host)
+	}
+
+	// add created pools
+	for createCount > 0 {
+		pool := <-pools
+		createCount--
+		if pool.Size() > 0 {
+			// add pool only if there a connections available
+			p.hostConnPools[pool.host.HostID()] = pool
+		}
+	}
+
+	for addr := range toRemove {
+		pool := p.hostConnPools[addr]
+		delete(p.hostConnPools, addr)
+		go pool.Close()
+	}
+}
+
+func (p *policyConnPool) InFlight() int {
+	p.mu.RLock()
+	count := 0
+	for _, pool := range p.hostConnPools {
+		count += pool.InFlight()
+	}
+	p.mu.RUnlock()
+
+	return count
+}
+
+func (p *policyConnPool) Size() int {
+	p.mu.RLock()
+	count := 0
+	for _, pool := range p.hostConnPools {
+		count += pool.Size()
+	}
+	p.mu.RUnlock()
+
+	return count
+}
+
+func (p *policyConnPool) getPool(host *HostInfo) (pool *hostConnPool, ok bool) {
+	hostID := host.HostID()
+	p.mu.RLock()
+	pool, ok = p.hostConnPools[hostID]
+	p.mu.RUnlock()
+	return
+}
+
+func (p *policyConnPool) Close() {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// close the pools
+	for addr, pool := range p.hostConnPools {
+		delete(p.hostConnPools, addr)
+		pool.Close()
+	}
+}
+
+func (p *policyConnPool) addHost(host *HostInfo) {
+	hostID := host.HostID()
+	p.mu.Lock()
+	pool, ok := p.hostConnPools[hostID]
+	if !ok {
+		pool = newHostConnPool(
+			p.session,
+			host,
+			host.Port(), // TODO: if port == 0 use pool.port?
+			p.numConns,
+			p.keyspace,
+		)
+
+		p.hostConnPools[hostID] = pool
+	}
+	p.mu.Unlock()
+
+	pool.fill_debounce()
+}
+
+func (p *policyConnPool) removeHost(hostID string) {
+	p.mu.Lock()
+	pool, ok := p.hostConnPools[hostID]
+	if !ok {
+		p.mu.Unlock()
+		return
+	}
+
+	delete(p.hostConnPools, hostID)
+	p.mu.Unlock()
+
+	go pool.Close()
+}
+
+// hostConnPool is a connection pool for a single host.
+// Connection selection is based on a provided ConnSelectionPolicy
+type hostConnPool struct {
+	session  *Session
+	host     *HostInfo
+	size     int
+	keyspace string
+	// protection for connPicker, closed, filling
+	mu         sync.RWMutex
+	connPicker ConnPicker
+	closed     bool
+	filling    bool
+	debouncer  *debounce.SimpleDebouncer
+
+	logger StdLogger
+}
+
+func (h *hostConnPool) String() string {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	size, _ := h.connPicker.Size()
+	return fmt.Sprintf("[filling=%v closed=%v conns=%v size=%v host=%v]",
+		h.filling, h.closed, size, h.size, h.host)
+}
+
+func newHostConnPool(session *Session, host *HostInfo, port, size int,
+	keyspace string) *hostConnPool {
+
+	pool := &hostConnPool{
+		session:    session,
+		host:       host,
+		size:       size,
+		keyspace:   keyspace,
+		connPicker: nopConnPicker{},
+		filling:    false,
+		closed:     false,
+		logger:     session.logger,
+		debouncer:  debounce.NewSimpleDebouncer(),
+	}
+
+	// the pool is not filled or connected
+	return pool
+}
+
+// Pick a connection from this connection pool for the given query.
+func (pool *hostConnPool) Pick(token Token, qry ExecutableQuery) *Conn {
+	pool.mu.RLock()
+	defer pool.mu.RUnlock()
+
+	if pool.closed {
+		return nil
+	}
+
+	size, missing := pool.connPicker.Size()
+	if missing > 0 {
+		// try to fill the pool
+		go pool.fill_debounce()
+
+		if size == 0 {
+			return nil
+		}
+	}
+
+	return pool.connPicker.Pick(token, qry)
+}
+
+// Size returns the number of connections currently active in the pool
+func (pool *hostConnPool) Size() int {
+	pool.mu.RLock()
+	defer pool.mu.RUnlock()
+
+	size, _ := pool.connPicker.Size()
+	return size
+}
+
+// Size returns the number of connections currently active in the pool
+func (pool *hostConnPool) InFlight() int {
+	pool.mu.RLock()
+	defer pool.mu.RUnlock()
+
+	size := pool.connPicker.InFlight()
+	return size
+}
+
+// Close the connection pool
+func (pool *hostConnPool) Close() {
+	pool.mu.Lock()
+	defer pool.mu.Unlock()
+
+	if !pool.closed {
+		pool.connPicker.Close()
+	}
+	pool.closed = true
+}
+
+// Fill the connection pool
+func (pool *hostConnPool) fill() {
+	pool.mu.RLock()
+	// avoid filling a closed pool, or concurrent filling
+	if pool.closed || pool.filling {
+		pool.mu.RUnlock()
+		return
+	}
+
+	// determine the filling work to be done
+	startCount, fillCount := pool.connPicker.Size()
+
+	// avoid filling a full (or overfull) pool
+	if fillCount <= 0 {
+		pool.mu.RUnlock()
+		return
+	}
+
+	// switch from read to write lock
+	pool.mu.RUnlock()
+	pool.mu.Lock()
+
+	startCount, fillCount = pool.connPicker.Size()
+	if pool.closed || pool.filling || fillCount <= 0 {
+		// looks like another goroutine already beat this
+		// goroutine to the filling
+		pool.mu.Unlock()
+		return
+	}
+
+	// ok fill the pool
+	pool.filling = true
+
+	// allow others to access the pool while filling
+	pool.mu.Unlock()
+	// only this goroutine should make calls to fill/empty the pool at this
+	// point until after this routine or its subordinates calls
+	// fillingStopped
+
+	// fill only the first connection synchronously
+	if startCount == 0 {
+		err := pool.connect()
+		pool.logConnectErr(err)
+
+		if err != nil {
+			// probably unreachable host
+			pool.fillingStopped(err)
+			return
+		}
+		// notify the session that this node is connected
+		go pool.session.handleNodeConnected(pool.host)
+
+		// filled one, let's reload it to see if it has changed
+		pool.mu.RLock()
+		_, fillCount = pool.connPicker.Size()
+		pool.mu.RUnlock()
+	}
+
+	// fill the rest of the pool asynchronously
+	go func() {
+		err := pool.connectMany(fillCount)
+
+		// mark the end of filling
+		pool.fillingStopped(err)
+
+		if err == nil && startCount > 0 {
+			// notify the session that this node is connected again
+			go pool.session.handleNodeConnected(pool.host)
+		}
+	}()
+}
+
+func (pool *hostConnPool) fill_debounce() {
+	pool.debouncer.Debounce(pool.fill)
+}
+
+func (pool *hostConnPool) logConnectErr(err error) {
+	if opErr, ok := err.(*net.OpError); ok && (opErr.Op == "dial" || opErr.Op == "read") {
+		// connection refused
+		// these are typical during a node outage so avoid log spam.
+		if gocqlDebug {
+			pool.logger.Printf("unable to dial %q: %v\n", pool.host, err)
+		}
+	} else if err != nil {
+		// unexpected error
+		pool.logger.Printf("error: failed to connect to %q due to error: %v", pool.host, err)
+	}
+}
+
+// transition back to a not-filling state.
+func (pool *hostConnPool) fillingStopped(err error) {
+	if err != nil {
+		if gocqlDebug {
+			pool.logger.Printf("gocql: filling stopped %q: %v\n", pool.host.ConnectAddress(), err)
+		}
+		// wait for some time to avoid back-to-back filling
+		// this provides some time between failed attempts
+		// to fill the pool for the host to recover
+		time.Sleep(time.Duration(rand.Int31n(100)+31) * time.Millisecond)
+	}
+
+	pool.mu.Lock()
+	pool.filling = false
+	count, _ := pool.connPicker.Size()
+	host := pool.host
+	port := pool.host.Port()
+	pool.mu.Unlock()
+
+	// if we errored and the size is now zero, make sure the host is marked as down
+	// see https://github.com/gocql/gocql/issues/1614
+	if gocqlDebug {
+		pool.logger.Printf("gocql: conns of pool after stopped %q: %v\n", host.ConnectAddress(), count)
+	}
+	if err != nil && count == 0 {
+		if pool.session.cfg.ConvictionPolicy.AddFailure(err, host) {
+			pool.session.handleNodeDown(host.ConnectAddress(), port)
+		}
+	}
+}
+
+// connectMany creates new connections concurrent.
+func (pool *hostConnPool) connectMany(count int) error {
+	if count == 0 {
+		return nil
+	}
+	var (
+		wg         sync.WaitGroup
+		mu         sync.Mutex
+		connectErr error
+	)
+	wg.Add(count)
+	for i := 0; i < count; i++ {
+		go func() {
+			defer wg.Done()
+			err := pool.connect()
+			pool.logConnectErr(err)
+			if err != nil {
+				mu.Lock()
+				connectErr = err
+				mu.Unlock()
+			}
+		}()
+	}
+	// wait for all connections are done
+	wg.Wait()
+
+	return connectErr
+}
+
+// create a new connection to the host and add it to the pool
+func (pool *hostConnPool) connect() (err error) {
+	pool.mu.Lock()
+	shardID, nrShards := pool.connPicker.NextShard()
+	pool.mu.Unlock()
+
+	// TODO: provide a more robust connection retry mechanism, we should also
+	// be able to detect hosts that come up by trying to connect to downed ones.
+	// try to connect
+	var conn *Conn
+	reconnectionPolicy := pool.session.cfg.ReconnectionPolicy
+	for i := 0; i < reconnectionPolicy.GetMaxRetries(); i++ {
+		conn, err = pool.session.connectShard(pool.session.ctx, pool.host, pool, shardID, nrShards)
+		if err == nil {
+			break
+		}
+		if opErr, isOpErr := err.(*net.OpError); isOpErr {
+			// if the error is not a temporary error (ex: network unreachable) don't
+			//  retry
+			if !opErr.Temporary() {
+				break
+			}
+		}
+		if gocqlDebug {
+			pool.logger.Printf("gocql: connection failed %q: %v, reconnecting with %T\n",
+				pool.host.ConnectAddress(), err, reconnectionPolicy)
+		}
+		time.Sleep(reconnectionPolicy.GetInterval(i))
+	}
+
+	if err != nil {
+		return err
+	}
+
+	if pool.keyspace != "" {
+		// set the keyspace
+		if err = conn.UseKeyspace(pool.keyspace); err != nil {
+			conn.Close()
+			return err
+		}
+	}
+
+	// add the Conn to the pool
+	pool.mu.Lock()
+	defer pool.mu.Unlock()
+
+	if pool.closed {
+		conn.Close()
+		return nil
+	}
+
+	// lazily initialize the connPicker when we know the required type
+	pool.initConnPicker(conn)
+	pool.connPicker.Put(conn)
+
+	return nil
+}
+
+func (pool *hostConnPool) initConnPicker(conn *Conn) {
+	if _, ok := pool.connPicker.(nopConnPicker); !ok {
+		return
+	}
+
+	if conn.isScyllaConn() {
+		pool.connPicker = newScyllaConnPicker(conn)
+		return
+	}
+
+	pool.connPicker = newDefaultConnPicker(pool.size)
+}
+
+// handle any error from a Conn
+func (pool *hostConnPool) HandleError(conn *Conn, err error, closed bool) {
+	if !closed {
+		// still an open connection, so continue using it
+		return
+	}
+
+	// TODO: track the number of errors per host and detect when a host is dead,
+	// then also have something which can detect when a host comes back.
+	pool.mu.Lock()
+	defer pool.mu.Unlock()
+
+	if pool.closed {
+		// pool closed
+		return
+	}
+
+	if gocqlDebug {
+		pool.logger.Printf("gocql: pool connection error %q: %v\n", conn.addr, err)
+	}
+
+	pool.connPicker.Remove(conn)
+	go pool.fill_debounce()
+}
diff --git a/vendor/github.com/gocql/gocql/connpicker.go b/vendor/github.com/gocql/gocql/connpicker.go
new file mode 100644
index 0000000..d84fc33
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/connpicker.go
@@ -0,0 +1,140 @@
+package gocql
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+)
+
+type ConnPicker interface {
+	Pick(Token, ExecutableQuery) *Conn
+	Put(*Conn)
+	Remove(conn *Conn)
+	InFlight() int
+	Size() (int, int)
+	Close()
+
+	// NextShard returns the shardID to connect to.
+	// nrShard specifies how many shards the host has.
+	// If nrShards is zero, the caller shouldn't use shard-aware port.
+	NextShard() (shardID, nrShards int)
+}
+
+type defaultConnPicker struct {
+	conns []*Conn
+	pos   uint32
+	size  int
+	mu    sync.RWMutex
+}
+
+func newDefaultConnPicker(size int) *defaultConnPicker {
+	if size <= 0 {
+		panic(fmt.Sprintf("invalid pool size %d", size))
+	}
+	return &defaultConnPicker{
+		size: size,
+	}
+}
+
+func (p *defaultConnPicker) Remove(conn *Conn) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	for i, candidate := range p.conns {
+		if candidate == conn {
+			last := len(p.conns) - 1
+			p.conns[i], p.conns = p.conns[last], p.conns[:last]
+			break
+		}
+	}
+}
+
+func (p *defaultConnPicker) Close() {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	conns := p.conns
+	p.conns = nil
+	for _, conn := range conns {
+		if conn != nil {
+			conn.Close()
+		}
+	}
+}
+
+func (p *defaultConnPicker) InFlight() int {
+	size := len(p.conns)
+	return size
+}
+
+func (p *defaultConnPicker) Size() (int, int) {
+	size := len(p.conns)
+	return size, p.size - size
+}
+
+func (p *defaultConnPicker) Pick(Token, ExecutableQuery) *Conn {
+	pos := int(atomic.AddUint32(&p.pos, 1) - 1)
+	size := len(p.conns)
+
+	var (
+		leastBusyConn    *Conn
+		streamsAvailable int
+	)
+
+	// find the conn which has the most available streams, this is racy
+	for i := 0; i < size; i++ {
+		conn := p.conns[(pos+i)%size]
+		if conn == nil {
+			continue
+		}
+		if streams := conn.AvailableStreams(); streams > streamsAvailable {
+			leastBusyConn = conn
+			streamsAvailable = streams
+		}
+	}
+
+	return leastBusyConn
+}
+
+func (p *defaultConnPicker) Put(conn *Conn) {
+	p.mu.Lock()
+	p.conns = append(p.conns, conn)
+	p.mu.Unlock()
+}
+
+func (*defaultConnPicker) NextShard() (shardID, nrShards int) {
+	return 0, 0
+}
+
+// nopConnPicker is a no-operation implementation of ConnPicker, it's used when
+// hostConnPool is created to allow deferring creation of the actual ConnPicker
+// to the point where we have first connection.
+type nopConnPicker struct{}
+
+func (nopConnPicker) Pick(Token, ExecutableQuery) *Conn {
+	return nil
+}
+
+func (nopConnPicker) Put(*Conn) {
+}
+
+func (nopConnPicker) Remove(conn *Conn) {
+}
+
+func (nopConnPicker) InFlight() int {
+	return 0
+}
+
+func (nopConnPicker) Size() (int, int) {
+	// Return 1 to make hostConnPool to try to establish a connection.
+	// When first connection is established hostConnPool replaces nopConnPicker
+	// with a different ConnPicker implementation.
+	return 0, 1
+}
+
+func (nopConnPicker) Close() {
+}
+
+func (nopConnPicker) NextShard() (shardID, nrShards int) {
+	return 0, 0
+}
diff --git a/vendor/github.com/gocql/gocql/control.go b/vendor/github.com/gocql/gocql/control.go
new file mode 100644
index 0000000..c88d7f4
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/control.go
@@ -0,0 +1,517 @@
+package gocql
+
+import (
+	"context"
+	crand "crypto/rand"
+	"errors"
+	"fmt"
+	"math/rand"
+	"net"
+	"os"
+	"regexp"
+	"strconv"
+	"sync"
+	"sync/atomic"
+	"time"
+)
+
+var (
+	randr    *rand.Rand
+	mutRandr sync.Mutex
+)
+
+func init() {
+	b := make([]byte, 4)
+	if _, err := crand.Read(b); err != nil {
+		panic(fmt.Sprintf("unable to seed random number generator: %v", err))
+	}
+
+	randr = rand.New(rand.NewSource(int64(readInt(b))))
+}
+
+const (
+	controlConnStarting = 0
+	controlConnStarted  = 1
+	controlConnClosing  = -1
+)
+
+type controlConnection interface {
+	getConn() *connHost
+	awaitSchemaAgreement() error
+	query(statement string, values ...interface{}) (iter *Iter)
+	discoverProtocol(hosts []*HostInfo) (int, error)
+	connect(hosts []*HostInfo) error
+	close()
+	getSession() *Session
+}
+
+// Ensure that the atomic variable is aligned to a 64bit boundary
+// so that atomic operations can be applied on 32bit architectures.
+type controlConn struct {
+	state        int32
+	reconnecting int32
+
+	session *Session
+	conn    atomic.Value
+
+	retry RetryPolicy
+
+	quit chan struct{}
+}
+
+func (c *controlConn) getSession() *Session {
+	return c.session
+}
+
+func createControlConn(session *Session) *controlConn {
+
+	control := &controlConn{
+		session:            session,
+		quit:               make(chan struct{}),
+		retry:              &SimpleRetryPolicy{NumRetries: 3},
+	}
+
+	control.conn.Store((*connHost)(nil))
+
+	return control
+}
+
+func (c *controlConn) heartBeat() {
+	if !atomic.CompareAndSwapInt32(&c.state, controlConnStarting, controlConnStarted) {
+		return
+	}
+
+	sleepTime := 1 * time.Second
+	timer := time.NewTimer(sleepTime)
+	defer timer.Stop()
+
+	for {
+		timer.Reset(sleepTime)
+
+		select {
+		case <-c.quit:
+			return
+		case <-timer.C:
+		}
+
+		resp, err := c.writeFrame(&writeOptionsFrame{})
+		if err != nil {
+			goto reconn
+		}
+
+		switch resp.(type) {
+		case *supportedFrame:
+			// Everything ok
+			sleepTime = 30 * time.Second
+			continue
+		case error:
+			goto reconn
+		default:
+			panic(fmt.Sprintf("gocql: unknown frame in response to options: %T", resp))
+		}
+
+	reconn:
+		// try to connect a bit faster
+		sleepTime = 1 * time.Second
+		c.reconnect()
+		continue
+	}
+}
+
+var hostLookupPreferV4 = os.Getenv("GOCQL_HOST_LOOKUP_PREFER_V4") == "true"
+
+func hostInfo(addr string, defaultPort int) ([]*HostInfo, error) {
+	var port int
+	host, portStr, err := net.SplitHostPort(addr)
+	if err != nil {
+		host = addr
+		port = defaultPort
+	} else {
+		port, err = strconv.Atoi(portStr)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	var hosts []*HostInfo
+
+	// Check if host is a literal IP address
+	if ip := net.ParseIP(host); ip != nil {
+		hosts = append(hosts, &HostInfo{hostname: host, connectAddress: ip, port: port})
+		return hosts, nil
+	}
+
+	// Look up host in DNS
+	ips, err := LookupIP(host)
+	if err != nil {
+		return nil, err
+	} else if len(ips) == 0 {
+		return nil, fmt.Errorf("no IP's returned from DNS lookup for %q", addr)
+	}
+
+	// Filter to v4 addresses if any present
+	if hostLookupPreferV4 {
+		var preferredIPs []net.IP
+		for _, v := range ips {
+			if v4 := v.To4(); v4 != nil {
+				preferredIPs = append(preferredIPs, v4)
+			}
+		}
+		if len(preferredIPs) != 0 {
+			ips = preferredIPs
+		}
+	}
+
+	for _, ip := range ips {
+		hosts = append(hosts, &HostInfo{hostname: host, connectAddress: ip, port: port})
+	}
+
+	return hosts, nil
+}
+
+func shuffleHosts(hosts []*HostInfo) []*HostInfo {
+	shuffled := make([]*HostInfo, len(hosts))
+	copy(shuffled, hosts)
+
+	mutRandr.Lock()
+	randr.Shuffle(len(hosts), func(i, j int) {
+		shuffled[i], shuffled[j] = shuffled[j], shuffled[i]
+	})
+	mutRandr.Unlock()
+
+	return shuffled
+}
+
+// this is going to be version dependant and a nightmare to maintain :(
+var protocolSupportRe = regexp.MustCompile(`the lowest supported version is \d+ and the greatest is (\d+)$`)
+
+func parseProtocolFromError(err error) int {
+	// I really wish this had the actual info in the error frame...
+	matches := protocolSupportRe.FindAllStringSubmatch(err.Error(), -1)
+	if len(matches) != 1 || len(matches[0]) != 2 {
+		if verr, ok := err.(*protocolError); ok {
+			return int(verr.frame.Header().version.version())
+		}
+		return 0
+	}
+
+	max, err := strconv.Atoi(matches[0][1])
+	if err != nil {
+		return 0
+	}
+
+	return max
+}
+
+func (c *controlConn) discoverProtocol(hosts []*HostInfo) (int, error) {
+	hosts = shuffleHosts(hosts)
+
+	connCfg := *c.session.connCfg
+	connCfg.ProtoVersion = 4 // TODO: define maxProtocol
+
+	handler := connErrorHandlerFn(func(c *Conn, err error, closed bool) {
+		// we should never get here, but if we do it means we connected to a
+		// host successfully which means our attempted protocol version worked
+		if !closed {
+			c.Close()
+		}
+	})
+
+	var err error
+	for _, host := range hosts {
+		var conn *Conn
+		conn, err = c.session.dial(c.session.ctx, host, &connCfg, handler)
+		if conn != nil {
+			conn.Close()
+		}
+
+		if err == nil {
+			return connCfg.ProtoVersion, nil
+		}
+
+		if proto := parseProtocolFromError(err); proto > 0 {
+			return proto, nil
+		}
+	}
+
+	return 0, err
+}
+
+func (c *controlConn) connect(hosts []*HostInfo) error {
+	if len(hosts) == 0 {
+		return errors.New("control: no endpoints specified")
+	}
+
+	// shuffle endpoints so not all drivers will connect to the same initial
+	// node.
+	hosts = shuffleHosts(hosts)
+
+	cfg := *c.session.connCfg
+	cfg.disableCoalesce = true
+
+	var conn *Conn
+	var err error
+	for _, host := range hosts {
+		conn, err = c.session.dial(c.session.ctx, host, &cfg, c)
+		if err != nil {
+			c.session.logger.Printf("gocql: unable to dial control conn %v:%v: %v\n", host.ConnectAddress(), host.Port(), err)
+			continue
+		}
+		err = c.setupConn(conn)
+		if err == nil {
+			break
+		}
+		c.session.logger.Printf("gocql: unable setup control conn %v:%v: %v\n", host.ConnectAddress(), host.Port(), err)
+		conn.Close()
+		conn = nil
+	}
+	if conn == nil {
+		return fmt.Errorf("unable to connect to initial hosts: %v", err)
+	}
+
+	// we could fetch the initial ring here and update initial host data. So that
+	// when we return from here we have a ring topology ready to go.
+
+	go c.heartBeat()
+
+	return nil
+}
+
+type connHost struct {
+	conn ConnInterface
+	host *HostInfo
+}
+
+func (c *controlConn) setupConn(conn *Conn) error {
+	// we need up-to-date host info for the filterHost call below
+	iter := conn.querySystem(context.TODO(), qrySystemLocal)
+	defaultPort := 9042
+	if tcpAddr, ok := conn.conn.RemoteAddr().(*net.TCPAddr); ok {
+		defaultPort = tcpAddr.Port
+	}
+	host, err := hostInfoFromIter(iter, conn.host.connectAddress, defaultPort, c.session.cfg.translateAddressPort)
+	if err != nil {
+		return err
+	}
+
+	host = c.session.hostSource.addOrUpdate(host)
+
+	if c.session.cfg.filterHost(host) {
+		return fmt.Errorf("host was filtered: %v", host.ConnectAddress())
+	}
+
+	if err := c.registerEvents(conn); err != nil {
+		return fmt.Errorf("register events: %v", err)
+	}
+
+	ch := &connHost{
+		conn: conn,
+		host: host,
+	}
+
+	c.conn.Store(ch)
+	if c.session.initialized() {
+		// We connected to control conn, so add the connect the host in pool as well.
+		// Notify session we can start trying to connect to the node.
+		// We can't start the fill before the session is initialized, otherwise the fill would interfere
+		// with the fill called by Session.init. Session.init needs to wait for its fill to finish and that
+		// would return immediately if we started the fill here.
+		// TODO(martin-sucha): Trigger pool refill for all hosts, like in reconnectDownedHosts?
+		go c.session.startPoolFill(host)
+	}
+	return nil
+}
+
+func (c *controlConn) registerEvents(conn *Conn) error {
+	var events []string
+
+	if !c.session.cfg.Events.DisableTopologyEvents {
+		events = append(events, "TOPOLOGY_CHANGE")
+	}
+	if !c.session.cfg.Events.DisableNodeStatusEvents {
+		events = append(events, "STATUS_CHANGE")
+	}
+	if !c.session.cfg.Events.DisableSchemaEvents {
+		events = append(events, "SCHEMA_CHANGE")
+	}
+
+	if len(events) == 0 {
+		return nil
+	}
+
+	framer, err := conn.exec(context.Background(),
+		&writeRegisterFrame{
+			events: events,
+		}, nil)
+	if err != nil {
+		return err
+	}
+
+	frame, err := framer.parseFrame()
+	if err != nil {
+		return err
+	} else if _, ok := frame.(*readyFrame); !ok {
+		return fmt.Errorf("unexpected frame in response to register: got %T: %v\n", frame, frame)
+	}
+
+	return nil
+}
+
+func (c *controlConn) reconnect() {
+	if atomic.LoadInt32(&c.state) == controlConnClosing {
+		return
+	}
+	if !atomic.CompareAndSwapInt32(&c.reconnecting, 0, 1) {
+		return
+	}
+	defer atomic.StoreInt32(&c.reconnecting, 0)
+
+	conn, err := c.attemptReconnect()
+
+	if conn == nil {
+		c.session.logger.Printf("gocql: unable to reconnect control connection: %v\n", err)
+		return
+	}
+
+	err = c.session.refreshRingNow()
+	if err != nil {
+		c.session.logger.Printf("gocql: unable to refresh ring: %v\n", err)
+	}
+
+	err = c.session.metadataDescriber.refreshAllSchema()
+	if err != nil {
+		c.session.logger.Printf("gocql: unable to refresh the schema: %v\n", err)
+	}
+}
+
+func (c *controlConn) attemptReconnect() (*Conn, error) {
+	hosts := c.session.hostSource.getHostsList()
+	hosts = shuffleHosts(hosts)
+
+	// keep the old behavior of connecting to the old host first by moving it to
+	// the front of the slice
+	ch := c.getConn()
+	if ch != nil {
+		for i := range hosts {
+			if hosts[i].Equal(ch.host) {
+				hosts[0], hosts[i] = hosts[i], hosts[0]
+				break
+			}
+		}
+		ch.conn.Close()
+	}
+
+	conn, err := c.attemptReconnectToAnyOfHosts(hosts)
+
+	if conn != nil {
+		return conn, err
+	}
+
+	c.session.logger.Printf("gocql: unable to connect to any ring node: %v\n", err)
+	c.session.logger.Printf("gocql: control falling back to initial contact points.\n")
+	// Fallback to initial contact points, as it may be the case that all known initialHosts
+	// changed their IPs while keeping the same hostname(s).
+	initialHosts, resolvErr := addrsToHosts(c.session.cfg.Hosts, c.session.cfg.Port, c.session.logger)
+	if resolvErr != nil {
+		return nil, fmt.Errorf("resolve contact points' hostnames: %v", resolvErr)
+	}
+
+	return c.attemptReconnectToAnyOfHosts(initialHosts)
+}
+
+func (c *controlConn) attemptReconnectToAnyOfHosts(hosts []*HostInfo) (*Conn, error) {
+	var conn *Conn
+	var err error
+	for _, host := range hosts {
+		conn, err = c.session.connect(c.session.ctx, host, c)
+		if err != nil {
+			if c.session.cfg.ConvictionPolicy.AddFailure(err, host) {
+				c.session.handleNodeDown(host.ConnectAddress(), host.Port())
+			}
+			c.session.logger.Printf("gocql: unable to dial control conn %v:%v: %v\n", host.ConnectAddress(), host.Port(), err)
+			continue
+		}
+		err = c.setupConn(conn)
+		if err == nil {
+			break
+		}
+		c.session.logger.Printf("gocql: unable setup control conn %v:%v: %v\n", host.ConnectAddress(), host.Port(), err)
+		conn.Close()
+		conn = nil
+	}
+	return conn, err
+}
+
+func (c *controlConn) HandleError(conn *Conn, err error, closed bool) {
+	if !closed {
+		return
+	}
+
+	oldConn := c.getConn()
+
+	// If connection has long gone, and not been attempted for awhile,
+	// it's possible to have oldConn as nil here (#1297).
+	if oldConn != nil && oldConn.conn != conn {
+		return
+	}
+
+	c.reconnect()
+}
+
+func (c *controlConn) getConn() *connHost {
+	return c.conn.Load().(*connHost)
+}
+
+func (c *controlConn) writeFrame(w frameBuilder) (frame, error) {
+	ch := c.getConn()
+	if ch == nil {
+		return nil, errNoControl
+	}
+
+	framer, err := ch.conn.exec(context.Background(), w, nil)
+	if err != nil {
+		return nil, err
+	}
+
+	return framer.parseFrame()
+}
+
+// query will return nil if the connection is closed or nil
+func (c *controlConn) query(statement string, values ...interface{}) (iter *Iter) {
+	q := c.session.Query(statement, values...).Consistency(One).RoutingKey([]byte{}).Trace(nil)
+
+	for {
+		ch := c.getConn()
+		q.conn = ch.conn.(*Conn)
+		iter = ch.conn.executeQuery(context.TODO(), q)
+
+		if gocqlDebug && iter.err != nil {
+			c.session.logger.Printf("control: error executing %q: %v\n", statement, iter.err)
+		}
+
+		q.AddAttempts(1, c.getConn().host)
+		if iter.err == nil || !c.retry.Attempt(q) {
+			break
+		}
+	}
+
+	return
+}
+
+func (c *controlConn) awaitSchemaAgreement() error {
+	ch := c.getConn()
+	return (&Iter{err: ch.conn.awaitSchemaAgreement(context.TODO())}).err
+}
+
+func (c *controlConn) close() {
+	if atomic.CompareAndSwapInt32(&c.state, controlConnStarted, controlConnClosing) {
+		c.quit <- struct{}{}
+	}
+
+	ch := c.getConn()
+	if ch != nil {
+		ch.conn.Close()
+	}
+}
+
+var errNoControl = errors.New("gocql: no control connection available")
diff --git a/vendor/github.com/gocql/gocql/cqltypes.go b/vendor/github.com/gocql/gocql/cqltypes.go
new file mode 100644
index 0000000..e465e94
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/cqltypes.go
@@ -0,0 +1,11 @@
+// Copyright (c) 2012 The gocql Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gocql
+
+type Duration struct {
+	Months      int32
+	Days        int32
+	Nanoseconds int64
+}
diff --git a/vendor/github.com/gocql/gocql/debounce/refresh_deboucer.go b/vendor/github.com/gocql/gocql/debounce/refresh_deboucer.go
new file mode 100644
index 0000000..7a3bd3f
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/debounce/refresh_deboucer.go
@@ -0,0 +1,164 @@
+package debounce
+
+import (
+	"sync"
+	"time"
+)
+
+const (
+	RingRefreshDebounceTime = 1 * time.Second
+)
+
+// debounces requests to call a refresh function (currently used for ring refresh). It also supports triggering a refresh immediately.
+type RefreshDebouncer struct {
+	mu           sync.Mutex
+	stopped      bool
+	broadcaster  *errorBroadcaster
+	interval     time.Duration
+	timer        *time.Timer
+	refreshNowCh chan struct{}
+	quit         chan struct{}
+	refreshFn    func() error
+}
+
+func NewRefreshDebouncer(interval time.Duration, refreshFn func() error) *RefreshDebouncer {
+	d := &RefreshDebouncer{
+		stopped:      false,
+		broadcaster:  nil,
+		refreshNowCh: make(chan struct{}, 1),
+		quit:         make(chan struct{}),
+		interval:     interval,
+		timer:        time.NewTimer(interval),
+		refreshFn:    refreshFn,
+	}
+	d.timer.Stop()
+	go d.flusher()
+	return d
+}
+
+// debounces a request to call the refresh function
+func (d *RefreshDebouncer) Debounce() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	if d.stopped {
+		return
+	}
+	d.timer.Reset(d.interval)
+}
+
+// requests an immediate refresh which will cancel pending refresh requests
+func (d *RefreshDebouncer) RefreshNow() <-chan error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	if d.broadcaster == nil {
+		d.broadcaster = newErrorBroadcaster()
+		select {
+		case d.refreshNowCh <- struct{}{}:
+		default:
+			// already a refresh pending
+		}
+	}
+	return d.broadcaster.newListener()
+}
+
+func (d *RefreshDebouncer) flusher() {
+	for {
+		select {
+		case <-d.refreshNowCh:
+		case <-d.timer.C:
+		case <-d.quit:
+		}
+		d.mu.Lock()
+		if d.stopped {
+			if d.broadcaster != nil {
+				d.broadcaster.stop()
+				d.broadcaster = nil
+			}
+			d.timer.Stop()
+			d.mu.Unlock()
+			return
+		}
+
+		// make sure both request channels are cleared before we refresh
+		select {
+		case <-d.refreshNowCh:
+		default:
+		}
+
+		d.timer.Stop()
+		select {
+		case <-d.timer.C:
+		default:
+		}
+
+		curBroadcaster := d.broadcaster
+		d.broadcaster = nil
+		d.mu.Unlock()
+
+		err := d.refreshFn()
+		if curBroadcaster != nil {
+			curBroadcaster.broadcast(err)
+		}
+	}
+}
+
+func (d *RefreshDebouncer) Stop() {
+	d.mu.Lock()
+	if d.stopped {
+		d.mu.Unlock()
+		return
+	}
+	d.stopped = true
+	d.mu.Unlock()
+	d.quit <- struct{}{} // sync with flusher
+	close(d.quit)
+}
+
+// broadcasts an error to multiple channels (listeners)
+type errorBroadcaster struct {
+	listeners []chan<- error
+	mu        sync.Mutex
+}
+
+func newErrorBroadcaster() *errorBroadcaster {
+	return &errorBroadcaster{
+		listeners: nil,
+		mu:        sync.Mutex{},
+	}
+}
+
+func (b *errorBroadcaster) newListener() <-chan error {
+	ch := make(chan error, 1)
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.listeners = append(b.listeners, ch)
+	return ch
+}
+
+func (b *errorBroadcaster) broadcast(err error) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	curListeners := b.listeners
+	if len(curListeners) > 0 {
+		b.listeners = nil
+	} else {
+		return
+	}
+
+	for _, listener := range curListeners {
+		listener <- err
+		close(listener)
+	}
+}
+
+func (b *errorBroadcaster) stop() {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	if len(b.listeners) == 0 {
+		return
+	}
+	for _, listener := range b.listeners {
+		close(listener)
+	}
+	b.listeners = nil
+}
diff --git a/vendor/github.com/gocql/gocql/debounce/simple_debouncer.go b/vendor/github.com/gocql/gocql/debounce/simple_debouncer.go
new file mode 100644
index 0000000..6ab08ea
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/debounce/simple_debouncer.go
@@ -0,0 +1,34 @@
+package debounce
+
+import (
+	"sync"
+	"sync/atomic"
+)
+
+// SimpleDebouncer is are tool for queuing immutable functions calls. It provides:
+// 1. Blocking simultaneous calls
+// 2. If there is no running call and no waiting call, then the current call go through
+// 3. If there is running call and no waiting call, then the current call go waiting
+// 4. If there is running call and waiting call, then the current call are voided
+type SimpleDebouncer struct {
+	m     sync.Mutex
+	count atomic.Int32
+}
+
+// NewSimpleDebouncer creates a new SimpleDebouncer.
+func NewSimpleDebouncer() *SimpleDebouncer {
+	return &SimpleDebouncer{}
+}
+
+// Debounce attempts to execute the function if the logic of the SimpleDebouncer allows it.
+func (d *SimpleDebouncer) Debounce(fn func()) bool {
+	if d.count.Add(1) > 2 {
+		d.count.Add(-1)
+		return false
+	}
+	d.m.Lock()
+	fn()
+	d.count.Add(-1)
+	d.m.Unlock()
+	return true
+}
diff --git a/vendor/github.com/gocql/gocql/debug_off.go b/vendor/github.com/gocql/gocql/debug_off.go
new file mode 100644
index 0000000..31e6225
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/debug_off.go
@@ -0,0 +1,6 @@
+//go:build !gocql_debug
+// +build !gocql_debug
+
+package gocql
+
+const gocqlDebug = false
diff --git a/vendor/github.com/gocql/gocql/debug_on.go b/vendor/github.com/gocql/gocql/debug_on.go
new file mode 100644
index 0000000..b3bdfab
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/debug_on.go
@@ -0,0 +1,6 @@
+//go:build gocql_debug
+// +build gocql_debug
+
+package gocql
+
+const gocqlDebug = true
diff --git a/vendor/github.com/gocql/gocql/dial.go b/vendor/github.com/gocql/gocql/dial.go
new file mode 100644
index 0000000..0613ceb
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/dial.go
@@ -0,0 +1,91 @@
+package gocql
+
+import (
+	"context"
+	"crypto/tls"
+	"fmt"
+	"net"
+	"strings"
+)
+
+// HostDialer allows customizing connection to cluster nodes.
+type HostDialer interface {
+	// DialHost establishes a connection to the host.
+	// The returned connection must be directly usable for CQL protocol,
+	// specifically DialHost is responsible also for setting up the TLS session if needed.
+	// DialHost should disable write coalescing if the returned net.Conn does not support writev.
+	// As of Go 1.18, only plain TCP connections support writev, TLS sessions should disable coalescing.
+	// You can use WrapTLS helper function if you don't need to override the TLS setup.
+	DialHost(ctx context.Context, host *HostInfo) (*DialedHost, error)
+}
+
+// DialedHost contains information about established connection to a host.
+type DialedHost struct {
+	// Conn used to communicate with the server.
+	Conn net.Conn
+
+	// DisableCoalesce disables write coalescing for the Conn.
+	// If true, the effect is the same as if WriteCoalesceWaitTime was configured to 0.
+	DisableCoalesce bool
+}
+
+// defaultHostDialer dials host in a default way.
+type defaultHostDialer struct {
+	dialer    Dialer
+	tlsConfig *tls.Config
+}
+
+func (hd *defaultHostDialer) DialHost(ctx context.Context, host *HostInfo) (*DialedHost, error) {
+	ip := host.ConnectAddress()
+	port := host.Port()
+
+	if !validIpAddr(ip) {
+		return nil, fmt.Errorf("host missing connect ip address: %v", ip)
+	} else if port == 0 {
+		return nil, fmt.Errorf("host missing port: %v", port)
+	}
+
+	connAddr := host.ConnectAddressAndPort()
+	conn, err := hd.dialer.DialContext(ctx, "tcp", connAddr)
+	if err != nil {
+		return nil, err
+	}
+	addr := host.HostnameAndPort()
+	return WrapTLS(ctx, conn, addr, hd.tlsConfig)
+}
+
+func tlsConfigForAddr(tlsConfig *tls.Config, addr string) *tls.Config {
+	// the TLS config is safe to be reused by connections but it must not
+	// be modified after being used.
+	if !tlsConfig.InsecureSkipVerify && tlsConfig.ServerName == "" {
+		colonPos := strings.LastIndex(addr, ":")
+		if colonPos == -1 {
+			colonPos = len(addr)
+		}
+		hostname := addr[:colonPos]
+		// clone config to avoid modifying the shared one.
+		tlsConfig = tlsConfig.Clone()
+		tlsConfig.ServerName = hostname
+	}
+	return tlsConfig
+}
+
+// WrapTLS optionally wraps a net.Conn connected to addr with the given tlsConfig.
+// If the tlsConfig is nil, conn is not wrapped into a TLS session, so is insecure.
+// If the tlsConfig does not have server name set, it is updated based on the default gocql rules.
+func WrapTLS(ctx context.Context, conn net.Conn, addr string, tlsConfig *tls.Config) (*DialedHost, error) {
+	if tlsConfig != nil {
+		tlsConfig := tlsConfigForAddr(tlsConfig, addr)
+		tconn := tls.Client(conn, tlsConfig)
+		if err := tconn.HandshakeContext(ctx); err != nil {
+			conn.Close()
+			return nil, err
+		}
+		conn = tconn
+	}
+
+	return &DialedHost{
+		Conn:            conn,
+		DisableCoalesce: tlsConfig != nil, // write coalescing can't use writev when the connection is wrapped.
+	}, nil
+}
diff --git a/vendor/github.com/gocql/gocql/doc.go b/vendor/github.com/gocql/gocql/doc.go
new file mode 100644
index 0000000..5f7d676
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/doc.go
@@ -0,0 +1,375 @@
+// Copyright (c) 2012-2015 The gocql Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package gocql implements a fast and robust Cassandra driver for the
+// Go programming language.
+//
+// # Connecting to the cluster
+//
+// Pass a list of initial node IP addresses to NewCluster to create a new cluster configuration:
+//
+//	cluster := gocql.NewCluster("192.168.1.1", "192.168.1.2", "192.168.1.3")
+//
+// Port can be specified as part of the address, the above is equivalent to:
+//
+//	cluster := gocql.NewCluster("192.168.1.1:9042", "192.168.1.2:9042", "192.168.1.3:9042")
+//
+// It is recommended to use the value set in the Cassandra config for broadcast_address or listen_address,
+// an IP address not a domain name. This is because events from Cassandra will use the configured IP
+// address, which is used to index connected hosts. If the domain name specified resolves to more than 1 IP address
+// then the driver may connect multiple times to the same host, and will not mark the node being down or up from events.
+//
+// Then you can customize more options (see ClusterConfig):
+//
+//	cluster.Keyspace = "example"
+//	cluster.Consistency = gocql.Quorum
+//	cluster.ProtoVersion = 4
+//
+// The driver tries to automatically detect the protocol version to use if not set, but you might want to set the
+// protocol version explicitly, as it's not defined which version will be used in certain situations (for example
+// during upgrade of the cluster when some of the nodes support different set of protocol versions than other nodes).
+//
+// The driver advertises the module name and version in the STARTUP message, so servers are able to detect the version.
+// If you use replace directive in go.mod, the driver will send information about the replacement module instead.
+//
+// When ready, create a session from the configuration. Don't forget to Close the session once you are done with it:
+//
+//	session, err := cluster.CreateSession()
+//	if err != nil {
+//		return err
+//	}
+//	defer session.Close()
+//
+// # Authentication
+//
+// CQL protocol uses a SASL-based authentication mechanism and so consists of an exchange of server challenges and
+// client response pairs. The details of the exchanged messages depend on the authenticator used.
+//
+// To use authentication, set ClusterConfig.Authenticator or ClusterConfig.AuthProvider.
+//
+// PasswordAuthenticator is provided to use for username/password authentication:
+//
+//	 cluster := gocql.NewCluster("192.168.1.1", "192.168.1.2", "192.168.1.3")
+//	 cluster.Authenticator = gocql.PasswordAuthenticator{
+//			Username: "user",
+//			Password: "password"
+//	 }
+//	 session, err := cluster.CreateSession()
+//	 if err != nil {
+//	 	return err
+//	 }
+//	 defer session.Close()
+//
+// By default, PasswordAuthenticator will attempt to authenticate regardless of what implementation the server returns
+// in its AUTHENTICATE message as its authenticator, (e.g. org.apache.cassandra.auth.PasswordAuthenticator).  If you
+// wish to restrict this you may use PasswordAuthenticator.AllowedAuthenticators:
+//
+//	 cluster.Authenticator = gocql.PasswordAuthenticator {
+//			Username:              "user",
+//			Password:              "password"
+//			AllowedAuthenticators: []string{"org.apache.cassandra.auth.PasswordAuthenticator"},
+//	 }
+//
+// # Transport layer security
+//
+// It is possible to secure traffic between the client and server with TLS.
+//
+// To use TLS, set the ClusterConfig.SslOpts field. SslOptions embeds *tls.Config so you can set that directly.
+// There are also helpers to load keys/certificates from files.
+//
+// Warning: Due to historical reasons, the SslOptions is insecure by default, so you need to set EnableHostVerification
+// to true if no Config is set. Most users should set SslOptions.Config to a *tls.Config.
+// SslOptions and Config.InsecureSkipVerify interact as follows:
+//
+//	Config.InsecureSkipVerify | EnableHostVerification | Result
+//	Config is nil             | false                  | do not verify host
+//	Config is nil             | true                   | verify host
+//	false                     | false                  | verify host
+//	true                      | false                  | do not verify host
+//	false                     | true                   | verify host
+//	true                      | true                   | verify host
+//
+// For example:
+//
+//	cluster := gocql.NewCluster("192.168.1.1", "192.168.1.2", "192.168.1.3")
+//	cluster.SslOpts = &gocql.SslOptions{
+//		EnableHostVerification: true,
+//	}
+//	session, err := cluster.CreateSession()
+//	if err != nil {
+//		return err
+//	}
+//	defer session.Close()
+//
+// # Data-center awareness and query routing
+//
+// To route queries to local DC first, use DCAwareRoundRobinPolicy. For example, if the datacenter you
+// want to primarily connect is called dc1 (as configured in the database):
+//
+//	cluster := gocql.NewCluster("192.168.1.1", "192.168.1.2", "192.168.1.3")
+//	cluster.PoolConfig.HostSelectionPolicy = gocql.DCAwareRoundRobinPolicy("dc1")
+//
+// The driver can route queries to nodes that hold data replicas based on partition key (preferring local DC).
+//
+//	cluster := gocql.NewCluster("192.168.1.1", "192.168.1.2", "192.168.1.3")
+//	cluster.PoolConfig.HostSelectionPolicy = gocql.TokenAwareHostPolicy(gocql.DCAwareRoundRobinPolicy("dc1"))
+//
+// Note that TokenAwareHostPolicy can take options such as gocql.ShuffleReplicas and gocql.NonLocalReplicasFallback.
+//
+// We recommend running with a token aware host policy in production for maximum performance.
+//
+// The driver can only use token-aware routing for queries where all partition key columns are query parameters.
+// For example, instead of
+//
+//	session.Query("select value from mytable where pk1 = 'abc' AND pk2 = ?", "def")
+//
+// use
+//
+//	session.Query("select value from mytable where pk1 = ? AND pk2 = ?", "abc", "def")
+//
+// # Rack-level awareness
+//
+// The DCAwareRoundRobinPolicy can be replaced with RackAwareRoundRobinPolicy, which takes two parameters, datacenter and rack.
+//
+// Instead of dividing hosts with two tiers (local datacenter and remote datacenters) it divides hosts into three
+// (the local rack, the rest of the local datacenter, and everything else).
+//
+// RackAwareRoundRobinPolicy can be combined with TokenAwareHostPolicy in the same way as DCAwareRoundRobinPolicy.
+//
+// # Executing queries
+//
+// Create queries with Session.Query. Query values must not be reused between different executions and must not be
+// modified after starting execution of the query.
+//
+// To execute a query without reading results, use Query.Exec:
+//
+//	 err := session.Query(`INSERT INTO tweet (timeline, id, text) VALUES (?, ?, ?)`,
+//			"me", gocql.TimeUUID(), "hello world").WithContext(ctx).Exec()
+//
+// Single row can be read by calling Query.Scan:
+//
+//	 err := session.Query(`SELECT id, text FROM tweet WHERE timeline = ? LIMIT 1`,
+//			"me").WithContext(ctx).Consistency(gocql.One).Scan(&id, &text)
+//
+// Multiple rows can be read using Iter.Scanner:
+//
+//	 scanner := session.Query(`SELECT id, text FROM tweet WHERE timeline = ?`,
+//	 	"me").WithContext(ctx).Iter().Scanner()
+//	 for scanner.Next() {
+//	 	var (
+//	 		id gocql.UUID
+//			text string
+//	 	)
+//	 	err = scanner.Scan(&id, &text)
+//	 	if err != nil {
+//	 		log.Fatal(err)
+//	 	}
+//	 	fmt.Println("Tweet:", id, text)
+//	 }
+//	 // scanner.Err() closes the iterator, so scanner nor iter should be used afterwards.
+//	 if err := scanner.Err(); err != nil {
+//	 	log.Fatal(err)
+//	 }
+//
+// See Example for complete example.
+//
+// # Prepared statements
+//
+// The driver automatically prepares DML queries (SELECT/INSERT/UPDATE/DELETE/BATCH statements) and maintains a cache
+// of prepared statements.
+// CQL protocol does not support preparing other query types.
+//
+// When using CQL protocol >= 4, it is possible to use gocql.UnsetValue as the bound value of a column.
+// This will cause the database to ignore writing the column.
+// The main advantage is the ability to keep the same prepared statement even when you don't
+// want to update some fields, where before you needed to make another prepared statement.
+//
+// # Executing multiple queries concurrently
+//
+// Session is safe to use from multiple goroutines, so to execute multiple concurrent queries, just execute them
+// from several worker goroutines. Gocql provides synchronously-looking API (as recommended for Go APIs) and the queries
+// are executed asynchronously at the protocol level.
+//
+//	results := make(chan error, 2)
+//	go func() {
+//		results <- session.Query(`INSERT INTO tweet (timeline, id, text) VALUES (?, ?, ?)`,
+//			"me", gocql.TimeUUID(), "hello world 1").Exec()
+//	}()
+//	go func() {
+//		results <- session.Query(`INSERT INTO tweet (timeline, id, text) VALUES (?, ?, ?)`,
+//			"me", gocql.TimeUUID(), "hello world 2").Exec()
+//	}()
+//
+// # Nulls
+//
+// Null values are are unmarshalled as zero value of the type. If you need to distinguish for example between text
+// column being null and empty string, you can unmarshal into *string variable instead of string.
+//
+//	var text *string
+//	err := scanner.Scan(&text)
+//	if err != nil {
+//		// handle error
+//	}
+//	if text != nil {
+//		// not null
+//	}
+//	else {
+//		// null
+//	}
+//
+// See Example_nulls for full example.
+//
+// # Reusing slices
+//
+// The driver reuses backing memory of slices when unmarshalling. This is an optimization so that a buffer does not
+// need to be allocated for every processed row. However, you need to be careful when storing the slices to other
+// memory structures.
+//
+//	scanner := session.Query(`SELECT myints FROM table WHERE pk = ?`, "key").WithContext(ctx).Iter().Scanner()
+//	var myInts []int
+//	for scanner.Next() {
+//		// This scan reuses backing store of myInts for each row.
+//		err = scanner.Scan(&myInts)
+//		if err != nil {
+//			log.Fatal(err)
+//		}
+//	}
+//
+// When you want to save the data for later use, pass a new slice every time. A common pattern is to declare the
+// slice variable within the scanner loop:
+//
+//	scanner := session.Query(`SELECT myints FROM table WHERE pk = ?`, "key").WithContext(ctx).Iter().Scanner()
+//	for scanner.Next() {
+//		var myInts []int
+//		// This scan always gets pointer to fresh myInts slice, so does not reuse memory.
+//		err = scanner.Scan(&myInts)
+//		if err != nil {
+//			log.Fatal(err)
+//		}
+//	}
+//
+// # Paging
+//
+// The driver supports paging of results with automatic prefetch, see ClusterConfig.PageSize, Session.SetPrefetch,
+// Query.PageSize, and Query.Prefetch.
+//
+// It is also possible to control the paging manually with Query.PageState (this disables automatic prefetch).
+// Manual paging is useful if you want to store the page state externally, for example in a URL to allow users
+// browse pages in a result. You might want to sign/encrypt the paging state when exposing it externally since
+// it contains data from primary keys.
+//
+// Paging state is specific to the CQL protocol version and the exact query used. It is meant as opaque state that
+// should not be modified. If you send paging state from different query or protocol version, then the behaviour
+// is not defined (you might get unexpected results or an error from the server). For example, do not send paging state
+// returned by node using protocol version 3 to a node using protocol version 4. Also, when using protocol version 4,
+// paging state between Cassandra 2.2 and 3.0 is incompatible (https://issues.apache.org/jira/browse/CASSANDRA-10880).
+//
+// The driver does not check whether the paging state is from the same protocol version/statement.
+// You might want to validate yourself as this could be a problem if you store paging state externally.
+// For example, if you store paging state in a URL, the URLs might become broken when you upgrade your cluster.
+//
+// Call Query.PageState(nil) to fetch just the first page of the query results. Pass the page state returned by
+// Iter.PageState to Query.PageState of a subsequent query to get the next page. If the length of slice returned
+// by Iter.PageState is zero, there are no more pages available (or an error occurred).
+//
+// Using too low values of PageSize will negatively affect performance, a value below 100 is probably too low.
+// While Cassandra returns exactly PageSize items (except for last page) in a page currently, the protocol authors
+// explicitly reserved the right to return smaller or larger amount of items in a page for performance reasons, so don't
+// rely on the page having the exact count of items.
+//
+// See Example_paging for an example of manual paging.
+//
+// # Dynamic list of columns
+//
+// There are certain situations when you don't know the list of columns in advance, mainly when the query is supplied
+// by the user. Iter.Columns, Iter.RowData, Iter.MapScan and Iter.SliceMap can be used to handle this case.
+//
+// See Example_dynamicColumns.
+//
+// # Batches
+//
+// The CQL protocol supports sending batches of DML statements (INSERT/UPDATE/DELETE) and so does gocql.
+// Use Session.Batch to create a new batch and then fill-in details of individual queries.
+// Then execute the batch with Session.ExecuteBatch.
+//
+// Logged batches ensure atomicity, either all or none of the operations in the batch will succeed, but they have
+// overhead to ensure this property.
+// Unlogged batches don't have the overhead of logged batches, but don't guarantee atomicity.
+// Updates of counters are handled specially by Cassandra so batches of counter updates have to use CounterBatch type.
+// A counter batch can only contain statements to update counters.
+//
+// For unlogged batches it is recommended to send only single-partition batches (i.e. all statements in the batch should
+// involve only a single partition).
+// Multi-partition batch needs to be split by the coordinator node and re-sent to
+// correct nodes.
+// With single-partition batches you can send the batch directly to the node for the partition without incurring the
+// additional network hop.
+//
+// It is also possible to pass entire BEGIN BATCH .. APPLY BATCH statement to Query.Exec.
+// There are differences how those are executed.
+// BEGIN BATCH statement passed to Query.Exec is prepared as a whole in a single statement.
+// Session.ExecuteBatch prepares individual statements in the batch.
+// If you have variable-length batches using the same statement, using Session.ExecuteBatch is more efficient.
+//
+// See Example_batch for an example.
+//
+// # Lightweight transactions
+//
+// Query.ScanCAS or Query.MapScanCAS can be used to execute a single-statement lightweight transaction (an
+// INSERT/UPDATE .. IF statement) and reading its result. See example for Query.MapScanCAS.
+//
+// Multiple-statement lightweight transactions can be executed as a logged batch that contains at least one conditional
+// statement. All the conditions must return true for the batch to be applied. You can use Session.ExecuteBatchCAS and
+// Session.MapExecuteBatchCAS when executing the batch to learn about the result of the LWT. See example for
+// Session.MapExecuteBatchCAS.
+//
+// # Retries and speculative execution
+//
+// Queries can be marked as idempotent. Marking the query as idempotent tells the driver that the query can be executed
+// multiple times without affecting its result. Non-idempotent queries are not eligible for retrying nor speculative
+// execution.
+//
+// Idempotent queries are retried in case of errors based on the configured RetryPolicy.
+// If the query is LWT and the configured RetryPolicy additionally implements LWTRetryPolicy
+// interface, then the policy will be cast to LWTRetryPolicy and used this way.
+//
+// Queries can be retried even before they fail by setting a SpeculativeExecutionPolicy. The policy can
+// cause the driver to retry on a different node if the query is taking longer than a specified delay even before the
+// driver receives an error or timeout from the server. When a query is speculatively executed, the original execution
+// is still executing. The two parallel executions of the query race to return a result, the first received result will
+// be returned.
+//
+// # User-defined types
+//
+// UDTs can be mapped (un)marshaled from/to map[string]interface{} a Go struct (or a type implementing
+// UDTUnmarshaler, UDTMarshaler, Unmarshaler or Marshaler interfaces).
+//
+// For structs, cql tag can be used to specify the CQL field name to be mapped to a struct field:
+//
+//	type MyUDT struct {
+//		FieldA int32 `cql:"a"`
+//		FieldB string `cql:"b"`
+//	}
+//
+// See Example_userDefinedTypesMap, Example_userDefinedTypesStruct, ExampleUDTMarshaler, ExampleUDTUnmarshaler.
+//
+// # Metrics and tracing
+//
+// It is possible to provide observer implementations that could be used to gather metrics:
+//
+//   - QueryObserver for monitoring individual queries.
+//   - BatchObserver for monitoring batch queries.
+//   - ConnectObserver for monitoring new connections from the driver to the database.
+//   - FrameHeaderObserver for monitoring individual protocol frames.
+//
+// CQL protocol also supports tracing of queries. When enabled, the database will write information about
+// internal events that happened during execution of the query. You can use Query.Trace to request tracing and receive
+// the session ID that the database used to store the trace information in system_traces.sessions and
+// system_traces.events tables. NewTraceWriter returns an implementation of Tracer that writes the events to a writer.
+// Gathering trace information might be essential for debugging and optimizing queries, but writing traces has overhead,
+// so this feature should not be used on production systems with very high load unless you know what you are doing.
+// There is also a new implementation of Tracer - TracerEnhanced, that is intended to be more reliable and convinient to use.
+// It has a funcionality to check if trace is ready to be extracted and only actually gets it if requested which makes
+// the impact on a performance smaller.
+package gocql // import "github.com/gocql/gocql"
diff --git a/vendor/github.com/gocql/gocql/docker-compose.yml b/vendor/github.com/gocql/gocql/docker-compose.yml
new file mode 100644
index 0000000..5a8ef97
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/docker-compose.yml
@@ -0,0 +1,90 @@
+version: "3.7"
+
+services:
+  node_1:
+    image: ${SCYLLA_IMAGE}
+    privileged: true
+    command: |
+      --smp 2
+      --memory 768M
+      --seeds 192.168.100.11
+      --overprovisioned 1
+      --experimental-features udf
+      --enable-user-defined-functions true
+    networks:
+      public:
+        ipv4_address: 192.168.100.11
+    volumes:
+    - /tmp/scylla:/var/lib/scylla/
+    - type: bind
+      source: ./testdata/config/scylla.yaml
+      target: /etc/scylla/scylla.yaml
+    - type: bind
+      source: ./testdata/pki/ca.crt
+      target: /etc/scylla/ca.crt
+    - type: bind
+      source: ./testdata/pki/cassandra.crt
+      target: /etc/scylla/db.crt
+    - type: bind
+      source: ./testdata/pki/cassandra.key
+      target: /etc/scylla/db.key
+    healthcheck:
+      test: [ "CMD", "cqlsh", "-e", "select * from system.local" ]
+      interval: 5s
+      timeout: 5s
+      retries: 18
+  node_2:
+    image: ${SCYLLA_IMAGE}
+    command: |
+      --smp 2
+      --memory 1G
+      --seeds 192.168.100.12
+    networks:
+      public:
+        ipv4_address: 192.168.100.12
+    healthcheck:
+      test: [ "CMD", "cqlsh", "192.168.100.12", "-e", "select * from system.local" ]
+      interval: 5s
+      timeout: 5s
+      retries: 18
+  node_3:
+    image: ${SCYLLA_IMAGE}
+    command: |
+      --smp 2
+      --memory 1G
+      --seeds 192.168.100.12
+    networks:
+      public:
+        ipv4_address: 192.168.100.13
+    healthcheck:
+      test: [ "CMD", "cqlsh", "192.168.100.13", "-e", "select * from system.local" ]
+      interval: 5s
+      timeout: 5s
+      retries: 18
+    depends_on:
+      node_2:
+        condition: service_healthy
+  node_4:
+    image: ${SCYLLA_IMAGE}
+    command: |
+      --smp 2
+      --memory 1G
+      --seeds 192.168.100.12
+    networks:
+      public:
+        ipv4_address: 192.168.100.14
+    healthcheck:
+      test: [ "CMD", "cqlsh", "192.168.100.14", "-e", "select * from system.local" ]
+      interval: 5s
+      timeout: 5s
+      retries: 18
+    depends_on:
+      node_3:
+        condition: service_healthy
+networks:
+  public:
+    driver: bridge
+    ipam:
+      driver: default
+      config:
+        - subnet: 192.168.100.0/24
diff --git a/vendor/github.com/gocql/gocql/errors.go b/vendor/github.com/gocql/gocql/errors.go
new file mode 100644
index 0000000..bafaab6
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/errors.go
@@ -0,0 +1,227 @@
+package gocql
+
+import "fmt"
+
+// See CQL Binary Protocol v5, section 8 for more details.
+// https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec
+const (
+	// ErrCodeServer indicates unexpected error on server-side.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1246-L1247
+	ErrCodeServer = 0x0000
+	// ErrCodeProtocol indicates a protocol violation by some client message.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1248-L1250
+	ErrCodeProtocol = 0x000A
+	// ErrCodeCredentials indicates missing required authentication.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1251-L1254
+	ErrCodeCredentials = 0x0100
+	// ErrCodeUnavailable indicates unavailable error.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1255-L1265
+	ErrCodeUnavailable = 0x1000
+	// ErrCodeOverloaded returned in case of request on overloaded node coordinator.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1266-L1267
+	ErrCodeOverloaded = 0x1001
+	// ErrCodeBootstrapping returned from the coordinator node in bootstrapping phase.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1268-L1269
+	ErrCodeBootstrapping = 0x1002
+	// ErrCodeTruncate indicates truncation exception.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1270
+	ErrCodeTruncate = 0x1003
+	// ErrCodeWriteTimeout returned in case of timeout during the request write.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1271-L1304
+	ErrCodeWriteTimeout = 0x1100
+	// ErrCodeReadTimeout returned in case of timeout during the request read.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1305-L1321
+	ErrCodeReadTimeout = 0x1200
+	// ErrCodeReadFailure indicates request read error which is not covered by ErrCodeReadTimeout.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1322-L1340
+	ErrCodeReadFailure = 0x1300
+	// ErrCodeFunctionFailure indicates an error in user-defined function.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1341-L1347
+	ErrCodeFunctionFailure = 0x1400
+	// ErrCodeWriteFailure indicates request write error which is not covered by ErrCodeWriteTimeout.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1348-L1385
+	ErrCodeWriteFailure = 0x1500
+	// ErrCodeCDCWriteFailure is defined, but not yet documented in CQLv5 protocol.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1386
+	ErrCodeCDCWriteFailure = 0x1600
+	// ErrCodeCASWriteUnknown indicates only partially completed CAS operation.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1387-L1397
+	ErrCodeCASWriteUnknown = 0x1700
+	// ErrCodeSyntax indicates the syntax error in the query.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1399
+	ErrCodeSyntax = 0x2000
+	// ErrCodeUnauthorized indicates access rights violation by user on performed operation.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1400-L1401
+	ErrCodeUnauthorized = 0x2100
+	// ErrCodeInvalid indicates invalid query error which is not covered by ErrCodeSyntax.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1402
+	ErrCodeInvalid = 0x2200
+	// ErrCodeConfig indicates the configuration error.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1403
+	ErrCodeConfig = 0x2300
+	// ErrCodeAlreadyExists is returned for the requests creating the existing keyspace/table.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1404-L1413
+	ErrCodeAlreadyExists = 0x2400
+	// ErrCodeUnprepared returned from the host for prepared statement which is unknown.
+	//
+	// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1414-L1417
+	ErrCodeUnprepared = 0x2500
+)
+
+type RequestError interface {
+	Code() int
+	Message() string
+	Error() string
+}
+
+type errorFrame struct {
+	frameHeader
+
+	code    int
+	message string
+}
+
+func (e errorFrame) Code() int {
+	return e.code
+}
+
+func (e errorFrame) Message() string {
+	return e.message
+}
+
+func (e errorFrame) Error() string {
+	return e.Message()
+}
+
+func (e errorFrame) String() string {
+	return fmt.Sprintf("[error code=%x message=%q]", e.code, e.message)
+}
+
+type RequestErrUnavailable struct {
+	errorFrame
+	Consistency Consistency
+	Required    int
+	Alive       int
+}
+
+func (e *RequestErrUnavailable) String() string {
+	return fmt.Sprintf("[request_error_unavailable consistency=%s required=%d alive=%d]", e.Consistency, e.Required, e.Alive)
+}
+
+type ErrorMap map[string]uint16
+
+type RequestErrWriteTimeout struct {
+	errorFrame
+	Consistency Consistency
+	Received    int
+	BlockFor    int
+	WriteType   string
+}
+
+type RequestErrWriteFailure struct {
+	errorFrame
+	Consistency Consistency
+	Received    int
+	BlockFor    int
+	NumFailures int
+	WriteType   string
+	ErrorMap    ErrorMap
+}
+
+type RequestErrCDCWriteFailure struct {
+	errorFrame
+}
+
+type RequestErrReadTimeout struct {
+	errorFrame
+	Consistency Consistency
+	Received    int
+	BlockFor    int
+	DataPresent byte
+}
+
+type RequestErrAlreadyExists struct {
+	errorFrame
+	Keyspace string
+	Table    string
+}
+
+type RequestErrUnprepared struct {
+	errorFrame
+	StatementId []byte
+}
+
+type RequestErrReadFailure struct {
+	errorFrame
+	Consistency Consistency
+	Received    int
+	BlockFor    int
+	NumFailures int
+	DataPresent bool
+	ErrorMap    ErrorMap
+}
+
+type RequestErrFunctionFailure struct {
+	errorFrame
+	Keyspace string
+	Function string
+	ArgTypes []string
+}
+
+// RequestErrCASWriteUnknown is distinct error for ErrCodeCasWriteUnknown.
+//
+// See https://github.com/apache/cassandra/blob/7337fc0/doc/native_protocol_v5.spec#L1387-L1397
+type RequestErrCASWriteUnknown struct {
+	errorFrame
+	Consistency Consistency
+	Received    int
+	BlockFor    int
+}
+
+type UnknownServerError struct {
+	errorFrame
+}
+
+type OpType uint8
+
+const (
+	OpTypeRead  OpType = 0
+	OpTypeWrite OpType = 1
+)
+
+type RequestErrRateLimitReached struct {
+	errorFrame
+	OpType                OpType
+	RejectedByCoordinator bool
+}
+
+func (e *RequestErrRateLimitReached) String() string {
+	var opType string
+	if e.OpType == OpTypeRead {
+		opType = "Read"
+	} else if e.OpType == OpTypeWrite {
+		opType = "Write"
+	} else {
+		opType = "Other"
+	}
+	return fmt.Sprintf("[request_error_rate_limit_reached OpType=%s RejectedByCoordinator=%t]", opType, e.RejectedByCoordinator)
+}
diff --git a/vendor/github.com/gocql/gocql/events.go b/vendor/github.com/gocql/gocql/events.go
new file mode 100644
index 0000000..833af82
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/events.go
@@ -0,0 +1,256 @@
+package gocql
+
+import (
+	"net"
+	"sync"
+	"time"
+)
+
+type eventDebouncer struct {
+	name   string
+	timer  *time.Timer
+	mu     sync.Mutex
+	events []frame
+
+	callback func([]frame)
+	quit     chan struct{}
+
+	logger StdLogger
+}
+
+func newEventDebouncer(name string, eventHandler func([]frame), logger StdLogger) *eventDebouncer {
+	e := &eventDebouncer{
+		name:     name,
+		quit:     make(chan struct{}),
+		timer:    time.NewTimer(eventDebounceTime),
+		callback: eventHandler,
+		logger:   logger,
+	}
+	e.timer.Stop()
+	go e.flusher()
+
+	return e
+}
+
+func (e *eventDebouncer) stop() {
+	e.quit <- struct{}{} // sync with flusher
+	close(e.quit)
+}
+
+func (e *eventDebouncer) flusher() {
+	for {
+		select {
+		case <-e.timer.C:
+			e.mu.Lock()
+			e.flush()
+			e.mu.Unlock()
+		case <-e.quit:
+			return
+		}
+	}
+}
+
+const (
+	eventBufferSize   = 1000
+	eventDebounceTime = 1 * time.Second
+)
+
+// flush must be called with mu locked
+func (e *eventDebouncer) flush() {
+	if len(e.events) == 0 {
+		return
+	}
+
+	// if the flush interval is faster than the callback then we will end up calling
+	// the callback multiple times, probably a bad idea. In this case we could drop
+	// frames?
+	go e.callback(e.events)
+	e.events = make([]frame, 0, eventBufferSize)
+}
+
+func (e *eventDebouncer) debounce(frame frame) {
+	e.mu.Lock()
+	e.timer.Reset(eventDebounceTime)
+
+	// TODO: probably need a warning to track if this threshold is too low
+	if len(e.events) < eventBufferSize {
+		e.events = append(e.events, frame)
+	} else {
+		e.logger.Printf("%s: buffer full, dropping event frame: %s", e.name, frame)
+	}
+
+	e.mu.Unlock()
+}
+
+func (s *Session) handleEvent(framer *framer) {
+	frame, err := framer.parseFrame()
+	if err != nil {
+		s.logger.Printf("gocql: unable to parse event frame: %v\n", err)
+		return
+	}
+
+	if gocqlDebug {
+		s.logger.Printf("gocql: handling frame: %v\n", frame)
+	}
+
+	switch f := frame.(type) {
+	case *schemaChangeKeyspace, *schemaChangeFunction,
+		*schemaChangeTable, *schemaChangeAggregate, *schemaChangeType:
+
+		s.schemaEvents.debounce(frame)
+	case *topologyChangeEventFrame, *statusChangeEventFrame:
+		s.nodeEvents.debounce(frame)
+	default:
+		s.logger.Printf("gocql: invalid event frame (%T): %v\n", f, f)
+	}
+}
+
+func (s *Session) handleSchemaEvent(frames []frame) {
+	// TODO: debounce events
+	for _, frame := range frames {
+		switch f := frame.(type) {
+		case *schemaChangeKeyspace:
+			s.metadataDescriber.clearSchema(f.keyspace)
+			s.handleKeyspaceChange(f.keyspace, f.change)
+		case *schemaChangeTable:
+			s.metadataDescriber.clearSchema(f.keyspace)
+			s.handleTableChange(f.keyspace, f.object, f.change)
+		case *schemaChangeAggregate:
+			s.metadataDescriber.clearSchema(f.keyspace)
+		case *schemaChangeFunction:
+			s.metadataDescriber.clearSchema(f.keyspace)
+		case *schemaChangeType:
+			s.metadataDescriber.clearSchema(f.keyspace)
+		}
+	}
+}
+
+func (s *Session) handleKeyspaceChange(keyspace, change string) {
+	s.control.awaitSchemaAgreement()
+	if change == "DROPPED" || change == "UPDATED" {
+		s.metadataDescriber.removeTabletsWithKeyspace(keyspace)
+	}
+	s.policy.KeyspaceChanged(KeyspaceUpdateEvent{Keyspace: keyspace, Change: change})
+}
+
+func (s *Session) handleTableChange(keyspace, table, change string) {
+	if change == "DROPPED" || change == "UPDATED" {
+		s.metadataDescriber.removeTabletsWithTable(keyspace, table)
+	}
+}
+
+// handleNodeEvent handles inbound status and topology change events.
+//
+// Status events are debounced by host IP; only the latest event is processed.
+//
+// Topology events are debounced by performing a single full topology refresh
+// whenever any topology event comes in.
+//
+// Processing topology change events before status change events ensures
+// that a NEW_NODE event is not dropped in favor of a newer UP event (which
+// would itself be dropped/ignored, as the node is not yet known).
+func (s *Session) handleNodeEvent(frames []frame) {
+	type nodeEvent struct {
+		change string
+		host   net.IP
+		port   int
+	}
+
+	topologyEventReceived := false
+	// status change events
+	sEvents := make(map[string]*nodeEvent)
+
+	for _, frame := range frames {
+		switch f := frame.(type) {
+		case *topologyChangeEventFrame:
+			topologyEventReceived = true
+		case *statusChangeEventFrame:
+			event, ok := sEvents[f.host.String()]
+			if !ok {
+				event = &nodeEvent{change: f.change, host: f.host, port: f.port}
+				sEvents[f.host.String()] = event
+			}
+			event.change = f.change
+		}
+	}
+
+	if topologyEventReceived && !s.cfg.Events.DisableTopologyEvents {
+		s.debounceRingRefresh()
+	}
+
+	for _, f := range sEvents {
+		if gocqlDebug {
+			s.logger.Printf("gocql: dispatching status change event: %+v\n", f)
+		}
+
+		// ignore events we received if they were disabled
+		// see https://github.com/gocql/gocql/issues/1591
+		switch f.change {
+		case "UP":
+			if !s.cfg.Events.DisableNodeStatusEvents {
+				s.handleNodeUp(f.host, f.port)
+			}
+		case "DOWN":
+			if !s.cfg.Events.DisableNodeStatusEvents {
+				s.handleNodeDown(f.host, f.port)
+			}
+		}
+	}
+}
+
+func (s *Session) handleNodeUp(eventIp net.IP, eventPort int) {
+	if gocqlDebug {
+		s.logger.Printf("gocql: Session.handleNodeUp: %s:%d\n", eventIp.String(), eventPort)
+	}
+
+	host, ok := s.hostSource.getHostByIP(eventIp.String())
+	if !ok {
+		s.debounceRingRefresh()
+		return
+	}
+
+	if s.cfg.filterHost(host) {
+		return
+	}
+
+	if d := host.Version().nodeUpDelay(); d > 0 {
+		time.Sleep(d)
+	}
+	s.startPoolFill(host)
+}
+
+func (s *Session) startPoolFill(host *HostInfo) {
+	// we let the pool call handleNodeConnected to change the host state
+	s.pool.addHost(host)
+	s.policy.AddHost(host)
+}
+
+func (s *Session) handleNodeConnected(host *HostInfo) {
+	if gocqlDebug {
+		s.logger.Printf("gocql: Session.handleNodeConnected: %s:%d\n", host.ConnectAddress(), host.Port())
+	}
+
+	host.setState(NodeUp)
+
+	if !s.cfg.filterHost(host) {
+		s.policy.HostUp(host)
+	}
+}
+
+func (s *Session) handleNodeDown(ip net.IP, port int) {
+	if gocqlDebug {
+		s.logger.Printf("gocql: Session.handleNodeDown: %s:%d\n", ip.String(), port)
+	}
+
+	host, ok := s.hostSource.getHostByIP(ip.String())
+	if ok {
+		host.setState(NodeDown)
+		if s.cfg.filterHost(host) {
+			return
+		}
+
+		s.policy.HostDown(host)
+		hostID := host.HostID()
+		s.pool.removeHost(hostID)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/exec.go b/vendor/github.com/gocql/gocql/exec.go
new file mode 100644
index 0000000..26e483f
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/exec.go
@@ -0,0 +1,110 @@
+package gocql
+
+import (
+	"fmt"
+)
+
+// SingleHostQueryExecutor allows to quickly execute diagnostic queries while
+// connected to only a single node.
+// The executor opens only a single connection to a node and does not use
+// connection pools.
+// Consistency level used is ONE.
+// Retry policy is applied, attempts are visible in query metrics but query
+// observer is not notified.
+type SingleHostQueryExecutor struct {
+	session *Session
+	control *controlConn
+}
+
+// Exec executes the query without returning any rows.
+func (e SingleHostQueryExecutor) Exec(stmt string, values ...interface{}) error {
+	return e.control.query(stmt, values...).Close()
+}
+
+// Iter executes the query and returns an iterator capable of iterating
+// over all results.
+func (e SingleHostQueryExecutor) Iter(stmt string, values ...interface{}) *Iter {
+	return e.control.query(stmt, values...)
+}
+
+func (e SingleHostQueryExecutor) Close() {
+	if e.control != nil {
+		e.control.close()
+	}
+	if e.session != nil {
+		e.session.Close()
+	}
+}
+
+// NewSingleHostQueryExecutor creates a SingleHostQueryExecutor by connecting
+// to one of the hosts specified in the ClusterConfig.
+// If ProtoVersion is not specified version 4 is used.
+// Caller is responsible for closing the executor after use.
+func NewSingleHostQueryExecutor(cfg *ClusterConfig) (e SingleHostQueryExecutor, err error) {
+	// Check that hosts in the ClusterConfig is not empty
+	if len(cfg.Hosts) < 1 {
+		err = ErrNoHosts
+		return
+	}
+
+	c := *cfg
+
+	// If protocol version not set assume 4 and skip discovery
+	if c.ProtoVersion == 0 {
+		c.ProtoVersion = 4
+	}
+
+	// Close in case of error
+	defer func() {
+		if err != nil {
+			e.Close()
+		}
+	}()
+
+	// Create uninitialised session
+	c.disableInit = true
+	if e.session, err = NewSession(c); err != nil {
+		err = fmt.Errorf("new session: %w", err)
+		return
+	}
+
+	var hosts []*HostInfo
+	if hosts, err = addrsToHosts(c.Hosts, c.Port, c.Logger); err != nil {
+		err = fmt.Errorf("addrs to hosts: %w", err)
+		return
+	}
+
+	// Create control connection to one of the hosts
+	e.control = createControlConn(e.session)
+
+	// shuffle endpoints so not all drivers will connect to the same initial
+	// node.
+	hosts = shuffleHosts(hosts)
+
+	conncfg := *e.control.session.connCfg
+	conncfg.disableCoalesce = true
+
+	var conn *Conn
+
+	for _, host := range hosts {
+		conn, err = e.control.session.dial(e.control.session.ctx, host, &conncfg, e.control)
+		if err != nil {
+			e.control.session.logger.Printf("gocql: unable to dial control conn %v:%v: %v\n", host.ConnectAddress(), host.Port(), err)
+			continue
+		}
+		err = e.control.setupConn(conn)
+		if err == nil {
+			break
+		}
+		e.control.session.logger.Printf("gocql: unable setup control conn %v:%v: %v\n", host.ConnectAddress(), host.Port(), err)
+		conn.Close()
+		conn = nil
+	}
+
+	if conn == nil {
+		err = fmt.Errorf("setup: %w", err)
+		return
+	}
+
+	return
+}
diff --git a/vendor/github.com/gocql/gocql/filters.go b/vendor/github.com/gocql/gocql/filters.go
new file mode 100644
index 0000000..ecd9c77
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/filters.go
@@ -0,0 +1,57 @@
+package gocql
+
+import "fmt"
+
+// HostFilter interface is used when a host is discovered via server sent events.
+type HostFilter interface {
+	// Called when a new host is discovered, returning true will cause the host
+	// to be added to the pools.
+	Accept(host *HostInfo) bool
+}
+
+// HostFilterFunc converts a func(host HostInfo) bool into a HostFilter
+type HostFilterFunc func(host *HostInfo) bool
+
+func (fn HostFilterFunc) Accept(host *HostInfo) bool {
+	return fn(host)
+}
+
+// AcceptAllFilter will accept all hosts
+func AcceptAllFilter() HostFilter {
+	return HostFilterFunc(func(host *HostInfo) bool {
+		return true
+	})
+}
+
+func DenyAllFilter() HostFilter {
+	return HostFilterFunc(func(host *HostInfo) bool {
+		return false
+	})
+}
+
+// DataCentreHostFilter filters all hosts such that they are in the same data centre
+// as the supplied data centre.
+func DataCentreHostFilter(dataCentre string) HostFilter {
+	return HostFilterFunc(func(host *HostInfo) bool {
+		return host.DataCenter() == dataCentre
+	})
+}
+
+// WhiteListHostFilter filters incoming hosts by checking that their address is
+// in the initial hosts whitelist.
+func WhiteListHostFilter(hosts ...string) HostFilter {
+	hostInfos, err := addrsToHosts(hosts, 9042, nopLogger{})
+	if err != nil {
+		// dont want to panic here, but rather not break the API
+		panic(fmt.Errorf("unable to lookup host info from address: %v", err))
+	}
+
+	m := make(map[string]bool, len(hostInfos))
+	for _, host := range hostInfos {
+		m[host.ConnectAddress().String()] = true
+	}
+
+	return HostFilterFunc(func(host *HostInfo) bool {
+		return m[host.ConnectAddress().String()]
+	})
+}
diff --git a/vendor/github.com/gocql/gocql/frame.go b/vendor/github.com/gocql/gocql/frame.go
new file mode 100644
index 0000000..1f1a027
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/frame.go
@@ -0,0 +1,2119 @@
+// Copyright (c) 2012 The gocql Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gocql
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"net"
+	"runtime"
+	"strings"
+	"time"
+)
+
+type unsetColumn struct{}
+
+// UnsetValue represents a value used in a query binding that will be ignored by Cassandra.
+//
+// By setting a field to the unset value Cassandra will ignore the write completely.
+// The main advantage is the ability to keep the same prepared statement even when you don't
+// want to update some fields, where before you needed to make another prepared statement.
+//
+// UnsetValue is only available when using the version 4 of the protocol.
+var UnsetValue = unsetColumn{}
+
+type namedValue struct {
+	name  string
+	value interface{}
+}
+
+// NamedValue produce a value which will bind to the named parameter in a query
+func NamedValue(name string, value interface{}) interface{} {
+	return &namedValue{
+		name:  name,
+		value: value,
+	}
+}
+
+const (
+	protoDirectionMask = 0x80
+	protoVersionMask   = 0x7F
+	protoVersion1      = 0x01
+	protoVersion2      = 0x02
+	protoVersion3      = 0x03
+	protoVersion4      = 0x04
+	protoVersion5      = 0x05
+
+	maxFrameSize = 256 * 1024 * 1024
+)
+
+type protoVersion byte
+
+func (p protoVersion) request() bool {
+	return p&protoDirectionMask == 0x00
+}
+
+func (p protoVersion) response() bool {
+	return p&protoDirectionMask == 0x80
+}
+
+func (p protoVersion) version() byte {
+	return byte(p) & protoVersionMask
+}
+
+func (p protoVersion) String() string {
+	dir := "REQ"
+	if p.response() {
+		dir = "RESP"
+	}
+
+	return fmt.Sprintf("[version=%d direction=%s]", p.version(), dir)
+}
+
+type frameOp byte
+
+const (
+	// header ops
+	opError         frameOp = 0x00
+	opStartup       frameOp = 0x01
+	opReady         frameOp = 0x02
+	opAuthenticate  frameOp = 0x03
+	opOptions       frameOp = 0x05
+	opSupported     frameOp = 0x06
+	opQuery         frameOp = 0x07
+	opResult        frameOp = 0x08
+	opPrepare       frameOp = 0x09
+	opExecute       frameOp = 0x0A
+	opRegister      frameOp = 0x0B
+	opEvent         frameOp = 0x0C
+	opBatch         frameOp = 0x0D
+	opAuthChallenge frameOp = 0x0E
+	opAuthResponse  frameOp = 0x0F
+	opAuthSuccess   frameOp = 0x10
+)
+
+func (f frameOp) String() string {
+	switch f {
+	case opError:
+		return "ERROR"
+	case opStartup:
+		return "STARTUP"
+	case opReady:
+		return "READY"
+	case opAuthenticate:
+		return "AUTHENTICATE"
+	case opOptions:
+		return "OPTIONS"
+	case opSupported:
+		return "SUPPORTED"
+	case opQuery:
+		return "QUERY"
+	case opResult:
+		return "RESULT"
+	case opPrepare:
+		return "PREPARE"
+	case opExecute:
+		return "EXECUTE"
+	case opRegister:
+		return "REGISTER"
+	case opEvent:
+		return "EVENT"
+	case opBatch:
+		return "BATCH"
+	case opAuthChallenge:
+		return "AUTH_CHALLENGE"
+	case opAuthResponse:
+		return "AUTH_RESPONSE"
+	case opAuthSuccess:
+		return "AUTH_SUCCESS"
+	default:
+		return fmt.Sprintf("UNKNOWN_OP_%d", f)
+	}
+}
+
+const (
+	// result kind
+	resultKindVoid          = 1
+	resultKindRows          = 2
+	resultKindKeyspace      = 3
+	resultKindPrepared      = 4
+	resultKindSchemaChanged = 5
+
+	// rows flags
+	flagGlobalTableSpec int = 0x01
+	flagHasMorePages    int = 0x02
+	flagNoMetaData      int = 0x04
+
+	// query flags
+	flagValues                byte = 0x01
+	flagSkipMetaData          byte = 0x02
+	flagPageSize              byte = 0x04
+	flagWithPagingState       byte = 0x08
+	flagWithSerialConsistency byte = 0x10
+	flagDefaultTimestamp      byte = 0x20
+	flagWithNameValues        byte = 0x40
+	flagWithKeyspace          byte = 0x80
+
+	// prepare flags
+	flagWithPreparedKeyspace uint32 = 0x01
+
+	// header flags
+	flagCompress      byte = 0x01
+	flagTracing       byte = 0x02
+	flagCustomPayload byte = 0x04
+	flagWarning       byte = 0x08
+	flagBetaProtocol  byte = 0x10
+)
+
+// DEPRECATED use Consistency type, SerialConsistency is now an alias for backwards compatibility.
+type SerialConsistency = Consistency
+
+type Consistency uint16
+
+const (
+	Any         Consistency = 0x00
+	One         Consistency = 0x01
+	Two         Consistency = 0x02
+	Three       Consistency = 0x03
+	Quorum      Consistency = 0x04
+	All         Consistency = 0x05
+	LocalQuorum Consistency = 0x06
+	EachQuorum  Consistency = 0x07
+	Serial      Consistency = 0x08
+	LocalSerial Consistency = 0x09
+	LocalOne    Consistency = 0x0A
+)
+
+func (c Consistency) String() string {
+	switch c {
+	case Any:
+		return "ANY"
+	case One:
+		return "ONE"
+	case Two:
+		return "TWO"
+	case Three:
+		return "THREE"
+	case Quorum:
+		return "QUORUM"
+	case All:
+		return "ALL"
+	case LocalQuorum:
+		return "LOCAL_QUORUM"
+	case EachQuorum:
+		return "EACH_QUORUM"
+	case Serial:
+		return "SERIAL"
+	case LocalSerial:
+		return "LOCAL_SERIAL"
+	case LocalOne:
+		return "LOCAL_ONE"
+	default:
+		return fmt.Sprintf("UNKNOWN_CONS_0x%x", uint16(c))
+	}
+}
+
+func (c Consistency) IsSerial() bool {
+	return c == Serial || c == LocalSerial
+}
+
+func (c Consistency) MarshalText() (text []byte, err error) {
+	return []byte(c.String()), nil
+}
+
+func (c *Consistency) UnmarshalText(text []byte) error {
+	switch string(text) {
+	case "ANY":
+		*c = Any
+	case "ONE":
+		*c = One
+	case "TWO":
+		*c = Two
+	case "THREE":
+		*c = Three
+	case "QUORUM":
+		*c = Quorum
+	case "ALL":
+		*c = All
+	case "LOCAL_QUORUM":
+		*c = LocalQuorum
+	case "EACH_QUORUM":
+		*c = EachQuorum
+	case "SERIAL":
+		*c = Serial
+	case "LOCAL_SERIAL":
+		*c = LocalSerial
+	case "LOCAL_ONE":
+		*c = LocalOne
+	default:
+		return fmt.Errorf("invalid consistency %q", string(text))
+	}
+
+	return nil
+}
+
+func ParseConsistency(s string) Consistency {
+	var c Consistency
+	if err := c.UnmarshalText([]byte(strings.ToUpper(s))); err != nil {
+		panic(err)
+	}
+	return c
+}
+
+// ParseConsistencyWrapper wraps gocql.ParseConsistency to provide an err
+// return instead of a panic
+func ParseConsistencyWrapper(s string) (consistency Consistency, err error) {
+	err = consistency.UnmarshalText([]byte(strings.ToUpper(s)))
+	return
+}
+
+// MustParseConsistency is the same as ParseConsistency except it returns
+// an error (never). It is kept here since breaking changes are not good.
+// DEPRECATED: use ParseConsistency if you want a panic on parse error.
+func MustParseConsistency(s string) (Consistency, error) {
+	c, err := ParseConsistencyWrapper(s)
+	if err != nil {
+		panic(err)
+	}
+	return c, nil
+}
+
+const (
+	apacheCassandraTypePrefix = "org.apache.cassandra.db.marshal."
+)
+
+var (
+	ErrFrameTooBig = errors.New("frame length is bigger than the maximum allowed")
+)
+
+const maxFrameHeaderSize = 9
+
+func readInt(p []byte) int32 {
+	return int32(p[0])<<24 | int32(p[1])<<16 | int32(p[2])<<8 | int32(p[3])
+}
+
+type frameHeader struct {
+	version  protoVersion
+	flags    byte
+	stream   int
+	op       frameOp
+	length   int
+	warnings []string
+}
+
+func (f frameHeader) String() string {
+	return fmt.Sprintf("[header version=%s flags=0x%x stream=%d op=%s length=%d]", f.version, f.flags, f.stream, f.op, f.length)
+}
+
+func (f frameHeader) Header() frameHeader {
+	return f
+}
+
+const defaultBufSize = 128
+
+type ObservedFrameHeader struct {
+	Version protoVersion
+	Flags   byte
+	Stream  int16
+	Opcode  frameOp
+	Length  int32
+
+	// StartHeader is the time we started reading the frame header off the network connection.
+	Start time.Time
+	// EndHeader is the time we finished reading the frame header off the network connection.
+	End time.Time
+
+	// Host is Host of the connection the frame header was read from.
+	Host *HostInfo
+}
+
+func (f ObservedFrameHeader) String() string {
+	return fmt.Sprintf("[observed header version=%s flags=0x%x stream=%d op=%s length=%d]", f.Version, f.Flags, f.Stream, f.Opcode, f.Length)
+}
+
+// FrameHeaderObserver is the interface implemented by frame observers / stat collectors.
+//
+// Experimental, this interface and use may change
+type FrameHeaderObserver interface {
+	// ObserveFrameHeader gets called on every received frame header.
+	ObserveFrameHeader(context.Context, ObservedFrameHeader)
+}
+
+type framerInterface interface {
+	ReadBytesInternal() ([]byte, error)
+	GetCustomPayload() map[string][]byte
+	GetHeaderWarnings() []string
+}
+
+// a framer is responsible for reading, writing and parsing frames on a single stream
+type framer struct {
+	proto byte
+	// flags are for outgoing flags, enabling compression and tracing etc
+	flags    byte
+	compres  Compressor
+	headSize int
+	// if this frame was read then the header will be here
+	header *frameHeader
+
+	// if tracing flag is set this is not nil
+	traceID []byte
+
+	// holds a ref to the whole byte slice for buf so that it can be reset to
+	// 0 after a read.
+	readBuffer []byte
+
+	buf []byte
+
+	customPayload map[string][]byte
+
+	flagLWT               int
+	rateLimitingErrorCode int
+	tabletsRoutingV1      bool
+}
+
+func newFramer(compressor Compressor, version byte) *framer {
+	buf := make([]byte, defaultBufSize)
+	f := &framer{
+		buf:        buf[:0],
+		readBuffer: buf,
+	}
+	var flags byte
+	if compressor != nil {
+		flags |= flagCompress
+	}
+	if version == protoVersion5 {
+		flags |= flagBetaProtocol
+	}
+
+	version &= protoVersionMask
+
+	headSize := 8
+	if version > protoVersion2 {
+		headSize = 9
+	}
+
+	f.compres = compressor
+	f.proto = version
+	f.flags = flags
+	f.headSize = headSize
+
+	f.header = nil
+	f.traceID = nil
+
+	f.tabletsRoutingV1 = false
+
+	return f
+}
+
+func newFramerWithExts(compressor Compressor, version byte, cqlProtoExts []cqlProtocolExtension) *framer {
+
+	f := newFramer(compressor, version)
+
+	if lwtExt := findCQLProtoExtByName(cqlProtoExts, lwtAddMetadataMarkKey); lwtExt != nil {
+		castedExt, ok := lwtExt.(*lwtAddMetadataMarkExt)
+		if !ok {
+			Logger.Println(
+				fmt.Errorf("Failed to cast CQL protocol extension identified by name %s to type %T",
+					lwtAddMetadataMarkKey, lwtAddMetadataMarkExt{}))
+			return f
+		}
+		f.flagLWT = castedExt.lwtOptMetaBitMask
+	}
+
+	if rateLimitErrorExt := findCQLProtoExtByName(cqlProtoExts, rateLimitError); rateLimitErrorExt != nil {
+		castedExt, ok := rateLimitErrorExt.(*rateLimitExt)
+		if !ok {
+			Logger.Println(
+				fmt.Errorf("Failed to cast CQL protocol extension identified by name %s to type %T",
+					rateLimitError, rateLimitExt{}))
+			return f
+		}
+		f.rateLimitingErrorCode = castedExt.rateLimitErrorCode
+	}
+
+	if tabletsExt := findCQLProtoExtByName(cqlProtoExts, tabletsRoutingV1); tabletsExt != nil {
+		_, ok := tabletsExt.(*tabletsRoutingV1Ext)
+		if !ok {
+			Logger.Println(
+				fmt.Errorf("Failed to cast CQL protocol extension identified by name %s to type %T",
+					tabletsRoutingV1, tabletsRoutingV1Ext{}))
+			return f
+		}
+		f.tabletsRoutingV1 = true
+	}
+
+	return f
+}
+
+type frame interface {
+	Header() frameHeader
+}
+
+func readHeader(r io.Reader, p []byte) (head frameHeader, err error) {
+	_, err = io.ReadFull(r, p[:1])
+	if err != nil {
+		return frameHeader{}, err
+	}
+
+	version := p[0] & protoVersionMask
+
+	if version < protoVersion1 || version > protoVersion5 {
+		return frameHeader{}, fmt.Errorf("gocql: unsupported protocol response version: %d", version)
+	}
+
+	headSize := 9
+	if version < protoVersion3 {
+		headSize = 8
+	}
+
+	_, err = io.ReadFull(r, p[1:headSize])
+	if err != nil {
+		return frameHeader{}, err
+	}
+
+	p = p[:headSize]
+
+	head.version = protoVersion(p[0])
+	head.flags = p[1]
+
+	if version > protoVersion2 {
+		if len(p) != 9 {
+			return frameHeader{}, fmt.Errorf("not enough bytes to read header require 9 got: %d", len(p))
+		}
+
+		head.stream = int(int16(p[2])<<8 | int16(p[3]))
+		head.op = frameOp(p[4])
+		head.length = int(readInt(p[5:]))
+	} else {
+		if len(p) != 8 {
+			return frameHeader{}, fmt.Errorf("not enough bytes to read header require 8 got: %d", len(p))
+		}
+
+		head.stream = int(int8(p[2]))
+		head.op = frameOp(p[3])
+		head.length = int(readInt(p[4:]))
+	}
+
+	return head, nil
+}
+
+// explicitly enables tracing for the framers outgoing requests
+func (f *framer) trace() {
+	f.flags |= flagTracing
+}
+
+// explicitly enables the custom payload flag
+func (f *framer) payload() {
+	f.flags |= flagCustomPayload
+}
+
+// reads a frame form the wire into the framers buffer
+func (f *framer) readFrame(r io.Reader, head *frameHeader) error {
+	if head.length < 0 {
+		return fmt.Errorf("frame body length can not be less than 0: %d", head.length)
+	} else if head.length > maxFrameSize {
+		// need to free up the connection to be used again
+		_, err := io.CopyN(ioutil.Discard, r, int64(head.length))
+		if err != nil {
+			return fmt.Errorf("error whilst trying to discard frame with invalid length: %v", err)
+		}
+		return ErrFrameTooBig
+	}
+
+	if cap(f.readBuffer) >= head.length {
+		f.buf = f.readBuffer[:head.length]
+	} else {
+		f.readBuffer = make([]byte, head.length)
+		f.buf = f.readBuffer
+	}
+
+	// assume the underlying reader takes care of timeouts and retries
+	n, err := io.ReadFull(r, f.buf)
+	if err != nil {
+		return fmt.Errorf("unable to read frame body: read %d/%d bytes: %v", n, head.length, err)
+	}
+
+	if head.flags&flagCompress == flagCompress {
+		if f.compres == nil {
+			return NewErrProtocol("no compressor available with compressed frame body")
+		}
+
+		f.buf, err = f.compres.Decode(f.buf)
+		if err != nil {
+			return err
+		}
+	}
+
+	f.header = head
+	return nil
+}
+
+func (f *framer) parseFrame() (frame frame, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			if _, ok := r.(runtime.Error); ok {
+				panic(r)
+			}
+			err = r.(error)
+		}
+	}()
+
+	if f.header.version.request() {
+		return nil, NewErrProtocol("got a request frame from server: %v", f.header.version)
+	}
+
+	if f.header.flags&flagTracing == flagTracing {
+		f.readTrace()
+	}
+
+	if f.header.flags&flagWarning == flagWarning {
+		f.header.warnings = f.readStringList()
+	}
+
+	if f.header.flags&flagCustomPayload == flagCustomPayload {
+		f.customPayload = f.readBytesMap()
+	}
+
+	// assumes that the frame body has been read into rbuf
+	switch f.header.op {
+	case opError:
+		frame = f.parseErrorFrame()
+	case opReady:
+		frame = f.parseReadyFrame()
+	case opResult:
+		frame, err = f.parseResultFrame()
+	case opSupported:
+		frame = f.parseSupportedFrame()
+	case opAuthenticate:
+		frame = f.parseAuthenticateFrame()
+	case opAuthChallenge:
+		frame = f.parseAuthChallengeFrame()
+	case opAuthSuccess:
+		frame = f.parseAuthSuccessFrame()
+	case opEvent:
+		frame = f.parseEventFrame()
+	default:
+		return nil, NewErrProtocol("unknown op in frame header: %s", f.header.op)
+	}
+
+	return
+}
+
+func (f *framer) parseErrorFrame() frame {
+	code := f.readInt()
+	msg := f.readString()
+
+	errD := errorFrame{
+		frameHeader: *f.header,
+		code:        code,
+		message:     msg,
+	}
+
+	switch code {
+	case ErrCodeUnavailable:
+		cl := f.readConsistency()
+		required := f.readInt()
+		alive := f.readInt()
+		return &RequestErrUnavailable{
+			errorFrame:  errD,
+			Consistency: cl,
+			Required:    required,
+			Alive:       alive,
+		}
+	case ErrCodeWriteTimeout:
+		cl := f.readConsistency()
+		received := f.readInt()
+		blockfor := f.readInt()
+		writeType := f.readString()
+		return &RequestErrWriteTimeout{
+			errorFrame:  errD,
+			Consistency: cl,
+			Received:    received,
+			BlockFor:    blockfor,
+			WriteType:   writeType,
+		}
+	case ErrCodeReadTimeout:
+		cl := f.readConsistency()
+		received := f.readInt()
+		blockfor := f.readInt()
+		dataPresent := f.readByte()
+		return &RequestErrReadTimeout{
+			errorFrame:  errD,
+			Consistency: cl,
+			Received:    received,
+			BlockFor:    blockfor,
+			DataPresent: dataPresent,
+		}
+	case ErrCodeAlreadyExists:
+		ks := f.readString()
+		table := f.readString()
+		return &RequestErrAlreadyExists{
+			errorFrame: errD,
+			Keyspace:   ks,
+			Table:      table,
+		}
+	case ErrCodeUnprepared:
+		stmtId := f.readShortBytes()
+		return &RequestErrUnprepared{
+			errorFrame:  errD,
+			StatementId: copyBytes(stmtId), // defensively copy
+		}
+	case ErrCodeReadFailure:
+		res := &RequestErrReadFailure{
+			errorFrame: errD,
+		}
+		res.Consistency = f.readConsistency()
+		res.Received = f.readInt()
+		res.BlockFor = f.readInt()
+		if f.proto > protoVersion4 {
+			res.ErrorMap = f.readErrorMap()
+			res.NumFailures = len(res.ErrorMap)
+		} else {
+			res.NumFailures = f.readInt()
+		}
+		res.DataPresent = f.readByte() != 0
+
+		return res
+	case ErrCodeWriteFailure:
+		res := &RequestErrWriteFailure{
+			errorFrame: errD,
+		}
+		res.Consistency = f.readConsistency()
+		res.Received = f.readInt()
+		res.BlockFor = f.readInt()
+		if f.proto > protoVersion4 {
+			res.ErrorMap = f.readErrorMap()
+			res.NumFailures = len(res.ErrorMap)
+		} else {
+			res.NumFailures = f.readInt()
+		}
+		res.WriteType = f.readString()
+		return res
+	case ErrCodeFunctionFailure:
+		res := &RequestErrFunctionFailure{
+			errorFrame: errD,
+		}
+		res.Keyspace = f.readString()
+		res.Function = f.readString()
+		res.ArgTypes = f.readStringList()
+		return res
+
+	case ErrCodeCDCWriteFailure:
+		res := &RequestErrCDCWriteFailure{
+			errorFrame: errD,
+		}
+		return res
+	case ErrCodeCASWriteUnknown:
+		res := &RequestErrCASWriteUnknown{
+			errorFrame: errD,
+		}
+		res.Consistency = f.readConsistency()
+		res.Received = f.readInt()
+		res.BlockFor = f.readInt()
+		return res
+	case ErrCodeInvalid, ErrCodeBootstrapping, ErrCodeConfig, ErrCodeCredentials, ErrCodeOverloaded,
+		ErrCodeProtocol, ErrCodeServer, ErrCodeSyntax, ErrCodeTruncate, ErrCodeUnauthorized:
+		// TODO(zariel): we should have some distinct types for these errors
+		return errD
+	default:
+		if f.rateLimitingErrorCode != 0 && code == f.rateLimitingErrorCode {
+			res := &RequestErrRateLimitReached{
+				errorFrame: errD,
+			}
+			res.OpType = OpType(f.readByte())
+			res.RejectedByCoordinator = f.readByte() != 0
+			return res
+		} else {
+			return &UnknownServerError{
+				errorFrame: errD,
+			}
+		}
+	}
+}
+
+func (f *framer) readErrorMap() (errMap ErrorMap) {
+	errMap = make(ErrorMap)
+	numErrs := f.readInt()
+	for i := 0; i < numErrs; i++ {
+		ip := f.readInetAdressOnly().String()
+		errMap[ip] = f.readShort()
+	}
+	return
+}
+
+func (f *framer) writeHeader(flags byte, op frameOp, stream int) {
+	f.buf = f.buf[:0]
+	f.buf = append(f.buf,
+		f.proto,
+		flags,
+	)
+
+	if f.proto > protoVersion2 {
+		f.buf = append(f.buf,
+			byte(stream>>8),
+			byte(stream),
+		)
+	} else {
+		f.buf = append(f.buf,
+			byte(stream),
+		)
+	}
+
+	// pad out length
+	f.buf = append(f.buf,
+		byte(op),
+		0,
+		0,
+		0,
+		0,
+	)
+}
+
+func (f *framer) setLength(length int) {
+	p := 4
+	if f.proto > protoVersion2 {
+		p = 5
+	}
+
+	f.buf[p+0] = byte(length >> 24)
+	f.buf[p+1] = byte(length >> 16)
+	f.buf[p+2] = byte(length >> 8)
+	f.buf[p+3] = byte(length)
+}
+
+func (f *framer) finish() error {
+	if len(f.buf) > maxFrameSize {
+		// huge app frame, lets remove it so it doesn't bloat the heap
+		f.buf = make([]byte, defaultBufSize)
+		return ErrFrameTooBig
+	}
+
+	if f.buf[1]&flagCompress == flagCompress {
+		if f.compres == nil {
+			panic("compress flag set with no compressor")
+		}
+
+		// TODO: only compress frames which are big enough
+		compressed, err := f.compres.Encode(f.buf[f.headSize:])
+		if err != nil {
+			return err
+		}
+
+		f.buf = append(f.buf[:f.headSize], compressed...)
+	}
+	length := len(f.buf) - f.headSize
+	f.setLength(length)
+
+	return nil
+}
+
+func (f *framer) writeTo(w io.Writer) error {
+	_, err := w.Write(f.buf)
+	return err
+}
+
+func (f *framer) readTrace() {
+	f.traceID = f.readUUID().Bytes()
+}
+
+type readyFrame struct {
+	frameHeader
+}
+
+func (f *framer) parseReadyFrame() frame {
+	return &readyFrame{
+		frameHeader: *f.header,
+	}
+}
+
+type supportedFrame struct {
+	frameHeader
+
+	supported map[string][]string
+}
+
+// TODO: if we move the body buffer onto the frameHeader then we only need a single
+// framer, and can move the methods onto the header.
+func (f *framer) parseSupportedFrame() frame {
+	return &supportedFrame{
+		frameHeader: *f.header,
+
+		supported: f.readStringMultiMap(),
+	}
+}
+
+type writeStartupFrame struct {
+	opts map[string]string
+}
+
+func (w writeStartupFrame) String() string {
+	return fmt.Sprintf("[startup opts=%+v]", w.opts)
+}
+
+func (w *writeStartupFrame) buildFrame(f *framer, streamID int) error {
+	f.writeHeader(f.flags&^flagCompress, opStartup, streamID)
+	f.writeStringMap(w.opts)
+
+	return f.finish()
+}
+
+type writePrepareFrame struct {
+	statement     string
+	keyspace      string
+	customPayload map[string][]byte
+}
+
+func (w *writePrepareFrame) buildFrame(f *framer, streamID int) error {
+	if len(w.customPayload) > 0 {
+		f.payload()
+	}
+	f.writeHeader(f.flags, opPrepare, streamID)
+	f.writeCustomPayload(&w.customPayload)
+	f.writeLongString(w.statement)
+
+	var flags uint32 = 0
+	if w.keyspace != "" {
+		if f.proto > protoVersion4 {
+			flags |= flagWithPreparedKeyspace
+		} else {
+			panic(fmt.Errorf("the keyspace can only be set with protocol 5 or higher"))
+		}
+	}
+	if f.proto > protoVersion4 {
+		f.writeUint(flags)
+	}
+	if w.keyspace != "" {
+		f.writeString(w.keyspace)
+	}
+
+	return f.finish()
+}
+
+func (f *framer) readTypeInfo() TypeInfo {
+	// TODO: factor this out so the same code paths can be used to parse custom
+	// types and other types, as much of the logic will be duplicated.
+	id := f.readShort()
+
+	simple := NativeType{
+		proto: f.proto,
+		typ:   Type(id),
+	}
+
+	if simple.typ == TypeCustom {
+		simple.custom = f.readString()
+		if cassType := getApacheCassandraType(simple.custom); cassType != TypeCustom {
+			simple.typ = cassType
+		}
+	}
+
+	switch simple.typ {
+	case TypeTuple:
+		n := f.readShort()
+		tuple := TupleTypeInfo{
+			NativeType: simple,
+			Elems:      make([]TypeInfo, n),
+		}
+
+		for i := 0; i < int(n); i++ {
+			tuple.Elems[i] = f.readTypeInfo()
+		}
+
+		return tuple
+
+	case TypeUDT:
+		udt := UDTTypeInfo{
+			NativeType: simple,
+		}
+		udt.KeySpace = f.readString()
+		udt.Name = f.readString()
+
+		n := f.readShort()
+		udt.Elements = make([]UDTField, n)
+		for i := 0; i < int(n); i++ {
+			field := &udt.Elements[i]
+			field.Name = f.readString()
+			field.Type = f.readTypeInfo()
+		}
+
+		return udt
+	case TypeMap, TypeList, TypeSet:
+		collection := CollectionType{
+			NativeType: simple,
+		}
+
+		if simple.typ == TypeMap {
+			collection.Key = f.readTypeInfo()
+		}
+
+		collection.Elem = f.readTypeInfo()
+
+		return collection
+	}
+
+	return simple
+}
+
+type preparedMetadata struct {
+	resultMetadata
+
+	// LWT query detected
+	lwt bool
+
+	// proto v4+
+	pkeyColumns []int
+
+	keyspace string
+
+	table string
+}
+
+func (r preparedMetadata) String() string {
+	return fmt.Sprintf("[prepared flags=0x%x pkey=%v paging_state=% X columns=%v col_count=%d actual_col_count=%d lwt=%t]",
+		r.flags, r.pkeyColumns, r.pagingState, r.columns, r.colCount, r.actualColCount, r.lwt)
+}
+
+func (f *framer) parsePreparedMetadata() preparedMetadata {
+	// TODO: deduplicate this from parseMetadata
+	meta := preparedMetadata{}
+
+	meta.flags = f.readInt()
+	meta.colCount = f.readInt()
+	if meta.colCount < 0 {
+		panic(fmt.Errorf("received negative column count: %d", meta.colCount))
+	}
+	meta.actualColCount = meta.colCount
+
+	if f.proto >= protoVersion4 {
+		pkeyCount := f.readInt()
+		pkeys := make([]int, pkeyCount)
+		for i := 0; i < pkeyCount; i++ {
+			pkeys[i] = int(f.readShort())
+		}
+		meta.pkeyColumns = pkeys
+	}
+
+	meta.lwt = meta.flags&f.flagLWT == f.flagLWT
+
+	if meta.flags&flagHasMorePages == flagHasMorePages {
+		meta.pagingState = copyBytes(f.readBytes())
+	}
+
+	if meta.flags&flagNoMetaData == flagNoMetaData {
+		return meta
+	}
+
+	globalSpec := meta.flags&flagGlobalTableSpec == flagGlobalTableSpec
+	if globalSpec {
+		meta.keyspace = f.readString()
+		meta.table = f.readString()
+	}
+
+	var cols []ColumnInfo
+	if meta.colCount < 1000 {
+		// preallocate columninfo to avoid excess copying
+		cols = make([]ColumnInfo, meta.colCount)
+		for i := 0; i < meta.colCount; i++ {
+			f.readCol(&cols[i], &meta.resultMetadata, globalSpec, meta.keyspace, meta.table)
+		}
+	} else {
+		// use append, huge number of columns usually indicates a corrupt frame or
+		// just a huge row.
+		for i := 0; i < meta.colCount; i++ {
+			var col ColumnInfo
+			f.readCol(&col, &meta.resultMetadata, globalSpec, meta.keyspace, meta.table)
+			cols = append(cols, col)
+		}
+	}
+
+	meta.columns = cols
+
+	return meta
+}
+
+type resultMetadata struct {
+	flags int
+
+	// only if flagPageState
+	pagingState []byte
+
+	columns  []ColumnInfo
+	colCount int
+
+	// this is a count of the total number of columns which can be scanned,
+	// it is at minimum len(columns) but may be larger, for instance when a column
+	// is a UDT or tuple.
+	actualColCount int
+}
+
+func (r *resultMetadata) morePages() bool {
+	return r.flags&flagHasMorePages == flagHasMorePages
+}
+
+func (r resultMetadata) String() string {
+	return fmt.Sprintf("[metadata flags=0x%x paging_state=% X columns=%v]", r.flags, r.pagingState, r.columns)
+}
+
+func (f *framer) readCol(col *ColumnInfo, meta *resultMetadata, globalSpec bool, keyspace, table string) {
+	if !globalSpec {
+		col.Keyspace = f.readString()
+		col.Table = f.readString()
+	} else {
+		col.Keyspace = keyspace
+		col.Table = table
+	}
+
+	col.Name = f.readString()
+	col.TypeInfo = f.readTypeInfo()
+	switch v := col.TypeInfo.(type) {
+	// maybe also UDT
+	case TupleTypeInfo:
+		// -1 because we already included the tuple column
+		meta.actualColCount += len(v.Elems) - 1
+	}
+}
+
+func (f *framer) parseResultMetadata() resultMetadata {
+	var meta resultMetadata
+
+	meta.flags = f.readInt()
+	meta.colCount = f.readInt()
+	if meta.colCount < 0 {
+		panic(fmt.Errorf("received negative column count: %d", meta.colCount))
+	}
+	meta.actualColCount = meta.colCount
+
+	if meta.flags&flagHasMorePages == flagHasMorePages {
+		meta.pagingState = copyBytes(f.readBytes())
+	}
+
+	if meta.flags&flagNoMetaData == flagNoMetaData {
+		return meta
+	}
+
+	var keyspace, table string
+	globalSpec := meta.flags&flagGlobalTableSpec == flagGlobalTableSpec
+	if globalSpec {
+		keyspace = f.readString()
+		table = f.readString()
+	}
+
+	var cols []ColumnInfo
+	if meta.colCount < 1000 {
+		// preallocate columninfo to avoid excess copying
+		cols = make([]ColumnInfo, meta.colCount)
+		for i := 0; i < meta.colCount; i++ {
+			f.readCol(&cols[i], &meta, globalSpec, keyspace, table)
+		}
+
+	} else {
+		// use append, huge number of columns usually indicates a corrupt frame or
+		// just a huge row.
+		for i := 0; i < meta.colCount; i++ {
+			var col ColumnInfo
+			f.readCol(&col, &meta, globalSpec, keyspace, table)
+			cols = append(cols, col)
+		}
+	}
+
+	meta.columns = cols
+
+	return meta
+}
+
+type resultVoidFrame struct {
+	frameHeader
+}
+
+func (f *resultVoidFrame) String() string {
+	return "[result_void]"
+}
+
+func (f *framer) parseResultFrame() (frame, error) {
+	kind := f.readInt()
+
+	switch kind {
+	case resultKindVoid:
+		return &resultVoidFrame{frameHeader: *f.header}, nil
+	case resultKindRows:
+		return f.parseResultRows(), nil
+	case resultKindKeyspace:
+		return f.parseResultSetKeyspace(), nil
+	case resultKindPrepared:
+		return f.parseResultPrepared(), nil
+	case resultKindSchemaChanged:
+		return f.parseResultSchemaChange(), nil
+	}
+
+	return nil, NewErrProtocol("unknown result kind: %x", kind)
+}
+
+type resultRowsFrame struct {
+	frameHeader
+
+	meta resultMetadata
+	// dont parse the rows here as we only need to do it once
+	numRows int
+}
+
+func (f *resultRowsFrame) String() string {
+	return fmt.Sprintf("[result_rows meta=%v]", f.meta)
+}
+
+func (f *framer) parseResultRows() frame {
+	result := &resultRowsFrame{}
+	result.meta = f.parseResultMetadata()
+
+	result.numRows = f.readInt()
+	if result.numRows < 0 {
+		panic(fmt.Errorf("invalid row_count in result frame: %d", result.numRows))
+	}
+
+	return result
+}
+
+type resultKeyspaceFrame struct {
+	frameHeader
+	keyspace string
+}
+
+func (r *resultKeyspaceFrame) String() string {
+	return fmt.Sprintf("[result_keyspace keyspace=%s]", r.keyspace)
+}
+
+func (f *framer) parseResultSetKeyspace() frame {
+	return &resultKeyspaceFrame{
+		frameHeader: *f.header,
+		keyspace:    f.readString(),
+	}
+}
+
+type resultPreparedFrame struct {
+	frameHeader
+
+	preparedID []byte
+	reqMeta    preparedMetadata
+	respMeta   resultMetadata
+}
+
+func (f *framer) parseResultPrepared() frame {
+	frame := &resultPreparedFrame{
+		frameHeader: *f.header,
+		preparedID:  f.readShortBytes(),
+		reqMeta:     f.parsePreparedMetadata(),
+	}
+
+	if f.proto < protoVersion2 {
+		return frame
+	}
+
+	frame.respMeta = f.parseResultMetadata()
+
+	return frame
+}
+
+type schemaChangeKeyspace struct {
+	frameHeader
+
+	change   string
+	keyspace string
+}
+
+func (f schemaChangeKeyspace) String() string {
+	return fmt.Sprintf("[event schema_change_keyspace change=%q keyspace=%q]", f.change, f.keyspace)
+}
+
+type schemaChangeTable struct {
+	frameHeader
+
+	change   string
+	keyspace string
+	object   string
+}
+
+func (f schemaChangeTable) String() string {
+	return fmt.Sprintf("[event schema_change change=%q keyspace=%q object=%q]", f.change, f.keyspace, f.object)
+}
+
+type schemaChangeType struct {
+	frameHeader
+
+	change   string
+	keyspace string
+	object   string
+}
+
+type schemaChangeFunction struct {
+	frameHeader
+
+	change   string
+	keyspace string
+	name     string
+	args     []string
+}
+
+type schemaChangeAggregate struct {
+	frameHeader
+
+	change   string
+	keyspace string
+	name     string
+	args     []string
+}
+
+func (f *framer) parseResultSchemaChange() frame {
+	if f.proto <= protoVersion2 {
+		change := f.readString()
+		keyspace := f.readString()
+		table := f.readString()
+
+		if table != "" {
+			return &schemaChangeTable{
+				frameHeader: *f.header,
+				change:      change,
+				keyspace:    keyspace,
+				object:      table,
+			}
+		} else {
+			return &schemaChangeKeyspace{
+				frameHeader: *f.header,
+				change:      change,
+				keyspace:    keyspace,
+			}
+		}
+	} else {
+		change := f.readString()
+		target := f.readString()
+
+		// TODO: could just use a separate type for each target
+		switch target {
+		case "KEYSPACE":
+			frame := &schemaChangeKeyspace{
+				frameHeader: *f.header,
+				change:      change,
+			}
+
+			frame.keyspace = f.readString()
+
+			return frame
+		case "TABLE":
+			frame := &schemaChangeTable{
+				frameHeader: *f.header,
+				change:      change,
+			}
+
+			frame.keyspace = f.readString()
+			frame.object = f.readString()
+
+			return frame
+		case "TYPE":
+			frame := &schemaChangeType{
+				frameHeader: *f.header,
+				change:      change,
+			}
+
+			frame.keyspace = f.readString()
+			frame.object = f.readString()
+
+			return frame
+		case "FUNCTION":
+			frame := &schemaChangeFunction{
+				frameHeader: *f.header,
+				change:      change,
+			}
+
+			frame.keyspace = f.readString()
+			frame.name = f.readString()
+			frame.args = f.readStringList()
+
+			return frame
+		case "AGGREGATE":
+			frame := &schemaChangeAggregate{
+				frameHeader: *f.header,
+				change:      change,
+			}
+
+			frame.keyspace = f.readString()
+			frame.name = f.readString()
+			frame.args = f.readStringList()
+
+			return frame
+		default:
+			panic(fmt.Errorf("gocql: unknown SCHEMA_CHANGE target: %q change: %q", target, change))
+		}
+	}
+
+}
+
+type authenticateFrame struct {
+	frameHeader
+
+	class string
+}
+
+func (a *authenticateFrame) String() string {
+	return fmt.Sprintf("[authenticate class=%q]", a.class)
+}
+
+func (f *framer) parseAuthenticateFrame() frame {
+	return &authenticateFrame{
+		frameHeader: *f.header,
+		class:       f.readString(),
+	}
+}
+
+type authSuccessFrame struct {
+	frameHeader
+
+	data []byte
+}
+
+func (a *authSuccessFrame) String() string {
+	return fmt.Sprintf("[auth_success data=%q]", a.data)
+}
+
+func (f *framer) parseAuthSuccessFrame() frame {
+	return &authSuccessFrame{
+		frameHeader: *f.header,
+		data:        f.readBytes(),
+	}
+}
+
+type authChallengeFrame struct {
+	frameHeader
+
+	data []byte
+}
+
+func (a *authChallengeFrame) String() string {
+	return fmt.Sprintf("[auth_challenge data=%q]", a.data)
+}
+
+func (f *framer) parseAuthChallengeFrame() frame {
+	return &authChallengeFrame{
+		frameHeader: *f.header,
+		data:        f.readBytes(),
+	}
+}
+
+type statusChangeEventFrame struct {
+	frameHeader
+
+	change string
+	host   net.IP
+	port   int
+}
+
+func (t statusChangeEventFrame) String() string {
+	return fmt.Sprintf("[status_change change=%s host=%v port=%v]", t.change, t.host, t.port)
+}
+
+// essentially the same as statusChange
+type topologyChangeEventFrame struct {
+	frameHeader
+
+	change string
+	host   net.IP
+	port   int
+}
+
+func (t topologyChangeEventFrame) String() string {
+	return fmt.Sprintf("[topology_change change=%s host=%v port=%v]", t.change, t.host, t.port)
+}
+
+func (f *framer) parseEventFrame() frame {
+	eventType := f.readString()
+
+	switch eventType {
+	case "TOPOLOGY_CHANGE":
+		frame := &topologyChangeEventFrame{frameHeader: *f.header}
+		frame.change = f.readString()
+		frame.host, frame.port = f.readInet()
+
+		return frame
+	case "STATUS_CHANGE":
+		frame := &statusChangeEventFrame{frameHeader: *f.header}
+		frame.change = f.readString()
+		frame.host, frame.port = f.readInet()
+
+		return frame
+	case "SCHEMA_CHANGE":
+		// this should work for all versions
+		return f.parseResultSchemaChange()
+	default:
+		panic(fmt.Errorf("gocql: unknown event type: %q", eventType))
+	}
+
+}
+
+type writeAuthResponseFrame struct {
+	data []byte
+}
+
+func (a *writeAuthResponseFrame) String() string {
+	return fmt.Sprintf("[auth_response data=%q]", a.data)
+}
+
+func (a *writeAuthResponseFrame) buildFrame(framer *framer, streamID int) error {
+	return framer.writeAuthResponseFrame(streamID, a.data)
+}
+
+func (f *framer) writeAuthResponseFrame(streamID int, data []byte) error {
+	f.writeHeader(f.flags, opAuthResponse, streamID)
+	f.writeBytes(data)
+	return f.finish()
+}
+
+type queryValues struct {
+	value []byte
+
+	// optional name, will set With names for values flag
+	name    string
+	isUnset bool
+}
+
+type queryParams struct {
+	consistency Consistency
+	// v2+
+	skipMeta          bool
+	values            []queryValues
+	pageSize          int
+	pagingState       []byte
+	serialConsistency Consistency
+	// v3+
+	defaultTimestamp      bool
+	defaultTimestampValue int64
+	// v5+
+	keyspace string
+}
+
+func (q queryParams) String() string {
+	return fmt.Sprintf("[query_params consistency=%v skip_meta=%v page_size=%d paging_state=%q serial_consistency=%v default_timestamp=%v values=%v keyspace=%s]",
+		q.consistency, q.skipMeta, q.pageSize, q.pagingState, q.serialConsistency, q.defaultTimestamp, q.values, q.keyspace)
+}
+
+func (f *framer) writeQueryParams(opts *queryParams) {
+	f.writeConsistency(opts.consistency)
+
+	if f.proto == protoVersion1 {
+		return
+	}
+
+	var flags byte
+	if len(opts.values) > 0 {
+		flags |= flagValues
+	}
+	if opts.skipMeta {
+		flags |= flagSkipMetaData
+	}
+	if opts.pageSize > 0 {
+		flags |= flagPageSize
+	}
+	if len(opts.pagingState) > 0 {
+		flags |= flagWithPagingState
+	}
+	if opts.serialConsistency > 0 {
+		flags |= flagWithSerialConsistency
+	}
+
+	names := false
+
+	// protoV3 specific things
+	if f.proto > protoVersion2 {
+		if opts.defaultTimestamp {
+			flags |= flagDefaultTimestamp
+		}
+
+		if len(opts.values) > 0 && opts.values[0].name != "" {
+			flags |= flagWithNameValues
+			names = true
+		}
+	}
+
+	if opts.keyspace != "" {
+		if f.proto > protoVersion4 {
+			flags |= flagWithKeyspace
+		} else {
+			panic(fmt.Errorf("the keyspace can only be set with protocol 5 or higher"))
+		}
+	}
+
+	if f.proto > protoVersion4 {
+		f.writeUint(uint32(flags))
+	} else {
+		f.writeByte(flags)
+	}
+
+	if n := len(opts.values); n > 0 {
+		f.writeShort(uint16(n))
+
+		for i := 0; i < n; i++ {
+			if names {
+				f.writeString(opts.values[i].name)
+			}
+			if opts.values[i].isUnset {
+				f.writeUnset()
+			} else {
+				f.writeBytes(opts.values[i].value)
+			}
+		}
+	}
+
+	if opts.pageSize > 0 {
+		f.writeInt(int32(opts.pageSize))
+	}
+
+	if len(opts.pagingState) > 0 {
+		f.writeBytes(opts.pagingState)
+	}
+
+	if opts.serialConsistency > 0 {
+		f.writeConsistency(opts.serialConsistency)
+	}
+
+	if f.proto > protoVersion2 && opts.defaultTimestamp {
+		// timestamp in microseconds
+		var ts int64
+		if opts.defaultTimestampValue != 0 {
+			ts = opts.defaultTimestampValue
+		} else {
+			ts = time.Now().UnixNano() / 1000
+		}
+		f.writeLong(ts)
+	}
+
+	if opts.keyspace != "" {
+		f.writeString(opts.keyspace)
+	}
+}
+
+type writeQueryFrame struct {
+	statement string
+	params    queryParams
+
+	// v4+
+	customPayload map[string][]byte
+}
+
+func (w *writeQueryFrame) String() string {
+	return fmt.Sprintf("[query statement=%q params=%v]", w.statement, w.params)
+}
+
+func (w *writeQueryFrame) buildFrame(framer *framer, streamID int) error {
+	return framer.writeQueryFrame(streamID, w.statement, &w.params, w.customPayload)
+}
+
+func (f *framer) writeQueryFrame(streamID int, statement string, params *queryParams, customPayload map[string][]byte) error {
+	if len(customPayload) > 0 {
+		f.payload()
+	}
+	f.writeHeader(f.flags, opQuery, streamID)
+	f.writeCustomPayload(&customPayload)
+	f.writeLongString(statement)
+	f.writeQueryParams(params)
+
+	return f.finish()
+}
+
+type frameBuilder interface {
+	buildFrame(framer *framer, streamID int) error
+}
+
+type frameWriterFunc func(framer *framer, streamID int) error
+
+func (f frameWriterFunc) buildFrame(framer *framer, streamID int) error {
+	return f(framer, streamID)
+}
+
+type writeExecuteFrame struct {
+	preparedID []byte
+	params     queryParams
+
+	// v4+
+	customPayload map[string][]byte
+}
+
+func (e *writeExecuteFrame) String() string {
+	return fmt.Sprintf("[execute id=% X params=%v]", e.preparedID, &e.params)
+}
+
+func (e *writeExecuteFrame) buildFrame(fr *framer, streamID int) error {
+	return fr.writeExecuteFrame(streamID, e.preparedID, &e.params, &e.customPayload)
+}
+
+func (f *framer) writeExecuteFrame(streamID int, preparedID []byte, params *queryParams, customPayload *map[string][]byte) error {
+	if len(*customPayload) > 0 {
+		f.payload()
+	}
+	f.writeHeader(f.flags, opExecute, streamID)
+	f.writeCustomPayload(customPayload)
+	f.writeShortBytes(preparedID)
+	if f.proto > protoVersion1 {
+		f.writeQueryParams(params)
+	} else {
+		n := len(params.values)
+		f.writeShort(uint16(n))
+		for i := 0; i < n; i++ {
+			if params.values[i].isUnset {
+				f.writeUnset()
+			} else {
+				f.writeBytes(params.values[i].value)
+			}
+		}
+		f.writeConsistency(params.consistency)
+	}
+
+	return f.finish()
+}
+
+// TODO: can we replace BatchStatemt with batchStatement? As they prety much
+// duplicate each other
+type batchStatment struct {
+	preparedID []byte
+	statement  string
+	values     []queryValues
+}
+
+type writeBatchFrame struct {
+	typ         BatchType
+	statements  []batchStatment
+	consistency Consistency
+
+	// v3+
+	serialConsistency     Consistency
+	defaultTimestamp      bool
+	defaultTimestampValue int64
+
+	//v4+
+	customPayload map[string][]byte
+}
+
+func (w *writeBatchFrame) buildFrame(framer *framer, streamID int) error {
+	return framer.writeBatchFrame(streamID, w, w.customPayload)
+}
+
+func (f *framer) writeBatchFrame(streamID int, w *writeBatchFrame, customPayload map[string][]byte) error {
+	if len(customPayload) > 0 {
+		f.payload()
+	}
+	f.writeHeader(f.flags, opBatch, streamID)
+	f.writeCustomPayload(&customPayload)
+	f.writeByte(byte(w.typ))
+
+	n := len(w.statements)
+	f.writeShort(uint16(n))
+
+	var flags byte
+
+	for i := 0; i < n; i++ {
+		b := &w.statements[i]
+		if len(b.preparedID) == 0 {
+			f.writeByte(0)
+			f.writeLongString(b.statement)
+		} else {
+			f.writeByte(1)
+			f.writeShortBytes(b.preparedID)
+		}
+
+		f.writeShort(uint16(len(b.values)))
+		for j := range b.values {
+			col := b.values[j]
+			if f.proto > protoVersion2 && col.name != "" {
+				// TODO: move this check into the caller and set a flag on writeBatchFrame
+				// to indicate using named values
+				if f.proto <= protoVersion5 {
+					return fmt.Errorf("gocql: named query values are not supported in batches, please see https://issues.apache.org/jira/browse/CASSANDRA-10246")
+				}
+				flags |= flagWithNameValues
+				f.writeString(col.name)
+			}
+			if col.isUnset {
+				f.writeUnset()
+			} else {
+				f.writeBytes(col.value)
+			}
+		}
+	}
+
+	f.writeConsistency(w.consistency)
+
+	if f.proto > protoVersion2 {
+		if w.serialConsistency > 0 {
+			flags |= flagWithSerialConsistency
+		}
+		if w.defaultTimestamp {
+			flags |= flagDefaultTimestamp
+		}
+
+		if f.proto > protoVersion4 {
+			f.writeUint(uint32(flags))
+		} else {
+			f.writeByte(flags)
+		}
+
+		if w.serialConsistency > 0 {
+			f.writeConsistency(w.serialConsistency)
+		}
+
+		if w.defaultTimestamp {
+			var ts int64
+			if w.defaultTimestampValue != 0 {
+				ts = w.defaultTimestampValue
+			} else {
+				ts = time.Now().UnixNano() / 1000
+			}
+			f.writeLong(ts)
+		}
+	}
+
+	return f.finish()
+}
+
+type writeOptionsFrame struct{}
+
+func (w *writeOptionsFrame) buildFrame(framer *framer, streamID int) error {
+	return framer.writeOptionsFrame(streamID, w)
+}
+
+func (f *framer) writeOptionsFrame(stream int, _ *writeOptionsFrame) error {
+	f.writeHeader(f.flags&^flagCompress, opOptions, stream)
+	return f.finish()
+}
+
+type writeRegisterFrame struct {
+	events []string
+}
+
+func (w *writeRegisterFrame) buildFrame(framer *framer, streamID int) error {
+	return framer.writeRegisterFrame(streamID, w)
+}
+
+func (f *framer) writeRegisterFrame(streamID int, w *writeRegisterFrame) error {
+	f.writeHeader(f.flags, opRegister, streamID)
+	f.writeStringList(w.events)
+
+	return f.finish()
+}
+
+func (f *framer) readByte() byte {
+	if len(f.buf) < 1 {
+		panic(fmt.Errorf("not enough bytes in buffer to read byte require 1 got: %d", len(f.buf)))
+	}
+
+	b := f.buf[0]
+	f.buf = f.buf[1:]
+	return b
+}
+
+func (f *framer) readInt() (n int) {
+	if len(f.buf) < 4 {
+		panic(fmt.Errorf("not enough bytes in buffer to read int require 4 got: %d", len(f.buf)))
+	}
+
+	n = int(int32(f.buf[0])<<24 | int32(f.buf[1])<<16 | int32(f.buf[2])<<8 | int32(f.buf[3]))
+	f.buf = f.buf[4:]
+	return
+}
+
+func (f *framer) readShort() (n uint16) {
+	if len(f.buf) < 2 {
+		panic(fmt.Errorf("not enough bytes in buffer to read short require 2 got: %d", len(f.buf)))
+	}
+	n = uint16(f.buf[0])<<8 | uint16(f.buf[1])
+	f.buf = f.buf[2:]
+	return
+}
+
+func (f *framer) readString() (s string) {
+	size := f.readShort()
+
+	if len(f.buf) < int(size) {
+		panic(fmt.Errorf("not enough bytes in buffer to read string require %d got: %d", size, len(f.buf)))
+	}
+
+	s = string(f.buf[:size])
+	f.buf = f.buf[size:]
+	return
+}
+
+func (f *framer) readLongString() (s string) {
+	size := f.readInt()
+
+	if len(f.buf) < size {
+		panic(fmt.Errorf("not enough bytes in buffer to read long string require %d got: %d", size, len(f.buf)))
+	}
+
+	s = string(f.buf[:size])
+	f.buf = f.buf[size:]
+	return
+}
+
+func (f *framer) readUUID() *UUID {
+	if len(f.buf) < 16 {
+		panic(fmt.Errorf("not enough bytes in buffer to read uuid require %d got: %d", 16, len(f.buf)))
+	}
+
+	// TODO: how to handle this error, if it is a uuid, then sureley, problems?
+	u, _ := UUIDFromBytes(f.buf[:16])
+	f.buf = f.buf[16:]
+	return &u
+}
+
+func (f *framer) readStringList() []string {
+	size := f.readShort()
+
+	l := make([]string, size)
+	for i := 0; i < int(size); i++ {
+		l[i] = f.readString()
+	}
+
+	return l
+}
+
+func (f *framer) ReadBytesInternal() ([]byte, error) {
+	size := f.readInt()
+	if size < 0 {
+		return nil, nil
+	}
+
+	if len(f.buf) < size {
+		return nil, fmt.Errorf("not enough bytes in buffer to read bytes require %d got: %d", size, len(f.buf))
+	}
+
+	l := f.buf[:size]
+	f.buf = f.buf[size:]
+
+	return l, nil
+}
+
+func (f *framer) readBytes() []byte {
+	l, err := f.ReadBytesInternal()
+	if err != nil {
+		panic(err)
+	}
+
+	return l
+}
+
+func (f *framer) readShortBytes() []byte {
+	size := f.readShort()
+	if len(f.buf) < int(size) {
+		panic(fmt.Errorf("not enough bytes in buffer to read short bytes: require %d got %d", size, len(f.buf)))
+	}
+
+	l := f.buf[:size]
+	f.buf = f.buf[size:]
+
+	return l
+}
+
+func (f *framer) readInetAdressOnly() net.IP {
+	if len(f.buf) < 1 {
+		panic(fmt.Errorf("not enough bytes in buffer to read inet size require %d got: %d", 1, len(f.buf)))
+	}
+
+	size := f.buf[0]
+	f.buf = f.buf[1:]
+
+	if !(size == 4 || size == 16) {
+		panic(fmt.Errorf("invalid IP size: %d", size))
+	}
+
+	if len(f.buf) < 1 {
+		panic(fmt.Errorf("not enough bytes in buffer to read inet require %d got: %d", size, len(f.buf)))
+	}
+
+	ip := make([]byte, size)
+	copy(ip, f.buf[:size])
+	f.buf = f.buf[size:]
+	return net.IP(ip)
+}
+
+func (f *framer) readInet() (net.IP, int) {
+	return f.readInetAdressOnly(), f.readInt()
+}
+
+func (f *framer) readConsistency() Consistency {
+	return Consistency(f.readShort())
+}
+
+func (f *framer) readBytesMap() map[string][]byte {
+	size := f.readShort()
+	m := make(map[string][]byte, size)
+
+	for i := 0; i < int(size); i++ {
+		k := f.readString()
+		v := f.readBytes()
+		m[k] = v
+	}
+
+	return m
+}
+
+func (f *framer) readStringMultiMap() map[string][]string {
+	size := f.readShort()
+	m := make(map[string][]string, size)
+
+	for i := 0; i < int(size); i++ {
+		k := f.readString()
+		v := f.readStringList()
+		m[k] = v
+	}
+
+	return m
+}
+
+func (f *framer) writeByte(b byte) {
+	f.buf = append(f.buf, b)
+}
+
+func appendBytes(p []byte, d []byte) []byte {
+	if d == nil {
+		return appendInt(p, -1)
+	}
+	p = appendInt(p, int32(len(d)))
+	p = append(p, d...)
+	return p
+}
+
+func appendShort(p []byte, n uint16) []byte {
+	return append(p,
+		byte(n>>8),
+		byte(n),
+	)
+}
+
+func appendInt(p []byte, n int32) []byte {
+	return append(p, byte(n>>24),
+		byte(n>>16),
+		byte(n>>8),
+		byte(n))
+}
+
+func appendUint(p []byte, n uint32) []byte {
+	return append(p, byte(n>>24),
+		byte(n>>16),
+		byte(n>>8),
+		byte(n))
+}
+
+func appendLong(p []byte, n int64) []byte {
+	return append(p,
+		byte(n>>56),
+		byte(n>>48),
+		byte(n>>40),
+		byte(n>>32),
+		byte(n>>24),
+		byte(n>>16),
+		byte(n>>8),
+		byte(n),
+	)
+}
+
+func (f *framer) writeCustomPayload(customPayload *map[string][]byte) {
+	if len(*customPayload) > 0 {
+		if f.proto < protoVersion4 {
+			panic("Custom payload is not supported with version V3 or less")
+		}
+		f.writeBytesMap(*customPayload)
+	}
+}
+
+func (f *framer) GetCustomPayload() map[string][]byte {
+	return f.customPayload
+}
+
+func (f *framer) GetHeaderWarnings() []string {
+	return f.header.warnings
+}
+
+// these are protocol level binary types
+func (f *framer) writeInt(n int32) {
+	f.buf = appendInt(f.buf, n)
+}
+
+func (f *framer) writeUint(n uint32) {
+	f.buf = appendUint(f.buf, n)
+}
+
+func (f *framer) writeShort(n uint16) {
+	f.buf = appendShort(f.buf, n)
+}
+
+func (f *framer) writeLong(n int64) {
+	f.buf = appendLong(f.buf, n)
+}
+
+func (f *framer) writeString(s string) {
+	f.writeShort(uint16(len(s)))
+	f.buf = append(f.buf, s...)
+}
+
+func (f *framer) writeLongString(s string) {
+	f.writeInt(int32(len(s)))
+	f.buf = append(f.buf, s...)
+}
+
+func (f *framer) writeStringList(l []string) {
+	f.writeShort(uint16(len(l)))
+	for _, s := range l {
+		f.writeString(s)
+	}
+}
+
+func (f *framer) writeUnset() {
+	// Protocol version 4 specifies that bind variables do not require having a
+	// value when executing a statement.   Bind variables without a value are
+	// called 'unset'. The 'unset' bind variable is serialized as the int
+	// value '-2' without following bytes.
+	f.writeInt(-2)
+}
+
+func (f *framer) writeBytes(p []byte) {
+	// TODO: handle null case correctly,
+	//     [bytes]        A [int] n, followed by n bytes if n >= 0. If n < 0,
+	//					  no byte should follow and the value represented is `null`.
+	if p == nil {
+		f.writeInt(-1)
+	} else {
+		f.writeInt(int32(len(p)))
+		f.buf = append(f.buf, p...)
+	}
+}
+
+func (f *framer) writeShortBytes(p []byte) {
+	f.writeShort(uint16(len(p)))
+	f.buf = append(f.buf, p...)
+}
+
+func (f *framer) writeConsistency(cons Consistency) {
+	f.writeShort(uint16(cons))
+}
+
+func (f *framer) writeStringMap(m map[string]string) {
+	f.writeShort(uint16(len(m)))
+	for k, v := range m {
+		f.writeString(k)
+		f.writeString(v)
+	}
+}
+
+func (f *framer) writeStringMultiMap(m map[string][]string) {
+	f.writeShort(uint16(len(m)))
+	for k, v := range m {
+		f.writeString(k)
+		f.writeStringList(v)
+	}
+}
+
+func (f *framer) writeBytesMap(m map[string][]byte) {
+	f.writeShort(uint16(len(m)))
+	for k, v := range m {
+		f.writeString(k)
+		f.writeBytes(v)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/fuzz.go b/vendor/github.com/gocql/gocql/fuzz.go
new file mode 100644
index 0000000..0d4cff0
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/fuzz.go
@@ -0,0 +1,34 @@
+//go:build gofuzz
+// +build gofuzz
+
+package gocql
+
+import "bytes"
+
+func Fuzz(data []byte) int {
+	var bw bytes.Buffer
+
+	r := bytes.NewReader(data)
+
+	head, err := readHeader(r, make([]byte, 9))
+	if err != nil {
+		return 0
+	}
+
+	framer := newFramer(r, &bw, nil, byte(head.version))
+	err = framer.readFrame(&head)
+	if err != nil {
+		return 0
+	}
+
+	frame, err := framer.parseFrame()
+	if err != nil {
+		return 0
+	}
+
+	if frame != nil {
+		return 1
+	}
+
+	return 2
+}
diff --git a/vendor/github.com/gocql/gocql/helpers.go b/vendor/github.com/gocql/gocql/helpers.go
new file mode 100644
index 0000000..00f3397
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/helpers.go
@@ -0,0 +1,448 @@
+// Copyright (c) 2012 The gocql Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gocql
+
+import (
+	"fmt"
+	"math/big"
+	"net"
+	"reflect"
+	"strings"
+	"time"
+
+	"gopkg.in/inf.v0"
+)
+
+type RowData struct {
+	Columns []string
+	Values  []interface{}
+}
+
+func goType(t TypeInfo) (reflect.Type, error) {
+	switch t.Type() {
+	case TypeVarchar, TypeAscii, TypeInet, TypeText:
+		return reflect.TypeOf(*new(string)), nil
+	case TypeBigInt, TypeCounter:
+		return reflect.TypeOf(*new(int64)), nil
+	case TypeTime:
+		return reflect.TypeOf(*new(time.Duration)), nil
+	case TypeTimestamp:
+		return reflect.TypeOf(*new(time.Time)), nil
+	case TypeBlob:
+		return reflect.TypeOf(*new([]byte)), nil
+	case TypeBoolean:
+		return reflect.TypeOf(*new(bool)), nil
+	case TypeFloat:
+		return reflect.TypeOf(*new(float32)), nil
+	case TypeDouble:
+		return reflect.TypeOf(*new(float64)), nil
+	case TypeInt:
+		return reflect.TypeOf(*new(int)), nil
+	case TypeSmallInt:
+		return reflect.TypeOf(*new(int16)), nil
+	case TypeTinyInt:
+		return reflect.TypeOf(*new(int8)), nil
+	case TypeDecimal:
+		return reflect.TypeOf(*new(*inf.Dec)), nil
+	case TypeUUID, TypeTimeUUID:
+		return reflect.TypeOf(*new(UUID)), nil
+	case TypeList, TypeSet:
+		elemType, err := goType(t.(CollectionType).Elem)
+		if err != nil {
+			return nil, err
+		}
+		return reflect.SliceOf(elemType), nil
+	case TypeMap:
+		keyType, err := goType(t.(CollectionType).Key)
+		if err != nil {
+			return nil, err
+		}
+		valueType, err := goType(t.(CollectionType).Elem)
+		if err != nil {
+			return nil, err
+		}
+		return reflect.MapOf(keyType, valueType), nil
+	case TypeVarint:
+		return reflect.TypeOf(*new(*big.Int)), nil
+	case TypeTuple:
+		// what can we do here? all there is to do is to make a list of interface{}
+		tuple := t.(TupleTypeInfo)
+		return reflect.TypeOf(make([]interface{}, len(tuple.Elems))), nil
+	case TypeUDT:
+		return reflect.TypeOf(make(map[string]interface{})), nil
+	case TypeDate:
+		return reflect.TypeOf(*new(time.Time)), nil
+	case TypeDuration:
+		return reflect.TypeOf(*new(Duration)), nil
+	default:
+		return nil, fmt.Errorf("cannot create Go type for unknown CQL type %s", t)
+	}
+}
+
+func dereference(i interface{}) interface{} {
+	return reflect.Indirect(reflect.ValueOf(i)).Interface()
+}
+
+func getCassandraBaseType(name string) Type {
+	switch name {
+	case "ascii":
+		return TypeAscii
+	case "bigint":
+		return TypeBigInt
+	case "blob":
+		return TypeBlob
+	case "boolean":
+		return TypeBoolean
+	case "counter":
+		return TypeCounter
+	case "date":
+		return TypeDate
+	case "decimal":
+		return TypeDecimal
+	case "double":
+		return TypeDouble
+	case "duration":
+		return TypeDuration
+	case "float":
+		return TypeFloat
+	case "int":
+		return TypeInt
+	case "smallint":
+		return TypeSmallInt
+	case "tinyint":
+		return TypeTinyInt
+	case "time":
+		return TypeTime
+	case "timestamp":
+		return TypeTimestamp
+	case "uuid":
+		return TypeUUID
+	case "varchar":
+		return TypeVarchar
+	case "text":
+		return TypeText
+	case "varint":
+		return TypeVarint
+	case "timeuuid":
+		return TypeTimeUUID
+	case "inet":
+		return TypeInet
+	case "MapType":
+		return TypeMap
+	case "ListType":
+		return TypeList
+	case "SetType":
+		return TypeSet
+	case "TupleType":
+		return TypeTuple
+	default:
+		return TypeCustom
+	}
+}
+
+func getCassandraType(name string, logger StdLogger) TypeInfo {
+	if strings.HasPrefix(name, "frozen<") {
+		return getCassandraType(strings.TrimPrefix(name[:len(name)-1], "frozen<"), logger)
+	} else if strings.HasPrefix(name, "set<") {
+		return CollectionType{
+			NativeType: NativeType{typ: TypeSet},
+			Elem:       getCassandraType(strings.TrimPrefix(name[:len(name)-1], "set<"), logger),
+		}
+	} else if strings.HasPrefix(name, "list<") {
+		return CollectionType{
+			NativeType: NativeType{typ: TypeList},
+			Elem:       getCassandraType(strings.TrimPrefix(name[:len(name)-1], "list<"), logger),
+		}
+	} else if strings.HasPrefix(name, "map<") {
+		names := splitCompositeTypes(strings.TrimPrefix(name[:len(name)-1], "map<"))
+		if len(names) != 2 {
+			logger.Printf("Error parsing map type, it has %d subelements, expecting 2\n", len(names))
+			return NativeType{
+				typ: TypeCustom,
+			}
+		}
+		return CollectionType{
+			NativeType: NativeType{typ: TypeMap},
+			Key:        getCassandraType(names[0], logger),
+			Elem:       getCassandraType(names[1], logger),
+		}
+	} else if strings.HasPrefix(name, "tuple<") {
+		names := splitCompositeTypes(strings.TrimPrefix(name[:len(name)-1], "tuple<"))
+		types := make([]TypeInfo, len(names))
+
+		for i, name := range names {
+			types[i] = getCassandraType(name, logger)
+		}
+
+		return TupleTypeInfo{
+			NativeType: NativeType{typ: TypeTuple},
+			Elems:      types,
+		}
+	} else {
+		return NativeType{
+			typ: getCassandraBaseType(name),
+		}
+	}
+}
+
+func splitCompositeTypes(name string) []string {
+	if !strings.Contains(name, "<") {
+		return strings.Split(name, ", ")
+	}
+	var parts []string
+	lessCount := 0
+	segment := ""
+	for _, char := range name {
+		if char == ',' && lessCount == 0 {
+			if segment != "" {
+				parts = append(parts, strings.TrimSpace(segment))
+			}
+			segment = ""
+			continue
+		}
+		segment += string(char)
+		if char == '<' {
+			lessCount++
+		} else if char == '>' {
+			lessCount--
+		}
+	}
+	if segment != "" {
+		parts = append(parts, strings.TrimSpace(segment))
+	}
+	return parts
+}
+
+func apacheToCassandraType(t string) string {
+	t = strings.Replace(t, apacheCassandraTypePrefix, "", -1)
+	t = strings.Replace(t, "(", "<", -1)
+	t = strings.Replace(t, ")", ">", -1)
+	types := strings.FieldsFunc(t, func(r rune) bool {
+		return r == '<' || r == '>' || r == ','
+	})
+	for _, typ := range types {
+		t = strings.Replace(t, typ, getApacheCassandraType(typ).String(), -1)
+	}
+	// This is done so it exactly matches what Cassandra returns
+	return strings.Replace(t, ",", ", ", -1)
+}
+
+func getApacheCassandraType(class string) Type {
+	switch strings.TrimPrefix(class, apacheCassandraTypePrefix) {
+	case "AsciiType":
+		return TypeAscii
+	case "LongType":
+		return TypeBigInt
+	case "BytesType":
+		return TypeBlob
+	case "BooleanType":
+		return TypeBoolean
+	case "CounterColumnType":
+		return TypeCounter
+	case "DecimalType":
+		return TypeDecimal
+	case "DoubleType":
+		return TypeDouble
+	case "FloatType":
+		return TypeFloat
+	case "Int32Type":
+		return TypeInt
+	case "ShortType":
+		return TypeSmallInt
+	case "ByteType":
+		return TypeTinyInt
+	case "TimeType":
+		return TypeTime
+	case "DateType", "TimestampType":
+		return TypeTimestamp
+	case "UUIDType", "LexicalUUIDType":
+		return TypeUUID
+	case "UTF8Type":
+		return TypeVarchar
+	case "IntegerType":
+		return TypeVarint
+	case "TimeUUIDType":
+		return TypeTimeUUID
+	case "InetAddressType":
+		return TypeInet
+	case "MapType":
+		return TypeMap
+	case "ListType":
+		return TypeList
+	case "SetType":
+		return TypeSet
+	case "TupleType":
+		return TypeTuple
+	case "DurationType":
+		return TypeDuration
+	default:
+		return TypeCustom
+	}
+}
+
+func (r *RowData) rowMap(m map[string]interface{}) {
+	for i, column := range r.Columns {
+		val := dereference(r.Values[i])
+		if valVal := reflect.ValueOf(val); valVal.Kind() == reflect.Slice {
+			valCopy := reflect.MakeSlice(valVal.Type(), valVal.Len(), valVal.Cap())
+			reflect.Copy(valCopy, valVal)
+			m[column] = valCopy.Interface()
+		} else {
+			m[column] = val
+		}
+	}
+}
+
+// TupeColumnName will return the column name of a tuple value in a column named
+// c at index n. It should be used if a specific element within a tuple is needed
+// to be extracted from a map returned from SliceMap or MapScan.
+func TupleColumnName(c string, n int) string {
+	return fmt.Sprintf("%s[%d]", c, n)
+}
+
+func (iter *Iter) RowData() (RowData, error) {
+	if iter.err != nil {
+		return RowData{}, iter.err
+	}
+
+	columns := make([]string, 0, len(iter.Columns()))
+	values := make([]interface{}, 0, len(iter.Columns()))
+
+	for _, column := range iter.Columns() {
+		if c, ok := column.TypeInfo.(TupleTypeInfo); !ok {
+			val, err := column.TypeInfo.NewWithError()
+			if err != nil {
+				return RowData{}, err
+			}
+			columns = append(columns, column.Name)
+			values = append(values, val)
+		} else {
+			for i, elem := range c.Elems {
+				columns = append(columns, TupleColumnName(column.Name, i))
+				val, err := elem.NewWithError()
+				if err != nil {
+					return RowData{}, err
+				}
+				values = append(values, val)
+			}
+		}
+	}
+
+	rowData := RowData{
+		Columns: columns,
+		Values:  values,
+	}
+
+	return rowData, nil
+}
+
+// TODO(zariel): is it worth exporting this?
+func (iter *Iter) rowMap() (map[string]interface{}, error) {
+	if iter.err != nil {
+		return nil, iter.err
+	}
+
+	rowData, _ := iter.RowData()
+	iter.Scan(rowData.Values...)
+	m := make(map[string]interface{}, len(rowData.Columns))
+	rowData.rowMap(m)
+	return m, nil
+}
+
+// SliceMap is a helper function to make the API easier to use
+// returns the data from the query in the form of []map[string]interface{}
+func (iter *Iter) SliceMap() ([]map[string]interface{}, error) {
+	if iter.err != nil {
+		return nil, iter.err
+	}
+
+	// Not checking for the error because we just did
+	rowData, _ := iter.RowData()
+	dataToReturn := make([]map[string]interface{}, 0)
+	for iter.Scan(rowData.Values...) {
+		m := make(map[string]interface{}, len(rowData.Columns))
+		rowData.rowMap(m)
+		dataToReturn = append(dataToReturn, m)
+	}
+	if iter.err != nil {
+		return nil, iter.err
+	}
+	return dataToReturn, nil
+}
+
+// MapScan takes a map[string]interface{} and populates it with a row
+// that is returned from cassandra.
+//
+// Each call to MapScan() must be called with a new map object.
+// During the call to MapScan() any pointers in the existing map
+// are replaced with non pointer types before the call returns
+//
+//	iter := session.Query(`SELECT * FROM mytable`).Iter()
+//	for {
+//		// New map each iteration
+//		row := make(map[string]interface{})
+//		if !iter.MapScan(row) {
+//			break
+//		}
+//		// Do things with row
+//		if fullname, ok := row["fullname"]; ok {
+//			fmt.Printf("Full Name: %s\n", fullname)
+//		}
+//	}
+//
+// You can also pass pointers in the map before each call
+//
+//	var fullName FullName // Implements gocql.Unmarshaler and gocql.Marshaler interfaces
+//	var address net.IP
+//	var age int
+//	iter := session.Query(`SELECT * FROM scan_map_table`).Iter()
+//	for {
+//		// New map each iteration
+//		row := map[string]interface{}{
+//			"fullname": &fullName,
+//			"age":      &age,
+//			"address":  &address,
+//		}
+//		if !iter.MapScan(row) {
+//			break
+//		}
+//		fmt.Printf("First: %s Age: %d Address: %q\n", fullName.FirstName, age, address)
+//	}
+func (iter *Iter) MapScan(m map[string]interface{}) bool {
+	if iter.err != nil {
+		return false
+	}
+
+	// Not checking for the error because we just did
+	rowData, _ := iter.RowData()
+
+	for i, col := range rowData.Columns {
+		if dest, ok := m[col]; ok {
+			rowData.Values[i] = dest
+		}
+	}
+
+	if iter.Scan(rowData.Values...) {
+		rowData.rowMap(m)
+		return true
+	}
+	return false
+}
+
+func copyBytes(p []byte) []byte {
+	b := make([]byte, len(p))
+	copy(b, p)
+	return b
+}
+
+var failDNS = false
+
+func LookupIP(host string) ([]net.IP, error) {
+	if failDNS {
+		return nil, &net.DNSError{}
+	}
+	return net.LookupIP(host)
+
+}
diff --git a/vendor/github.com/gocql/gocql/host_source.go b/vendor/github.com/gocql/gocql/host_source.go
new file mode 100644
index 0000000..1d4dfd1
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/host_source.go
@@ -0,0 +1,722 @@
+package gocql
+
+import (
+	"errors"
+	"fmt"
+	"net"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+)
+
+var ErrCannotFindHost = errors.New("cannot find host")
+var ErrHostAlreadyExists = errors.New("host already exists")
+
+type nodeState int32
+
+func (n nodeState) String() string {
+	if n == NodeUp {
+		return "UP"
+	} else if n == NodeDown {
+		return "DOWN"
+	}
+	return fmt.Sprintf("UNKNOWN_%d", n)
+}
+
+const (
+	NodeUp nodeState = iota
+	NodeDown
+)
+
+type cassVersion struct {
+	Major, Minor, Patch int
+}
+
+func (c *cassVersion) Set(v string) error {
+	if v == "" {
+		return nil
+	}
+
+	return c.UnmarshalCQL(nil, []byte(v))
+}
+
+func (c *cassVersion) UnmarshalCQL(info TypeInfo, data []byte) error {
+	return c.unmarshal(data)
+}
+
+func (c *cassVersion) unmarshal(data []byte) error {
+	version := strings.TrimSuffix(string(data), "-SNAPSHOT")
+	version = strings.TrimPrefix(version, "v")
+	v := strings.Split(version, ".")
+
+	if len(v) < 2 {
+		return fmt.Errorf("invalid version string: %s", data)
+	}
+
+	var err error
+	c.Major, err = strconv.Atoi(v[0])
+	if err != nil {
+		return fmt.Errorf("invalid major version %v: %v", v[0], err)
+	}
+
+	c.Minor, err = strconv.Atoi(v[1])
+	if err != nil {
+		return fmt.Errorf("invalid minor version %v: %v", v[1], err)
+	}
+
+	if len(v) > 2 {
+		c.Patch, err = strconv.Atoi(v[2])
+		if err != nil {
+			return fmt.Errorf("invalid patch version %v: %v", v[2], err)
+		}
+	}
+
+	return nil
+}
+
+func (c cassVersion) Before(major, minor, patch int) bool {
+	// We're comparing us (cassVersion) with the provided version (major, minor, patch)
+	// We return true if our version is lower (comes before) than the provided one.
+	if c.Major < major {
+		return true
+	} else if c.Major == major {
+		if c.Minor < minor {
+			return true
+		} else if c.Minor == minor && c.Patch < patch {
+			return true
+		}
+
+	}
+	return false
+}
+
+func (c cassVersion) AtLeast(major, minor, patch int) bool {
+	return !c.Before(major, minor, patch)
+}
+
+func (c cassVersion) String() string {
+	return fmt.Sprintf("v%d.%d.%d", c.Major, c.Minor, c.Patch)
+}
+
+func (c cassVersion) nodeUpDelay() time.Duration {
+	if c.Major >= 2 && c.Minor >= 2 {
+		// CASSANDRA-8236
+		return 0
+	}
+
+	return 10 * time.Second
+}
+
+type HostInfo struct {
+	// TODO(zariel): reduce locking maybe, not all values will change, but to ensure
+	// that we are thread safe use a mutex to access all fields.
+	mu                         sync.RWMutex
+	hostname                   string
+	peer                       net.IP
+	broadcastAddress           net.IP
+	listenAddress              net.IP
+	rpcAddress                 net.IP
+	preferredIP                net.IP
+	connectAddress             net.IP
+	untranslatedConnectAddress net.IP
+	port                       int
+	dataCenter                 string
+	rack                       string
+	hostId                     string
+	workload                   string
+	graph                      bool
+	dseVersion                 string
+	partitioner                string
+	clusterName                string
+	version                    cassVersion
+	state                      nodeState
+	schemaVersion              string
+	tokens                     []string
+
+	scyllaShardAwarePort    uint16
+	scyllaShardAwarePortTLS uint16
+}
+
+func (h *HostInfo) Equal(host *HostInfo) bool {
+	if h == host {
+		// prevent rlock reentry
+		return true
+	}
+
+	return h.HostID() == host.HostID() && h.ConnectAddressAndPort() == host.ConnectAddressAndPort()
+}
+
+func (h *HostInfo) Peer() net.IP {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.peer
+}
+
+func (h *HostInfo) invalidConnectAddr() bool {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	addr, _ := h.connectAddressLocked()
+	return !validIpAddr(addr)
+}
+
+func validIpAddr(addr net.IP) bool {
+	return addr != nil && !addr.IsUnspecified()
+}
+
+func (h *HostInfo) connectAddressLocked() (net.IP, string) {
+	if validIpAddr(h.connectAddress) {
+		return h.connectAddress, "connect_address"
+	} else if validIpAddr(h.rpcAddress) {
+		return h.rpcAddress, "rpc_adress"
+	} else if validIpAddr(h.preferredIP) {
+		// where does perferred_ip get set?
+		return h.preferredIP, "preferred_ip"
+	} else if validIpAddr(h.broadcastAddress) {
+		return h.broadcastAddress, "broadcast_address"
+	} else if validIpAddr(h.peer) {
+		return h.peer, "peer"
+	}
+	return net.IPv4zero, "invalid"
+}
+
+// nodeToNodeAddress returns address broadcasted between node to nodes.
+// It's either `broadcast_address` if host info is read from system.local or `peer` if read from system.peers.
+// This IP address is also part of CQL Event emitted on topology/status changes,
+// but does not uniquely identify the node in case multiple nodes use the same IP address.
+func (h *HostInfo) nodeToNodeAddress() net.IP {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+
+	if validIpAddr(h.broadcastAddress) {
+		return h.broadcastAddress
+	} else if validIpAddr(h.peer) {
+		return h.peer
+	}
+	return net.IPv4zero
+}
+
+// Returns the address that should be used to connect to the host.
+// If you wish to override this, use an AddressTranslator or
+// use a HostFilter to SetConnectAddress()
+func (h *HostInfo) ConnectAddress() net.IP {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+
+	if addr, _ := h.connectAddressLocked(); validIpAddr(addr) {
+		return addr
+	}
+	panic(fmt.Sprintf("no valid connect address for host: %v. Is your cluster configured correctly?", h))
+}
+
+func (h *HostInfo) UntranslatedConnectAddress() net.IP {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+
+	if len(h.untranslatedConnectAddress) != 0 {
+		return h.untranslatedConnectAddress
+	}
+
+	if addr, _ := h.connectAddressLocked(); validIpAddr(addr) {
+		return addr
+	}
+	panic(fmt.Sprintf("no valid connect address for host: %v. Is your cluster configured correctly?", h))
+}
+
+func (h *HostInfo) SetConnectAddress(address net.IP) *HostInfo {
+	// TODO(zariel): should this not be exported?
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	h.connectAddress = address
+	return h
+}
+
+func (h *HostInfo) BroadcastAddress() net.IP {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.broadcastAddress
+}
+
+func (h *HostInfo) ListenAddress() net.IP {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.listenAddress
+}
+
+func (h *HostInfo) RPCAddress() net.IP {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.rpcAddress
+}
+
+func (h *HostInfo) PreferredIP() net.IP {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.preferredIP
+}
+
+func (h *HostInfo) DataCenter() string {
+	h.mu.RLock()
+	dc := h.dataCenter
+	h.mu.RUnlock()
+	return dc
+}
+
+func (h *HostInfo) Rack() string {
+	h.mu.RLock()
+	rack := h.rack
+	h.mu.RUnlock()
+	return rack
+}
+
+func (h *HostInfo) HostID() string {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.hostId
+}
+
+func (h *HostInfo) SetHostID(hostID string) {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	h.hostId = hostID
+}
+
+func (h *HostInfo) WorkLoad() string {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.workload
+}
+
+func (h *HostInfo) Graph() bool {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.graph
+}
+
+func (h *HostInfo) DSEVersion() string {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.dseVersion
+}
+
+func (h *HostInfo) Partitioner() string {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.partitioner
+}
+
+func (h *HostInfo) ClusterName() string {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.clusterName
+}
+
+func (h *HostInfo) Version() cassVersion {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.version
+}
+
+func (h *HostInfo) State() nodeState {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.state
+}
+
+func (h *HostInfo) setState(state nodeState) *HostInfo {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	h.state = state
+	return h
+}
+
+func (h *HostInfo) Tokens() []string {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.tokens
+}
+
+func (h *HostInfo) Port() int {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.port
+}
+
+func (h *HostInfo) update(from *HostInfo) {
+	if h == from {
+		return
+	}
+
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	from.mu.RLock()
+	defer from.mu.RUnlock()
+
+	// autogenerated do not update
+	if h.peer == nil {
+		h.peer = from.peer
+	}
+	if h.broadcastAddress == nil {
+		h.broadcastAddress = from.broadcastAddress
+	}
+	if h.listenAddress == nil {
+		h.listenAddress = from.listenAddress
+	}
+	if h.rpcAddress == nil {
+		h.rpcAddress = from.rpcAddress
+	}
+	if h.preferredIP == nil {
+		h.preferredIP = from.preferredIP
+	}
+	if h.connectAddress == nil {
+		h.connectAddress = from.connectAddress
+	}
+	if h.port == 0 {
+		h.port = from.port
+	}
+	if h.dataCenter == "" {
+		h.dataCenter = from.dataCenter
+	}
+	if h.rack == "" {
+		h.rack = from.rack
+	}
+	if h.hostId == "" {
+		h.hostId = from.hostId
+	}
+	if h.workload == "" {
+		h.workload = from.workload
+	}
+	if h.dseVersion == "" {
+		h.dseVersion = from.dseVersion
+	}
+	if h.partitioner == "" {
+		h.partitioner = from.partitioner
+	}
+	if h.clusterName == "" {
+		h.clusterName = from.clusterName
+	}
+	if h.version == (cassVersion{}) {
+		h.version = from.version
+	}
+	if h.tokens == nil {
+		h.tokens = from.tokens
+	}
+}
+
+func (h *HostInfo) IsUp() bool {
+	return h != nil && h.State() == NodeUp
+}
+
+func (h *HostInfo) IsBusy(s *Session) bool {
+	pool, ok := s.pool.getPool(h)
+	return ok && h != nil && pool.InFlight() >= MAX_IN_FLIGHT_THRESHOLD
+}
+
+func (h *HostInfo) HostnameAndPort() string {
+	// Fast path: in most cases hostname is not empty
+	var (
+		hostname string
+		port     int
+	)
+	h.mu.RLock()
+	hostname = h.hostname
+	port = h.port
+	h.mu.RUnlock()
+
+	if hostname != "" {
+		return net.JoinHostPort(hostname, strconv.Itoa(port))
+	}
+
+	// Slow path: hostname is empty
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	if h.hostname == "" { // recheck is hostname empty
+		// if yes - fill it
+		addr, _ := h.connectAddressLocked()
+		h.hostname = addr.String()
+	}
+	return net.JoinHostPort(h.hostname, strconv.Itoa(h.port))
+}
+
+func (h *HostInfo) Hostname() string {
+	// Fast path: in most cases hostname is not empty
+	var hostname string
+	h.mu.RLock()
+	hostname = h.hostname
+	h.mu.RUnlock()
+
+	if hostname != "" {
+		return hostname
+	}
+
+	// Slow path: hostname is empty
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	if h.hostname == "" {
+		addr, _ := h.connectAddressLocked()
+		h.hostname = addr.String()
+	}
+	return h.hostname
+}
+
+func (h *HostInfo) ConnectAddressAndPort() string {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	addr, _ := h.connectAddressLocked()
+	return net.JoinHostPort(addr.String(), strconv.Itoa(h.port))
+}
+
+func (h *HostInfo) String() string {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+
+	connectAddr, source := h.connectAddressLocked()
+	return fmt.Sprintf("[HostInfo hostname=%q connectAddress=%q peer=%q rpc_address=%q broadcast_address=%q "+
+		"preferred_ip=%q connect_addr=%q connect_addr_source=%q "+
+		"port=%d data_centre=%q rack=%q host_id=%q version=%q state=%s num_tokens=%d]",
+		h.hostname, h.connectAddress, h.peer, h.rpcAddress, h.broadcastAddress, h.preferredIP,
+		connectAddr, source,
+		h.port, h.dataCenter, h.rack, h.hostId, h.version, h.state, len(h.tokens))
+}
+
+func (h *HostInfo) setScyllaSupported(s scyllaSupported) {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	h.scyllaShardAwarePort = s.shardAwarePort
+	h.scyllaShardAwarePortTLS = s.shardAwarePortSSL
+}
+
+// ScyllaShardAwarePort returns the shard aware port of this host.
+// Returns zero if the shard aware port is not known.
+func (h *HostInfo) ScyllaShardAwarePort() uint16 {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.scyllaShardAwarePort
+}
+
+// ScyllaShardAwarePortTLS returns the TLS-enabled shard aware port of this host.
+// Returns zero if the shard aware port is not known.
+func (h *HostInfo) ScyllaShardAwarePortTLS() uint16 {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	return h.scyllaShardAwarePortTLS
+}
+
+// Returns true if we are using system_schema.keyspaces instead of system.schema_keyspaces
+func checkSystemSchema(control controlConnection) (bool, error) {
+	iter := control.query("SELECT * FROM system_schema.keyspaces" + control.getSession().usingTimeoutClause)
+	if err := iter.err; err != nil {
+		if errf, ok := err.(*errorFrame); ok {
+			if errf.code == ErrCodeSyntax {
+				return false, nil
+			}
+		}
+
+		return false, err
+	}
+
+	return true, nil
+}
+
+// Given a map that represents a row from either system.local or system.peers
+// return as much information as we can in *HostInfo
+func hostInfoFromMap(row map[string]interface{}, host *HostInfo, translateAddressPort func(addr net.IP, port int) (net.IP, int)) (*HostInfo, error) {
+	const assertErrorMsg = "Assertion failed for %s"
+	var ok bool
+
+	// Default to our connected port if the cluster doesn't have port information
+	for key, value := range row {
+		switch key {
+		case "data_center":
+			host.dataCenter, ok = value.(string)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "data_center")
+			}
+		case "rack":
+			host.rack, ok = value.(string)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "rack")
+			}
+		case "host_id":
+			hostId, ok := value.(UUID)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "host_id")
+			}
+			host.hostId = hostId.String()
+		case "release_version":
+			version, ok := value.(string)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "release_version")
+			}
+			host.version.Set(version)
+		case "peer":
+			ip, ok := value.(string)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "peer")
+			}
+			host.peer = net.ParseIP(ip)
+		case "cluster_name":
+			host.clusterName, ok = value.(string)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "cluster_name")
+			}
+		case "partitioner":
+			host.partitioner, ok = value.(string)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "partitioner")
+			}
+		case "broadcast_address":
+			ip, ok := value.(string)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "broadcast_address")
+			}
+			host.broadcastAddress = net.ParseIP(ip)
+		case "preferred_ip":
+			ip, ok := value.(string)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "preferred_ip")
+			}
+			host.preferredIP = net.ParseIP(ip)
+		case "rpc_address":
+			ip, ok := value.(string)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "rpc_address")
+			}
+			host.rpcAddress = net.ParseIP(ip)
+		case "native_address":
+			ip, ok := value.(string)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "native_address")
+			}
+			host.rpcAddress = net.ParseIP(ip)
+		case "listen_address":
+			ip, ok := value.(string)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "listen_address")
+			}
+			host.listenAddress = net.ParseIP(ip)
+		case "native_port":
+			native_port, ok := value.(int)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "native_port")
+			}
+			host.port = native_port
+		case "workload":
+			host.workload, ok = value.(string)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "workload")
+			}
+		case "graph":
+			host.graph, ok = value.(bool)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "graph")
+			}
+		case "tokens":
+			host.tokens, ok = value.([]string)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "tokens")
+			}
+		case "dse_version":
+			host.dseVersion, ok = value.(string)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "dse_version")
+			}
+		case "schema_version":
+			schemaVersion, ok := value.(UUID)
+			if !ok {
+				return nil, fmt.Errorf(assertErrorMsg, "schema_version")
+			}
+			host.schemaVersion = schemaVersion.String()
+		}
+		// TODO(thrawn01): Add 'port'? once CASSANDRA-7544 is complete
+		// Not sure what the port field will be called until the JIRA issue is complete
+	}
+
+	host.untranslatedConnectAddress = host.ConnectAddress()
+	ip, port := translateAddressPort(host.untranslatedConnectAddress, host.port)
+	host.connectAddress = ip
+	host.port = port
+
+	return host, nil
+}
+
+func hostInfoFromIter(iter *Iter, connectAddress net.IP, defaultPort int, translateAddressPort func(addr net.IP, port int) (net.IP, int)) (*HostInfo, error) {
+	rows, err := iter.SliceMap()
+	if err != nil {
+		// TODO(zariel): make typed error
+		return nil, err
+	}
+
+	if len(rows) == 0 {
+		return nil, errors.New("query returned 0 rows")
+	}
+
+	host, err := hostInfoFromMap(rows[0], &HostInfo{connectAddress: connectAddress, port: defaultPort}, translateAddressPort)
+	if err != nil {
+		return nil, err
+	}
+	return host, nil
+}
+
+// debounceRingRefresh submits a ring refresh request to the ring refresh debouncer.
+func (s *Session) debounceRingRefresh() {
+	s.ringRefresher.Debounce()
+}
+
+// refreshRing executes a ring refresh immediately and cancels pending debounce ring refresh requests.
+func (s *Session) refreshRingNow() error {
+	err, ok := <-s.ringRefresher.RefreshNow()
+	if !ok {
+		return errors.New("could not refresh ring because stop was requested")
+	}
+
+	return err
+}
+
+func (s *Session) refreshRing() error {
+	hosts, partitioner, err := s.hostSource.GetHostsFromSystem()
+	if err != nil {
+		return err
+	}
+	prevHosts := s.hostSource.getHostsMap()
+
+	for _, h := range hosts {
+		if s.cfg.filterHost(h) {
+			continue
+		}
+
+		if host, ok := s.hostSource.addHostIfMissing(h); !ok {
+			s.startPoolFill(h)
+		} else {
+			// host (by hostID) already exists; determine if IP has changed
+			newHostID := h.HostID()
+			existing, ok := prevHosts[newHostID]
+			if !ok {
+				return fmt.Errorf("get existing host=%s from prevHosts: %w", h, ErrCannotFindHost)
+			}
+			if h.connectAddress.Equal(existing.connectAddress) && h.nodeToNodeAddress().Equal(existing.nodeToNodeAddress()) {
+				// no host IP change
+				host.update(h)
+			} else {
+				// host IP has changed
+				// remove old HostInfo (w/old IP)
+				s.removeHost(existing)
+				if _, alreadyExists := s.hostSource.addHostIfMissing(h); alreadyExists {
+					return fmt.Errorf("add new host=%s after removal: %w", h, ErrHostAlreadyExists)
+				}
+				// add new HostInfo (same hostID, new IP)
+				s.startPoolFill(h)
+			}
+		}
+		delete(prevHosts, h.HostID())
+	}
+
+	for _, host := range prevHosts {
+		s.metadataDescriber.removeTabletsWithHost(host)
+		s.removeHost(host)
+	}
+	s.policy.SetPartitioner(partitioner)
+
+	return nil
+}
diff --git a/vendor/github.com/gocql/gocql/host_source_gen.go b/vendor/github.com/gocql/gocql/host_source_gen.go
new file mode 100644
index 0000000..8c096ff
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/host_source_gen.go
@@ -0,0 +1,46 @@
+//go:build genhostinfo
+// +build genhostinfo
+
+package main
+
+import (
+	"fmt"
+	"reflect"
+	"sync"
+
+	"github.com/gocql/gocql"
+)
+
+func gen(clause, field string) {
+	fmt.Printf("if h.%s == %s {\n", field, clause)
+	fmt.Printf("\th.%s = from.%s\n", field, field)
+	fmt.Println("}")
+}
+
+func main() {
+	t := reflect.ValueOf(&gocql.HostInfo{}).Elem().Type()
+	mu := reflect.TypeOf(sync.RWMutex{})
+
+	for i := 0; i < t.NumField(); i++ {
+		f := t.Field(i)
+		if f.Type == mu {
+			continue
+		}
+
+		switch f.Type.Kind() {
+		case reflect.Slice:
+			gen("nil", f.Name)
+		case reflect.String:
+			gen(`""`, f.Name)
+		case reflect.Int:
+			gen("0", f.Name)
+		case reflect.Struct:
+			gen("("+f.Type.Name()+"{})", f.Name)
+		case reflect.Bool, reflect.Int32:
+			continue
+		default:
+			panic(fmt.Sprintf("unknown field: %s", f))
+		}
+	}
+
+}
diff --git a/vendor/github.com/gocql/gocql/host_source_scylla.go b/vendor/github.com/gocql/gocql/host_source_scylla.go
new file mode 100644
index 0000000..be49315
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/host_source_scylla.go
@@ -0,0 +1,7 @@
+package gocql
+
+func (h *HostInfo) SetDatacenter(dc string) {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	h.dataCenter = dc
+}
diff --git a/vendor/github.com/gocql/gocql/install_test_deps.sh b/vendor/github.com/gocql/gocql/install_test_deps.sh
new file mode 100644
index 0000000..e69de29
diff --git a/vendor/github.com/gocql/gocql/integration.sh b/vendor/github.com/gocql/gocql/integration.sh
new file mode 100644
index 0000000..07d67f6
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/integration.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+#
+# Copyright (C) 2017 ScyllaDB
+#
+
+readonly SCYLLA_IMAGE=${SCYLLA_IMAGE}
+
+set -eu -o pipefail
+
+function scylla_up() {
+  local -r exec="docker compose exec -T"
+
+  echo "==> Running Scylla ${SCYLLA_IMAGE}"
+  docker pull ${SCYLLA_IMAGE}
+  docker compose up -d --wait || ( docker compose ps --format json | jq -M 'select(.Health == "unhealthy") | .Service' | xargs docker compose logs; exit 1 )
+}
+
+function scylla_down() {
+  echo "==> Stopping Scylla"
+  docker compose down
+}
+
+function scylla_restart() {
+  scylla_down
+  scylla_up
+}
+
+scylla_restart
+
+sudo chmod 0777 /tmp/scylla/cql.m
+
+readonly clusterSize=1
+readonly multiNodeClusterSize=3
+readonly scylla_liveset="192.168.100.11"
+readonly scylla_tablet_liveset="192.168.100.12"
+readonly cversion="3.11.4"
+readonly proto=4
+readonly args="-gocql.timeout=60s -proto=${proto} -rf=${clusterSize} -clusterSize=${clusterSize} -autowait=2000ms -compressor=snappy -gocql.cversion=${cversion} -cluster=${scylla_liveset}"
+readonly tabletArgs="-gocql.timeout=60s -proto=${proto} -rf=1 -clusterSize=${multiNodeClusterSize} -autowait=2000ms -compressor=snappy -gocql.cversion=${cversion} -multiCluster=${scylla_tablet_liveset}"
+
+if [[ "$*" == *"tablet"* ]];
+then 
+  echo "==> Running tablet tests with args: ${tabletArgs}"
+  go test -timeout=5m -race -tags="tablet" ${tabletArgs} ./...
+fi
+
+TAGS=$*
+TAGS=${TAGS//"tablet"/}
+
+if [ ! -z "$TAGS" ];
+then
+	echo "==> Running ${TAGS} tests with args: ${args}"
+	go test -timeout=5m -race -tags="$TAGS" ${args} ./...
+fi
diff --git a/vendor/github.com/gocql/gocql/internal/lru/lru.go b/vendor/github.com/gocql/gocql/internal/lru/lru.go
new file mode 100644
index 0000000..14ca1f4
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/internal/lru/lru.go
@@ -0,0 +1,127 @@
+/*
+Copyright 2015 To gocql authors
+Copyright 2013 Google Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package lru implements an LRU cache.
+package lru
+
+import "container/list"
+
+// Cache is an LRU cache. It is not safe for concurrent access.
+//
+// This cache has been forked from github.com/golang/groupcache/lru, but
+// specialized with string keys to avoid the allocations caused by wrapping them
+// in interface{}.
+type Cache struct {
+	// MaxEntries is the maximum number of cache entries before
+	// an item is evicted. Zero means no limit.
+	MaxEntries int
+
+	// OnEvicted optionally specifies a callback function to be
+	// executed when an entry is purged from the cache.
+	OnEvicted func(key string, value interface{})
+
+	ll    *list.List
+	cache map[string]*list.Element
+}
+
+type entry struct {
+	key   string
+	value interface{}
+}
+
+// New creates a new Cache.
+// If maxEntries is zero, the cache has no limit and it's assumed
+// that eviction is done by the caller.
+func New(maxEntries int) *Cache {
+	return &Cache{
+		MaxEntries: maxEntries,
+		ll:         list.New(),
+		cache:      make(map[string]*list.Element),
+	}
+}
+
+// Add adds a value to the cache.
+func (c *Cache) Add(key string, value interface{}) {
+	if c.cache == nil {
+		c.cache = make(map[string]*list.Element)
+		c.ll = list.New()
+	}
+	if ee, ok := c.cache[key]; ok {
+		c.ll.MoveToFront(ee)
+		ee.Value.(*entry).value = value
+		return
+	}
+	ele := c.ll.PushFront(&entry{key, value})
+	c.cache[key] = ele
+	if c.MaxEntries != 0 && c.ll.Len() > c.MaxEntries {
+		c.RemoveOldest()
+	}
+}
+
+// Get looks up a key's value from the cache.
+func (c *Cache) Get(key string) (value interface{}, ok bool) {
+	if c.cache == nil {
+		return
+	}
+	if ele, hit := c.cache[key]; hit {
+		c.ll.MoveToFront(ele)
+		return ele.Value.(*entry).value, true
+	}
+	return
+}
+
+// Remove removes the provided key from the cache.
+func (c *Cache) Remove(key string) bool {
+	if c.cache == nil {
+		return false
+	}
+
+	if ele, hit := c.cache[key]; hit {
+		c.removeElement(ele)
+		return true
+	}
+
+	return false
+}
+
+// RemoveOldest removes the oldest item from the cache.
+func (c *Cache) RemoveOldest() {
+	if c.cache == nil {
+		return
+	}
+	ele := c.ll.Back()
+	if ele != nil {
+		c.removeElement(ele)
+	}
+}
+
+func (c *Cache) removeElement(e *list.Element) {
+	c.ll.Remove(e)
+	kv := e.Value.(*entry)
+	delete(c.cache, kv.key)
+	if c.OnEvicted != nil {
+		c.OnEvicted(kv.key, kv.value)
+	}
+}
+
+// Len returns the number of items in the cache.
+func (c *Cache) Len() int {
+	if c.cache == nil {
+		return 0
+	}
+	return c.ll.Len()
+}
diff --git a/vendor/github.com/gocql/gocql/internal/murmur/murmur.go b/vendor/github.com/gocql/gocql/internal/murmur/murmur.go
new file mode 100644
index 0000000..d006cc0
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/internal/murmur/murmur.go
@@ -0,0 +1,135 @@
+package murmur
+
+const (
+	c1    int64 = -8663945395140668459 // 0x87c37b91114253d5
+	c2    int64 = 5545529020109919103  // 0x4cf5ad432745937f
+	fmix1 int64 = -49064778989728563   // 0xff51afd7ed558ccd
+	fmix2 int64 = -4265267296055464877 // 0xc4ceb9fe1a85ec53
+)
+
+func fmix(n int64) int64 {
+	// cast to unsigned for logical right bitshift (to match C* MM3 implementation)
+	n ^= int64(uint64(n) >> 33)
+	n *= fmix1
+	n ^= int64(uint64(n) >> 33)
+	n *= fmix2
+	n ^= int64(uint64(n) >> 33)
+
+	return n
+}
+
+func block(p byte) int64 {
+	return int64(int8(p))
+}
+
+func rotl(x int64, r uint8) int64 {
+	// cast to unsigned for logical right bitshift (to match C* MM3 implementation)
+	return (x << r) | (int64)((uint64(x) >> (64 - r)))
+}
+
+func Murmur3H1(data []byte) int64 {
+	length := len(data)
+
+	var h1, h2, k1, k2 int64
+
+	// body
+	nBlocks := length / 16
+	for i := 0; i < nBlocks; i++ {
+		k1, k2 = getBlock(data, i)
+
+		k1 *= c1
+		k1 = rotl(k1, 31)
+		k1 *= c2
+		h1 ^= k1
+
+		h1 = rotl(h1, 27)
+		h1 += h2
+		h1 = h1*5 + 0x52dce729
+
+		k2 *= c2
+		k2 = rotl(k2, 33)
+		k2 *= c1
+		h2 ^= k2
+
+		h2 = rotl(h2, 31)
+		h2 += h1
+		h2 = h2*5 + 0x38495ab5
+	}
+
+	// tail
+	tail := data[nBlocks*16:]
+	k1 = 0
+	k2 = 0
+	switch length & 15 {
+	case 15:
+		k2 ^= block(tail[14]) << 48
+		fallthrough
+	case 14:
+		k2 ^= block(tail[13]) << 40
+		fallthrough
+	case 13:
+		k2 ^= block(tail[12]) << 32
+		fallthrough
+	case 12:
+		k2 ^= block(tail[11]) << 24
+		fallthrough
+	case 11:
+		k2 ^= block(tail[10]) << 16
+		fallthrough
+	case 10:
+		k2 ^= block(tail[9]) << 8
+		fallthrough
+	case 9:
+		k2 ^= block(tail[8])
+
+		k2 *= c2
+		k2 = rotl(k2, 33)
+		k2 *= c1
+		h2 ^= k2
+
+		fallthrough
+	case 8:
+		k1 ^= block(tail[7]) << 56
+		fallthrough
+	case 7:
+		k1 ^= block(tail[6]) << 48
+		fallthrough
+	case 6:
+		k1 ^= block(tail[5]) << 40
+		fallthrough
+	case 5:
+		k1 ^= block(tail[4]) << 32
+		fallthrough
+	case 4:
+		k1 ^= block(tail[3]) << 24
+		fallthrough
+	case 3:
+		k1 ^= block(tail[2]) << 16
+		fallthrough
+	case 2:
+		k1 ^= block(tail[1]) << 8
+		fallthrough
+	case 1:
+		k1 ^= block(tail[0])
+
+		k1 *= c1
+		k1 = rotl(k1, 31)
+		k1 *= c2
+		h1 ^= k1
+	}
+
+	h1 ^= int64(length)
+	h2 ^= int64(length)
+
+	h1 += h2
+	h2 += h1
+
+	h1 = fmix(h1)
+	h2 = fmix(h2)
+
+	h1 += h2
+	// the following is extraneous since h2 is discarded
+	// h2 += h1
+
+	return h1
+}
diff --git a/vendor/github.com/gocql/gocql/internal/murmur/murmur_appengine.go b/vendor/github.com/gocql/gocql/internal/murmur/murmur_appengine.go
new file mode 100644
index 0000000..79a05e2
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/internal/murmur/murmur_appengine.go
@@ -0,0 +1,12 @@
+//go:build appengine || s390x
+// +build appengine s390x
+
+package murmur
+
+import "encoding/binary"
+
+func getBlock(data []byte, n int) (int64, int64) {
+	k1 := int64(binary.LittleEndian.Uint64(data[n*16:]))
+	k2 := int64(binary.LittleEndian.Uint64(data[(n*16)+8:]))
+	return k1, k2
+}
diff --git a/vendor/github.com/gocql/gocql/internal/murmur/murmur_unsafe.go b/vendor/github.com/gocql/gocql/internal/murmur/murmur_unsafe.go
new file mode 100644
index 0000000..e8b56e5
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/internal/murmur/murmur_unsafe.go
@@ -0,0 +1,16 @@
+//go:build !appengine && !s390x
+// +build !appengine,!s390x
+
+package murmur
+
+import (
+	"unsafe"
+)
+
+func getBlock(data []byte, n int) (int64, int64) {
+	block := (*[2]int64)(unsafe.Pointer(&data[n*16]))
+
+	k1 := block[0]
+	k2 := block[1]
+	return k1, k2
+}
diff --git a/vendor/github.com/gocql/gocql/internal/streams/streams.go b/vendor/github.com/gocql/gocql/internal/streams/streams.go
new file mode 100644
index 0000000..b31a969
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/internal/streams/streams.go
@@ -0,0 +1,146 @@
+package streams
+
+import (
+	"math"
+	"strconv"
+	"sync/atomic"
+)
+
+const bucketBits = 64
+
+// IDGenerator tracks and allocates streams which are in use.
+type IDGenerator struct {
+	NumStreams   int
+	inuseStreams int32
+	numBuckets   uint32
+
+	// streams is a bitset where each bit represents a stream, a 1 implies in use
+	streams []uint64
+	offset  uint32
+}
+
+func New(protocol int) *IDGenerator {
+	maxStreams := 128
+	if protocol > 2 {
+		maxStreams = 32768
+	}
+	return NewLimited(maxStreams)
+}
+
+func NewLimited(maxStreams int) *IDGenerator {
+	// Round up maxStreams to a nearest
+	// multiple of 64
+	maxStreams = ((maxStreams + 63) / 64) * 64
+
+	buckets := maxStreams / 64
+	// reserve stream 0
+	streams := make([]uint64, buckets)
+	streams[0] = 1 << 63
+
+	return &IDGenerator{
+		NumStreams: maxStreams,
+		streams:    streams,
+		numBuckets: uint32(buckets),
+		offset:     uint32(buckets) - 1,
+	}
+}
+
+func streamFromBucket(bucket, streamInBucket int) int {
+	return (bucket * bucketBits) + streamInBucket
+}
+
+func (s *IDGenerator) GetStream() (int, bool) {
+	// Reduce collisions by offsetting the starting point
+	offset := atomic.AddUint32(&s.offset, 1)
+
+	for i := uint32(0); i < s.numBuckets; i++ {
+		pos := int((i + offset) % s.numBuckets)
+
+		bucket := atomic.LoadUint64(&s.streams[pos])
+		if bucket == math.MaxUint64 {
+			// all streams in use
+			continue
+		}
+
+		for j := 0; j < bucketBits; j++ {
+			mask := uint64(1 << streamOffset(j))
+			for bucket&mask == 0 {
+				if atomic.CompareAndSwapUint64(&s.streams[pos], bucket, bucket|mask) {
+					atomic.AddInt32(&s.inuseStreams, 1)
+					return streamFromBucket(int(pos), j), true
+				}
+				bucket = atomic.LoadUint64(&s.streams[pos])
+			}
+		}
+	}
+
+	return 0, false
+}
+
+func bitfmt(b uint64) string {
+	return strconv.FormatUint(b, 16)
+}
+
+// returns the bucket offset of a given stream
+func bucketOffset(i int) int {
+	return i / bucketBits
+}
+
+func streamOffset(stream int) uint64 {
+	return bucketBits - uint64(stream%bucketBits) - 1
+}
+
+func isSet(bits uint64, stream int) bool {
+	return bits>>streamOffset(stream)&1 == 1
+}
+
+func (s *IDGenerator) isSet(stream int) bool {
+	bits := atomic.LoadUint64(&s.streams[bucketOffset(stream)])
+	return isSet(bits, stream)
+}
+
+func (s *IDGenerator) String() string {
+	size := s.numBuckets * (bucketBits + 1)
+	buf := make([]byte, 0, size)
+	for i := 0; i < int(s.numBuckets); i++ {
+		bits := atomic.LoadUint64(&s.streams[i])
+		buf = append(buf, bitfmt(bits)...)
+		buf = append(buf, ' ')
+	}
+	return string(buf[: size-1 : size-1])
+}
+
+func (s *IDGenerator) Clear(stream int) (inuse bool) {
+	offset := bucketOffset(stream)
+	bucket := atomic.LoadUint64(&s.streams[offset])
+
+	mask := uint64(1) << streamOffset(stream)
+	if bucket&mask != mask {
+		// already cleared
+		return false
+	}
+
+	for !atomic.CompareAndSwapUint64(&s.streams[offset], bucket, bucket & ^mask) {
+		bucket = atomic.LoadUint64(&s.streams[offset])
+		if bucket&mask != mask {
+			// already cleared
+			return false
+		}
+	}
+
+	// TODO: make this account for 0 stream being reserved
+	if atomic.AddInt32(&s.inuseStreams, -1) < 0 {
+		// TODO(zariel): remove this
+		panic("negative streams inuse")
+	}
+
+	return true
+}
+
+func (s *IDGenerator) Available() int {
+	return s.NumStreams - int(atomic.LoadInt32(&s.inuseStreams)) - 1
+}
+
+func (s *IDGenerator) InUse() int {
+	return int(atomic.LoadInt32(&s.inuseStreams))
+}
diff --git a/vendor/github.com/gocql/gocql/logger.go b/vendor/github.com/gocql/gocql/logger.go
new file mode 100644
index 0000000..8ff6658
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/logger.go
@@ -0,0 +1,40 @@
+package gocql
+
+import (
+	"bytes"
+	"fmt"
+	"log"
+)
+
+type StdLogger interface {
+	Print(v ...interface{})
+	Printf(format string, v ...interface{})
+	Println(v ...interface{})
+}
+
+type nopLogger struct{}
+
+func (n nopLogger) Print(_ ...interface{}) {}
+
+func (n nopLogger) Printf(_ string, _ ...interface{}) {}
+
+func (n nopLogger) Println(_ ...interface{}) {}
+
+type testLogger struct {
+	capture bytes.Buffer
+}
+
+func (l *testLogger) Print(v ...interface{})                 { fmt.Fprint(&l.capture, v...) }
+func (l *testLogger) Printf(format string, v ...interface{}) { fmt.Fprintf(&l.capture, format, v...) }
+func (l *testLogger) Println(v ...interface{})               { fmt.Fprintln(&l.capture, v...) }
+func (l *testLogger) String() string                         { return l.capture.String() }
+
+type defaultLogger struct{}
+
+func (l *defaultLogger) Print(v ...interface{})                 { log.Print(v...) }
+func (l *defaultLogger) Printf(format string, v ...interface{}) { log.Printf(format, v...) }
+func (l *defaultLogger) Println(v ...interface{})               { log.Println(v...) }
+
+// Logger for logging messages.
+// Deprecated: Use ClusterConfig.Logger instead.
+var Logger StdLogger = &defaultLogger{}
diff --git a/vendor/github.com/gocql/gocql/marshal.go b/vendor/github.com/gocql/gocql/marshal.go
new file mode 100644
index 0000000..af12ec7
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/marshal.go
@@ -0,0 +1,1846 @@
+// Copyright (c) 2012 The gocql Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gocql
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"math"
+	"reflect"
+	"strings"
+	"unsafe"
+
+	"github.com/gocql/gocql/serialization/ascii"
+	"github.com/gocql/gocql/serialization/bigint"
+	"github.com/gocql/gocql/serialization/blob"
+	"github.com/gocql/gocql/serialization/boolean"
+	"github.com/gocql/gocql/serialization/counter"
+	"github.com/gocql/gocql/serialization/cqlint"
+	"github.com/gocql/gocql/serialization/cqltime"
+	"github.com/gocql/gocql/serialization/date"
+	"github.com/gocql/gocql/serialization/decimal"
+	"github.com/gocql/gocql/serialization/double"
+	"github.com/gocql/gocql/serialization/duration"
+	"github.com/gocql/gocql/serialization/float"
+	"github.com/gocql/gocql/serialization/inet"
+	"github.com/gocql/gocql/serialization/smallint"
+	"github.com/gocql/gocql/serialization/text"
+	"github.com/gocql/gocql/serialization/timestamp"
+	"github.com/gocql/gocql/serialization/timeuuid"
+	"github.com/gocql/gocql/serialization/tinyint"
+	"github.com/gocql/gocql/serialization/uuid"
+	"github.com/gocql/gocql/serialization/varchar"
+	"github.com/gocql/gocql/serialization/varint"
+)
+
+var (
+	emptyValue reflect.Value
+)
+
+var (
+	ErrorUDTUnavailable = errors.New("UDT are not available on protocols less than 3, please update config")
+)
+
+// Marshaler is an interface for custom unmarshaler.
+// Each value of the 'CQL binary protocol' consist of <value_len> and <value_data>.
+// <value_len> can be 'unset'(-2), 'nil'(-1), 'zero'(0) or any value up to 2147483647.
+// When <value_len> is 'unset', 'nil' or 'zero', <value_data> is not present.
+// 'unset' is applicable only to columns, with some exceptions.
+// As you can see from API MarshalCQL only returns <value_data>, but there is a way for it to control <value_len>:
+//  1. If MarshalCQL returns (gocql.UnsetValue, nil), gocql writes 'unset' to <value_len>
+//  2. If MarshalCQL returns ([]byte(nil), nil), gocql writes 'nil' to <value_len>
+//  3. If MarshalCQL returns ([]byte{}, nil), gocql writes 'zero' to <value_len>
+//
+// Some CQL databases have proprietary value coding features, which you may want to consider.
+// CQL binary protocol info:https://github.com/apache/cassandra/tree/trunk/doc
+type Marshaler interface {
+	MarshalCQL(info TypeInfo) ([]byte, error)
+}
+
+type DirectMarshal []byte
+
+func (m DirectMarshal) MarshalCQL(_ TypeInfo) ([]byte, error) {
+	return m, nil
+}
+
+// Unmarshaler is an interface for custom unmarshaler.
+// Each value of the 'CQL binary protocol' consist of <value_len> and <value_data>.
+// <value_len> can be 'unset'(-2), 'nil'(-1), 'zero'(0) or any value up to 2147483647.
+// When <value_len> is 'unset', 'nil' or 'zero', <value_data> is not present.
+// As you can see from an API UnmarshalCQL receives only 'info TypeInfo' and
+// 'data []byte', but gocql has the following way to signal about <value_len>:
+//  1. When <value_len> is 'nil' gocql feeds nil to 'data []byte'
+//  2. When <value_len> is 'zero' gocql feeds []byte{} to 'data []byte'
+//
+// Some CQL databases have proprietary value coding features, which you may want to consider.
+// CQL binary protocol info:https://github.com/apache/cassandra/tree/trunk/doc
+type Unmarshaler interface {
+	UnmarshalCQL(info TypeInfo, data []byte) error
+}
+
+type DirectUnmarshal []byte
+
+func (d *DirectUnmarshal) UnmarshalCQL(_ TypeInfo, data []byte) error {
+	*d = bytes.Clone(data)
+	return nil
+}
+
+// Marshal returns the CQL encoding of the value for the Cassandra
+// internal type described by the info parameter.
+//
+// nil is serialized as CQL null.
+// If value implements Marshaler, its MarshalCQL method is called to marshal the data.
+// If value is a pointer, the pointed-to value is marshaled.
+//
+// Supported conversions are as follows, other type combinations may be added in the future:
+//
+//	CQL type                    | Go type (value)    | Note
+//	varchar, ascii, blob, text  | string, []byte     |
+//	boolean                     | bool               |
+//	tinyint, smallint, int      | integer types      |
+//	tinyint, smallint, int      | string             | formatted as base 10 number
+//	bigint, counter             | integer types      |
+//	bigint, counter             | big.Int            |
+//	bigint, counter             | string             | formatted as base 10 number
+//	float                       | float32            |
+//	double                      | float64            |
+//	decimal                     | inf.Dec            |
+//	time                        | int64              | nanoseconds since start of day
+//	time                        | time.Duration      | duration since start of day
+//	timestamp                   | int64              | milliseconds since Unix epoch
+//	timestamp                   | time.Time          |
+//	list, set                   | slice, array       |
+//	list, set                   | map[X]struct{}     |
+//	map                         | map[X]Y            |
+//	uuid, timeuuid              | gocql.UUID         |
+//	uuid, timeuuid              | [16]byte           | raw UUID bytes
+//	uuid, timeuuid              | []byte             | raw UUID bytes, length must be 16 bytes
+//	uuid, timeuuid              | string             | hex representation, see ParseUUID
+//	varint                      | integer types      |
+//	varint                      | big.Int            |
+//	varint                      | string             | value of number in decimal notation
+//	inet                        | net.IP             |
+//	inet                        | string             | IPv4 or IPv6 address string
+//	tuple                       | slice, array       |
+//	tuple                       | struct             | fields are marshaled in order of declaration
+//	user-defined type           | gocql.UDTMarshaler | MarshalUDT is called
+//	user-defined type           | map[string]interface{} |
+//	user-defined type           | struct             | struct fields' cql tags are used for column names
+//	date                        | int64              | milliseconds since Unix epoch to start of day (in UTC)
+//	date                        | time.Time          | start of day (in UTC)
+//	date                        | string             | parsed using "2006-01-02" format
+//	duration                    | int64              | duration in nanoseconds
+//	duration                    | time.Duration      |
+//	duration                    | gocql.Duration     |
+//	duration                    | string             | parsed with time.ParseDuration
+func Marshal(info TypeInfo, value interface{}) ([]byte, error) {
+	if info.Version() < protoVersion1 {
+		panic("protocol version not set")
+	}
+
+	if valueRef := reflect.ValueOf(value); valueRef.Kind() == reflect.Ptr {
+		if valueRef.IsNil() {
+			return nil, nil
+		} else if v, ok := value.(Marshaler); ok {
+			return v.MarshalCQL(info)
+		} else {
+			return Marshal(info, valueRef.Elem().Interface())
+		}
+	}
+
+	if v, ok := value.(Marshaler); ok {
+		return v.MarshalCQL(info)
+	}
+
+	switch info.Type() {
+	case TypeVarchar:
+		return marshalVarchar(value)
+	case TypeText:
+		return marshalText(value)
+	case TypeBlob:
+		return marshalBlob(value)
+	case TypeAscii:
+		return marshalAscii(value)
+	case TypeBoolean:
+		return marshalBool(value)
+	case TypeTinyInt:
+		return marshalTinyInt(value)
+	case TypeSmallInt:
+		return marshalSmallInt(value)
+	case TypeInt:
+		return marshalInt(value)
+	case TypeBigInt:
+		return marshalBigInt(value)
+	case TypeCounter:
+		return marshalCounter(value)
+	case TypeFloat:
+		return marshalFloat(value)
+	case TypeDouble:
+		return marshalDouble(value)
+	case TypeDecimal:
+		return marshalDecimal(value)
+	case TypeTime:
+		return marshalTime(value)
+	case TypeTimestamp:
+		return marshalTimestamp(value)
+	case TypeList, TypeSet:
+		return marshalList(info, value)
+	case TypeMap:
+		return marshalMap(info, value)
+	case TypeUUID:
+		return marshalUUID(value)
+	case TypeTimeUUID:
+		return marshalTimeUUID(value)
+	case TypeVarint:
+		return marshalVarint(value)
+	case TypeInet:
+		return marshalInet(value)
+	case TypeTuple:
+		return marshalTuple(info, value)
+	case TypeUDT:
+		return marshalUDT(info, value)
+	case TypeDate:
+		return marshalDate(value)
+	case TypeDuration:
+		return marshalDuration(value)
+	}
+
+	// detect protocol 2 UDT
+	if strings.HasPrefix(info.Custom(), "org.apache.cassandra.db.marshal.UserType") && info.Version() < 3 {
+		return nil, ErrorUDTUnavailable
+	}
+
+	// TODO(tux21b): add the remaining types
+	return nil, fmt.Errorf("can not marshal %T into %s", value, info)
+}
+
+// Unmarshal parses the CQL encoded data based on the info parameter that
+// describes the Cassandra internal data type and stores the result in the
+// value pointed by value.
+//
+// If value implements Unmarshaler, it's UnmarshalCQL method is called to
+// unmarshal the data.
+// If value is a pointer to pointer, it is set to nil if the CQL value is
+// null. Otherwise, nulls are unmarshalled as zero value.
+//
+// Supported conversions are as follows, other type combinations may be added in the future:
+//
+//	CQL type                                | Go type (value)         | Note
+//	varchar, ascii, blob, text              | *string                 |
+//	varchar, ascii, blob, text              | *[]byte                 | non-nil buffer is reused
+//	bool                                    | *bool                   |
+//	tinyint, smallint, int, bigint, counter | *integer types          |
+//	tinyint, smallint, int, bigint, counter | *big.Int                |
+//	tinyint, smallint, int, bigint, counter | *string                 | formatted as base 10 number
+//	float                                   | *float32                |
+//	double                                  | *float64                |
+//	decimal                                 | *inf.Dec                |
+//	time                                    | *int64                  | nanoseconds since start of day
+//	time                                    | *time.Duration          |
+//	timestamp                               | *int64                  | milliseconds since Unix epoch
+//	timestamp                               | *time.Time              |
+//	list, set                               | *slice, *array          |
+//	map                                     | *map[X]Y                |
+//	uuid, timeuuid                          | *string                 | see UUID.String
+//	uuid, timeuuid                          | *[]byte                 | raw UUID bytes
+//	uuid, timeuuid                          | *gocql.UUID             |
+//	timeuuid                                | *time.Time              | timestamp of the UUID
+//	inet                                    | *net.IP                 |
+//	inet                                    | *string                 | IPv4 or IPv6 address string
+//	tuple                                   | *slice, *array          |
+//	tuple                                   | *struct                 | struct fields are set in order of declaration
+//	user-defined types                      | gocql.UDTUnmarshaler    | UnmarshalUDT is called
+//	user-defined types                      | *map[string]interface{} |
+//	user-defined types                      | *struct                 | cql tag is used to determine field name
+//	date                                    | *time.Time              | time of beginning of the day (in UTC)
+//	date                                    | *string                 | formatted with 2006-01-02 format
+//	duration                                | *gocql.Duration         |
+func Unmarshal(info TypeInfo, data []byte, value interface{}) error {
+	if v, ok := value.(Unmarshaler); ok {
+		return v.UnmarshalCQL(info, data)
+	}
+
+	if isNullableValue(value) {
+		return unmarshalNullable(info, data, value)
+	}
+
+	switch info.Type() {
+	case TypeVarchar:
+		return unmarshalVarchar(data, value)
+	case TypeText:
+		return unmarshalText(data, value)
+	case TypeBlob:
+		return unmarshalBlob(data, value)
+	case TypeAscii:
+		return unmarshalAscii(data, value)
+	case TypeBoolean:
+		return unmarshalBool(data, value)
+	case TypeInt:
+		return unmarshalInt(data, value)
+	case TypeBigInt:
+		return unmarshalBigInt(data, value)
+	case TypeCounter:
+		return unmarshalCounter(data, value)
+	case TypeVarint:
+		return unmarshalVarint(data, value)
+	case TypeSmallInt:
+		return unmarshalSmallInt(data, value)
+	case TypeTinyInt:
+		return unmarshalTinyInt(data, value)
+	case TypeFloat:
+		return unmarshalFloat(data, value)
+	case TypeDouble:
+		return unmarshalDouble(data, value)
+	case TypeDecimal:
+		return unmarshalDecimal(data, value)
+	case TypeTime:
+		return unmarshalTime(data, value)
+	case TypeTimestamp:
+		return unmarshalTimestamp(data, value)
+	case TypeList, TypeSet:
+		return unmarshalList(info, data, value)
+	case TypeMap:
+		return unmarshalMap(info, data, value)
+	case TypeTimeUUID:
+		return unmarshalTimeUUID(data, value)
+	case TypeUUID:
+		return unmarshalUUID(data, value)
+	case TypeInet:
+		return unmarshalInet(data, value)
+	case TypeTuple:
+		return unmarshalTuple(info, data, value)
+	case TypeUDT:
+		return unmarshalUDT(info, data, value)
+	case TypeDate:
+		return unmarshalDate(data, value)
+	case TypeDuration:
+		return unmarshalDuration(data, value)
+	}
+
+	// detect protocol 2 UDT
+	if strings.HasPrefix(info.Custom(), "org.apache.cassandra.db.marshal.UserType") && info.Version() < 3 {
+		return ErrorUDTUnavailable
+	}
+
+	// TODO(tux21b): add the remaining types
+	return fmt.Errorf("can not unmarshal %s into %T", info, value)
+}
+
+func isNullableValue(value interface{}) bool {
+	v := reflect.ValueOf(value)
+	return v.Kind() == reflect.Ptr && v.Type().Elem().Kind() == reflect.Ptr
+}
+
+func isNullData(info TypeInfo, data []byte) bool {
+	return data == nil
+}
+
+func unmarshalNullable(info TypeInfo, data []byte, value interface{}) error {
+	valueRef := reflect.ValueOf(value)
+
+	if isNullData(info, data) {
+		nilValue := reflect.Zero(valueRef.Type().Elem())
+		valueRef.Elem().Set(nilValue)
+		return nil
+	}
+
+	newValue := reflect.New(valueRef.Type().Elem().Elem())
+	valueRef.Elem().Set(newValue)
+	return Unmarshal(info, data, newValue.Interface())
+}
+
+func marshalVarchar(value interface{}) ([]byte, error) {
+	data, err := varchar.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func marshalText(value interface{}) ([]byte, error) {
+	data, err := text.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func marshalBlob(value interface{}) ([]byte, error) {
+	data, err := blob.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func marshalAscii(value interface{}) ([]byte, error) {
+	data, err := ascii.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func unmarshalVarchar(data []byte, value interface{}) error {
+	err := varchar.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func unmarshalText(data []byte, value interface{}) error {
+	err := text.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func unmarshalBlob(data []byte, value interface{}) error {
+	err := blob.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func unmarshalAscii(data []byte, value interface{}) error {
+	err := ascii.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func marshalSmallInt(value interface{}) ([]byte, error) {
+	data, err := smallint.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func marshalTinyInt(value interface{}) ([]byte, error) {
+	data, err := tinyint.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func marshalInt(value interface{}) ([]byte, error) {
+	data, err := cqlint.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func marshalBigInt(value interface{}) ([]byte, error) {
+	data, err := bigint.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func marshalCounter(value interface{}) ([]byte, error) {
+	data, err := counter.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func unmarshalCounter(data []byte, value interface{}) error {
+	err := counter.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func unmarshalInt(data []byte, value interface{}) error {
+	err := cqlint.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func unmarshalBigInt(data []byte, value interface{}) error {
+	err := bigint.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func unmarshalSmallInt(data []byte, value interface{}) error {
+	err := smallint.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func unmarshalTinyInt(data []byte, value interface{}) error {
+	if err := tinyint.Unmarshal(data, value); err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func unmarshalVarint(data []byte, value interface{}) error {
+	if err := varint.Unmarshal(data, value); err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func marshalVarint(value interface{}) ([]byte, error) {
+	data, err := varint.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func decBigInt(data []byte) int64 {
+	if len(data) != 8 {
+		return 0
+	}
+	return int64(data[0])<<56 | int64(data[1])<<48 |
+		int64(data[2])<<40 | int64(data[3])<<32 |
+		int64(data[4])<<24 | int64(data[5])<<16 |
+		int64(data[6])<<8 | int64(data[7])
+}
+
+func marshalBool(value interface{}) ([]byte, error) {
+	data, err := boolean.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func unmarshalBool(data []byte, value interface{}) error {
+	if err := boolean.Unmarshal(data, value); err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func marshalFloat(value interface{}) ([]byte, error) {
+	data, err := float.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func unmarshalFloat(data []byte, value interface{}) error {
+	if err := float.Unmarshal(data, value); err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func marshalDouble(value interface{}) ([]byte, error) {
+	data, err := double.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func unmarshalDouble(data []byte, value interface{}) error {
+	err := double.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func marshalDecimal(value interface{}) ([]byte, error) {
+	data, err := decimal.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func unmarshalDecimal(data []byte, value interface{}) error {
+	if err := decimal.Unmarshal(data, value); err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func marshalTime(value interface{}) ([]byte, error) {
+	data, err := cqltime.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func unmarshalTime(data []byte, value interface{}) error {
+	err := cqltime.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func marshalTimestamp(value interface{}) ([]byte, error) {
+	data, err := timestamp.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func unmarshalTimestamp(data []byte, value interface{}) error {
+	err := timestamp.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func marshalDate(value interface{}) ([]byte, error) {
+	data, err := date.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func unmarshalDate(data []byte, value interface{}) error {
+	err := date.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func marshalDuration(value interface{}) ([]byte, error) {
+	switch uv := value.(type) {
+	case Duration:
+		value = duration.Duration(uv)
+	case *Duration:
+		value = (*duration.Duration)(uv)
+	}
+	data, err := duration.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func unmarshalDuration(data []byte, value interface{}) error {
+	switch uv := value.(type) {
+	case *Duration:
+		value = (*duration.Duration)(uv)
+	case **Duration:
+		if uv == nil {
+			value = (**duration.Duration)(nil)
+		} else {
+			value = (**duration.Duration)(unsafe.Pointer(uv))
+		}
+	}
+	err := duration.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func writeCollectionSize(info CollectionType, n int, buf *bytes.Buffer) error {
+	if info.proto > protoVersion2 {
+		if n > math.MaxInt32 {
+			return marshalErrorf("marshal: collection too large")
+		}
+
+		buf.WriteByte(byte(n >> 24))
+		buf.WriteByte(byte(n >> 16))
+		buf.WriteByte(byte(n >> 8))
+		buf.WriteByte(byte(n))
+	} else {
+		if n > math.MaxUint16 {
+			return marshalErrorf("marshal: collection too large")
+		}
+
+		buf.WriteByte(byte(n >> 8))
+		buf.WriteByte(byte(n))
+	}
+
+	return nil
+}
+
+func marshalList(info TypeInfo, value interface{}) ([]byte, error) {
+	listInfo, ok := info.(CollectionType)
+	if !ok {
+		return nil, marshalErrorf("marshal: can not marshal non collection type into list")
+	}
+
+	if value == nil {
+		return nil, nil
+	} else if _, ok := value.(unsetColumn); ok {
+		return nil, nil
+	}
+
+	rv := reflect.ValueOf(value)
+	t := rv.Type()
+	k := t.Kind()
+	if k == reflect.Slice && rv.IsNil() {
+		return nil, nil
+	}
+
+	switch k {
+	case reflect.Slice, reflect.Array:
+		buf := &bytes.Buffer{}
+		n := rv.Len()
+
+		if err := writeCollectionSize(listInfo, n, buf); err != nil {
+			return nil, err
+		}
+
+		for i := 0; i < n; i++ {
+			item, err := Marshal(listInfo.Elem, rv.Index(i).Interface())
+			if err != nil {
+				return nil, err
+			}
+			itemLen := len(item)
+			// Set the value to null for supported protocols
+			if item == nil && listInfo.proto > protoVersion2 {
+				itemLen = -1
+			}
+			if err := writeCollectionSize(listInfo, itemLen, buf); err != nil {
+				return nil, err
+			}
+			buf.Write(item)
+		}
+		return buf.Bytes(), nil
+	case reflect.Map:
+		elem := t.Elem()
+		if elem.Kind() == reflect.Struct && elem.NumField() == 0 {
+			rkeys := rv.MapKeys()
+			keys := make([]interface{}, len(rkeys))
+			for i := 0; i < len(keys); i++ {
+				keys[i] = rkeys[i].Interface()
+			}
+			return marshalList(listInfo, keys)
+		}
+	}
+	return nil, marshalErrorf("can not marshal %T into %s", value, info)
+}
+
+func readCollectionSize(info CollectionType, data []byte) (size, read int, err error) {
+	if info.proto > protoVersion2 {
+		if len(data) < 4 {
+			return 0, 0, unmarshalErrorf("unmarshal list: unexpected eof")
+		}
+		size = int(int32(data[0])<<24 | int32(data[1])<<16 | int32(data[2])<<8 | int32(data[3]))
+		read = 4
+	} else {
+		if len(data) < 2 {
+			return 0, 0, unmarshalErrorf("unmarshal list: unexpected eof")
+		}
+		size = int(data[0])<<8 | int(data[1])
+		read = 2
+	}
+	return
+}
+
+func unmarshalList(info TypeInfo, data []byte, value interface{}) error {
+	listInfo, ok := info.(CollectionType)
+	if !ok {
+		return unmarshalErrorf("unmarshal: can not unmarshal none collection type into list")
+	}
+
+	rv := reflect.ValueOf(value)
+	if rv.Kind() != reflect.Ptr {
+		return unmarshalErrorf("can not unmarshal into non-pointer %T", value)
+	}
+	rv = rv.Elem()
+	t := rv.Type()
+	k := t.Kind()
+
+	switch k {
+	case reflect.Slice, reflect.Array:
+		if data == nil {
+			if k == reflect.Array {
+				return unmarshalErrorf("unmarshal list: can not store nil in array value")
+			}
+			if rv.IsNil() {
+				return nil
+			}
+			rv.Set(reflect.Zero(t))
+			return nil
+		}
+		n, p, err := readCollectionSize(listInfo, data)
+		if err != nil {
+			return err
+		}
+		data = data[p:]
+		if k == reflect.Array {
+			if rv.Len() != n {
+				return unmarshalErrorf("unmarshal list: array with wrong size")
+			}
+		} else {
+			rv.Set(reflect.MakeSlice(t, n, n))
+		}
+		for i := 0; i < n; i++ {
+			m, p, err := readCollectionSize(listInfo, data)
+			if err != nil {
+				return err
+			}
+			data = data[p:]
+			// In case m < 0, the value is null, and unmarshalData should be nil.
+			var unmarshalData []byte
+			if m >= 0 {
+				if len(data) < m {
+					return unmarshalErrorf("unmarshal list: unexpected eof")
+				}
+				unmarshalData = data[:m]
+				data = data[m:]
+			}
+			if err := Unmarshal(listInfo.Elem, unmarshalData, rv.Index(i).Addr().Interface()); err != nil {
+				return err
+			}
+		}
+		return nil
+	}
+	return unmarshalErrorf("can not unmarshal %s into %T", info, value)
+}
+
+func marshalMap(info TypeInfo, value interface{}) ([]byte, error) {
+	mapInfo, ok := info.(CollectionType)
+	if !ok {
+		return nil, marshalErrorf("marshal: can not marshal none collection type into map")
+	}
+
+	if value == nil {
+		return nil, nil
+	} else if _, ok := value.(unsetColumn); ok {
+		return nil, nil
+	}
+
+	rv := reflect.ValueOf(value)
+
+	t := rv.Type()
+	if t.Kind() != reflect.Map {
+		return nil, marshalErrorf("can not marshal %T into %s", value, info)
+	}
+
+	if rv.IsNil() {
+		return nil, nil
+	}
+
+	buf := &bytes.Buffer{}
+	n := rv.Len()
+
+	if err := writeCollectionSize(mapInfo, n, buf); err != nil {
+		return nil, err
+	}
+
+	keys := rv.MapKeys()
+	for _, key := range keys {
+		item, err := Marshal(mapInfo.Key, key.Interface())
+		if err != nil {
+			return nil, err
+		}
+		itemLen := len(item)
+		// Set the key to null for supported protocols
+		if item == nil && mapInfo.proto > protoVersion2 {
+			itemLen = -1
+		}
+		if err := writeCollectionSize(mapInfo, itemLen, buf); err != nil {
+			return nil, err
+		}
+		buf.Write(item)
+
+		item, err = Marshal(mapInfo.Elem, rv.MapIndex(key).Interface())
+		if err != nil {
+			return nil, err
+		}
+		itemLen = len(item)
+		// Set the value to null for supported protocols
+		if item == nil && mapInfo.proto > protoVersion2 {
+			itemLen = -1
+		}
+		if err := writeCollectionSize(mapInfo, itemLen, buf); err != nil {
+			return nil, err
+		}
+		buf.Write(item)
+	}
+	return buf.Bytes(), nil
+}
+
+func unmarshalMap(info TypeInfo, data []byte, value interface{}) error {
+	mapInfo, ok := info.(CollectionType)
+	if !ok {
+		return unmarshalErrorf("unmarshal: can not unmarshal none collection type into map")
+	}
+
+	rv := reflect.ValueOf(value)
+	if rv.Kind() != reflect.Ptr {
+		return unmarshalErrorf("can not unmarshal into non-pointer %T", value)
+	}
+	rv = rv.Elem()
+	t := rv.Type()
+	if t.Kind() != reflect.Map {
+		return unmarshalErrorf("can not unmarshal %s into %T", info, value)
+	}
+	if data == nil {
+		rv.Set(reflect.Zero(t))
+		return nil
+	}
+	n, p, err := readCollectionSize(mapInfo, data)
+	if err != nil {
+		return err
+	}
+	if n < 0 {
+		return unmarshalErrorf("negative map size %d", n)
+	}
+	rv.Set(reflect.MakeMapWithSize(t, n))
+	data = data[p:]
+	for i := 0; i < n; i++ {
+		m, p, err := readCollectionSize(mapInfo, data)
+		if err != nil {
+			return err
+		}
+		data = data[p:]
+		key := reflect.New(t.Key())
+		// In case m < 0, the key is null, and unmarshalData should be nil.
+		var unmarshalData []byte
+		if m >= 0 {
+			if len(data) < m {
+				return unmarshalErrorf("unmarshal map: unexpected eof")
+			}
+			unmarshalData = data[:m]
+			data = data[m:]
+		}
+		if err := Unmarshal(mapInfo.Key, unmarshalData, key.Interface()); err != nil {
+			return err
+		}
+
+		m, p, err = readCollectionSize(mapInfo, data)
+		if err != nil {
+			return err
+		}
+		data = data[p:]
+		val := reflect.New(t.Elem())
+
+		// In case m < 0, the value is null, and unmarshalData should be nil.
+		unmarshalData = nil
+		if m >= 0 {
+			if len(data) < m {
+				return unmarshalErrorf("unmarshal map: unexpected eof")
+			}
+			unmarshalData = data[:m]
+			data = data[m:]
+		}
+		if err := Unmarshal(mapInfo.Elem, unmarshalData, val.Interface()); err != nil {
+			return err
+		}
+
+		rv.SetMapIndex(key.Elem(), val.Elem())
+	}
+	return nil
+}
+
+func marshalUUID(value interface{}) ([]byte, error) {
+	switch uv := value.(type) {
+	case UUID:
+		value = [16]byte(uv)
+	case *UUID:
+		value = (*[16]byte)(uv)
+	}
+	data, err := uuid.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func unmarshalUUID(data []byte, value interface{}) error {
+	switch uv := value.(type) {
+	case *UUID:
+		value = (*[16]byte)(uv)
+	case **UUID:
+		if uv == nil {
+			value = (**[16]byte)(nil)
+		} else {
+			value = (**[16]byte)(unsafe.Pointer(uv))
+		}
+	}
+	err := uuid.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func marshalTimeUUID(value interface{}) ([]byte, error) {
+	switch uv := value.(type) {
+	case UUID:
+		value = [16]byte(uv)
+	case *UUID:
+		value = (*[16]byte)(uv)
+	}
+	data, err := timeuuid.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func unmarshalTimeUUID(data []byte, value interface{}) error {
+	switch uv := value.(type) {
+	case *UUID:
+		value = (*[16]byte)(uv)
+	case **UUID:
+		if uv == nil {
+			value = (**[16]byte)(nil)
+		} else {
+			value = (**[16]byte)(unsafe.Pointer(uv))
+		}
+	}
+	err := timeuuid.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func marshalInet(value interface{}) ([]byte, error) {
+	data, err := inet.Marshal(value)
+	if err != nil {
+		return nil, wrapMarshalError(err, "marshal error")
+	}
+	return data, nil
+}
+
+func unmarshalInet(data []byte, value interface{}) error {
+	err := inet.Unmarshal(data, value)
+	if err != nil {
+		return wrapUnmarshalError(err, "unmarshal error")
+	}
+	return nil
+}
+
+func marshalTuple(info TypeInfo, value interface{}) ([]byte, error) {
+	tuple := info.(TupleTypeInfo)
+	switch v := value.(type) {
+	case unsetColumn:
+		return nil, unmarshalErrorf("Invalid request: UnsetValue is unsupported for tuples")
+	case []interface{}:
+		if len(v) != len(tuple.Elems) {
+			return nil, unmarshalErrorf("cannont marshal tuple: wrong number of elements")
+		}
+
+		var buf []byte
+		for i, elem := range v {
+			if elem == nil {
+				buf = appendInt(buf, int32(-1))
+				continue
+			}
+
+			data, err := Marshal(tuple.Elems[i], elem)
+			if err != nil {
+				return nil, err
+			}
+
+			n := len(data)
+			buf = appendInt(buf, int32(n))
+			buf = append(buf, data...)
+		}
+
+		return buf, nil
+	}
+
+	rv := reflect.ValueOf(value)
+	t := rv.Type()
+	k := t.Kind()
+
+	switch k {
+	case reflect.Struct:
+		if v := t.NumField(); v != len(tuple.Elems) {
+			return nil, marshalErrorf("can not marshal tuple into struct %v, not enough fields have %d need %d", t, v, len(tuple.Elems))
+		}
+
+		var buf []byte
+		for i, elem := range tuple.Elems {
+			field := rv.Field(i)
+
+			if field.Kind() == reflect.Ptr && field.IsNil() {
+				buf = appendInt(buf, int32(-1))
+				continue
+			}
+
+			data, err := Marshal(elem, field.Interface())
+			if err != nil {
+				return nil, err
+			}
+
+			n := len(data)
+			buf = appendInt(buf, int32(n))
+			buf = append(buf, data...)
+		}
+
+		return buf, nil
+	case reflect.Slice, reflect.Array:
+		size := rv.Len()
+		if size != len(tuple.Elems) {
+			return nil, marshalErrorf("can not marshal tuple into %v of length %d need %d elements", k, size, len(tuple.Elems))
+		}
+
+		var buf []byte
+		for i, elem := range tuple.Elems {
+			item := rv.Index(i)
+
+			if item.Kind() == reflect.Ptr && item.IsNil() {
+				buf = appendInt(buf, int32(-1))
+				continue
+			}
+
+			data, err := Marshal(elem, item.Interface())
+			if err != nil {
+				return nil, err
+			}
+
+			n := len(data)
+			buf = appendInt(buf, int32(n))
+			buf = append(buf, data...)
+		}
+
+		return buf, nil
+	}
+
+	return nil, marshalErrorf("cannot marshal %T into %s", value, tuple)
+}
+
+func readBytes(p []byte) ([]byte, []byte) {
+	// TODO: really should use a framer
+	size := readInt(p)
+	p = p[4:]
+	if size < 0 {
+		return nil, p
+	}
+	return p[:size], p[size:]
+}
+
+// currently only support unmarshal into a list of values, this makes it possible
+// to support tuples without changing the query API. In the future this can be extend
+// to allow unmarshalling into custom tuple types.
+func unmarshalTuple(info TypeInfo, data []byte, value interface{}) error {
+	if v, ok := value.(Unmarshaler); ok {
+		return v.UnmarshalCQL(info, data)
+	}
+
+	tuple := info.(TupleTypeInfo)
+	switch v := value.(type) {
+	case []interface{}:
+		for i, elem := range tuple.Elems {
+			// each element inside data is a [bytes]
+			var p []byte
+			if len(data) >= 4 {
+				p, data = readBytes(data)
+			}
+			err := Unmarshal(elem, p, v[i])
+			if err != nil {
+				return err
+			}
+		}
+
+		return nil
+	}
+
+	rv := reflect.ValueOf(value)
+	if rv.Kind() != reflect.Ptr {
+		return unmarshalErrorf("can not unmarshal into non-pointer %T", value)
+	}
+
+	rv = rv.Elem()
+	t := rv.Type()
+	k := t.Kind()
+
+	switch k {
+	case reflect.Struct:
+		if v := t.NumField(); v != len(tuple.Elems) {
+			return unmarshalErrorf("can not unmarshal tuple into struct %v, not enough fields have %d need %d", t, v, len(tuple.Elems))
+		}
+
+		for i, elem := range tuple.Elems {
+			var p []byte
+			if len(data) >= 4 {
+				p, data = readBytes(data)
+			}
+
+			v, err := elem.NewWithError()
+			if err != nil {
+				return err
+			}
+			if err := Unmarshal(elem, p, v); err != nil {
+				return err
+			}
+
+			switch rv.Field(i).Kind() {
+			case reflect.Ptr:
+				if p != nil {
+					rv.Field(i).Set(reflect.ValueOf(v))
+				} else {
+					rv.Field(i).Set(reflect.Zero(reflect.TypeOf(v)))
+				}
+			default:
+				rv.Field(i).Set(reflect.ValueOf(v).Elem())
+			}
+		}
+
+		return nil
+	case reflect.Slice, reflect.Array:
+		if k == reflect.Array {
+			size := rv.Len()
+			if size != len(tuple.Elems) {
+				return unmarshalErrorf("can not unmarshal tuple into array of length %d need %d elements", size, len(tuple.Elems))
+			}
+		} else {
+			rv.Set(reflect.MakeSlice(t, len(tuple.Elems), len(tuple.Elems)))
+		}
+
+		for i, elem := range tuple.Elems {
+			var p []byte
+			if len(data) >= 4 {
+				p, data = readBytes(data)
+			}
+
+			v, err := elem.NewWithError()
+			if err != nil {
+				return err
+			}
+			if err := Unmarshal(elem, p, v); err != nil {
+				return err
+			}
+
+			switch rv.Index(i).Kind() {
+			case reflect.Ptr:
+				if p != nil {
+					rv.Index(i).Set(reflect.ValueOf(v))
+				} else {
+					rv.Index(i).Set(reflect.Zero(reflect.TypeOf(v)))
+				}
+			default:
+				rv.Index(i).Set(reflect.ValueOf(v).Elem())
+			}
+		}
+
+		return nil
+	}
+
+	return unmarshalErrorf("cannot unmarshal %s into %T", info, value)
+}
+
+// UDTMarshaler is an interface which should be implemented by users wishing to
+// handle encoding UDT types to sent to Cassandra. Note: due to current implentations
+// methods defined for this interface must be value receivers not pointer receivers.
+type UDTMarshaler interface {
+	// MarshalUDT will be called for each field in the the UDT returned by Cassandra,
+	// the implementor should marshal the type to return by for example calling
+	// Marshal.
+	MarshalUDT(name string, info TypeInfo) ([]byte, error)
+}
+
+// UDTUnmarshaler should be implemented by users wanting to implement custom
+// UDT unmarshaling.
+type UDTUnmarshaler interface {
+	// UnmarshalUDT will be called for each field in the UDT return by Cassandra,
+	// the implementor should unmarshal the data into the value of their chosing,
+	// for example by calling Unmarshal.
+	UnmarshalUDT(name string, info TypeInfo, data []byte) error
+}
+
+func marshalUDT(info TypeInfo, value interface{}) ([]byte, error) {
+	udt := info.(UDTTypeInfo)
+
+	switch v := value.(type) {
+	case Marshaler:
+		return v.MarshalCQL(info)
+	case unsetColumn:
+		return nil, unmarshalErrorf("invalid request: UnsetValue is unsupported for user defined types")
+	case UDTMarshaler:
+		var buf []byte
+		for _, e := range udt.Elements {
+			data, err := v.MarshalUDT(e.Name, e.Type)
+			if err != nil {
+				return nil, err
+			}
+
+			buf = appendBytes(buf, data)
+		}
+
+		return buf, nil
+	case map[string]interface{}:
+		var buf []byte
+		for _, e := range udt.Elements {
+			val, ok := v[e.Name]
+			var data []byte
+
+			if ok {
+				var err error
+				data, err = Marshal(e.Type, val)
+				if err != nil {
+					return nil, err
+				}
+			}
+
+			buf = appendBytes(buf, data)
+		}
+
+		return buf, nil
+	}
+
+	k := reflect.ValueOf(value)
+	if k.Kind() == reflect.Ptr {
+		if k.IsNil() {
+			return nil, marshalErrorf("cannot marshal %T into %s", value, info)
+		}
+		k = k.Elem()
+	}
+
+	if k.Kind() != reflect.Struct || !k.IsValid() {
+		return nil, marshalErrorf("cannot marshal %T into %s", value, info)
+	}
+
+	fields := make(map[string]reflect.Value)
+	t := reflect.TypeOf(value)
+	for i := 0; i < t.NumField(); i++ {
+		sf := t.Field(i)
+
+		if tag := sf.Tag.Get("cql"); tag != "" {
+			fields[tag] = k.Field(i)
+		}
+	}
+
+	var buf []byte
+	for _, e := range udt.Elements {
+		f, ok := fields[e.Name]
+		if !ok {
+			f = k.FieldByName(e.Name)
+		}
+
+		var data []byte
+		if f.IsValid() && f.CanInterface() {
+			var err error
+			data, err = Marshal(e.Type, f.Interface())
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		buf = appendBytes(buf, data)
+	}
+
+	return buf, nil
+}
+
+func unmarshalUDT(info TypeInfo, data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case Unmarshaler:
+		return v.UnmarshalCQL(info, data)
+	case UDTUnmarshaler:
+		udt := info.(UDTTypeInfo)
+
+		for id, e := range udt.Elements {
+			if len(data) == 0 {
+				return nil
+			}
+			if len(data) < 4 {
+				return unmarshalErrorf("can not unmarshal %s: field [%d]%s: unexpected eof", info, id, e.Name)
+			}
+
+			var p []byte
+			p, data = readBytes(data)
+			if err := v.UnmarshalUDT(e.Name, e.Type, p); err != nil {
+				return err
+			}
+		}
+
+		return nil
+	case *map[string]interface{}:
+		udt := info.(UDTTypeInfo)
+
+		rv := reflect.ValueOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return unmarshalErrorf("can not unmarshal into non-pointer %T", value)
+		}
+
+		rv = rv.Elem()
+		t := rv.Type()
+		if t.Kind() != reflect.Map {
+			return unmarshalErrorf("can not unmarshal %s into %T", info, value)
+		} else if data == nil {
+			rv.Set(reflect.Zero(t))
+			return nil
+		}
+
+		rv.Set(reflect.MakeMap(t))
+		m := *v
+
+		for id, e := range udt.Elements {
+			if len(data) == 0 {
+				return nil
+			}
+			if len(data) < 4 {
+				return unmarshalErrorf("can not unmarshal %s: field [%d]%s: unexpected eof", info, id, e.Name)
+			}
+
+			valType, err := goType(e.Type)
+			if err != nil {
+				return unmarshalErrorf("can not unmarshal %s: %v", info, err)
+			}
+
+			val := reflect.New(valType)
+
+			var p []byte
+			p, data = readBytes(data)
+
+			if err := Unmarshal(e.Type, p, val.Interface()); err != nil {
+				return err
+			}
+
+			m[e.Name] = val.Elem().Interface()
+		}
+
+		return nil
+	}
+
+	rv := reflect.ValueOf(value)
+	if rv.Kind() != reflect.Ptr {
+		return unmarshalErrorf("can not unmarshal into non-pointer %T", value)
+	}
+	k := rv.Elem()
+	if k.Kind() != reflect.Struct || !k.IsValid() {
+		return unmarshalErrorf("cannot unmarshal %s into %T", info, value)
+	}
+
+	if len(data) == 0 {
+		if k.CanSet() {
+			k.Set(reflect.Zero(k.Type()))
+		}
+
+		return nil
+	}
+
+	t := k.Type()
+	fields := make(map[string]reflect.Value, t.NumField())
+	for i := 0; i < t.NumField(); i++ {
+		sf := t.Field(i)
+
+		if tag := sf.Tag.Get("cql"); tag != "" {
+			fields[tag] = k.Field(i)
+		}
+	}
+
+	udt := info.(UDTTypeInfo)
+	for id, e := range udt.Elements {
+		if len(data) == 0 {
+			return nil
+		}
+		if len(data) < 4 {
+			// UDT def does not match the column value
+			return unmarshalErrorf("can not unmarshal %s: field [%d]%s: unexpected eof", info, id, e.Name)
+		}
+
+		var p []byte
+		p, data = readBytes(data)
+
+		f, ok := fields[e.Name]
+		if !ok {
+			f = k.FieldByName(e.Name)
+			if f == emptyValue {
+				// skip fields which exist in the UDT but not in
+				// the struct passed in
+				continue
+			}
+		}
+
+		if !f.IsValid() || !f.CanAddr() {
+			return unmarshalErrorf("cannot unmarshal %s into %T: field %v is not valid", info, value, e.Name)
+		}
+
+		fk := f.Addr().Interface()
+		if err := Unmarshal(e.Type, p, fk); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// TypeInfo describes a Cassandra specific data type.
+type TypeInfo interface {
+	Type() Type
+	Version() byte
+	Custom() string
+
+	// New creates a pointer to an empty version of whatever type
+	// is referenced by the TypeInfo receiver.
+	//
+	// If there is no corresponding Go type for the CQL type, New panics.
+	//
+	// Deprecated: Use NewWithError instead.
+	New() interface{}
+
+	// NewWithError creates a pointer to an empty version of whatever type
+	// is referenced by the TypeInfo receiver.
+	//
+	// If there is no corresponding Go type for the CQL type, NewWithError returns an error.
+	NewWithError() (interface{}, error)
+}
+
+type NativeType struct {
+	proto  byte
+	typ    Type
+	custom string // only used for TypeCustom
+}
+
+func NewNativeType(proto byte, typ Type, custom string) NativeType {
+	return NativeType{proto, typ, custom}
+}
+
+func (t NativeType) NewWithError() (interface{}, error) {
+	typ, err := goType(t)
+	if err != nil {
+		return nil, err
+	}
+	return reflect.New(typ).Interface(), nil
+}
+
+func (t NativeType) New() interface{} {
+	val, err := t.NewWithError()
+	if err != nil {
+		panic(err.Error())
+	}
+	return val
+}
+
+func (s NativeType) Type() Type {
+	return s.typ
+}
+
+func (s NativeType) Version() byte {
+	return s.proto
+}
+
+func (s NativeType) Custom() string {
+	return s.custom
+}
+
+func (s NativeType) String() string {
+	switch s.typ {
+	case TypeCustom:
+		return fmt.Sprintf("%s(%s)", s.typ, s.custom)
+	default:
+		return s.typ.String()
+	}
+}
+
+func NewCollectionType(m NativeType, key, elem TypeInfo) CollectionType {
+	return CollectionType{
+		NativeType: m,
+		Key:        key,
+		Elem:       elem,
+	}
+}
+
+type CollectionType struct {
+	NativeType
+	Key  TypeInfo // only used for TypeMap
+	Elem TypeInfo // only used for TypeMap, TypeList and TypeSet
+}
+
+func (t CollectionType) NewWithError() (interface{}, error) {
+	typ, err := goType(t)
+	if err != nil {
+		return nil, err
+	}
+	return reflect.New(typ).Interface(), nil
+}
+
+func (t CollectionType) New() interface{} {
+	val, err := t.NewWithError()
+	if err != nil {
+		panic(err.Error())
+	}
+	return val
+}
+
+func (c CollectionType) String() string {
+	switch c.typ {
+	case TypeMap:
+		return fmt.Sprintf("%s(%s, %s)", c.typ, c.Key, c.Elem)
+	case TypeList, TypeSet:
+		return fmt.Sprintf("%s(%s)", c.typ, c.Elem)
+	case TypeCustom:
+		return fmt.Sprintf("%s(%s)", c.typ, c.custom)
+	default:
+		return c.typ.String()
+	}
+}
+
+func NewTupleType(n NativeType, elems ...TypeInfo) TupleTypeInfo {
+	return TupleTypeInfo{
+		NativeType: n,
+		Elems:      elems,
+	}
+}
+
+type TupleTypeInfo struct {
+	NativeType
+	Elems []TypeInfo
+}
+
+func (t TupleTypeInfo) String() string {
+	var buf bytes.Buffer
+	buf.WriteString(fmt.Sprintf("%s(", t.typ))
+	for _, elem := range t.Elems {
+		buf.WriteString(fmt.Sprintf("%s, ", elem))
+	}
+	buf.Truncate(buf.Len() - 2)
+	buf.WriteByte(')')
+	return buf.String()
+}
+
+func (t TupleTypeInfo) NewWithError() (interface{}, error) {
+	typ, err := goType(t)
+	if err != nil {
+		return nil, err
+	}
+	return reflect.New(typ).Interface(), nil
+}
+
+func (t TupleTypeInfo) New() interface{} {
+	val, err := t.NewWithError()
+	if err != nil {
+		panic(err.Error())
+	}
+	return val
+}
+
+type UDTField struct {
+	Name string
+	Type TypeInfo
+}
+
+func NewUDTType(proto byte, name, keySpace string, elems ...UDTField) UDTTypeInfo {
+	return UDTTypeInfo{
+		NativeType: NativeType{proto, TypeUDT, ""},
+		Name:       name,
+		KeySpace:   keySpace,
+		Elements:   elems,
+	}
+}
+
+type UDTTypeInfo struct {
+	NativeType
+	KeySpace string
+	Name     string
+	Elements []UDTField
+}
+
+func (u UDTTypeInfo) NewWithError() (interface{}, error) {
+	typ, err := goType(u)
+	if err != nil {
+		return nil, err
+	}
+	return reflect.New(typ).Interface(), nil
+}
+
+func (u UDTTypeInfo) New() interface{} {
+	val, err := u.NewWithError()
+	if err != nil {
+		panic(err.Error())
+	}
+	return val
+}
+
+func (u UDTTypeInfo) String() string {
+	buf := &bytes.Buffer{}
+
+	fmt.Fprintf(buf, "%s.%s{", u.KeySpace, u.Name)
+	first := true
+	for _, e := range u.Elements {
+		if !first {
+			fmt.Fprint(buf, ",")
+		} else {
+			first = false
+		}
+
+		fmt.Fprintf(buf, "%s=%v", e.Name, e.Type)
+	}
+	fmt.Fprint(buf, "}")
+
+	return buf.String()
+}
+
+// String returns a human readable name for the Cassandra datatype
+// described by t.
+// Type is the identifier of a Cassandra internal datatype.
+type Type int
+
+const (
+	TypeCustom    Type = 0x0000
+	TypeAscii     Type = 0x0001
+	TypeBigInt    Type = 0x0002
+	TypeBlob      Type = 0x0003
+	TypeBoolean   Type = 0x0004
+	TypeCounter   Type = 0x0005
+	TypeDecimal   Type = 0x0006
+	TypeDouble    Type = 0x0007
+	TypeFloat     Type = 0x0008
+	TypeInt       Type = 0x0009
+	TypeText      Type = 0x000A
+	TypeTimestamp Type = 0x000B
+	TypeUUID      Type = 0x000C
+	TypeVarchar   Type = 0x000D
+	TypeVarint    Type = 0x000E
+	TypeTimeUUID  Type = 0x000F
+	TypeInet      Type = 0x0010
+	TypeDate      Type = 0x0011
+	TypeTime      Type = 0x0012
+	TypeSmallInt  Type = 0x0013
+	TypeTinyInt   Type = 0x0014
+	TypeDuration  Type = 0x0015
+	TypeList      Type = 0x0020
+	TypeMap       Type = 0x0021
+	TypeSet       Type = 0x0022
+	TypeUDT       Type = 0x0030
+	TypeTuple     Type = 0x0031
+)
+
+// String returns the name of the identifier.
+func (t Type) String() string {
+	switch t {
+	case TypeCustom:
+		return "custom"
+	case TypeAscii:
+		return "ascii"
+	case TypeBigInt:
+		return "bigint"
+	case TypeBlob:
+		return "blob"
+	case TypeBoolean:
+		return "boolean"
+	case TypeCounter:
+		return "counter"
+	case TypeDecimal:
+		return "decimal"
+	case TypeDouble:
+		return "double"
+	case TypeFloat:
+		return "float"
+	case TypeInt:
+		return "int"
+	case TypeText:
+		return "text"
+	case TypeTimestamp:
+		return "timestamp"
+	case TypeUUID:
+		return "uuid"
+	case TypeVarchar:
+		return "varchar"
+	case TypeTimeUUID:
+		return "timeuuid"
+	case TypeInet:
+		return "inet"
+	case TypeDate:
+		return "date"
+	case TypeDuration:
+		return "duration"
+	case TypeTime:
+		return "time"
+	case TypeSmallInt:
+		return "smallint"
+	case TypeTinyInt:
+		return "tinyint"
+	case TypeList:
+		return "list"
+	case TypeMap:
+		return "map"
+	case TypeSet:
+		return "set"
+	case TypeVarint:
+		return "varint"
+	case TypeTuple:
+		return "tuple"
+	default:
+		return fmt.Sprintf("unknown_type_%d", t)
+	}
+}
+
+type MarshalError struct {
+	cause error
+	msg   string
+}
+
+func (m MarshalError) Error() string {
+	if m.cause != nil {
+		return m.msg + ": " + m.cause.Error()
+	}
+	return m.msg
+}
+
+func (m MarshalError) Cause() error { return m.cause }
+
+func (m MarshalError) Unwrap() error {
+	return m.cause
+}
+
+func marshalErrorf(format string, args ...interface{}) MarshalError {
+	return MarshalError{msg: fmt.Sprintf(format, args...)}
+}
+
+func wrapMarshalError(err error, msg string) MarshalError {
+	return MarshalError{msg: msg, cause: err}
+}
+
+func wrapMarshalErrorf(err error, format string, a ...interface{}) MarshalError {
+	return MarshalError{msg: fmt.Sprintf(format, a...), cause: err}
+}
+
+type UnmarshalError struct {
+	cause error
+	msg   string
+}
+
+func (m UnmarshalError) Error() string {
+	if m.cause != nil {
+		return m.msg + ": " + m.cause.Error()
+	}
+	return m.msg
+}
+
+func (m UnmarshalError) Cause() error { return m.cause }
+
+func (m UnmarshalError) Unwrap() error {
+	return m.cause
+}
+
+func unmarshalErrorf(format string, args ...interface{}) UnmarshalError {
+	return UnmarshalError{msg: fmt.Sprintf(format, args...)}
+}
+
+func wrapUnmarshalError(err error, msg string) UnmarshalError {
+	return UnmarshalError{msg: msg, cause: err}
+}
+
+func wrapUnmarshalErrorf(err error, format string, a ...interface{}) UnmarshalError {
+	return UnmarshalError{msg: fmt.Sprintf(format, a...), cause: err}
+}
diff --git a/vendor/github.com/gocql/gocql/metadata_cassandra.go b/vendor/github.com/gocql/gocql/metadata_cassandra.go
new file mode 100644
index 0000000..96f794b
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/metadata_cassandra.go
@@ -0,0 +1,1466 @@
+//go:build cassandra && !scylla
+// +build cassandra,!scylla
+
+// Copyright (c) 2015 The gocql Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gocql
+
+import (
+	"encoding/hex"
+	"encoding/json"
+	"fmt"
+	"strconv"
+	"strings"
+	"sync"
+)
+
+// schema metadata for a keyspace
+type KeyspaceMetadata struct {
+	Name            string
+	DurableWrites   bool
+	StrategyClass   string
+	StrategyOptions map[string]interface{}
+	Tables          map[string]*TableMetadata
+	Functions       map[string]*FunctionMetadata
+	Aggregates      map[string]*AggregateMetadata
+	// Deprecated: use the MaterializedViews field for views and UserTypes field for udts instead.
+	Views             map[string]*ViewMetadata
+	MaterializedViews map[string]*MaterializedViewMetadata
+	UserTypes         map[string]*UserTypeMetadata
+}
+
+// schema metadata for a table (a.k.a. column family)
+type TableMetadata struct {
+	Keyspace          string
+	Name              string
+	KeyValidator      string
+	Comparator        string
+	DefaultValidator  string
+	KeyAliases        []string
+	ColumnAliases     []string
+	ValueAlias        string
+	PartitionKey      []*ColumnMetadata
+	ClusteringColumns []*ColumnMetadata
+	Columns           map[string]*ColumnMetadata
+	OrderedColumns    []string
+}
+
+// schema metadata for a column
+type ColumnMetadata struct {
+	Keyspace        string
+	Table           string
+	Name            string
+	ComponentIndex  int
+	Kind            ColumnKind
+	Validator       string
+	Type            TypeInfo
+	ClusteringOrder string
+	Order           ColumnOrder
+	Index           ColumnIndexMetadata
+}
+
+// FunctionMetadata holds metadata for function constructs
+type FunctionMetadata struct {
+	Keyspace          string
+	Name              string
+	ArgumentTypes     []TypeInfo
+	ArgumentNames     []string
+	Body              string
+	CalledOnNullInput bool
+	Language          string
+	ReturnType        TypeInfo
+}
+
+// AggregateMetadata holds metadata for aggregate constructs
+type AggregateMetadata struct {
+	Keyspace      string
+	Name          string
+	ArgumentTypes []TypeInfo
+	FinalFunc     FunctionMetadata
+	InitCond      string
+	ReturnType    TypeInfo
+	StateFunc     FunctionMetadata
+	StateType     TypeInfo
+
+	stateFunc string
+	finalFunc string
+}
+
+// ViewMetadata holds the metadata for views.
+// Deprecated: this is kept for backwards compatibility issues. Use MaterializedViewMetadata.
+type ViewMetadata struct {
+	Keyspace   string
+	Name       string
+	FieldNames []string
+	FieldTypes []TypeInfo
+}
+
+// MaterializedViewMetadata holds the metadata for materialized views.
+type MaterializedViewMetadata struct {
+	Keyspace                string
+	Name                    string
+	BaseTableId             UUID
+	BaseTable               *TableMetadata
+	BloomFilterFpChance     float64
+	Caching                 map[string]string
+	Comment                 string
+	Compaction              map[string]string
+	Compression             map[string]string
+	CrcCheckChance          float64
+	DcLocalReadRepairChance float64
+	DefaultTimeToLive       int
+	Extensions              map[string]string
+	GcGraceSeconds          int
+	Id                      UUID
+	IncludeAllColumns       bool
+	MaxIndexInterval        int
+	MemtableFlushPeriodInMs int
+	MinIndexInterval        int
+	ReadRepairChance        float64
+	SpeculativeRetry        string
+
+	baseTableName string
+}
+
+type UserTypeMetadata struct {
+	Keyspace   string
+	Name       string
+	FieldNames []string
+	FieldTypes []TypeInfo
+}
+
+// the ordering of the column with regard to its comparator
+type ColumnOrder bool
+
+const (
+	ASC  ColumnOrder = false
+	DESC ColumnOrder = true
+)
+
+type ColumnIndexMetadata struct {
+	Name    string
+	Type    string
+	Options map[string]interface{}
+}
+
+type ColumnKind int
+
+const (
+	ColumnUnkownKind ColumnKind = iota
+	ColumnPartitionKey
+	ColumnClusteringKey
+	ColumnRegular
+	ColumnCompact
+	ColumnStatic
+)
+
+func (c ColumnKind) String() string {
+	switch c {
+	case ColumnPartitionKey:
+		return "partition_key"
+	case ColumnClusteringKey:
+		return "clustering_key"
+	case ColumnRegular:
+		return "regular"
+	case ColumnCompact:
+		return "compact"
+	case ColumnStatic:
+		return "static"
+	default:
+		return fmt.Sprintf("unknown_column_%d", c)
+	}
+}
+
+func (c *ColumnKind) UnmarshalCQL(typ TypeInfo, p []byte) error {
+	if typ.Type() != TypeVarchar {
+		return unmarshalErrorf("unable to marshall %s into ColumnKind, expected Varchar", typ)
+	}
+
+	kind, err := columnKindFromSchema(string(p))
+	if err != nil {
+		return err
+	}
+	*c = kind
+
+	return nil
+}
+
+func columnKindFromSchema(kind string) (ColumnKind, error) {
+	switch kind {
+	case "partition_key":
+		return ColumnPartitionKey, nil
+	case "clustering_key", "clustering":
+		return ColumnClusteringKey, nil
+	case "regular":
+		return ColumnRegular, nil
+	case "compact_value":
+		return ColumnCompact, nil
+	case "static":
+		return ColumnStatic, nil
+	default:
+		return -1, fmt.Errorf("unknown column kind: %q", kind)
+	}
+}
+
+// default alias values
+const (
+	DEFAULT_KEY_ALIAS    = "key"
+	DEFAULT_COLUMN_ALIAS = "column"
+	DEFAULT_VALUE_ALIAS  = "value"
+)
+
+// queries the cluster for schema information for a specific keyspace
+type schemaDescriber struct {
+	session *Session
+	mu      sync.Mutex
+
+	cache map[string]*KeyspaceMetadata
+}
+
+// creates a session bound schema describer which will query and cache
+// keyspace metadata
+func newSchemaDescriber(session *Session) *schemaDescriber {
+	return &schemaDescriber{
+		session: session,
+		cache:   map[string]*KeyspaceMetadata{},
+	}
+}
+
+// returns the cached KeyspaceMetadata held by the describer for the named
+// keyspace.
+func (s *schemaDescriber) getSchema(keyspaceName string) (*KeyspaceMetadata, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	metadata, found := s.cache[keyspaceName]
+	if !found {
+		// refresh the cache for this keyspace
+		err := s.refreshSchema(keyspaceName)
+		if err != nil {
+			return nil, err
+		}
+
+		metadata = s.cache[keyspaceName]
+	}
+
+	return metadata, nil
+}
+
+// clears the already cached keyspace metadata
+func (s *schemaDescriber) clearSchema(keyspaceName string) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	delete(s.cache, keyspaceName)
+}
+
+// forcibly updates the current KeyspaceMetadata held by the schema describer
+// for a given named keyspace.
+func (s *schemaDescriber) refreshSchema(keyspaceName string) error {
+	var err error
+
+	// query the system keyspace for schema data
+	// TODO retrieve concurrently
+	keyspace, err := getKeyspaceMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+	tables, err := getTableMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+	columns, err := getColumnMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+	functions, err := getFunctionsMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+	aggregates, err := getAggregatesMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+	views, err := getViewsMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+	materializedViews, err := getMaterializedViewsMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+
+	// organize the schema data
+	compileMetadata(s.session.cfg.ProtoVersion, keyspace, tables, columns, functions, aggregates, views,
+		materializedViews, s.session.logger)
+
+	// update the cache
+	s.cache[keyspaceName] = keyspace
+
+	return nil
+}
+
+// "compiles" derived information about keyspace, table, and column metadata
+// for a keyspace from the basic queried metadata objects returned by
+// getKeyspaceMetadata, getTableMetadata, and getColumnMetadata respectively;
+// Links the metadata objects together and derives the column composition of
+// the partition key and clustering key for a table.
+func compileMetadata(
+	protoVersion int,
+	keyspace *KeyspaceMetadata,
+	tables []TableMetadata,
+	columns []ColumnMetadata,
+	functions []FunctionMetadata,
+	aggregates []AggregateMetadata,
+	views []ViewMetadata,
+	materializedViews []MaterializedViewMetadata,
+	logger StdLogger,
+) {
+	keyspace.Tables = make(map[string]*TableMetadata)
+	for i := range tables {
+		tables[i].Columns = make(map[string]*ColumnMetadata)
+
+		keyspace.Tables[tables[i].Name] = &tables[i]
+	}
+	keyspace.Functions = make(map[string]*FunctionMetadata, len(functions))
+	for i := range functions {
+		keyspace.Functions[functions[i].Name] = &functions[i]
+	}
+	keyspace.Aggregates = make(map[string]*AggregateMetadata, len(aggregates))
+	for i, _ := range aggregates {
+		aggregates[i].FinalFunc = *keyspace.Functions[aggregates[i].finalFunc]
+		aggregates[i].StateFunc = *keyspace.Functions[aggregates[i].stateFunc]
+		keyspace.Aggregates[aggregates[i].Name] = &aggregates[i]
+	}
+	keyspace.Views = make(map[string]*ViewMetadata, len(views))
+	for i := range views {
+		keyspace.Views[views[i].Name] = &views[i]
+	}
+	// Views currently holds the types and hasn't been deleted for backward compatibility issues.
+	// That's why it's ok to copy Views into Types in this case. For the real Views use MaterializedViews.
+	types := make([]UserTypeMetadata, len(views))
+	for i := range views {
+		types[i].Keyspace = views[i].Keyspace
+		types[i].Name = views[i].Name
+		types[i].FieldNames = views[i].FieldNames
+		types[i].FieldTypes = views[i].FieldTypes
+	}
+	keyspace.UserTypes = make(map[string]*UserTypeMetadata, len(views))
+	for i := range types {
+		keyspace.UserTypes[types[i].Name] = &types[i]
+	}
+	keyspace.MaterializedViews = make(map[string]*MaterializedViewMetadata, len(materializedViews))
+	for i, _ := range materializedViews {
+		materializedViews[i].BaseTable = keyspace.Tables[materializedViews[i].baseTableName]
+		keyspace.MaterializedViews[materializedViews[i].Name] = &materializedViews[i]
+	}
+
+	// add columns from the schema data
+	for i := range columns {
+		col := &columns[i]
+		// decode the validator for TypeInfo and order
+		if col.ClusteringOrder != "" { // Cassandra 3.x+
+			col.Type = getCassandraType(col.Validator, logger)
+			col.Order = ASC
+			if col.ClusteringOrder == "desc" {
+				col.Order = DESC
+			}
+		} else {
+			validatorParsed := parseType(col.Validator, logger)
+			col.Type = validatorParsed.types[0]
+			col.Order = ASC
+			if validatorParsed.reversed[0] {
+				col.Order = DESC
+			}
+		}
+
+		table, ok := keyspace.Tables[col.Table]
+		if !ok {
+			// if the schema is being updated we will race between seeing
+			// the metadata be complete. Potentially we should check for
+			// schema versions before and after reading the metadata and
+			// if they dont match try again.
+			continue
+		}
+
+		table.Columns[col.Name] = col
+		table.OrderedColumns = append(table.OrderedColumns, col.Name)
+	}
+
+	if protoVersion == protoVersion1 {
+		compileV1Metadata(tables, logger)
+	} else {
+		compileV2Metadata(tables, logger)
+	}
+}
+
+// Compiles derived information from TableMetadata which have had
+// ColumnMetadata added already. V1 protocol does not return as much
+// column metadata as V2+ (because V1 doesn't support the "type" column in the
+// system.schema_columns table) so determining PartitionKey and ClusterColumns
+// is more complex.
+func compileV1Metadata(tables []TableMetadata, logger StdLogger) {
+	for i := range tables {
+		table := &tables[i]
+
+		// decode the key validator
+		keyValidatorParsed := parseType(table.KeyValidator, logger)
+		// decode the comparator
+		comparatorParsed := parseType(table.Comparator, logger)
+
+		// the partition key length is the same as the number of types in the
+		// key validator
+		table.PartitionKey = make([]*ColumnMetadata, len(keyValidatorParsed.types))
+
+		// V1 protocol only returns "regular" columns from
+		// system.schema_columns (there is no type field for columns)
+		// so the alias information is used to
+		// create the partition key and clustering columns
+
+		// construct the partition key from the alias
+		for i := range table.PartitionKey {
+			var alias string
+			if len(table.KeyAliases) > i {
+				alias = table.KeyAliases[i]
+			} else if i == 0 {
+				alias = DEFAULT_KEY_ALIAS
+			} else {
+				alias = DEFAULT_KEY_ALIAS + strconv.Itoa(i+1)
+			}
+
+			column := &ColumnMetadata{
+				Keyspace:       table.Keyspace,
+				Table:          table.Name,
+				Name:           alias,
+				Type:           keyValidatorParsed.types[i],
+				Kind:           ColumnPartitionKey,
+				ComponentIndex: i,
+			}
+
+			table.PartitionKey[i] = column
+			table.Columns[alias] = column
+		}
+
+		// determine the number of clustering columns
+		size := len(comparatorParsed.types)
+		if comparatorParsed.isComposite {
+			if len(comparatorParsed.collections) != 0 ||
+				(len(table.ColumnAliases) == size-1 &&
+					comparatorParsed.types[size-1].Type() == TypeVarchar) {
+				size = size - 1
+			}
+		} else {
+			if !(len(table.ColumnAliases) != 0 || len(table.Columns) == 0) {
+				size = 0
+			}
+		}
+
+		table.ClusteringColumns = make([]*ColumnMetadata, size)
+
+		for i := range table.ClusteringColumns {
+			var alias string
+			if len(table.ColumnAliases) > i {
+				alias = table.ColumnAliases[i]
+			} else if i == 0 {
+				alias = DEFAULT_COLUMN_ALIAS
+			} else {
+				alias = DEFAULT_COLUMN_ALIAS + strconv.Itoa(i+1)
+			}
+
+			order := ASC
+			if comparatorParsed.reversed[i] {
+				order = DESC
+			}
+
+			column := &ColumnMetadata{
+				Keyspace:       table.Keyspace,
+				Table:          table.Name,
+				Name:           alias,
+				Type:           comparatorParsed.types[i],
+				Order:          order,
+				Kind:           ColumnClusteringKey,
+				ComponentIndex: i,
+			}
+
+			table.ClusteringColumns[i] = column
+			table.Columns[alias] = column
+		}
+
+		if size != len(comparatorParsed.types)-1 {
+			alias := DEFAULT_VALUE_ALIAS
+			if len(table.ValueAlias) > 0 {
+				alias = table.ValueAlias
+			}
+			// decode the default validator
+			defaultValidatorParsed := parseType(table.DefaultValidator, logger)
+			column := &ColumnMetadata{
+				Keyspace: table.Keyspace,
+				Table:    table.Name,
+				Name:     alias,
+				Type:     defaultValidatorParsed.types[0],
+				Kind:     ColumnRegular,
+			}
+			table.Columns[alias] = column
+		}
+	}
+}
+
+// The simpler compile case for V2+ protocol
+func compileV2Metadata(tables []TableMetadata, logger StdLogger) {
+	for i := range tables {
+		table := &tables[i]
+
+		clusteringColumnCount := componentColumnCountOfType(table.Columns, ColumnClusteringKey)
+		table.ClusteringColumns = make([]*ColumnMetadata, clusteringColumnCount)
+
+		if table.KeyValidator != "" {
+			keyValidatorParsed := parseType(table.KeyValidator, logger)
+			table.PartitionKey = make([]*ColumnMetadata, len(keyValidatorParsed.types))
+		} else { // Cassandra 3.x+
+			partitionKeyCount := componentColumnCountOfType(table.Columns, ColumnPartitionKey)
+			table.PartitionKey = make([]*ColumnMetadata, partitionKeyCount)
+		}
+
+		for _, columnName := range table.OrderedColumns {
+			column := table.Columns[columnName]
+			if column.Kind == ColumnPartitionKey {
+				table.PartitionKey[column.ComponentIndex] = column
+			} else if column.Kind == ColumnClusteringKey {
+				table.ClusteringColumns[column.ComponentIndex] = column
+			}
+		}
+	}
+}
+
+// returns the count of coluns with the given "kind" value.
+func componentColumnCountOfType(columns map[string]*ColumnMetadata, kind ColumnKind) int {
+	maxComponentIndex := -1
+	for _, column := range columns {
+		if column.Kind == kind && column.ComponentIndex > maxComponentIndex {
+			maxComponentIndex = column.ComponentIndex
+		}
+	}
+	return maxComponentIndex + 1
+}
+
+// query only for the keyspace metadata for the specified keyspace from system.schema_keyspace
+func getKeyspaceMetadata(session *Session, keyspaceName string) (*KeyspaceMetadata, error) {
+	keyspace := &KeyspaceMetadata{Name: keyspaceName}
+
+	if session.useSystemSchema { // Cassandra 3.x+
+		const stmt = `
+		SELECT durable_writes, replication
+		FROM system_schema.keyspaces
+		WHERE keyspace_name = ?`
+
+		var replication map[string]string
+
+		iter := session.control.query(stmt+session.usingTimeoutClause, keyspaceName)
+		if iter.NumRows() == 0 {
+			return nil, ErrKeyspaceDoesNotExist
+		}
+		iter.Scan(&keyspace.DurableWrites, &replication)
+		err := iter.Close()
+		if err != nil {
+			return nil, fmt.Errorf("error querying keyspace schema: %v", err)
+		}
+
+		keyspace.StrategyClass = replication["class"]
+		delete(replication, "class")
+
+		keyspace.StrategyOptions = make(map[string]interface{}, len(replication))
+		for k, v := range replication {
+			keyspace.StrategyOptions[k] = v
+		}
+	} else {
+
+		const stmt = `
+		SELECT durable_writes, strategy_class, strategy_options
+		FROM system.schema_keyspaces
+		WHERE keyspace_name = ?`
+
+		var strategyOptionsJSON []byte
+
+		iter := session.control.query(stmt+session.usingTimeoutClause, keyspaceName)
+		if iter.NumRows() == 0 {
+			return nil, ErrKeyspaceDoesNotExist
+		}
+		iter.Scan(&keyspace.DurableWrites, &keyspace.StrategyClass, &strategyOptionsJSON)
+		err := iter.Close()
+		if err != nil {
+			return nil, fmt.Errorf("error querying keyspace schema: %v", err)
+		}
+
+		err = json.Unmarshal(strategyOptionsJSON, &keyspace.StrategyOptions)
+		if err != nil {
+			return nil, fmt.Errorf(
+				"invalid JSON value '%s' as strategy_options for in keyspace '%s': %v",
+				strategyOptionsJSON, keyspace.Name, err,
+			)
+		}
+	}
+
+	return keyspace, nil
+}
+
+// query for only the table metadata in the specified keyspace from system.schema_columnfamilies
+func getTableMetadata(session *Session, keyspaceName string) ([]TableMetadata, error) {
+
+	var (
+		iter *Iter
+		scan func(iter *Iter, table *TableMetadata) bool
+		stmt string
+
+		keyAliasesJSON    []byte
+		columnAliasesJSON []byte
+	)
+
+	if session.useSystemSchema { // Cassandra 3.x+
+		stmt = `
+		SELECT
+			table_name
+		FROM system_schema.tables
+		WHERE keyspace_name = ?`
+
+		switchIter := func() *Iter {
+			iter.Close()
+			stmt = `
+				SELECT
+					view_name
+				FROM system_schema.views
+				WHERE keyspace_name = ?`
+			iter = session.control.query(stmt+session.usingTimeoutClause, keyspaceName)
+			return iter
+		}
+
+		scan = func(iter *Iter, table *TableMetadata) bool {
+			r := iter.Scan(
+				&table.Name,
+			)
+			if !r {
+				iter = switchIter()
+				if iter != nil {
+					switchIter = func() *Iter { return nil }
+					r = iter.Scan(&table.Name)
+				}
+			}
+			return r
+		}
+	} else if session.cfg.ProtoVersion == protoVersion1 {
+		// we have key aliases
+		stmt = `
+		SELECT
+			columnfamily_name,
+			key_validator,
+			comparator,
+			default_validator,
+			key_aliases,
+			column_aliases,
+			value_alias
+		FROM system.schema_columnfamilies
+		WHERE keyspace_name = ?`
+
+		scan = func(iter *Iter, table *TableMetadata) bool {
+			return iter.Scan(
+				&table.Name,
+				&table.KeyValidator,
+				&table.Comparator,
+				&table.DefaultValidator,
+				&keyAliasesJSON,
+				&columnAliasesJSON,
+				&table.ValueAlias,
+			)
+		}
+	} else {
+		stmt = `
+		SELECT
+			columnfamily_name,
+			key_validator,
+			comparator,
+			default_validator
+		FROM system.schema_columnfamilies
+		WHERE keyspace_name = ?`
+
+		scan = func(iter *Iter, table *TableMetadata) bool {
+			return iter.Scan(
+				&table.Name,
+				&table.KeyValidator,
+				&table.Comparator,
+				&table.DefaultValidator,
+			)
+		}
+	}
+
+	iter = session.control.query(stmt+session.usingTimeoutClause, keyspaceName)
+
+	tables := []TableMetadata{}
+	table := TableMetadata{Keyspace: keyspaceName}
+
+	for scan(iter, &table) {
+		var err error
+
+		// decode the key aliases
+		if keyAliasesJSON != nil {
+			table.KeyAliases = []string{}
+			err = json.Unmarshal(keyAliasesJSON, &table.KeyAliases)
+			if err != nil {
+				iter.Close()
+				return nil, fmt.Errorf(
+					"invalid JSON value '%s' as key_aliases for in table '%s': %v",
+					keyAliasesJSON, table.Name, err,
+				)
+			}
+		}
+
+		// decode the column aliases
+		if columnAliasesJSON != nil {
+			table.ColumnAliases = []string{}
+			err = json.Unmarshal(columnAliasesJSON, &table.ColumnAliases)
+			if err != nil {
+				iter.Close()
+				return nil, fmt.Errorf(
+					"invalid JSON value '%s' as column_aliases for in table '%s': %v",
+					columnAliasesJSON, table.Name, err,
+				)
+			}
+		}
+
+		tables = append(tables, table)
+		table = TableMetadata{Keyspace: keyspaceName}
+	}
+
+	err := iter.Close()
+	if err != nil && err != ErrNotFound {
+		return nil, fmt.Errorf("error querying table schema: %v", err)
+	}
+
+	return tables, nil
+}
+
+func (s *Session) scanColumnMetadataV1(keyspace string) ([]ColumnMetadata, error) {
+	// V1 does not support the type column, and all returned rows are
+	// of kind "regular".
+	const stmt = `
+		SELECT
+				columnfamily_name,
+				column_name,
+				component_index,
+				validator,
+				index_name,
+				index_type,
+				index_options
+			FROM system.schema_columns
+			WHERE keyspace_name = ?`
+
+	var columns []ColumnMetadata
+
+	rows := s.control.query(stmt+s.usingTimeoutClause, keyspace).Scanner()
+	for rows.Next() {
+		var (
+			column           = ColumnMetadata{Keyspace: keyspace}
+			indexOptionsJSON []byte
+		)
+
+		// all columns returned by V1 are regular
+		column.Kind = ColumnRegular
+
+		err := rows.Scan(&column.Table,
+			&column.Name,
+			&column.ComponentIndex,
+			&column.Validator,
+			&column.Index.Name,
+			&column.Index.Type,
+			&indexOptionsJSON)
+
+		if err != nil {
+			return nil, err
+		}
+
+		if len(indexOptionsJSON) > 0 {
+			err := json.Unmarshal(indexOptionsJSON, &column.Index.Options)
+			if err != nil {
+				return nil, fmt.Errorf(
+					"invalid JSON value '%s' as index_options for column '%s' in table '%s': %v",
+					indexOptionsJSON,
+					column.Name,
+					column.Table,
+					err)
+			}
+		}
+
+		columns = append(columns, column)
+	}
+
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+
+	return columns, nil
+}
+
+func (s *Session) scanColumnMetadataV2(keyspace string) ([]ColumnMetadata, error) {
+	// V2+ supports the type column
+	const stmt = `
+			SELECT
+				columnfamily_name,
+				column_name,
+				component_index,
+				validator,
+				index_name,
+				index_type,
+				index_options,
+				type
+			FROM system.schema_columns
+			WHERE keyspace_name = ?`
+
+	var columns []ColumnMetadata
+
+	rows := s.control.query(stmt+s.usingTimeoutClause, keyspace).Scanner()
+	for rows.Next() {
+		var (
+			column           = ColumnMetadata{Keyspace: keyspace}
+			indexOptionsJSON []byte
+		)
+
+		err := rows.Scan(&column.Table,
+			&column.Name,
+			&column.ComponentIndex,
+			&column.Validator,
+			&column.Index.Name,
+			&column.Index.Type,
+			&indexOptionsJSON,
+			&column.Kind,
+		)
+
+		if err != nil {
+			return nil, err
+		}
+
+		if len(indexOptionsJSON) > 0 {
+			err := json.Unmarshal(indexOptionsJSON, &column.Index.Options)
+			if err != nil {
+				return nil, fmt.Errorf(
+					"invalid JSON value '%s' as index_options for column '%s' in table '%s': %v",
+					indexOptionsJSON,
+					column.Name,
+					column.Table,
+					err)
+			}
+		}
+
+		columns = append(columns, column)
+	}
+
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+
+	return columns, nil
+
+}
+
+func (s *Session) scanColumnMetadataSystem(keyspace string) ([]ColumnMetadata, error) {
+	const stmt = `
+			SELECT
+				table_name,
+				column_name,
+				clustering_order,
+				type,
+				kind,
+				position
+			FROM system_schema.columns
+			WHERE keyspace_name = ?`
+
+	var columns []ColumnMetadata
+
+	rows := s.control.query(stmt+s.usingTimeoutClause, keyspace).Scanner()
+	for rows.Next() {
+		column := ColumnMetadata{Keyspace: keyspace}
+
+		err := rows.Scan(&column.Table,
+			&column.Name,
+			&column.ClusteringOrder,
+			&column.Validator,
+			&column.Kind,
+			&column.ComponentIndex,
+		)
+
+		if err != nil {
+			return nil, err
+		}
+
+		columns = append(columns, column)
+	}
+
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+
+	// TODO(zariel): get column index info from system_schema.indexes
+
+	return columns, nil
+}
+
+// query for only the column metadata in the specified keyspace from system.schema_columns
+func getColumnMetadata(session *Session, keyspaceName string) ([]ColumnMetadata, error) {
+	var (
+		columns []ColumnMetadata
+		err     error
+	)
+
+	// Deal with differences in protocol versions
+	if session.cfg.ProtoVersion == 1 {
+		columns, err = session.scanColumnMetadataV1(keyspaceName)
+	} else if session.useSystemSchema { // Cassandra 3.x+
+		columns, err = session.scanColumnMetadataSystem(keyspaceName)
+	} else {
+		columns, err = session.scanColumnMetadataV2(keyspaceName)
+	}
+
+	if err != nil && err != ErrNotFound {
+		return nil, fmt.Errorf("error querying column schema: %v", err)
+	}
+
+	return columns, nil
+}
+
+func getTypeInfo(t string, logger StdLogger) TypeInfo {
+	if strings.HasPrefix(t, apacheCassandraTypePrefix) {
+		t = apacheToCassandraType(t)
+	}
+	return getCassandraType(t, logger)
+}
+
+func getViewsMetadata(session *Session, keyspaceName string) ([]ViewMetadata, error) {
+	if session.cfg.ProtoVersion == protoVersion1 {
+		return nil, nil
+	}
+	var tableName string
+	if session.useSystemSchema {
+		tableName = "system_schema.types"
+	} else {
+		tableName = "system.schema_usertypes"
+	}
+	stmt := fmt.Sprintf(`
+		SELECT
+			type_name,
+			field_names,
+			field_types
+		FROM %s
+		WHERE keyspace_name = ?`, tableName)
+
+	var views []ViewMetadata
+
+	rows := session.control.query(stmt, keyspaceName).Scanner()
+	for rows.Next() {
+		view := ViewMetadata{Keyspace: keyspaceName}
+		var argumentTypes []string
+		err := rows.Scan(&view.Name,
+			&view.FieldNames,
+			&argumentTypes,
+		)
+		if err != nil {
+			return nil, err
+		}
+		view.FieldTypes = make([]TypeInfo, len(argumentTypes))
+		for i, argumentType := range argumentTypes {
+			view.FieldTypes[i] = getTypeInfo(argumentType, session.logger)
+		}
+		views = append(views, view)
+	}
+
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+
+	return views, nil
+}
+
+func getMaterializedViewsMetadata(session *Session, keyspaceName string) ([]MaterializedViewMetadata, error) {
+	if !session.useSystemSchema {
+		return nil, nil
+	}
+	var tableName = "system_schema.views"
+	stmt := fmt.Sprintf(`
+		SELECT
+			view_name,
+			base_table_id,
+			base_table_name,
+			bloom_filter_fp_chance,
+			caching,
+			comment,
+			compaction,
+			compression,
+			crc_check_chance,
+			default_time_to_live,
+			extensions,
+			gc_grace_seconds,
+			id,
+			include_all_columns,
+			max_index_interval,
+			memtable_flush_period_in_ms,
+			min_index_interval,
+			speculative_retry
+		FROM %s
+		WHERE keyspace_name = ?`, tableName)
+
+	var materializedViews []MaterializedViewMetadata
+
+	rows := session.control.query(stmt, keyspaceName).Scanner()
+	for rows.Next() {
+		materializedView := MaterializedViewMetadata{Keyspace: keyspaceName}
+		err := rows.Scan(&materializedView.Name,
+			&materializedView.BaseTableId,
+			&materializedView.baseTableName,
+			&materializedView.BloomFilterFpChance,
+			&materializedView.Caching,
+			&materializedView.Comment,
+			&materializedView.Compaction,
+			&materializedView.Compression,
+			&materializedView.CrcCheckChance,
+			&materializedView.DcLocalReadRepairChance,
+			&materializedView.DefaultTimeToLive,
+			&materializedView.Extensions,
+			&materializedView.GcGraceSeconds,
+			&materializedView.Id,
+			&materializedView.IncludeAllColumns,
+			&materializedView.MaxIndexInterval,
+			&materializedView.MemtableFlushPeriodInMs,
+			&materializedView.MinIndexInterval,
+			&materializedView.ReadRepairChance,
+			&materializedView.SpeculativeRetry,
+		)
+		if err != nil {
+			return nil, err
+		}
+		materializedViews = append(materializedViews, materializedView)
+	}
+
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+
+	return materializedViews, nil
+}
+
+func getFunctionsMetadata(session *Session, keyspaceName string) ([]FunctionMetadata, error) {
+	if session.cfg.ProtoVersion == protoVersion1 || !session.hasAggregatesAndFunctions {
+		return nil, nil
+	}
+	var tableName string
+	if session.useSystemSchema {
+		tableName = "system_schema.functions"
+	} else {
+		tableName = "system.schema_functions"
+	}
+	stmt := fmt.Sprintf(`
+		SELECT
+			function_name,
+			argument_types,
+			argument_names,
+			body,
+			called_on_null_input,
+			language,
+			return_type
+		FROM %s
+		WHERE keyspace_name = ?`, tableName)
+
+	var functions []FunctionMetadata
+
+	rows := session.control.query(stmt, keyspaceName).Scanner()
+	for rows.Next() {
+		function := FunctionMetadata{Keyspace: keyspaceName}
+		var argumentTypes []string
+		var returnType string
+		err := rows.Scan(&function.Name,
+			&argumentTypes,
+			&function.ArgumentNames,
+			&function.Body,
+			&function.CalledOnNullInput,
+			&function.Language,
+			&returnType,
+		)
+		if err != nil {
+			return nil, err
+		}
+		function.ReturnType = getTypeInfo(returnType, session.logger)
+		function.ArgumentTypes = make([]TypeInfo, len(argumentTypes))
+		for i, argumentType := range argumentTypes {
+			function.ArgumentTypes[i] = getTypeInfo(argumentType, session.logger)
+		}
+		functions = append(functions, function)
+	}
+
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+
+	return functions, nil
+}
+
+func getAggregatesMetadata(session *Session, keyspaceName string) ([]AggregateMetadata, error) {
+	if session.cfg.ProtoVersion == protoVersion1 || !session.hasAggregatesAndFunctions {
+		return nil, nil
+	}
+	var tableName string
+	if session.useSystemSchema {
+		tableName = "system_schema.aggregates"
+	} else {
+		tableName = "system.schema_aggregates"
+	}
+
+	stmt := fmt.Sprintf(`
+		SELECT
+			aggregate_name,
+			argument_types,
+			final_func,
+			initcond,
+			return_type,
+			state_func,
+			state_type
+		FROM %s
+		WHERE keyspace_name = ?`, tableName)
+
+	var aggregates []AggregateMetadata
+
+	rows := session.control.query(stmt, keyspaceName).Scanner()
+	for rows.Next() {
+		aggregate := AggregateMetadata{Keyspace: keyspaceName}
+		var argumentTypes []string
+		var returnType string
+		var stateType string
+		err := rows.Scan(&aggregate.Name,
+			&argumentTypes,
+			&aggregate.finalFunc,
+			&aggregate.InitCond,
+			&returnType,
+			&aggregate.stateFunc,
+			&stateType,
+		)
+		if err != nil {
+			return nil, err
+		}
+		aggregate.ReturnType = getTypeInfo(returnType, session.logger)
+		aggregate.StateType = getTypeInfo(stateType, session.logger)
+		aggregate.ArgumentTypes = make([]TypeInfo, len(argumentTypes))
+		for i, argumentType := range argumentTypes {
+			aggregate.ArgumentTypes[i] = getTypeInfo(argumentType, session.logger)
+		}
+		aggregates = append(aggregates, aggregate)
+	}
+
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+
+	return aggregates, nil
+}
+
+// type definition parser state
+type typeParser struct {
+	input  string
+	index  int
+	logger StdLogger
+}
+
+// the type definition parser result
+type typeParserResult struct {
+	isComposite bool
+	types       []TypeInfo
+	reversed    []bool
+	collections map[string]TypeInfo
+}
+
+// Parse the type definition used for validator and comparator schema data
+func parseType(def string, logger StdLogger) typeParserResult {
+	parser := &typeParser{input: def, logger: logger}
+	return parser.parse()
+}
+
+const (
+	REVERSED_TYPE   = "org.apache.cassandra.db.marshal.ReversedType"
+	COMPOSITE_TYPE  = "org.apache.cassandra.db.marshal.CompositeType"
+	COLLECTION_TYPE = "org.apache.cassandra.db.marshal.ColumnToCollectionType"
+	LIST_TYPE       = "org.apache.cassandra.db.marshal.ListType"
+	SET_TYPE        = "org.apache.cassandra.db.marshal.SetType"
+	MAP_TYPE        = "org.apache.cassandra.db.marshal.MapType"
+)
+
+// represents a class specification in the type def AST
+type typeParserClassNode struct {
+	name   string
+	params []typeParserParamNode
+	// this is the segment of the input string that defined this node
+	input string
+}
+
+// represents a class parameter in the type def AST
+type typeParserParamNode struct {
+	name  *string
+	class typeParserClassNode
+}
+
+func (t *typeParser) parse() typeParserResult {
+	// parse the AST
+	ast, ok := t.parseClassNode()
+	if !ok {
+		// treat this is a custom type
+		return typeParserResult{
+			isComposite: false,
+			types: []TypeInfo{
+				NativeType{
+					typ:    TypeCustom,
+					custom: t.input,
+				},
+			},
+			reversed:    []bool{false},
+			collections: nil,
+		}
+	}
+
+	// interpret the AST
+	if strings.HasPrefix(ast.name, COMPOSITE_TYPE) {
+		count := len(ast.params)
+
+		// look for a collections param
+		last := ast.params[count-1]
+		collections := map[string]TypeInfo{}
+		if strings.HasPrefix(last.class.name, COLLECTION_TYPE) {
+			count--
+
+			for _, param := range last.class.params {
+				// decode the name
+				var name string
+				decoded, err := hex.DecodeString(*param.name)
+				if err != nil {
+					t.logger.Printf(
+						"Error parsing type '%s', contains collection name '%s' with an invalid format: %v",
+						t.input,
+						*param.name,
+						err,
+					)
+					// just use the provided name
+					name = *param.name
+				} else {
+					name = string(decoded)
+				}
+				collections[name] = param.class.asTypeInfo()
+			}
+		}
+
+		types := make([]TypeInfo, count)
+		reversed := make([]bool, count)
+
+		for i, param := range ast.params[:count] {
+			class := param.class
+			reversed[i] = strings.HasPrefix(class.name, REVERSED_TYPE)
+			if reversed[i] {
+				class = class.params[0].class
+			}
+			types[i] = class.asTypeInfo()
+		}
+
+		return typeParserResult{
+			isComposite: true,
+			types:       types,
+			reversed:    reversed,
+			collections: collections,
+		}
+	} else {
+		// not composite, so one type
+		class := *ast
+		reversed := strings.HasPrefix(class.name, REVERSED_TYPE)
+		if reversed {
+			class = class.params[0].class
+		}
+		typeInfo := class.asTypeInfo()
+
+		return typeParserResult{
+			isComposite: false,
+			types:       []TypeInfo{typeInfo},
+			reversed:    []bool{reversed},
+		}
+	}
+}
+
+func (class *typeParserClassNode) asTypeInfo() TypeInfo {
+	if strings.HasPrefix(class.name, LIST_TYPE) {
+		elem := class.params[0].class.asTypeInfo()
+		return CollectionType{
+			NativeType: NativeType{
+				typ: TypeList,
+			},
+			Elem: elem,
+		}
+	}
+	if strings.HasPrefix(class.name, SET_TYPE) {
+		elem := class.params[0].class.asTypeInfo()
+		return CollectionType{
+			NativeType: NativeType{
+				typ: TypeSet,
+			},
+			Elem: elem,
+		}
+	}
+	if strings.HasPrefix(class.name, MAP_TYPE) {
+		key := class.params[0].class.asTypeInfo()
+		elem := class.params[1].class.asTypeInfo()
+		return CollectionType{
+			NativeType: NativeType{
+				typ: TypeMap,
+			},
+			Key:  key,
+			Elem: elem,
+		}
+	}
+
+	// must be a simple type or custom type
+	info := NativeType{typ: getApacheCassandraType(class.name)}
+	if info.typ == TypeCustom {
+		// add the entire class definition
+		info.custom = class.input
+	}
+	return info
+}
+
+// CLASS := ID [ PARAMS ]
+func (t *typeParser) parseClassNode() (node *typeParserClassNode, ok bool) {
+	t.skipWhitespace()
+
+	startIndex := t.index
+
+	name, ok := t.nextIdentifier()
+	if !ok {
+		return nil, false
+	}
+
+	params, ok := t.parseParamNodes()
+	if !ok {
+		return nil, false
+	}
+
+	endIndex := t.index
+
+	node = &typeParserClassNode{
+		name:   name,
+		params: params,
+		input:  t.input[startIndex:endIndex],
+	}
+	return node, true
+}
+
+// PARAMS := "(" PARAM { "," PARAM } ")"
+// PARAM := [ PARAM_NAME ":" ] CLASS
+// PARAM_NAME := ID
+func (t *typeParser) parseParamNodes() (params []typeParserParamNode, ok bool) {
+	t.skipWhitespace()
+
+	// the params are optional
+	if t.index == len(t.input) || t.input[t.index] != '(' {
+		return nil, true
+	}
+
+	params = []typeParserParamNode{}
+
+	// consume the '('
+	t.index++
+
+	t.skipWhitespace()
+
+	for t.input[t.index] != ')' {
+		// look for a named param, but if no colon, then we want to backup
+		backupIndex := t.index
+
+		// name will be a hex encoded version of a utf-8 string
+		name, ok := t.nextIdentifier()
+		if !ok {
+			return nil, false
+		}
+		hasName := true
+
+		// TODO handle '=>' used for DynamicCompositeType
+
+		t.skipWhitespace()
+
+		if t.input[t.index] == ':' {
+			// there is a name for this parameter
+
+			// consume the ':'
+			t.index++
+
+			t.skipWhitespace()
+		} else {
+			// no name, backup
+			hasName = false
+			t.index = backupIndex
+		}
+
+		// parse the next full parameter
+		classNode, ok := t.parseClassNode()
+		if !ok {
+			return nil, false
+		}
+
+		if hasName {
+			params = append(
+				params,
+				typeParserParamNode{name: &name, class: *classNode},
+			)
+		} else {
+			params = append(
+				params,
+				typeParserParamNode{class: *classNode},
+			)
+		}
+
+		t.skipWhitespace()
+
+		if t.input[t.index] == ',' {
+			// consume the comma
+			t.index++
+
+			t.skipWhitespace()
+		}
+	}
+
+	// consume the ')'
+	t.index++
+
+	return params, true
+}
+
+func (t *typeParser) skipWhitespace() {
+	for t.index < len(t.input) && isWhitespaceChar(t.input[t.index]) {
+		t.index++
+	}
+}
+
+func isWhitespaceChar(c byte) bool {
+	return c == ' ' || c == '\n' || c == '\t'
+}
+
+// ID := LETTER { LETTER }
+// LETTER := "0"..."9" | "a"..."z" | "A"..."Z" | "-" | "+" | "." | "_" | "&"
+func (t *typeParser) nextIdentifier() (id string, found bool) {
+	startIndex := t.index
+	for t.index < len(t.input) && isIdentifierChar(t.input[t.index]) {
+		t.index++
+	}
+	if startIndex == t.index {
+		return "", false
+	}
+	return t.input[startIndex:t.index], true
+}
+
+func isIdentifierChar(c byte) bool {
+	return (c >= '0' && c <= '9') ||
+		(c >= 'a' && c <= 'z') ||
+		(c >= 'A' && c <= 'Z') ||
+		c == '-' ||
+		c == '+' ||
+		c == '.' ||
+		c == '_' ||
+		c == '&'
+}
diff --git a/vendor/github.com/gocql/gocql/metadata_scylla.go b/vendor/github.com/gocql/gocql/metadata_scylla.go
new file mode 100644
index 0000000..17a52c1
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/metadata_scylla.go
@@ -0,0 +1,1102 @@
+//go:build !cassandra || scylla
+// +build !cassandra scylla
+
+// Copyright (c) 2015 The gocql Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gocql
+
+import (
+	"fmt"
+	"reflect"
+	"strings"
+	"sync"
+	"sync/atomic"
+)
+
+// schema metadata for a keyspace
+type KeyspaceMetadata struct {
+	Name            string
+	DurableWrites   bool
+	StrategyClass   string
+	StrategyOptions map[string]interface{}
+	Tables          map[string]*TableMetadata
+	Functions       map[string]*FunctionMetadata
+	Aggregates      map[string]*AggregateMetadata
+	Types           map[string]*TypeMetadata
+	Indexes         map[string]*IndexMetadata
+	Views           map[string]*ViewMetadata
+	CreateStmts     string
+}
+
+// schema metadata for a table (a.k.a. column family)
+type TableMetadata struct {
+	Keyspace          string
+	Name              string
+	PartitionKey      []*ColumnMetadata
+	ClusteringColumns []*ColumnMetadata
+	Columns           map[string]*ColumnMetadata
+	OrderedColumns    []string
+	Options           TableMetadataOptions
+	Flags             []string
+	Extensions        map[string]interface{}
+}
+
+type TableMetadataOptions struct {
+	BloomFilterFpChance     float64
+	Caching                 map[string]string
+	Comment                 string
+	Compaction              map[string]string
+	Compression             map[string]string
+	CrcCheckChance          float64
+	DcLocalReadRepairChance float64
+	DefaultTimeToLive       int
+	GcGraceSeconds          int
+	MaxIndexInterval        int
+	MemtableFlushPeriodInMs int
+	MinIndexInterval        int
+	ReadRepairChance        float64
+	SpeculativeRetry        string
+	CDC                     map[string]string
+	InMemory                bool
+	Partitioner             string
+	Version                 string
+}
+
+func (t *TableMetadataOptions) Equals(other *TableMetadataOptions) bool {
+	if t == nil || other == nil {
+		return t == other // Both must be nil to be equal
+	}
+
+	if t.BloomFilterFpChance != other.BloomFilterFpChance ||
+		t.Comment != other.Comment ||
+		t.CrcCheckChance != other.CrcCheckChance ||
+		t.DcLocalReadRepairChance != other.DcLocalReadRepairChance ||
+		t.DefaultTimeToLive != other.DefaultTimeToLive ||
+		t.GcGraceSeconds != other.GcGraceSeconds ||
+		t.MaxIndexInterval != other.MaxIndexInterval ||
+		t.MemtableFlushPeriodInMs != other.MemtableFlushPeriodInMs ||
+		t.MinIndexInterval != other.MinIndexInterval ||
+		t.ReadRepairChance != other.ReadRepairChance ||
+		t.SpeculativeRetry != other.SpeculativeRetry ||
+		t.InMemory != other.InMemory ||
+		t.Partitioner != other.Partitioner ||
+		t.Version != other.Version {
+		return false
+	}
+
+	if !compareStringMaps(t.Caching, other.Caching) ||
+		!compareStringMaps(t.Compaction, other.Compaction) ||
+		!compareStringMaps(t.Compression, other.Compression) ||
+		!compareStringMaps(t.CDC, other.CDC) {
+		return false
+	}
+
+	return true
+}
+
+type ViewMetadata struct {
+	KeyspaceName      string
+	ViewName          string
+	BaseTableID       string
+	BaseTableName     string
+	ID                string
+	IncludeAllColumns bool
+	Columns           map[string]*ColumnMetadata
+	OrderedColumns    []string
+	PartitionKey      []*ColumnMetadata
+	ClusteringColumns []*ColumnMetadata
+	WhereClause       string
+	Options           TableMetadataOptions
+	Extensions        map[string]interface{}
+}
+
+// schema metadata for a column
+type ColumnMetadata struct {
+	Keyspace        string
+	Table           string
+	Name            string
+	ComponentIndex  int
+	Kind            ColumnKind
+	Type            string
+	ClusteringOrder string
+	Order           ColumnOrder
+	Index           ColumnIndexMetadata
+}
+
+func (c *ColumnMetadata) Equals(other *ColumnMetadata) bool {
+	if c == nil || other == nil {
+		return c == other
+	}
+
+	return c.Keyspace == other.Keyspace &&
+		c.Table == other.Table &&
+		c.Name == other.Name &&
+		c.ComponentIndex == other.ComponentIndex &&
+		c.Kind == other.Kind &&
+		c.Type == other.Type &&
+		c.ClusteringOrder == other.ClusteringOrder &&
+		c.Order == other.Order &&
+		c.Index.Equals(&other.Index)
+}
+
+// FunctionMetadata holds metadata for function constructs
+type FunctionMetadata struct {
+	Keyspace          string
+	Name              string
+	ArgumentTypes     []string
+	ArgumentNames     []string
+	Body              string
+	CalledOnNullInput bool
+	Language          string
+	ReturnType        string
+}
+
+// AggregateMetadata holds metadata for aggregate constructs
+type AggregateMetadata struct {
+	Keyspace      string
+	Name          string
+	ArgumentTypes []string
+	FinalFunc     FunctionMetadata
+	InitCond      string
+	ReturnType    string
+	StateFunc     FunctionMetadata
+	StateType     string
+
+	stateFunc string
+	finalFunc string
+}
+
+// TypeMetadata holds the metadata for views.
+type TypeMetadata struct {
+	Keyspace   string
+	Name       string
+	FieldNames []string
+	FieldTypes []string
+}
+
+type IndexMetadata struct {
+	Name         string
+	KeyspaceName string
+	TableName    string
+	Kind         string
+	Options      map[string]string
+}
+
+func (t *TableMetadata) Equals(other *TableMetadata) bool {
+	if t == nil || other == nil {
+		return t == other
+	}
+
+	if t.Keyspace != other.Keyspace || t.Name != other.Name {
+		return false
+	}
+
+	if len(t.PartitionKey) != len(other.PartitionKey) || !compareColumnSlices(t.PartitionKey, other.PartitionKey) {
+		return false
+	}
+
+	if len(t.ClusteringColumns) != len(other.ClusteringColumns) || !compareColumnSlices(t.ClusteringColumns, other.ClusteringColumns) {
+		return false
+	}
+
+	if len(t.Columns) != len(other.Columns) || !compareColumnsMap(t.Columns, other.Columns) {
+		return false
+	}
+
+	if len(t.OrderedColumns) != len(other.OrderedColumns) || !compareStringSlices(t.OrderedColumns, other.OrderedColumns) {
+		return false
+	}
+
+	if !t.Options.Equals(&other.Options) {
+		return false
+	}
+
+	if len(t.Flags) != len(other.Flags) || !compareStringSlices(t.Flags, other.Flags) {
+		return false
+	}
+
+	if len(t.Extensions) != len(other.Extensions) || !compareInterfaceMaps(t.Extensions, other.Extensions) {
+		return false
+	}
+
+	return true
+}
+
+func compareColumnSlices(a, b []*ColumnMetadata) bool {
+	for i := range a {
+		if !a[i].Equals(b[i]) {
+			return false
+		}
+	}
+	return true
+}
+
+func compareColumnsMap(a, b map[string]*ColumnMetadata) bool {
+	for k, v := range a {
+		otherValue, exists := b[k]
+		if !exists || !v.Equals(otherValue) {
+			return false
+		}
+	}
+	return true
+}
+
+func compareStringSlices(a, b []string) bool {
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func compareStringMaps(a, b map[string]string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for k, v := range a {
+		if otherValue, exists := b[k]; !exists || v != otherValue {
+			return false
+		}
+	}
+	return true
+}
+
+func compareInterfaceMaps(a, b map[string]interface{}) bool {
+	for k, v := range a {
+		otherValue, exists := b[k]
+		if !exists || !reflect.DeepEqual(v, otherValue) {
+			return false
+		}
+	}
+	return true
+}
+
+// cowTabletList implements a copy on write keyspace metadata map, its equivalent type is map[string]*KeyspaceMetadata
+type cowKeyspaceMetadataMap struct {
+	keyspaceMap atomic.Value
+	mu          sync.Mutex
+}
+
+func (c *cowKeyspaceMetadataMap) get() map[string]*KeyspaceMetadata {
+	l, ok := c.keyspaceMap.Load().(map[string]*KeyspaceMetadata)
+	if !ok {
+		return nil
+	}
+	return l
+}
+
+func (c *cowKeyspaceMetadataMap) getKeyspace(keyspaceName string) (*KeyspaceMetadata, bool) {
+	m, ok := c.keyspaceMap.Load().(map[string]*KeyspaceMetadata)
+	if !ok {
+		return nil, ok
+	}
+	val, ok := m[keyspaceName]
+	return val, ok
+}
+
+func (c *cowKeyspaceMetadataMap) set(keyspaceName string, keyspaceMetadata *KeyspaceMetadata) bool {
+	c.mu.Lock()
+	m := c.get()
+
+	newM := map[string]*KeyspaceMetadata{}
+	for name, metadata := range m {
+		newM[name] = metadata
+	}
+	newM[keyspaceName] = keyspaceMetadata
+
+	c.keyspaceMap.Store(newM)
+	c.mu.Unlock()
+	return true
+}
+
+func (c *cowKeyspaceMetadataMap) remove(keyspaceName string) {
+	c.mu.Lock()
+	m := c.get()
+
+	newM := map[string]*KeyspaceMetadata{}
+	for name, meta := range m {
+		if name != keyspaceName {
+			newM[name] = meta
+		}
+	}
+
+	c.keyspaceMap.Store(newM)
+	c.mu.Unlock()
+}
+
+const (
+	IndexKindCustom = "CUSTOM"
+)
+
+const (
+	TableFlagDense    = "dense"
+	TableFlagSuper    = "super"
+	TableFlagCompound = "compound"
+)
+
+// the ordering of the column with regard to its comparator
+type ColumnOrder bool
+
+const (
+	ASC  ColumnOrder = false
+	DESC             = true
+)
+
+type ColumnIndexMetadata struct {
+	Name    string
+	Type    string
+	Options map[string]interface{}
+}
+
+func (c *ColumnIndexMetadata) Equals(other *ColumnIndexMetadata) bool {
+	if c == nil || other == nil {
+		return c == other
+	}
+
+	if c.Name != other.Name || c.Type != other.Type {
+		return false
+	}
+
+	// Compare the Options map
+	if len(c.Options) != len(other.Options) {
+		return false
+	}
+	for k, v := range c.Options {
+		otherValue, exists := other.Options[k]
+		if !exists || !reflect.DeepEqual(v, otherValue) {
+			return false
+		}
+	}
+
+	return true
+}
+
+type ColumnKind int
+
+const (
+	ColumnUnkownKind ColumnKind = iota
+	ColumnPartitionKey
+	ColumnClusteringKey
+	ColumnRegular
+	ColumnCompact
+	ColumnStatic
+)
+
+func (c ColumnKind) String() string {
+	switch c {
+	case ColumnPartitionKey:
+		return "partition_key"
+	case ColumnClusteringKey:
+		return "clustering_key"
+	case ColumnRegular:
+		return "regular"
+	case ColumnCompact:
+		return "compact"
+	case ColumnStatic:
+		return "static"
+	default:
+		return fmt.Sprintf("unknown_column_%d", c)
+	}
+}
+
+func (c *ColumnKind) UnmarshalCQL(typ TypeInfo, p []byte) error {
+	if typ.Type() != TypeVarchar {
+		return unmarshalErrorf("unable to marshall %s into ColumnKind, expected Varchar", typ)
+	}
+
+	kind, err := columnKindFromSchema(string(p))
+	if err != nil {
+		return err
+	}
+	*c = kind
+
+	return nil
+}
+
+func columnKindFromSchema(kind string) (ColumnKind, error) {
+	switch kind {
+	case "partition_key":
+		return ColumnPartitionKey, nil
+	case "clustering_key", "clustering":
+		return ColumnClusteringKey, nil
+	case "regular":
+		return ColumnRegular, nil
+	case "compact_value":
+		return ColumnCompact, nil
+	case "static":
+		return ColumnStatic, nil
+	default:
+		return -1, fmt.Errorf("unknown column kind: %q", kind)
+	}
+}
+
+type Metadata struct {
+	tabletsMetadata  cowTabletList
+	keyspaceMetadata cowKeyspaceMetadataMap
+}
+
+// queries the cluster for schema information for a specific keyspace and for tablets
+type metadataDescriber struct {
+	session *Session
+	mu      sync.Mutex
+
+	metadata *Metadata
+}
+
+// creates a session bound schema describer which will query and cache
+// keyspace metadata and tablets metadata
+func newMetadataDescriber(session *Session) *metadataDescriber {
+	return &metadataDescriber{
+		session:  session,
+		metadata: &Metadata{},
+	}
+}
+
+// returns the cached KeyspaceMetadata held by the describer for the named
+// keyspace.
+func (s *metadataDescriber) getSchema(keyspaceName string) (*KeyspaceMetadata, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	metadata, found := s.metadata.keyspaceMetadata.getKeyspace(keyspaceName)
+	if !found {
+		// refresh the cache for this keyspace
+		err := s.refreshSchema(keyspaceName)
+		if err != nil {
+			return nil, err
+		}
+
+		metadata, found = s.metadata.keyspaceMetadata.getKeyspace(keyspaceName)
+		if !found {
+			return nil, fmt.Errorf("Metadata not found for keyspace: %s", keyspaceName)
+		}
+	}
+
+	return metadata, nil
+}
+
+func (s *metadataDescriber) setTablets(tablets TabletInfoList) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	s.metadata.tabletsMetadata.set(tablets)
+}
+
+func (s *metadataDescriber) getTablets() TabletInfoList {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	return s.metadata.tabletsMetadata.get()
+}
+
+func (s *metadataDescriber) addTablet(tablet *TabletInfo) error {
+	tablets := s.getTablets()
+	tablets = tablets.addTabletToTabletsList(tablet)
+
+	s.setTablets(tablets)
+
+	return nil
+}
+
+func (s *metadataDescriber) removeTabletsWithHost(host *HostInfo) error {
+	tablets := s.getTablets()
+	tablets = tablets.removeTabletsWithHostFromTabletsList(host)
+
+	s.setTablets(tablets)
+
+	return nil
+}
+
+func (s *metadataDescriber) removeTabletsWithKeyspace(keyspace string) error {
+	tablets := s.getTablets()
+	tablets = tablets.removeTabletsWithKeyspaceFromTabletsList(keyspace)
+
+	s.setTablets(tablets)
+
+	return nil
+}
+
+func (s *metadataDescriber) removeTabletsWithTable(keyspace string, table string) error {
+	tablets := s.getTablets()
+	tablets = tablets.removeTabletsWithTableFromTabletsList(keyspace, table)
+
+	s.setTablets(tablets)
+
+	return nil
+}
+
+// clears the already cached keyspace metadata
+func (s *metadataDescriber) clearSchema(keyspaceName string) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	s.metadata.keyspaceMetadata.remove(keyspaceName)
+}
+
+func (s *metadataDescriber) refreshAllSchema() error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	copiedMap := make(map[string]*KeyspaceMetadata)
+
+	for key, value := range s.metadata.keyspaceMetadata.get() {
+		if value != nil {
+			copiedMap[key] = &KeyspaceMetadata{
+				Name:            value.Name,
+				DurableWrites:   value.DurableWrites,
+				StrategyClass:   value.StrategyClass,
+				StrategyOptions: value.StrategyOptions,
+				Tables:          value.Tables,
+				Functions:       value.Functions,
+				Aggregates:      value.Aggregates,
+				Types:           value.Types,
+				Indexes:         value.Indexes,
+				Views:           value.Views,
+				CreateStmts:     value.CreateStmts,
+			}
+		} else {
+			copiedMap[key] = nil
+		}
+	}
+
+	for keyspaceName, metadata := range copiedMap {
+		// refresh the cache for this keyspace
+		err := s.refreshSchema(keyspaceName)
+		if err == ErrKeyspaceDoesNotExist {
+			s.clearSchema(keyspaceName)
+			s.removeTabletsWithKeyspace(keyspaceName)
+			continue
+		} else if err != nil {
+			return err
+		}
+
+		updatedMetadata, err := s.getSchema(keyspaceName)
+		if err != nil {
+			return err
+		}
+
+		if !compareInterfaceMaps(metadata.StrategyOptions, updatedMetadata.StrategyOptions) {
+			s.removeTabletsWithKeyspace(keyspaceName)
+			continue
+		}
+
+		for tableName, tableMetadata := range metadata.Tables {
+			if updatedTableMetadata, ok := updatedMetadata.Tables[tableName]; !ok || tableMetadata.Equals(updatedTableMetadata) {
+				s.removeTabletsWithTable(keyspaceName, tableName)
+			}
+		}
+	}
+	return nil
+}
+
+// forcibly updates the current KeyspaceMetadata held by the schema describer
+// for a given named keyspace.
+func (s *metadataDescriber) refreshSchema(keyspaceName string) error {
+	var err error
+
+	// query the system keyspace for schema data
+	// TODO retrieve concurrently
+	keyspace, err := getKeyspaceMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+	tables, err := getTableMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+	columns, err := getColumnMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+	functions, err := getFunctionsMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+	aggregates, err := getAggregatesMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+	types, err := getTypeMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+	indexes, err := getIndexMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+	views, err := getViewMetadata(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+
+	createStmts, err := getCreateStatements(s.session, keyspaceName)
+	if err != nil {
+		return err
+	}
+
+	// organize the schema data
+	compileMetadata(keyspace, tables, columns, functions, aggregates, types, indexes, views, createStmts)
+
+	// update the cache
+	s.metadata.keyspaceMetadata.set(keyspaceName, keyspace)
+
+	return nil
+}
+
+// "compiles" derived information about keyspace, table, and column metadata
+// for a keyspace from the basic queried metadata objects returned by
+// getKeyspaceMetadata, getTableMetadata, and getColumnMetadata respectively;
+// Links the metadata objects together and derives the column composition of
+// the partition key and clustering key for a table.
+func compileMetadata(
+	keyspace *KeyspaceMetadata,
+	tables []TableMetadata,
+	columns []ColumnMetadata,
+	functions []FunctionMetadata,
+	aggregates []AggregateMetadata,
+	types []TypeMetadata,
+	indexes []IndexMetadata,
+	views []ViewMetadata,
+	createStmts []byte,
+) {
+	keyspace.Tables = make(map[string]*TableMetadata)
+	for i := range tables {
+		tables[i].Columns = make(map[string]*ColumnMetadata)
+		keyspace.Tables[tables[i].Name] = &tables[i]
+	}
+	keyspace.Functions = make(map[string]*FunctionMetadata, len(functions))
+	for i := range functions {
+		keyspace.Functions[functions[i].Name] = &functions[i]
+	}
+	keyspace.Aggregates = make(map[string]*AggregateMetadata, len(aggregates))
+	for _, aggregate := range aggregates {
+		aggregate.FinalFunc = *keyspace.Functions[aggregate.finalFunc]
+		aggregate.StateFunc = *keyspace.Functions[aggregate.stateFunc]
+		keyspace.Aggregates[aggregate.Name] = &aggregate
+	}
+	keyspace.Types = make(map[string]*TypeMetadata, len(types))
+	for i := range types {
+		keyspace.Types[types[i].Name] = &types[i]
+	}
+	keyspace.Indexes = make(map[string]*IndexMetadata, len(indexes))
+	for i := range indexes {
+		keyspace.Indexes[indexes[i].Name] = &indexes[i]
+	}
+	keyspace.Views = make(map[string]*ViewMetadata, len(views))
+	for i := range views {
+		v := &views[i]
+		if _, ok := keyspace.Indexes[strings.TrimSuffix(v.ViewName, "_index")]; ok {
+			continue
+		}
+
+		v.Columns = make(map[string]*ColumnMetadata)
+		keyspace.Views[v.ViewName] = v
+	}
+
+	// add columns from the schema data
+	for i := range columns {
+		col := &columns[i]
+		col.Order = ASC
+		if col.ClusteringOrder == "desc" {
+			col.Order = DESC
+		}
+
+		table, ok := keyspace.Tables[col.Table]
+		if !ok {
+			view, ok := keyspace.Views[col.Table]
+			if !ok {
+				// if the schema is being updated we will race between seeing
+				// the metadata be complete. Potentially we should check for
+				// schema versions before and after reading the metadata and
+				// if they dont match try again.
+				continue
+			}
+
+			view.Columns[col.Name] = col
+			view.OrderedColumns = append(view.OrderedColumns, col.Name)
+			continue
+		}
+
+		table.Columns[col.Name] = col
+		table.OrderedColumns = append(table.OrderedColumns, col.Name)
+	}
+
+	for i := range tables {
+		t := &tables[i]
+		t.PartitionKey, t.ClusteringColumns, t.OrderedColumns = compileColumns(t.Columns, t.OrderedColumns)
+	}
+	for i := range views {
+		v := &views[i]
+		v.PartitionKey, v.ClusteringColumns, v.OrderedColumns = compileColumns(v.Columns, v.OrderedColumns)
+	}
+
+	keyspace.CreateStmts = string(createStmts)
+}
+
+func compileColumns(columns map[string]*ColumnMetadata, orderedColumns []string) (
+	partitionKey, clusteringColumns []*ColumnMetadata, sortedColumns []string) {
+	clusteringColumnCount := componentColumnCountOfType(columns, ColumnClusteringKey)
+	clusteringColumns = make([]*ColumnMetadata, clusteringColumnCount)
+
+	partitionKeyCount := componentColumnCountOfType(columns, ColumnPartitionKey)
+	partitionKey = make([]*ColumnMetadata, partitionKeyCount)
+
+	var otherColumns []string
+	for _, columnName := range orderedColumns {
+		column := columns[columnName]
+		if column.Kind == ColumnPartitionKey {
+			partitionKey[column.ComponentIndex] = column
+		} else if column.Kind == ColumnClusteringKey {
+			clusteringColumns[column.ComponentIndex] = column
+		} else {
+			otherColumns = append(otherColumns, columnName)
+		}
+	}
+
+	sortedColumns = orderedColumns[:0]
+	for _, pk := range partitionKey {
+		sortedColumns = append(sortedColumns, pk.Name)
+	}
+	for _, ck := range clusteringColumns {
+		sortedColumns = append(sortedColumns, ck.Name)
+	}
+	for _, oc := range otherColumns {
+		sortedColumns = append(sortedColumns, oc)
+	}
+
+	return
+}
+
+// returns the count of coluns with the given "kind" value.
+func componentColumnCountOfType(columns map[string]*ColumnMetadata, kind ColumnKind) int {
+	maxComponentIndex := -1
+	for _, column := range columns {
+		if column.Kind == kind && column.ComponentIndex > maxComponentIndex {
+			maxComponentIndex = column.ComponentIndex
+		}
+	}
+	return maxComponentIndex + 1
+}
+
+// query for keyspace metadata in the system_schema.keyspaces
+func getKeyspaceMetadata(session *Session, keyspaceName string) (*KeyspaceMetadata, error) {
+	if !session.useSystemSchema {
+		return nil, ErrKeyspaceDoesNotExist
+	}
+	keyspace := &KeyspaceMetadata{Name: keyspaceName}
+
+	const stmt = `
+		SELECT durable_writes, replication
+		FROM system_schema.keyspaces
+		WHERE keyspace_name = ?`
+
+	var replication map[string]string
+
+	iter := session.control.query(stmt+session.usingTimeoutClause, keyspaceName)
+	if iter.NumRows() == 0 {
+		return nil, ErrKeyspaceDoesNotExist
+	}
+	iter.Scan(&keyspace.DurableWrites, &replication)
+	err := iter.Close()
+	if err != nil {
+		return nil, fmt.Errorf("error querying keyspace schema: %v", err)
+	}
+
+	keyspace.StrategyClass = replication["class"]
+	delete(replication, "class")
+
+	keyspace.StrategyOptions = make(map[string]interface{}, len(replication))
+	for k, v := range replication {
+		keyspace.StrategyOptions[k] = v
+	}
+
+	return keyspace, nil
+}
+
+// query for table metadata in the system_schema.tables and system_schema.scylla_tables
+func getTableMetadata(session *Session, keyspaceName string) ([]TableMetadata, error) {
+	if !session.useSystemSchema {
+		return nil, nil
+	}
+
+	stmt := `SELECT * FROM system_schema.tables WHERE keyspace_name = ?`
+	iter := session.control.query(stmt+session.usingTimeoutClause, keyspaceName)
+
+	var tables []TableMetadata
+	table := TableMetadata{Keyspace: keyspaceName}
+	for iter.MapScan(map[string]interface{}{
+		"table_name":                  &table.Name,
+		"bloom_filter_fp_chance":      &table.Options.BloomFilterFpChance,
+		"caching":                     &table.Options.Caching,
+		"comment":                     &table.Options.Comment,
+		"compaction":                  &table.Options.Compaction,
+		"compression":                 &table.Options.Compression,
+		"crc_check_chance":            &table.Options.CrcCheckChance,
+		"default_time_to_live":        &table.Options.DefaultTimeToLive,
+		"gc_grace_seconds":            &table.Options.GcGraceSeconds,
+		"max_index_interval":          &table.Options.MaxIndexInterval,
+		"memtable_flush_period_in_ms": &table.Options.MemtableFlushPeriodInMs,
+		"min_index_interval":          &table.Options.MinIndexInterval,
+		"speculative_retry":           &table.Options.SpeculativeRetry,
+		"flags":                       &table.Flags,
+		"extensions":                  &table.Extensions,
+	}) {
+		tables = append(tables, table)
+		table = TableMetadata{Keyspace: keyspaceName}
+	}
+
+	err := iter.Close()
+	if err != nil && err != ErrNotFound {
+		return nil, fmt.Errorf("error querying table schema: %v", err)
+	}
+
+	stmt = `SELECT * FROM system_schema.scylla_tables WHERE keyspace_name = ? AND table_name = ?`
+	for i, t := range tables {
+		iter := session.control.query(stmt+session.usingTimeoutClause, keyspaceName, t.Name)
+
+		table := TableMetadata{}
+		if iter.MapScan(map[string]interface{}{
+			"cdc":         &table.Options.CDC,
+			"in_memory":   &table.Options.InMemory,
+			"partitioner": &table.Options.Partitioner,
+			"version":     &table.Options.Version,
+		}) {
+			tables[i].Options.CDC = table.Options.CDC
+			tables[i].Options.Version = table.Options.Version
+			tables[i].Options.Partitioner = table.Options.Partitioner
+			tables[i].Options.InMemory = table.Options.InMemory
+		}
+		if err := iter.Close(); err != nil && err != ErrNotFound {
+			return nil, fmt.Errorf("error querying scylla table schema: %v", err)
+		}
+	}
+
+	return tables, nil
+}
+
+// query for column metadata in the system_schema.columns
+func getColumnMetadata(session *Session, keyspaceName string) ([]ColumnMetadata, error) {
+	const stmt = `SELECT * FROM system_schema.columns WHERE keyspace_name = ?`
+
+	var columns []ColumnMetadata
+
+	iter := session.control.query(stmt+session.usingTimeoutClause, keyspaceName)
+	column := ColumnMetadata{Keyspace: keyspaceName}
+
+	for iter.MapScan(map[string]interface{}{
+		"table_name":       &column.Table,
+		"column_name":      &column.Name,
+		"clustering_order": &column.ClusteringOrder,
+		"type":             &column.Type,
+		"kind":             &column.Kind,
+		"position":         &column.ComponentIndex,
+	}) {
+		columns = append(columns, column)
+		column = ColumnMetadata{Keyspace: keyspaceName}
+	}
+
+	if err := iter.Close(); err != nil && err != ErrNotFound {
+		return nil, fmt.Errorf("error querying column schema: %v", err)
+	}
+
+	return columns, nil
+}
+
+// query for type metadata in the system_schema.types
+func getTypeMetadata(session *Session, keyspaceName string) ([]TypeMetadata, error) {
+	if !session.useSystemSchema {
+		return nil, nil
+	}
+
+	stmt := `SELECT * FROM system_schema.types WHERE keyspace_name = ?`
+	iter := session.control.query(stmt+session.usingTimeoutClause, keyspaceName)
+
+	var types []TypeMetadata
+	tm := TypeMetadata{Keyspace: keyspaceName}
+
+	for iter.MapScan(map[string]interface{}{
+		"type_name":   &tm.Name,
+		"field_names": &tm.FieldNames,
+		"field_types": &tm.FieldTypes,
+	}) {
+		types = append(types, tm)
+		tm = TypeMetadata{Keyspace: keyspaceName}
+	}
+
+	if err := iter.Close(); err != nil {
+		return nil, err
+	}
+
+	return types, nil
+}
+
+// query for function metadata in the system_schema.functions
+func getFunctionsMetadata(session *Session, keyspaceName string) ([]FunctionMetadata, error) {
+	if !session.hasAggregatesAndFunctions || !session.useSystemSchema {
+		return nil, nil
+	}
+	stmt := `SELECT * FROM system_schema.functions WHERE keyspace_name = ?`
+
+	var functions []FunctionMetadata
+	function := FunctionMetadata{Keyspace: keyspaceName}
+
+	iter := session.control.query(stmt+session.usingTimeoutClause, keyspaceName)
+	for iter.MapScan(map[string]interface{}{
+		"function_name":        &function.Name,
+		"argument_types":       &function.ArgumentTypes,
+		"argument_names":       &function.ArgumentNames,
+		"body":                 &function.Body,
+		"called_on_null_input": &function.CalledOnNullInput,
+		"language":             &function.Language,
+		"return_type":          &function.ReturnType,
+	}) {
+		functions = append(functions, function)
+		function = FunctionMetadata{Keyspace: keyspaceName}
+	}
+
+	if err := iter.Close(); err != nil {
+		return nil, err
+	}
+
+	return functions, nil
+}
+
+// query for aggregate metadata in the system_schema.aggregates
+func getAggregatesMetadata(session *Session, keyspaceName string) ([]AggregateMetadata, error) {
+	if !session.hasAggregatesAndFunctions || !session.useSystemSchema {
+		return nil, nil
+	}
+
+	const stmt = `SELECT * FROM system_schema.aggregates WHERE keyspace_name = ?`
+
+	var aggregates []AggregateMetadata
+	aggregate := AggregateMetadata{Keyspace: keyspaceName}
+
+	iter := session.control.query(stmt+session.usingTimeoutClause, keyspaceName)
+	for iter.MapScan(map[string]interface{}{
+		"aggregate_name": &aggregate.Name,
+		"argument_types": &aggregate.ArgumentTypes,
+		"final_func":     &aggregate.finalFunc,
+		"initcond":       &aggregate.InitCond,
+		"return_type":    &aggregate.ReturnType,
+		"state_func":     &aggregate.stateFunc,
+		"state_type":     &aggregate.StateType,
+	}) {
+		aggregates = append(aggregates, aggregate)
+		aggregate = AggregateMetadata{Keyspace: keyspaceName}
+	}
+
+	if err := iter.Close(); err != nil {
+		return nil, err
+	}
+
+	return aggregates, nil
+}
+
+// query for index metadata in the system_schema.indexes
+func getIndexMetadata(session *Session, keyspaceName string) ([]IndexMetadata, error) {
+	if !session.useSystemSchema {
+		return nil, nil
+	}
+
+	const stmt = `SELECT * FROM system_schema.indexes WHERE keyspace_name = ?`
+
+	var indexes []IndexMetadata
+	index := IndexMetadata{}
+
+	iter := session.control.query(stmt+session.usingTimeoutClause, keyspaceName)
+	for iter.MapScan(map[string]interface{}{
+		"index_name":    &index.Name,
+		"keyspace_name": &index.KeyspaceName,
+		"table_name":    &index.TableName,
+		"kind":          &index.Kind,
+		"options":       &index.Options,
+	}) {
+		indexes = append(indexes, index)
+		index = IndexMetadata{}
+	}
+
+	if err := iter.Close(); err != nil {
+		return nil, err
+	}
+
+	return indexes, nil
+}
+
+// get create statements for the keyspace
+func getCreateStatements(session *Session, keyspaceName string) ([]byte, error) {
+	if !session.useSystemSchema {
+		return nil, nil
+	}
+	iter := session.control.query(fmt.Sprintf(`DESCRIBE KEYSPACE %s WITH INTERNALS`, keyspaceName))
+
+	var createStatements []string
+
+	var stmt string
+	for iter.Scan(nil, nil, nil, &stmt) {
+		if stmt == "" {
+			continue
+		}
+		createStatements = append(createStatements, stmt)
+	}
+
+	if err := iter.Close(); err != nil {
+		if errFrame, ok := err.(errorFrame); ok && errFrame.code == ErrCodeSyntax {
+			// DESCRIBE KEYSPACE is not supported on older versions of Cassandra and Scylla
+			// For such case schema statement is going to be recreated on the client side
+			return nil, nil
+		}
+		return nil, fmt.Errorf("error querying keyspace schema: %v", err)
+	}
+
+	return []byte(strings.Join(createStatements, "\n")), nil
+}
+
+// query for view metadata in the system_schema.views
+func getViewMetadata(session *Session, keyspaceName string) ([]ViewMetadata, error) {
+	if !session.useSystemSchema {
+		return nil, nil
+	}
+
+	stmt := `SELECT * FROM system_schema.views WHERE keyspace_name = ?`
+
+	iter := session.control.query(stmt+session.usingTimeoutClause, keyspaceName)
+
+	var views []ViewMetadata
+	view := ViewMetadata{KeyspaceName: keyspaceName}
+
+	for iter.MapScan(map[string]interface{}{
+		"id":                          &view.ID,
+		"view_name":                   &view.ViewName,
+		"base_table_id":               &view.BaseTableID,
+		"base_table_name":             &view.BaseTableName,
+		"include_all_columns":         &view.IncludeAllColumns,
+		"where_clause":                &view.WhereClause,
+		"bloom_filter_fp_chance":      &view.Options.BloomFilterFpChance,
+		"caching":                     &view.Options.Caching,
+		"comment":                     &view.Options.Comment,
+		"compaction":                  &view.Options.Compaction,
+		"compression":                 &view.Options.Compression,
+		"crc_check_chance":            &view.Options.CrcCheckChance,
+		"default_time_to_live":        &view.Options.DefaultTimeToLive,
+		"gc_grace_seconds":            &view.Options.GcGraceSeconds,
+		"max_index_interval":          &view.Options.MaxIndexInterval,
+		"memtable_flush_period_in_ms": &view.Options.MemtableFlushPeriodInMs,
+		"min_index_interval":          &view.Options.MinIndexInterval,
+		"speculative_retry":           &view.Options.SpeculativeRetry,
+		"extensions":                  &view.Extensions,
+	}) {
+		views = append(views, view)
+		view = ViewMetadata{KeyspaceName: keyspaceName}
+	}
+
+	err := iter.Close()
+	if err != nil && err != ErrNotFound {
+		return nil, fmt.Errorf("error querying view schema: %v", err)
+	}
+
+	return views, nil
+}
diff --git a/vendor/github.com/gocql/gocql/policies.go b/vendor/github.com/gocql/gocql/policies.go
new file mode 100644
index 0000000..d5ce456
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/policies.go
@@ -0,0 +1,1326 @@
+// Copyright (c) 2012 The gocql Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gocql
+
+//This file will be the future home for more policies
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"math"
+	"math/rand"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/hailocab/go-hostpool"
+)
+
+// cowHostList implements a copy on write host list, its equivalent type is []*HostInfo
+type cowHostList struct {
+	list atomic.Value
+	mu   sync.Mutex
+}
+
+func (c *cowHostList) String() string {
+	return fmt.Sprintf("%+v", c.get())
+}
+
+func (c *cowHostList) get() []*HostInfo {
+	// TODO(zariel): should we replace this with []*HostInfo?
+	l, ok := c.list.Load().(*[]*HostInfo)
+	if !ok {
+		return nil
+	}
+	return *l
+}
+
+// add will add a host if it not already in the list
+func (c *cowHostList) add(host *HostInfo) bool {
+	c.mu.Lock()
+	l := c.get()
+
+	if n := len(l); n == 0 {
+		l = []*HostInfo{host}
+	} else {
+		newL := make([]*HostInfo, n+1)
+		for i := 0; i < n; i++ {
+			if host.Equal(l[i]) {
+				c.mu.Unlock()
+				return false
+			}
+			newL[i] = l[i]
+		}
+		newL[n] = host
+		l = newL
+	}
+
+	c.list.Store(&l)
+	c.mu.Unlock()
+	return true
+}
+
+func (c *cowHostList) remove(host *HostInfo) bool {
+	c.mu.Lock()
+	l := c.get()
+	size := len(l)
+	if size == 0 {
+		c.mu.Unlock()
+		return false
+	}
+
+	found := false
+	newL := make([]*HostInfo, 0, size)
+	for i := 0; i < len(l); i++ {
+		if !l[i].Equal(host) {
+			newL = append(newL, l[i])
+		} else {
+			found = true
+		}
+	}
+
+	if !found {
+		c.mu.Unlock()
+		return false
+	}
+
+	newL = newL[: size-1 : size-1]
+	c.list.Store(&newL)
+	c.mu.Unlock()
+
+	return true
+}
+
+// RetryableQuery is an interface that represents a query or batch statement that
+// exposes the correct functions for the retry policy logic to evaluate correctly.
+type RetryableQuery interface {
+	Attempts() int
+	SetConsistency(c Consistency)
+	GetConsistency() Consistency
+	Context() context.Context
+}
+
+type RetryType uint16
+
+const (
+	Retry         RetryType = 0x00 // retry on same connection
+	RetryNextHost RetryType = 0x01 // retry on another connection
+	Ignore        RetryType = 0x02 // ignore error and return result
+	Rethrow       RetryType = 0x03 // raise error and stop retrying
+)
+
+// ErrUnknownRetryType is returned if the retry policy returns a retry type
+// unknown to the query executor.
+var ErrUnknownRetryType = errors.New("unknown retry type returned by retry policy")
+
+// RetryPolicy interface is used by gocql to determine if a query can be attempted
+// again after a retryable error has been received. The interface allows gocql
+// users to implement their own logic to determine if a query can be attempted
+// again.
+//
+// See SimpleRetryPolicy as an example of implementing and using a RetryPolicy
+// interface.
+type RetryPolicy interface {
+	Attempt(RetryableQuery) bool
+	GetRetryType(error) RetryType
+}
+
+// LWTRetryPolicy is a similar interface to RetryPolicy
+// If a query is recognized as an LWT query and its RetryPolicy satisfies this
+// interface, then this interface will be used instead of RetryPolicy.
+type LWTRetryPolicy interface {
+	AttemptLWT(RetryableQuery) bool
+	GetRetryTypeLWT(error) RetryType
+}
+
+// SimpleRetryPolicy has simple logic for attempting a query a fixed number of times.
+//
+// See below for examples of usage:
+//
+//	//Assign to the cluster
+//	cluster.RetryPolicy = &gocql.SimpleRetryPolicy{NumRetries: 3}
+//
+//	//Assign to a query
+//	query.RetryPolicy(&gocql.SimpleRetryPolicy{NumRetries: 1})
+type SimpleRetryPolicy struct {
+	NumRetries int //Number of times to retry a query
+}
+
+// Attempt tells gocql to attempt the query again based on query.Attempts being less
+// than the NumRetries defined in the policy.
+func (s *SimpleRetryPolicy) Attempt(q RetryableQuery) bool {
+	return q.Attempts() <= s.NumRetries
+}
+
+func (s *SimpleRetryPolicy) AttemptLWT(q RetryableQuery) bool {
+	return s.Attempt(q)
+}
+
+func (s *SimpleRetryPolicy) GetRetryType(err error) RetryType {
+	var executedErr *QueryError
+	if errors.As(err, &executedErr) && executedErr.PotentiallyExecuted() && !executedErr.IsIdempotent() {
+		return Rethrow
+	}
+	return RetryNextHost
+}
+
+// Retrying on a different host is fine for normal (non-LWT) queries,
+// but in case of LWTs it will cause Paxos contention and possibly
+// even timeouts if other clients send statements touching the same
+// partition to the original node at the same time.
+func (s *SimpleRetryPolicy) GetRetryTypeLWT(err error) RetryType {
+	var executedErr *QueryError
+	if errors.As(err, &executedErr) && executedErr.PotentiallyExecuted() && !executedErr.IsIdempotent() {
+		return Rethrow
+	}
+	return Retry
+}
+
+// ExponentialBackoffRetryPolicy sleeps between attempts
+type ExponentialBackoffRetryPolicy struct {
+	NumRetries int
+	Min, Max   time.Duration
+}
+
+func (e *ExponentialBackoffRetryPolicy) Attempt(q RetryableQuery) bool {
+	if q.Attempts() > e.NumRetries {
+		return false
+	}
+	time.Sleep(e.napTime(q.Attempts()))
+	return true
+}
+
+func (e *ExponentialBackoffRetryPolicy) AttemptLWT(q RetryableQuery) bool {
+	return e.Attempt(q)
+}
+
+// used to calculate exponentially growing time
+func getExponentialTime(min time.Duration, max time.Duration, attempts int) time.Duration {
+	if min <= 0 {
+		min = 100 * time.Millisecond
+	}
+	if max <= 0 {
+		max = 10 * time.Second
+	}
+	minFloat := float64(min)
+	napDuration := minFloat * math.Pow(2, float64(attempts-1))
+	// add some jitter
+	napDuration += rand.Float64()*minFloat - (minFloat / 2)
+	if napDuration > float64(max) {
+		return time.Duration(max)
+	}
+	return time.Duration(napDuration)
+}
+
+func (e *ExponentialBackoffRetryPolicy) GetRetryType(err error) RetryType {
+	var executedErr *QueryError
+	if errors.As(err, &executedErr) && executedErr.PotentiallyExecuted() && !executedErr.IsIdempotent() {
+		return Rethrow
+	}
+	return RetryNextHost
+}
+
+// Retrying on a different host is fine for normal (non-LWT) queries,
+// but in case of LWTs it will cause Paxos contention and possibly
+// even timeouts if other clients send statements touching the same
+// partition to the original node at the same time.
+func (e *ExponentialBackoffRetryPolicy) GetRetryTypeLWT(err error) RetryType {
+	var executedErr *QueryError
+	if errors.As(err, &executedErr) && executedErr.PotentiallyExecuted() && !executedErr.IsIdempotent() {
+		return Rethrow
+	}
+	return Retry
+}
+
+// DowngradingConsistencyRetryPolicy: Next retry will be with the next consistency level
+// provided in the slice
+//
+// On a read timeout: the operation is retried with the next provided consistency
+// level.
+//
+// On a write timeout: if the operation is an :attr:`~.UNLOGGED_BATCH`
+// and at least one replica acknowledged the write, the operation is
+// retried with the next consistency level.  Furthermore, for other
+// write types, if at least one replica acknowledged the write, the
+// timeout is ignored.
+//
+// On an unavailable exception: if at least one replica is alive, the
+// operation is retried with the next provided consistency level.
+
+type DowngradingConsistencyRetryPolicy struct {
+	ConsistencyLevelsToTry []Consistency
+}
+
+func (d *DowngradingConsistencyRetryPolicy) Attempt(q RetryableQuery) bool {
+	currentAttempt := q.Attempts()
+
+	if currentAttempt > len(d.ConsistencyLevelsToTry) {
+		return false
+	} else if currentAttempt > 0 {
+		q.SetConsistency(d.ConsistencyLevelsToTry[currentAttempt-1])
+	}
+	return true
+}
+
+func (d *DowngradingConsistencyRetryPolicy) GetRetryType(err error) RetryType {
+	var executedErr *QueryError
+	if errors.As(err, &executedErr) {
+		err = executedErr.err
+		if executedErr.PotentiallyExecuted() && !executedErr.IsIdempotent() {
+			return Rethrow
+		}
+	}
+
+	switch t := err.(type) {
+	case *RequestErrUnavailable:
+		if t.Alive > 0 {
+			return Retry
+		}
+		return Rethrow
+	case *RequestErrWriteTimeout:
+		if t.WriteType == "SIMPLE" || t.WriteType == "BATCH" || t.WriteType == "COUNTER" {
+			if t.Received > 0 {
+				return Ignore
+			}
+			return Rethrow
+		}
+		if t.WriteType == "UNLOGGED_BATCH" {
+			return Retry
+		}
+		return Rethrow
+	case *RequestErrReadTimeout:
+		return Retry
+	default:
+		return RetryNextHost
+	}
+}
+
+func (e *ExponentialBackoffRetryPolicy) napTime(attempts int) time.Duration {
+	return getExponentialTime(e.Min, e.Max, attempts)
+}
+
+type HostStateNotifier interface {
+	AddHost(host *HostInfo)
+	RemoveHost(host *HostInfo)
+	HostUp(host *HostInfo)
+	HostDown(host *HostInfo)
+}
+
+type KeyspaceUpdateEvent struct {
+	Keyspace string
+	Change   string
+}
+
+type HostTierer interface {
+	// HostTier returns an integer specifying how far a host is from the client.
+	// Tier must start at 0.
+	// The value is used to prioritize closer hosts during host selection.
+	// For example this could be:
+	// 0 - local rack, 1 - local DC, 2 - remote DC
+	// or:
+	// 0 - local DC, 1 - remote DC
+	HostTier(host *HostInfo) uint
+
+	// This function returns the maximum possible host tier
+	MaxHostTier() uint
+}
+
+// HostSelectionPolicy is an interface for selecting
+// the most appropriate host to execute a given query.
+// HostSelectionPolicy instances cannot be shared between sessions.
+type HostSelectionPolicy interface {
+	HostStateNotifier
+	SetPartitioner
+	KeyspaceChanged(KeyspaceUpdateEvent)
+	Init(*Session)
+	// Reset is opprotunity to reset HostSelectionPolicy if Session initilization failed and we want to
+	// call HostSelectionPolicy.Init() again with new Session
+	Reset()
+	IsLocal(host *HostInfo) bool
+	// Pick returns an iteration function over selected hosts.
+	// Multiple attempts of a single query execution won't call the returned NextHost function concurrently,
+	// so it's safe to have internal state without additional synchronization as long as every call to Pick returns
+	// a different instance of NextHost.
+	Pick(ExecutableQuery) NextHost
+	// IsOperational checks if host policy can properly work with given Session/Cluster/ClusterConfig
+	IsOperational(*Session) error
+}
+
+// SelectedHost is an interface returned when picking a host from a host
+// selection policy.
+type SelectedHost interface {
+	Info() *HostInfo
+	Token() Token
+	Mark(error)
+}
+
+type selectedHost struct {
+	info  *HostInfo
+	token Token
+}
+
+func (host selectedHost) Info() *HostInfo {
+	return host.info
+}
+
+func (host selectedHost) Token() Token {
+	return host.token
+}
+
+func (host selectedHost) Mark(err error) {}
+
+// NextHost is an iteration function over picked hosts
+type NextHost func() SelectedHost
+
+// RoundRobinHostPolicy is a round-robin load balancing policy, where each host
+// is tried sequentially for each query.
+func RoundRobinHostPolicy() HostSelectionPolicy {
+	return &roundRobinHostPolicy{}
+}
+
+type roundRobinHostPolicy struct {
+	hosts           cowHostList
+	lastUsedHostIdx uint64
+}
+
+func (r *roundRobinHostPolicy) IsLocal(*HostInfo) bool              { return true }
+func (r *roundRobinHostPolicy) KeyspaceChanged(KeyspaceUpdateEvent) {}
+func (r *roundRobinHostPolicy) SetPartitioner(partitioner string)   {}
+func (r *roundRobinHostPolicy) Init(*Session)                       {}
+func (r *roundRobinHostPolicy) Reset()                              {}
+func (r *roundRobinHostPolicy) IsOperational(*Session) error        { return nil }
+
+func (r *roundRobinHostPolicy) Pick(qry ExecutableQuery) NextHost {
+	nextStartOffset := atomic.AddUint64(&r.lastUsedHostIdx, 1)
+	return roundRobbin(int(nextStartOffset), r.hosts.get())
+}
+
+func (r *roundRobinHostPolicy) AddHost(host *HostInfo) {
+	r.hosts.add(host)
+}
+
+func (r *roundRobinHostPolicy) RemoveHost(host *HostInfo) {
+	r.hosts.remove(host)
+}
+
+func (r *roundRobinHostPolicy) HostUp(host *HostInfo) {
+	r.AddHost(host)
+}
+
+func (r *roundRobinHostPolicy) HostDown(host *HostInfo) {
+	r.RemoveHost(host)
+}
+
+func ShuffleReplicas() func(*tokenAwareHostPolicy) {
+	return func(t *tokenAwareHostPolicy) {
+		t.shuffleReplicas = true
+	}
+}
+
+// AvoidSlowReplicas enabled avoiding slow replicas
+//
+// TokenAwareHostPolicy normally does not check how busy replica is, with avoidSlowReplicas enabled it avoids replicas
+// if they have equal or more than MAX_IN_FLIGHT_THRESHOLD requests in flight
+func AvoidSlowReplicas(max_in_flight_threshold int) func(policy *tokenAwareHostPolicy) {
+	return func(t *tokenAwareHostPolicy) {
+		t.avoidSlowReplicas = true
+		MAX_IN_FLIGHT_THRESHOLD = max_in_flight_threshold
+	}
+}
+
+// NonLocalReplicasFallback enables fallback to replicas that are not considered local.
+//
+// TokenAwareHostPolicy used with DCAwareHostPolicy fallback first selects replicas by partition key in local DC, then
+// falls back to other nodes in the local DC. Enabling NonLocalReplicasFallback causes TokenAwareHostPolicy
+// to first select replicas by partition key in local DC, then replicas by partition key in remote DCs and fall back
+// to other nodes in local DC.
+func NonLocalReplicasFallback() func(policy *tokenAwareHostPolicy) {
+	return func(t *tokenAwareHostPolicy) {
+		t.nonLocalReplicasFallback = true
+	}
+}
+
+// TokenAwareHostPolicy is a token aware host selection policy, where hosts are
+// selected based on the partition key, so queries are sent to the host which
+// owns the partition. Fallback is used when routing information is not available.
+func TokenAwareHostPolicy(fallback HostSelectionPolicy, opts ...func(*tokenAwareHostPolicy)) HostSelectionPolicy {
+	p := &tokenAwareHostPolicy{fallback: fallback}
+	for _, opt := range opts {
+		opt(p)
+	}
+	return p
+}
+
+// clusterMeta holds metadata about cluster topology.
+// It is used inside atomic.Value and shallow copies are used when replacing it,
+// so fields should not be modified in-place. Instead, to modify a field a copy of the field should be made
+// and the pointer in clusterMeta updated to point to the new value.
+type clusterMeta struct {
+	// replicas is map[keyspace]map[token]hosts
+	replicas  map[string]tokenRingReplicas
+	tokenRing *tokenRing
+}
+
+var MAX_IN_FLIGHT_THRESHOLD int = 10
+
+type tokenAwareHostPolicy struct {
+	fallback            HostSelectionPolicy
+	getKeyspaceMetadata func(keyspace string) (*KeyspaceMetadata, error)
+	getKeyspaceName     func() string
+
+	shuffleReplicas          bool
+	nonLocalReplicasFallback bool
+
+	// mu protects writes to hosts, partitioner, metadata.
+	// reads can be unlocked as long as they are not used for updating state later.
+	mu          sync.Mutex
+	hosts       cowHostList
+	partitioner string
+	metadata    atomic.Value // *clusterMeta
+
+	logger StdLogger
+
+	avoidSlowReplicas bool
+}
+
+func (t *tokenAwareHostPolicy) Init(s *Session) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.getKeyspaceMetadata != nil {
+		// Init was already called.
+		// See https://github.com/scylladb/gocql/issues/94.
+		panic("sharing token aware host selection policy between sessions is not supported")
+	}
+	t.getKeyspaceMetadata = s.KeyspaceMetadata
+	t.getKeyspaceName = func() string { return s.cfg.Keyspace }
+	t.logger = s.logger
+}
+
+func (t *tokenAwareHostPolicy) Reset() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Sharing token aware host selection policy between sessions is not supported
+	// but session initialization can failed for some reasons. So in our application
+	// may be we want to create new session again.
+	// Reset method should be called in Session.Close method
+	t.getKeyspaceMetadata = nil
+	t.getKeyspaceName = nil
+	t.logger = nil
+}
+
+func (t *tokenAwareHostPolicy) IsOperational(session *Session) error {
+	return t.fallback.IsOperational(session)
+}
+
+func (t *tokenAwareHostPolicy) IsLocal(host *HostInfo) bool {
+	return t.fallback.IsLocal(host)
+}
+
+func (t *tokenAwareHostPolicy) KeyspaceChanged(update KeyspaceUpdateEvent) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	meta := t.getMetadataForUpdate()
+	t.updateReplicas(meta, update.Keyspace)
+	t.metadata.Store(meta)
+}
+
+// updateReplicas updates replicas in clusterMeta.
+// It must be called with t.mu mutex locked.
+// meta must not be nil and it's replicas field will be updated.
+func (t *tokenAwareHostPolicy) updateReplicas(meta *clusterMeta, keyspace string) {
+	newReplicas := make(map[string]tokenRingReplicas, len(meta.replicas))
+
+	ks, err := t.getKeyspaceMetadata(keyspace)
+	if err == nil {
+		strat := getStrategy(ks, t.logger)
+		if strat != nil {
+			if meta != nil && meta.tokenRing != nil {
+				newReplicas[keyspace] = strat.replicaMap(meta.tokenRing)
+			}
+		}
+	}
+
+	for ks, replicas := range meta.replicas {
+		if ks != keyspace {
+			newReplicas[ks] = replicas
+		}
+	}
+
+	meta.replicas = newReplicas
+}
+
+func (t *tokenAwareHostPolicy) SetPartitioner(partitioner string) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	if t.partitioner != partitioner {
+		t.fallback.SetPartitioner(partitioner)
+		t.partitioner = partitioner
+		meta := t.getMetadataForUpdate()
+		meta.resetTokenRing(t.partitioner, t.hosts.get(), t.logger)
+		t.updateReplicas(meta, t.getKeyspaceName())
+		t.metadata.Store(meta)
+	}
+}
+
+func (t *tokenAwareHostPolicy) AddHost(host *HostInfo) {
+	t.mu.Lock()
+	if t.hosts.add(host) {
+		meta := t.getMetadataForUpdate()
+		meta.resetTokenRing(t.partitioner, t.hosts.get(), t.logger)
+		t.updateReplicas(meta, t.getKeyspaceName())
+		t.metadata.Store(meta)
+	}
+	t.mu.Unlock()
+
+	t.fallback.AddHost(host)
+}
+
+func (t *tokenAwareHostPolicy) AddHosts(hosts []*HostInfo) {
+	t.mu.Lock()
+
+	for _, host := range hosts {
+		t.hosts.add(host)
+	}
+
+	meta := t.getMetadataForUpdate()
+	meta.resetTokenRing(t.partitioner, t.hosts.get(), t.logger)
+	t.updateReplicas(meta, t.getKeyspaceName())
+	t.metadata.Store(meta)
+
+	t.mu.Unlock()
+
+	for _, host := range hosts {
+		t.fallback.AddHost(host)
+	}
+}
+
+func (t *tokenAwareHostPolicy) RemoveHost(host *HostInfo) {
+	t.mu.Lock()
+	if t.hosts.remove(host) {
+		meta := t.getMetadataForUpdate()
+		meta.resetTokenRing(t.partitioner, t.hosts.get(), t.logger)
+		t.updateReplicas(meta, t.getKeyspaceName())
+		t.metadata.Store(meta)
+	}
+	t.mu.Unlock()
+
+	t.fallback.RemoveHost(host)
+}
+
+func (t *tokenAwareHostPolicy) HostUp(host *HostInfo) {
+	t.fallback.HostUp(host)
+}
+
+func (t *tokenAwareHostPolicy) HostDown(host *HostInfo) {
+	t.fallback.HostDown(host)
+}
+
+// getMetadataReadOnly returns current cluster metadata.
+// Metadata uses copy on write, so the returned value should be only used for reading.
+// To obtain a copy that could be updated, use getMetadataForUpdate instead.
+func (t *tokenAwareHostPolicy) getMetadataReadOnly() *clusterMeta {
+	meta, _ := t.metadata.Load().(*clusterMeta)
+	return meta
+}
+
+// getMetadataForUpdate returns clusterMeta suitable for updating.
+// It is a SHALLOW copy of current metadata in case it was already set or new empty clusterMeta otherwise.
+// This function should be called with t.mu mutex locked and the mutex should not be released before
+// storing the new metadata.
+func (t *tokenAwareHostPolicy) getMetadataForUpdate() *clusterMeta {
+	metaReadOnly := t.getMetadataReadOnly()
+	meta := new(clusterMeta)
+	if metaReadOnly != nil {
+		*meta = *metaReadOnly
+	}
+	return meta
+}
+
+// resetTokenRing creates a new tokenRing.
+// It must be called with t.mu locked.
+func (m *clusterMeta) resetTokenRing(partitioner string, hosts []*HostInfo, logger StdLogger) {
+	if partitioner == "" {
+		// partitioner not yet set
+		return
+	}
+
+	// create a new token ring
+	tokenRing, err := newTokenRing(partitioner, hosts)
+	if err != nil {
+		logger.Printf("Unable to update the token ring due to error: %s", err)
+		return
+	}
+
+	// replace the token ring
+	m.tokenRing = tokenRing
+}
+
+func (t *tokenAwareHostPolicy) Pick(qry ExecutableQuery) NextHost {
+	if qry == nil {
+		return t.fallback.Pick(qry)
+	}
+
+	routingKey, err := qry.GetRoutingKey()
+	if err != nil {
+		return t.fallback.Pick(qry)
+	} else if routingKey == nil {
+		return t.fallback.Pick(qry)
+	}
+
+	meta := t.getMetadataReadOnly()
+	if meta == nil || meta.tokenRing == nil {
+		return t.fallback.Pick(qry)
+	}
+
+	partitioner := qry.GetCustomPartitioner()
+	if partitioner == nil {
+		partitioner = meta.tokenRing.partitioner
+	}
+
+	token := partitioner.Hash(routingKey)
+
+	var replicas []*HostInfo
+
+	if session := qry.GetSession(); session != nil && session.tabletsRoutingV1 {
+		tablets := session.metadataDescriber.getTablets()
+
+		// Search for tablets with Keyspace and Table from the Query
+		l, r := tablets.findTablets(qry.Keyspace(), qry.Table())
+		if l != -1 {
+			tablet := tablets.findTabletForToken(token, l, r)
+			hosts := t.hosts.get()
+			for _, replica := range tablet.Replicas() {
+				for _, host := range hosts {
+					if host.hostId == replica.hostId.String() {
+						replicas = append(replicas, host)
+						break
+					}
+				}
+			}
+		}
+	}
+
+	if len(replicas) == 0 {
+		ht := meta.replicas[qry.Keyspace()].replicasFor(token)
+		if ht != nil {
+			replicas = ht.hosts
+		}
+	}
+
+	if len(replicas) == 0 {
+		host, _ := meta.tokenRing.GetHostForToken(token)
+		replicas = []*HostInfo{host}
+	}
+
+	if t.shuffleReplicas && !qry.IsLWT() && len(replicas) > 1 {
+		replicas = shuffleHosts(replicas)
+	}
+
+	if s := qry.GetSession(); s != nil && t.avoidSlowReplicas {
+		healthyReplicas := make([]*HostInfo, 0, len(replicas))
+		unhealthyReplicas := make([]*HostInfo, 0, len(replicas))
+
+		for _, h := range replicas {
+			if h.IsBusy(s) {
+				unhealthyReplicas = append(unhealthyReplicas, h)
+			} else {
+				healthyReplicas = append(healthyReplicas, h)
+			}
+		}
+
+		replicas = append(healthyReplicas, unhealthyReplicas...)
+	}
+
+	var (
+		fallbackIter NextHost
+		i, j, k      int
+		remote       [][]*HostInfo
+		tierer       HostTierer
+		tiererOk     bool
+		maxTier      uint
+	)
+
+	if tierer, tiererOk = t.fallback.(HostTierer); tiererOk {
+		maxTier = tierer.MaxHostTier()
+	} else {
+		maxTier = 1
+	}
+
+	if t.nonLocalReplicasFallback {
+		remote = make([][]*HostInfo, maxTier)
+	}
+
+	used := make(map[*HostInfo]bool, len(replicas))
+	return func() SelectedHost {
+		for i < len(replicas) {
+			h := replicas[i]
+			i++
+
+			var tier uint
+			if tiererOk {
+				tier = tierer.HostTier(h)
+			} else if t.fallback.IsLocal(h) {
+				tier = 0
+			} else {
+				tier = 1
+			}
+
+			if tier != 0 {
+				if t.nonLocalReplicasFallback {
+					remote[tier-1] = append(remote[tier-1], h)
+				}
+				continue
+			}
+
+			if h.IsUp() {
+				used[h] = true
+				return selectedHost{info: h, token: token}
+			}
+		}
+
+		if t.nonLocalReplicasFallback {
+			for j < len(remote) && k < len(remote[j]) {
+				h := remote[j][k]
+				k++
+
+				if k >= len(remote[j]) {
+					j++
+					k = 0
+				}
+
+				if h.IsUp() {
+					used[h] = true
+					return selectedHost{info: h, token: token}
+				}
+			}
+		}
+
+		if fallbackIter == nil {
+			// fallback
+			fallbackIter = t.fallback.Pick(qry)
+		}
+
+		// filter the token aware selected hosts from the fallback hosts
+		for fallbackHost := fallbackIter(); fallbackHost != nil; fallbackHost = fallbackIter() {
+			if !used[fallbackHost.Info()] {
+				used[fallbackHost.Info()] = true
+				return fallbackHost
+			}
+		}
+
+		return nil
+	}
+}
+
+// HostPoolHostPolicy is a host policy which uses the bitly/go-hostpool library
+// to distribute queries between hosts and prevent sending queries to
+// unresponsive hosts. When creating the host pool that is passed to the policy
+// use an empty slice of hosts as the hostpool will be populated later by gocql.
+// See below for examples of usage:
+//
+//	// Create host selection policy using a simple host pool
+//	cluster.PoolConfig.HostSelectionPolicy = HostPoolHostPolicy(hostpool.New(nil))
+//
+//	// Create host selection policy using an epsilon greedy pool
+//	cluster.PoolConfig.HostSelectionPolicy = HostPoolHostPolicy(
+//	    hostpool.NewEpsilonGreedy(nil, 0, &hostpool.LinearEpsilonValueCalculator{}),
+//	)
+func HostPoolHostPolicy(hp hostpool.HostPool) HostSelectionPolicy {
+	return &hostPoolHostPolicy{hostMap: map[string]*HostInfo{}, hp: hp}
+}
+
+type hostPoolHostPolicy struct {
+	hp      hostpool.HostPool
+	mu      sync.RWMutex
+	hostMap map[string]*HostInfo
+}
+
+func (r *hostPoolHostPolicy) Init(*Session)                       {}
+func (r *hostPoolHostPolicy) Reset()                              {}
+func (r *hostPoolHostPolicy) IsOperational(*Session) error        { return nil }
+func (r *hostPoolHostPolicy) KeyspaceChanged(KeyspaceUpdateEvent) {}
+func (r *hostPoolHostPolicy) SetPartitioner(string)               {}
+func (r *hostPoolHostPolicy) IsLocal(*HostInfo) bool              { return true }
+
+func (r *hostPoolHostPolicy) SetHosts(hosts []*HostInfo) {
+	peers := make([]string, len(hosts))
+	hostMap := make(map[string]*HostInfo, len(hosts))
+
+	for i, host := range hosts {
+		ip := host.ConnectAddress().String()
+		peers[i] = ip
+		hostMap[ip] = host
+	}
+
+	r.mu.Lock()
+	r.hp.SetHosts(peers)
+	r.hostMap = hostMap
+	r.mu.Unlock()
+}
+
+func (r *hostPoolHostPolicy) AddHost(host *HostInfo) {
+	ip := host.ConnectAddress().String()
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	// If the host addr is present and isn't nil return
+	if h, ok := r.hostMap[ip]; ok && h != nil {
+		return
+	}
+	// otherwise, add the host to the map
+	r.hostMap[ip] = host
+	// and construct a new peer list to give to the HostPool
+	hosts := make([]string, 0, len(r.hostMap))
+	for addr := range r.hostMap {
+		hosts = append(hosts, addr)
+	}
+
+	r.hp.SetHosts(hosts)
+}
+
+func (r *hostPoolHostPolicy) RemoveHost(host *HostInfo) {
+	ip := host.ConnectAddress().String()
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if _, ok := r.hostMap[ip]; !ok {
+		return
+	}
+
+	delete(r.hostMap, ip)
+	hosts := make([]string, 0, len(r.hostMap))
+	for _, host := range r.hostMap {
+		hosts = append(hosts, host.ConnectAddress().String())
+	}
+
+	r.hp.SetHosts(hosts)
+}
+
+func (r *hostPoolHostPolicy) HostUp(host *HostInfo) {
+	r.AddHost(host)
+}
+
+func (r *hostPoolHostPolicy) HostDown(host *HostInfo) {
+	r.RemoveHost(host)
+}
+
+func (r *hostPoolHostPolicy) Pick(qry ExecutableQuery) NextHost {
+	return func() SelectedHost {
+		r.mu.RLock()
+		defer r.mu.RUnlock()
+
+		if len(r.hostMap) == 0 {
+			return nil
+		}
+
+		hostR := r.hp.Get()
+		host, ok := r.hostMap[hostR.Host()]
+		if !ok {
+			return nil
+		}
+
+		return selectedHostPoolHost{
+			policy: r,
+			info:   host,
+			hostR:  hostR,
+		}
+	}
+}
+
+// selectedHostPoolHost is a host returned by the hostPoolHostPolicy and
+// implements the SelectedHost interface
+type selectedHostPoolHost struct {
+	policy *hostPoolHostPolicy
+	info   *HostInfo
+	hostR  hostpool.HostPoolResponse
+}
+
+func (host selectedHostPoolHost) Info() *HostInfo {
+	return host.info
+}
+
+func (host selectedHostPoolHost) Token() Token {
+	return nil
+}
+
+func (host selectedHostPoolHost) Mark(err error) {
+	ip := host.info.ConnectAddress().String()
+
+	host.policy.mu.RLock()
+	defer host.policy.mu.RUnlock()
+
+	if _, ok := host.policy.hostMap[ip]; !ok {
+		// host was removed between pick and mark
+		return
+	}
+
+	host.hostR.Mark(err)
+}
+
+type dcAwareRR struct {
+	local             string
+	localHosts        cowHostList
+	remoteHosts       cowHostList
+	lastUsedHostIdx   uint64
+	disableDCFailover bool
+}
+
+type dcFailoverDisabledPolicy interface {
+	setDCFailoverDisabled()
+}
+
+type dcAwarePolicyOption func(p dcFailoverDisabledPolicy)
+
+func HostPolicyOptionDisableDCFailover(p dcFailoverDisabledPolicy) {
+	p.setDCFailoverDisabled()
+}
+
+// DCAwareRoundRobinPolicy is a host selection policies which will prioritize and
+// return hosts which are in the local datacentre before returning hosts in all
+// other datercentres
+func DCAwareRoundRobinPolicy(localDC string, opts ...dcAwarePolicyOption) HostSelectionPolicy {
+	p := &dcAwareRR{local: localDC, disableDCFailover: false}
+	for _, opt := range opts {
+		opt(p)
+	}
+	return p
+}
+
+func (d *dcAwareRR) setDCFailoverDisabled() {
+	d.disableDCFailover = true
+}
+func (d *dcAwareRR) Init(*Session)                       {}
+func (d *dcAwareRR) Reset()                              {}
+func (d *dcAwareRR) KeyspaceChanged(KeyspaceUpdateEvent) {}
+func (d *dcAwareRR) SetPartitioner(p string)             {}
+
+func (d *dcAwareRR) IsOperational(session *Session) error {
+	if session.cfg.disableInit || session.cfg.disableControlConn {
+		return nil
+	}
+
+	hosts := session.hostSource.getHostsList()
+	for _, host := range hosts {
+		if !session.cfg.filterHost(host) && host.DataCenter() == d.local {
+			// Policy can work properly only if there is at least one host from target DC
+			// No need to check host status, since it could be down due to the outage
+			// We only need to make sure that policy is not misconfigured with wrong DC
+			return nil
+		}
+	}
+
+	return fmt.Errorf("gocql: datacenter %s in the policy was not found in the topology - probable DC aware policy misconfiguration", d.local)
+}
+
+func (d *dcAwareRR) IsLocal(host *HostInfo) bool {
+	return host.DataCenter() == d.local
+}
+
+func (d *dcAwareRR) AddHost(host *HostInfo) {
+	if d.IsLocal(host) {
+		d.localHosts.add(host)
+	} else {
+		d.remoteHosts.add(host)
+	}
+}
+
+func (d *dcAwareRR) RemoveHost(host *HostInfo) {
+	if d.IsLocal(host) {
+		d.localHosts.remove(host)
+	} else {
+		d.remoteHosts.remove(host)
+	}
+}
+
+func (d *dcAwareRR) HostUp(host *HostInfo)   { d.AddHost(host) }
+func (d *dcAwareRR) HostDown(host *HostInfo) { d.RemoveHost(host) }
+
+// This function is supposed to be called in a fashion
+// roundRobbin(offset, hostsPriority1, hostsPriority2, hostsPriority3 ... )
+//
+// E.g. for DC-naive strategy:
+// roundRobbin(offset, allHosts)
+//
+// For tiered and DC-aware strategy:
+// roundRobbin(offset, localHosts, remoteHosts)
+func roundRobbin(shift int, hosts ...[]*HostInfo) NextHost {
+	currentLayer := 0
+	currentlyObserved := 0
+
+	return func() SelectedHost {
+		// iterate over layers
+		for {
+			if currentLayer == len(hosts) {
+				return nil
+			}
+
+			currentLayerSize := len(hosts[currentLayer])
+
+			// iterate over hosts within a layer
+			for {
+				currentlyObserved++
+				if currentlyObserved > currentLayerSize {
+					currentLayer++
+					currentlyObserved = 0
+					break
+				}
+
+				h := hosts[currentLayer][(shift+currentlyObserved)%currentLayerSize]
+
+				if h.IsUp() {
+					return selectedHost{info: h}
+				}
+
+			}
+		}
+	}
+}
+
+func (d *dcAwareRR) Pick(q ExecutableQuery) NextHost {
+	nextStartOffset := atomic.AddUint64(&d.lastUsedHostIdx, 1)
+	if d.disableDCFailover {
+		return roundRobbin(int(nextStartOffset), d.localHosts.get())
+	}
+	return roundRobbin(int(nextStartOffset), d.localHosts.get(), d.remoteHosts.get())
+}
+
+// RackAwareRoundRobinPolicy is a host selection policies which will prioritize and
+// return hosts which are in the local rack, before hosts in the local datacenter but
+// a different rack, before hosts in all other datercentres
+
+type rackAwareRR struct {
+	// lastUsedHostIdx keeps the index of the last used host.
+	// It is accessed atomically and needs to be aligned to 64 bits, so we
+	// keep it first in the struct. Do not move it or add new struct members
+	// before it.
+	lastUsedHostIdx   uint64
+	localDC           string
+	localRack         string
+	hosts             []cowHostList
+	disableDCFailover bool
+}
+
+func RackAwareRoundRobinPolicy(localDC string, localRack string, opts ...dcAwarePolicyOption) HostSelectionPolicy {
+	p := &rackAwareRR{localDC: localDC, localRack: localRack, hosts: make([]cowHostList, 3), disableDCFailover: false}
+	for _, opt := range opts {
+		opt(p)
+	}
+	return p
+}
+
+func (d *rackAwareRR) Init(*Session)                       {}
+func (d *rackAwareRR) Reset()                              {}
+func (d *rackAwareRR) KeyspaceChanged(KeyspaceUpdateEvent) {}
+func (d *rackAwareRR) SetPartitioner(p string)             {}
+
+func (d *rackAwareRR) IsOperational(session *Session) error {
+	if session.cfg.disableInit || session.cfg.disableControlConn {
+		return nil
+	}
+	hosts := session.hostSource.getHostsList()
+	for _, host := range hosts {
+		if !session.cfg.filterHost(host) && host.DataCenter() == d.localDC && host.Rack() == d.localRack {
+			// Policy can work properly only if there is at least one host from target DC+Rack
+			// No need to check host status, since it could be down due to the outage
+			// We only need to make sure that policy is not misconfigured with wrong DC+Rack
+			return nil
+		}
+	}
+	return fmt.Errorf("gocql: rack %s/%s was not found in the topology - probable Rack aware policy misconfiguration", d.localDC, d.localRack)
+}
+
+func (d *rackAwareRR) MaxHostTier() uint {
+	return 2
+}
+
+func (d *rackAwareRR) setDCFailoverDisabled() {
+	d.disableDCFailover = true
+}
+
+func (d *rackAwareRR) HostTier(host *HostInfo) uint {
+	if host.DataCenter() == d.localDC {
+		if host.Rack() == d.localRack {
+			return 0
+		} else {
+			return 1
+		}
+	} else {
+		return 2
+	}
+}
+
+func (d *rackAwareRR) IsLocal(host *HostInfo) bool {
+	return d.HostTier(host) == 0
+}
+
+func (d *rackAwareRR) AddHost(host *HostInfo) {
+	dist := d.HostTier(host)
+	d.hosts[dist].add(host)
+}
+
+func (d *rackAwareRR) RemoveHost(host *HostInfo) {
+	dist := d.HostTier(host)
+	d.hosts[dist].remove(host)
+}
+
+func (d *rackAwareRR) HostUp(host *HostInfo)   { d.AddHost(host) }
+func (d *rackAwareRR) HostDown(host *HostInfo) { d.RemoveHost(host) }
+
+func (d *rackAwareRR) Pick(q ExecutableQuery) NextHost {
+	nextStartOffset := atomic.AddUint64(&d.lastUsedHostIdx, 1)
+	if d.disableDCFailover {
+		return roundRobbin(int(nextStartOffset), d.hosts[0].get(), d.hosts[1].get())
+	}
+	return roundRobbin(int(nextStartOffset), d.hosts[0].get(), d.hosts[1].get(), d.hosts[2].get())
+}
+
+// ReadyPolicy defines a policy for when a HostSelectionPolicy can be used. After
+// each host connects during session initialization, the Ready method will be
+// called. If you only need a single Host to be up you can wrap a
+// HostSelectionPolicy policy with SingleHostReadyPolicy.
+type ReadyPolicy interface {
+	Ready() bool
+}
+
+// SingleHostReadyPolicy wraps a HostSelectionPolicy and returns Ready after a
+// single host has been added via HostUp
+func SingleHostReadyPolicy(p HostSelectionPolicy) *singleHostReadyPolicy {
+	return &singleHostReadyPolicy{
+		HostSelectionPolicy: p,
+	}
+}
+
+type singleHostReadyPolicy struct {
+	HostSelectionPolicy
+	ready    bool
+	readyMux sync.Mutex
+}
+
+func (s *singleHostReadyPolicy) HostUp(host *HostInfo) {
+	s.HostSelectionPolicy.HostUp(host)
+
+	s.readyMux.Lock()
+	s.ready = true
+	s.readyMux.Unlock()
+}
+
+func (s *singleHostReadyPolicy) Ready() bool {
+	s.readyMux.Lock()
+	ready := s.ready
+	s.readyMux.Unlock()
+	if !ready {
+		return false
+	}
+
+	// in case the wrapped policy is also a ReadyPolicy, defer to that
+	if rdy, ok := s.HostSelectionPolicy.(ReadyPolicy); ok {
+		return rdy.Ready()
+	}
+	return true
+}
+
+// ConvictionPolicy interface is used by gocql to determine if a host should be
+// marked as DOWN based on the error and host info
+type ConvictionPolicy interface {
+	// Implementations should return `true` if the host should be convicted, `false` otherwise.
+	AddFailure(error error, host *HostInfo) bool
+	//Implementations should clear out any convictions or state regarding the host.
+	Reset(host *HostInfo)
+}
+
+// SimpleConvictionPolicy implements a ConvictionPolicy which convicts all hosts
+// regardless of error
+type SimpleConvictionPolicy struct {
+}
+
+func (e *SimpleConvictionPolicy) AddFailure(error error, host *HostInfo) bool {
+	return true
+}
+
+func (e *SimpleConvictionPolicy) Reset(host *HostInfo) {}
+
+// ReconnectionPolicy interface is used by gocql to determine if reconnection
+// can be attempted after connection error. The interface allows gocql users
+// to implement their own logic to determine how to attempt reconnection.
+type ReconnectionPolicy interface {
+	GetInterval(currentRetry int) time.Duration
+	GetMaxRetries() int
+}
+
+// NoReconnectionPolicy is a policy to have no retry.
+//
+// Examples of usage:
+//
+//	cluster.InitialReconnectionPolicy = &NoReconnectionPolicy{}
+type NoReconnectionPolicy struct {
+}
+
+func (c *NoReconnectionPolicy) GetInterval(currentRetry int) time.Duration {
+	return time.Duration(0)
+}
+
+func (c *NoReconnectionPolicy) GetMaxRetries() int {
+	return 1
+}
+
+// ConstantReconnectionPolicy has simple logic for returning a fixed reconnection interval.
+//
+// Examples of usage:
+//
+//	cluster.ReconnectionPolicy = &gocql.ConstantReconnectionPolicy{MaxRetries: 10, Interval: 8 * time.Second}
+type ConstantReconnectionPolicy struct {
+	MaxRetries int
+	Interval   time.Duration
+}
+
+func (c *ConstantReconnectionPolicy) GetInterval(currentRetry int) time.Duration {
+	return c.Interval
+}
+
+func (c *ConstantReconnectionPolicy) GetMaxRetries() int {
+	return c.MaxRetries
+}
+
+// ExponentialReconnectionPolicy returns a growing reconnection interval.
+type ExponentialReconnectionPolicy struct {
+	MaxRetries      int
+	InitialInterval time.Duration
+	MaxInterval     time.Duration
+}
+
+func (e *ExponentialReconnectionPolicy) GetInterval(currentRetry int) time.Duration {
+	max := e.MaxInterval
+	if max < e.InitialInterval {
+		max = math.MaxInt16 * time.Second
+	}
+	return getExponentialTime(e.InitialInterval, max, currentRetry)
+}
+
+func (e *ExponentialReconnectionPolicy) GetMaxRetries() int {
+	return e.MaxRetries
+}
+
+type SpeculativeExecutionPolicy interface {
+	Attempts() int
+	Delay() time.Duration
+}
+
+type NonSpeculativeExecution struct{}
+
+func (sp NonSpeculativeExecution) Attempts() int        { return 0 } // No additional attempts
+func (sp NonSpeculativeExecution) Delay() time.Duration { return 1 } // The delay. Must be positive to be used in a ticker.
+
+type SimpleSpeculativeExecution struct {
+	NumAttempts  int
+	TimeoutDelay time.Duration
+}
+
+func (sp *SimpleSpeculativeExecution) Attempts() int        { return sp.NumAttempts }
+func (sp *SimpleSpeculativeExecution) Delay() time.Duration { return sp.TimeoutDelay }
diff --git a/vendor/github.com/gocql/gocql/prepared_cache.go b/vendor/github.com/gocql/gocql/prepared_cache.go
new file mode 100644
index 0000000..1b1aca7
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/prepared_cache.go
@@ -0,0 +1,78 @@
+package gocql
+
+import (
+	"bytes"
+	"sync"
+
+	"github.com/gocql/gocql/internal/lru"
+)
+
+const defaultMaxPreparedStmts = 1000
+
+// preparedLRU is the prepared statement cache
+type preparedLRU struct {
+	mu  sync.Mutex
+	lru *lru.Cache
+}
+
+func (p *preparedLRU) clear() {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	for p.lru.Len() > 0 {
+		p.lru.RemoveOldest()
+	}
+}
+
+func (p *preparedLRU) add(key string, val *inflightPrepare) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	p.lru.Add(key, val)
+}
+
+func (p *preparedLRU) remove(key string) bool {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.lru.Remove(key)
+}
+
+func (p *preparedLRU) execIfMissing(key string, fn func(lru *lru.Cache) *inflightPrepare) (*inflightPrepare, bool) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	val, ok := p.lru.Get(key)
+	if ok {
+		return val.(*inflightPrepare), true
+	}
+
+	return fn(p.lru), false
+}
+
+func (p *preparedLRU) keyFor(hostID, keyspace, statement string) string {
+	// TODO: we should just use a struct for the key in the map
+	return hostID + keyspace + statement
+}
+
+func (p *preparedLRU) evictPreparedID(key string, id []byte) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	val, ok := p.lru.Get(key)
+	if !ok {
+		return
+	}
+
+	ifp, ok := val.(*inflightPrepare)
+	if !ok {
+		return
+	}
+
+	select {
+	case <-ifp.done:
+		if bytes.Equal(id, ifp.preparedStatment.id) {
+			p.lru.Remove(key)
+		}
+	default:
+	}
+
+}
diff --git a/vendor/github.com/gocql/gocql/query_executor.go b/vendor/github.com/gocql/gocql/query_executor.go
new file mode 100644
index 0000000..3543839
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/query_executor.go
@@ -0,0 +1,238 @@
+package gocql
+
+import (
+	"context"
+	"errors"
+	"sync"
+	"time"
+)
+
+type ExecutableQuery interface {
+	borrowForExecution()    // Used to ensure that the query stays alive for lifetime of a particular execution goroutine.
+	releaseAfterExecution() // Used when a goroutine finishes its execution attempts, either with ok result or an error.
+	execute(ctx context.Context, conn *Conn) *Iter
+	attempt(keyspace string, end, start time.Time, iter *Iter, host *HostInfo)
+	retryPolicy() RetryPolicy
+	speculativeExecutionPolicy() SpeculativeExecutionPolicy
+	GetRoutingKey() ([]byte, error)
+	Keyspace() string
+	Table() string
+	IsIdempotent() bool
+	IsLWT() bool
+	GetCustomPartitioner() Partitioner
+
+	withContext(context.Context) ExecutableQuery
+
+	RetryableQuery
+
+	GetSession() *Session
+}
+
+type queryExecutor struct {
+	pool   *policyConnPool
+	policy HostSelectionPolicy
+}
+
+func (q *queryExecutor) attemptQuery(ctx context.Context, qry ExecutableQuery, conn *Conn) *Iter {
+	start := time.Now()
+	iter := qry.execute(ctx, conn)
+	end := time.Now()
+
+	qry.attempt(q.pool.keyspace, end, start, iter, conn.host)
+
+	return iter
+}
+
+func (q *queryExecutor) speculate(ctx context.Context, qry ExecutableQuery, sp SpeculativeExecutionPolicy,
+	hostIter NextHost, results chan *Iter) *Iter {
+	ticker := time.NewTicker(sp.Delay())
+	defer ticker.Stop()
+
+	for i := 0; i < sp.Attempts(); i++ {
+		select {
+		case <-ticker.C:
+			qry.borrowForExecution() // ensure liveness in case of executing Query to prevent races with Query.Release().
+			go q.run(ctx, qry, hostIter, results)
+		case <-ctx.Done():
+			return &Iter{err: ctx.Err()}
+		case iter := <-results:
+			return iter
+		}
+	}
+
+	return nil
+}
+
+func (q *queryExecutor) executeQuery(qry ExecutableQuery) (*Iter, error) {
+	hostIter := q.policy.Pick(qry)
+
+	// check if the query is not marked as idempotent, if
+	// it is, we force the policy to NonSpeculative
+	sp := qry.speculativeExecutionPolicy()
+	if !qry.IsIdempotent() || sp.Attempts() == 0 {
+		return q.do(qry.Context(), qry, hostIter), nil
+	}
+
+	// When speculative execution is enabled, we could be accessing the host iterator from multiple goroutines below.
+	// To ensure we don't call it concurrently, we wrap the returned NextHost function here to synchronize access to it.
+	var mu sync.Mutex
+	origHostIter := hostIter
+	hostIter = func() SelectedHost {
+		mu.Lock()
+		defer mu.Unlock()
+		return origHostIter()
+	}
+
+	ctx, cancel := context.WithCancel(qry.Context())
+	defer cancel()
+
+	results := make(chan *Iter, 1)
+
+	// Launch the main execution
+	qry.borrowForExecution() // ensure liveness in case of executing Query to prevent races with Query.Release().
+	go q.run(ctx, qry, hostIter, results)
+
+	// The speculative executions are launched _in addition_ to the main
+	// execution, on a timer. So Speculation{2} would make 3 executions running
+	// in total.
+	if iter := q.speculate(ctx, qry, sp, hostIter, results); iter != nil {
+		return iter, nil
+	}
+
+	select {
+	case iter := <-results:
+		return iter, nil
+	case <-ctx.Done():
+		return &Iter{err: ctx.Err()}, nil
+	}
+}
+
+func (q *queryExecutor) do(ctx context.Context, qry ExecutableQuery, hostIter NextHost) *Iter {
+	rt := qry.retryPolicy()
+	if rt == nil {
+		rt = &SimpleRetryPolicy{3}
+	}
+
+	lwtRT, isRTSupportsLWT := rt.(LWTRetryPolicy)
+
+	var getShouldRetry func(qry RetryableQuery) bool
+	var getRetryType func(error) RetryType
+
+	if isRTSupportsLWT && qry.IsLWT() {
+		getShouldRetry = lwtRT.AttemptLWT
+		getRetryType = lwtRT.GetRetryTypeLWT
+	} else {
+		getShouldRetry = rt.Attempt
+		getRetryType = rt.GetRetryType
+	}
+
+	var potentiallyExecuted bool
+
+	execute := func(qry ExecutableQuery, selectedHost SelectedHost) (iter *Iter, retry RetryType) {
+		host := selectedHost.Info()
+		if host == nil || !host.IsUp() {
+			return &Iter{
+				err: &QueryError{
+					err:                 ErrHostDown,
+					potentiallyExecuted: potentiallyExecuted,
+				},
+			}, RetryNextHost
+		}
+		pool, ok := q.pool.getPool(host)
+		if !ok {
+			return &Iter{
+				err: &QueryError{
+					err:                 ErrNoPool,
+					potentiallyExecuted: potentiallyExecuted,
+				},
+			}, RetryNextHost
+		}
+		conn := pool.Pick(selectedHost.Token(), qry)
+		if conn == nil {
+			return &Iter{
+				err: &QueryError{
+					err:                 ErrNoConnectionsInPool,
+					potentiallyExecuted: potentiallyExecuted,
+				},
+			}, RetryNextHost
+		}
+		iter = q.attemptQuery(ctx, qry, conn)
+		iter.host = selectedHost.Info()
+		// Update host
+		if iter.err == nil {
+			return iter, RetryType(255)
+		}
+
+		switch {
+		case errors.Is(iter.err, context.Canceled),
+			errors.Is(iter.err, context.DeadlineExceeded):
+			selectedHost.Mark(nil)
+			potentiallyExecuted = true
+			retry = Rethrow
+		default:
+			selectedHost.Mark(iter.err)
+			retry = RetryType(255) // Don't enforce retry and get it from retry policy
+		}
+
+		var qErr *QueryError
+		if errors.As(iter.err, &qErr) {
+			potentiallyExecuted = potentiallyExecuted && qErr.PotentiallyExecuted()
+			qErr.potentiallyExecuted = potentiallyExecuted
+			qErr.isIdempotent = qry.IsIdempotent()
+			iter.err = qErr
+		} else {
+			iter.err = &QueryError{
+				err:                 iter.err,
+				potentiallyExecuted: potentiallyExecuted,
+				isIdempotent:        qry.IsIdempotent(),
+			}
+		}
+		return iter, retry
+	}
+
+	var lastErr error
+	selectedHost := hostIter()
+	for selectedHost != nil {
+		iter, retryType := execute(qry, selectedHost)
+		if iter.err == nil {
+			return iter
+		}
+		lastErr = iter.err
+
+		// Exit if retry policy decides to not retry anymore
+		if retryType == RetryType(255) {
+			if !getShouldRetry(qry) {
+				return iter
+			}
+			retryType = getRetryType(iter.err)
+		}
+
+		// If query is unsuccessful, check the error with RetryPolicy to retry
+		switch retryType {
+		case Retry:
+			// retry on the same host
+			continue
+		case Rethrow, Ignore:
+			return iter
+		case RetryNextHost:
+			// retry on the next host
+			selectedHost = hostIter()
+			continue
+		default:
+			// Undefined? Return nil and error, this will panic in the requester
+			return &Iter{err: ErrUnknownRetryType}
+		}
+	}
+	if lastErr != nil {
+		return &Iter{err: lastErr}
+	}
+	return &Iter{err: ErrNoConnections}
+}
+
+func (q *queryExecutor) run(ctx context.Context, qry ExecutableQuery, hostIter NextHost, results chan<- *Iter) {
+	select {
+	case results <- q.do(ctx, qry, hostIter):
+	case <-ctx.Done():
+	}
+	qry.releaseAfterExecution()
+}
diff --git a/vendor/github.com/gocql/gocql/recreate.go b/vendor/github.com/gocql/gocql/recreate.go
new file mode 100644
index 0000000..5ff9f8a
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/recreate.go
@@ -0,0 +1,544 @@
+//go:build !cassandra
+// +build !cassandra
+
+// Copyright (C) 2017 ScyllaDB
+
+package gocql
+
+import (
+	"encoding/binary"
+	"encoding/json"
+	"fmt"
+	"io"
+	"sort"
+	"strconv"
+	"strings"
+	"text/template"
+)
+
+// ToCQL returns a CQL query that ca be used to recreate keyspace with all
+// user defined types, tables, indexes, functions, aggregates and views associated
+// with this keyspace.
+func (km *KeyspaceMetadata) ToCQL() (string, error) {
+	// Be aware that `CreateStmts` is not only a cache for ToCQL,
+	// but it also can be populated from response to `DESCRIBE KEYSPACE %s WITH INTERNALS`
+	if len(km.CreateStmts) != 0 {
+		return km.CreateStmts, nil
+	}
+
+	var sb strings.Builder
+
+	if err := km.keyspaceToCQL(&sb); err != nil {
+		return "", err
+	}
+
+	sortedTypes := km.typesSortedTopologically()
+	for _, tm := range sortedTypes {
+		if err := km.userTypeToCQL(&sb, tm); err != nil {
+			return "", err
+		}
+	}
+
+	for _, tm := range km.Tables {
+		if err := km.tableToCQL(&sb, km.Name, tm); err != nil {
+			return "", err
+		}
+	}
+
+	for _, im := range km.Indexes {
+		if err := km.indexToCQL(&sb, im); err != nil {
+			return "", err
+		}
+	}
+
+	for _, fm := range km.Functions {
+		if err := km.functionToCQL(&sb, km.Name, fm); err != nil {
+			return "", err
+		}
+	}
+
+	for _, am := range km.Aggregates {
+		if err := km.aggregateToCQL(&sb, am); err != nil {
+			return "", err
+		}
+	}
+
+	for _, vm := range km.Views {
+		if err := km.viewToCQL(&sb, vm); err != nil {
+			return "", err
+		}
+	}
+
+	km.CreateStmts = sb.String()
+	return km.CreateStmts, nil
+}
+
+func (km *KeyspaceMetadata) typesSortedTopologically() []*TypeMetadata {
+	sortedTypes := make([]*TypeMetadata, 0, len(km.Types))
+	for _, tm := range km.Types {
+		sortedTypes = append(sortedTypes, tm)
+	}
+	sort.Slice(sortedTypes, func(i, j int) bool {
+		for _, ft := range sortedTypes[j].FieldTypes {
+			if strings.Contains(ft, sortedTypes[i].Name) {
+				return true
+			}
+		}
+		return false
+	})
+	return sortedTypes
+}
+
+var tableCQLTemplate = template.Must(template.New("table").
+	Funcs(map[string]interface{}{
+		"escape":               cqlHelpers.escape,
+		"tableColumnToCQL":     cqlHelpers.tableColumnToCQL,
+		"tablePropertiesToCQL": cqlHelpers.tablePropertiesToCQL,
+	}).
+	Parse(`
+CREATE TABLE {{ .KeyspaceName }}.{{ .Tm.Name }} (
+    {{ tableColumnToCQL .Tm }}
+) WITH {{ tablePropertiesToCQL .Tm.ClusteringColumns .Tm.Options .Tm.Flags .Tm.Extensions }};
+`))
+
+func (km *KeyspaceMetadata) tableToCQL(w io.Writer, kn string, tm *TableMetadata) error {
+	if err := tableCQLTemplate.Execute(w, map[string]interface{}{
+		"Tm":           tm,
+		"KeyspaceName": kn,
+	}); err != nil {
+		return err
+	}
+	return nil
+}
+
+var functionTemplate = template.Must(template.New("functions").
+	Funcs(map[string]interface{}{
+		"escape":      cqlHelpers.escape,
+		"zip":         cqlHelpers.zip,
+		"stripFrozen": cqlHelpers.stripFrozen,
+	}).
+	Parse(`
+CREATE FUNCTION {{ .keyspaceName }}.{{ .fm.Name }} ( 
+    {{- range $i, $args := zip .fm.ArgumentNames .fm.ArgumentTypes }}
+    {{- if ne $i 0 }}, {{ end }}
+    {{- (index $args 0) }}
+    {{ stripFrozen (index $args 1) }}
+    {{- end -}})
+    {{ if .fm.CalledOnNullInput }}CALLED{{ else }}RETURNS NULL{{ end }} ON NULL INPUT
+    RETURNS {{ .fm.ReturnType }}
+    LANGUAGE {{ .fm.Language }}
+    AS $${{ .fm.Body }}$$;
+`))
+
+func (km *KeyspaceMetadata) functionToCQL(w io.Writer, keyspaceName string, fm *FunctionMetadata) error {
+	if err := functionTemplate.Execute(w, map[string]interface{}{
+		"fm":           fm,
+		"keyspaceName": keyspaceName,
+	}); err != nil {
+		return err
+	}
+	return nil
+}
+
+var viewTemplate = template.Must(template.New("views").
+	Funcs(map[string]interface{}{
+		"zip":                  cqlHelpers.zip,
+		"partitionKeyString":   cqlHelpers.partitionKeyString,
+		"tablePropertiesToCQL": cqlHelpers.tablePropertiesToCQL,
+	}).
+	Parse(`
+CREATE MATERIALIZED VIEW {{ .vm.KeyspaceName }}.{{ .vm.ViewName }} AS
+    SELECT {{ if .vm.IncludeAllColumns }}*{{ else }}
+    {{- range $i, $col := .vm.OrderedColumns }}
+    {{- if ne $i 0 }}, {{ end }}
+    {{ $col }}
+    {{- end }}
+    {{- end }}
+    FROM {{ .vm.KeyspaceName }}.{{ .vm.BaseTableName }}
+    WHERE {{ .vm.WhereClause }}
+    PRIMARY KEY ({{ partitionKeyString .vm.PartitionKey .vm.ClusteringColumns }})
+    WITH {{ tablePropertiesToCQL .vm.ClusteringColumns .vm.Options .flags .vm.Extensions }};
+`))
+
+func (km *KeyspaceMetadata) viewToCQL(w io.Writer, vm *ViewMetadata) error {
+	if err := viewTemplate.Execute(w, map[string]interface{}{
+		"vm":    vm,
+		"flags": []string{},
+	}); err != nil {
+		return err
+	}
+	return nil
+}
+
+var aggregatesTemplate = template.Must(template.New("aggregate").
+	Funcs(map[string]interface{}{
+		"stripFrozen": cqlHelpers.stripFrozen,
+	}).
+	Parse(`
+CREATE AGGREGATE {{ .Keyspace }}.{{ .Name }}( 
+    {{- range $i, $arg := .ArgumentTypes }}
+    {{- if ne $i 0 }}, {{ end }}
+    {{ stripFrozen $arg }}
+    {{- end -}})
+    SFUNC {{ .StateFunc.Name }}
+    STYPE {{ stripFrozen .StateType }}
+    {{- if ne .FinalFunc.Name "" }}
+    FINALFUNC {{ .FinalFunc.Name }}
+    {{- end -}}
+    {{- if ne .InitCond "" }}
+    INITCOND {{ .InitCond }}
+    {{- end -}}
+;
+`))
+
+func (km *KeyspaceMetadata) aggregateToCQL(w io.Writer, am *AggregateMetadata) error {
+	if err := aggregatesTemplate.Execute(w, am); err != nil {
+		return err
+	}
+	return nil
+}
+
+var typeCQLTemplate = template.Must(template.New("types").
+	Funcs(map[string]interface{}{
+		"zip": cqlHelpers.zip,
+	}).
+	Parse(`
+CREATE TYPE {{ .Keyspace }}.{{ .Name }} ( 
+  {{- range $i, $fields := zip .FieldNames .FieldTypes }} {{- if ne $i 0 }},{{ end }}
+    {{ index $fields 0 }} {{ index $fields 1 }}
+  {{- end }}
+);
+`))
+
+func (km *KeyspaceMetadata) userTypeToCQL(w io.Writer, tm *TypeMetadata) error {
+	if err := typeCQLTemplate.Execute(w, tm); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (km *KeyspaceMetadata) indexToCQL(w io.Writer, im *IndexMetadata) error {
+	// Scylla doesn't support any custom indexes
+	if im.Kind == IndexKindCustom {
+		return nil
+	}
+
+	options := im.Options
+	indexTarget := options["target"]
+
+	// secondary index
+	si := struct {
+		ClusteringKeys []string `json:"ck"`
+		PartitionKeys  []string `json:"pk"`
+	}{}
+
+	if err := json.Unmarshal([]byte(indexTarget), &si); err == nil {
+		indexTarget = fmt.Sprintf("(%s), %s",
+			strings.Join(si.PartitionKeys, ","),
+			strings.Join(si.ClusteringKeys, ","),
+		)
+	}
+
+	_, err := fmt.Fprintf(w, "\nCREATE INDEX %s ON %s.%s (%s);\n",
+		im.Name,
+		im.KeyspaceName,
+		im.TableName,
+		indexTarget,
+	)
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+var keyspaceCQLTemplate = template.Must(template.New("keyspace").
+	Funcs(map[string]interface{}{
+		"escape":      cqlHelpers.escape,
+		"fixStrategy": cqlHelpers.fixStrategy,
+	}).
+	Parse(`CREATE KEYSPACE {{ .Name }} WITH replication = {
+    'class': {{ escape ( fixStrategy .StrategyClass) }}
+    {{- range $key, $value := .StrategyOptions }},
+    {{ escape $key }}: {{ escape $value }}
+    {{- end }}
+}{{ if not .DurableWrites }} AND durable_writes = 'false'{{ end }};
+`))
+
+func (km *KeyspaceMetadata) keyspaceToCQL(w io.Writer) error {
+	if err := keyspaceCQLTemplate.Execute(w, km); err != nil {
+		return err
+	}
+	return nil
+}
+
+func contains(in []string, v string) bool {
+	for _, e := range in {
+		if e == v {
+			return true
+		}
+	}
+	return false
+}
+
+type toCQLHelpers struct{}
+
+var cqlHelpers = toCQLHelpers{}
+
+func (h toCQLHelpers) zip(a []string, b []string) [][]string {
+	m := make([][]string, len(a))
+	for i := range a {
+		m[i] = []string{a[i], b[i]}
+	}
+	return m
+}
+
+func (h toCQLHelpers) escape(e interface{}) string {
+	switch v := e.(type) {
+	case int, float64:
+		return fmt.Sprint(v)
+	case bool:
+		if v {
+			return "true"
+		}
+		return "false"
+	case string:
+		return "'" + strings.ReplaceAll(v, "'", "''") + "'"
+	case []byte:
+		return string(v)
+	}
+	return ""
+}
+
+func (h toCQLHelpers) stripFrozen(v string) string {
+	return strings.TrimSuffix(strings.TrimPrefix(v, "frozen<"), ">")
+}
+func (h toCQLHelpers) fixStrategy(v string) string {
+	return strings.TrimPrefix(v, "org.apache.cassandra.locator.")
+}
+
+func (h toCQLHelpers) fixQuote(v string) string {
+	return strings.ReplaceAll(v, `"`, `'`)
+}
+
+func (h toCQLHelpers) tableOptionsToCQL(ops TableMetadataOptions) ([]string, error) {
+	opts := map[string]interface{}{
+		"bloom_filter_fp_chance":      ops.BloomFilterFpChance,
+		"comment":                     ops.Comment,
+		"crc_check_chance":            ops.CrcCheckChance,
+		"default_time_to_live":        ops.DefaultTimeToLive,
+		"gc_grace_seconds":            ops.GcGraceSeconds,
+		"max_index_interval":          ops.MaxIndexInterval,
+		"memtable_flush_period_in_ms": ops.MemtableFlushPeriodInMs,
+		"min_index_interval":          ops.MinIndexInterval,
+		"speculative_retry":           ops.SpeculativeRetry,
+	}
+
+	var err error
+	opts["caching"], err = json.Marshal(ops.Caching)
+	if err != nil {
+		return nil, err
+	}
+
+	opts["compaction"], err = json.Marshal(ops.Compaction)
+	if err != nil {
+		return nil, err
+	}
+
+	opts["compression"], err = json.Marshal(ops.Compression)
+	if err != nil {
+		return nil, err
+	}
+
+	cdc, err := json.Marshal(ops.CDC)
+	if err != nil {
+		return nil, err
+	}
+
+	if string(cdc) != "null" {
+		opts["cdc"] = cdc
+	}
+
+	if ops.InMemory {
+		opts["in_memory"] = ops.InMemory
+	}
+
+	out := make([]string, 0, len(opts))
+	for key, opt := range opts {
+		out = append(out, fmt.Sprintf("%s = %s", key, h.fixQuote(h.escape(opt))))
+	}
+
+	sort.Strings(out)
+	return out, nil
+}
+
+func (h toCQLHelpers) tableExtensionsToCQL(extensions map[string]interface{}) ([]string, error) {
+	exts := map[string]interface{}{}
+
+	if blob, ok := extensions["scylla_encryption_options"]; ok {
+		encOpts := &scyllaEncryptionOptions{}
+		if err := encOpts.UnmarshalBinary(blob.([]byte)); err != nil {
+			return nil, err
+		}
+
+		var err error
+		exts["scylla_encryption_options"], err = json.Marshal(encOpts)
+		if err != nil {
+			return nil, err
+		}
+
+	}
+
+	out := make([]string, 0, len(exts))
+	for key, ext := range exts {
+		out = append(out, fmt.Sprintf("%s = %s", key, h.fixQuote(h.escape(ext))))
+	}
+
+	sort.Strings(out)
+	return out, nil
+}
+
+func (h toCQLHelpers) tablePropertiesToCQL(cks []*ColumnMetadata, opts TableMetadataOptions,
+	flags []string, extensions map[string]interface{}) (string, error) {
+	var sb strings.Builder
+
+	var properties []string
+
+	compactStorage := len(flags) > 0 && (contains(flags, TableFlagDense) ||
+		contains(flags, TableFlagSuper) ||
+		!contains(flags, TableFlagCompound))
+
+	if compactStorage {
+		properties = append(properties, "COMPACT STORAGE")
+	}
+
+	if len(cks) > 0 {
+		var inner []string
+		for _, col := range cks {
+			inner = append(inner, fmt.Sprintf("%s %s", col.Name, col.ClusteringOrder))
+		}
+		properties = append(properties, fmt.Sprintf("CLUSTERING ORDER BY (%s)", strings.Join(inner, ", ")))
+	}
+
+	options, err := h.tableOptionsToCQL(opts)
+	if err != nil {
+		return "", err
+	}
+	properties = append(properties, options...)
+
+	exts, err := h.tableExtensionsToCQL(extensions)
+	if err != nil {
+		return "", err
+	}
+	properties = append(properties, exts...)
+
+	sb.WriteString(strings.Join(properties, "\n    AND "))
+	return sb.String(), nil
+}
+
+func (h toCQLHelpers) tableColumnToCQL(tm *TableMetadata) string {
+	var sb strings.Builder
+
+	var columns []string
+	for _, cn := range tm.OrderedColumns {
+		cm := tm.Columns[cn]
+		column := fmt.Sprintf("%s %s", cn, cm.Type)
+		if cm.Kind == ColumnStatic {
+			column += " static"
+		}
+		columns = append(columns, column)
+	}
+	if len(tm.PartitionKey) == 1 && len(tm.ClusteringColumns) == 0 && len(columns) > 0 {
+		columns[0] += " PRIMARY KEY"
+	}
+
+	sb.WriteString(strings.Join(columns, ",\n    "))
+
+	if len(tm.PartitionKey) > 1 || len(tm.ClusteringColumns) > 0 {
+		sb.WriteString(",\n    PRIMARY KEY (")
+		sb.WriteString(h.partitionKeyString(tm.PartitionKey, tm.ClusteringColumns))
+		sb.WriteRune(')')
+	}
+
+	return sb.String()
+}
+
+func (h toCQLHelpers) partitionKeyString(pks, cks []*ColumnMetadata) string {
+	var sb strings.Builder
+
+	if len(pks) > 1 {
+		sb.WriteRune('(')
+		for i, pk := range pks {
+			if i != 0 {
+				sb.WriteString(", ")
+			}
+			sb.WriteString(pk.Name)
+		}
+		sb.WriteRune(')')
+	} else {
+		sb.WriteString(pks[0].Name)
+	}
+
+	if len(cks) > 0 {
+		sb.WriteString(", ")
+		for i, ck := range cks {
+			if i != 0 {
+				sb.WriteString(", ")
+			}
+			sb.WriteString(ck.Name)
+		}
+	}
+
+	return sb.String()
+}
+
+type scyllaEncryptionOptions struct {
+	CipherAlgorithm   string `json:"cipher_algorithm"`
+	SecretKeyStrength int    `json:"secret_key_strength"`
+	KeyProvider       string `json:"key_provider"`
+	SecretKeyFile     string `json:"secret_key_file"`
+}
+
+// UnmarshalBinary deserializes blob into scyllaEncryptionOptions.
+// Format:
+//   - 4 bytes - size of KV map
+//     Size times:
+//   - 4 bytes - length of key
+//   - len_of_key bytes - key
+//   - 4 bytes - length of value
+//   - len_of_value bytes - value
+func (enc *scyllaEncryptionOptions) UnmarshalBinary(data []byte) error {
+	size := binary.LittleEndian.Uint32(data[0:4])
+
+	m := make(map[string]string, size)
+
+	off := uint32(4)
+	for i := uint32(0); i < size; i++ {
+		keyLen := binary.LittleEndian.Uint32(data[off : off+4])
+		off += 4
+
+		key := string(data[off : off+keyLen])
+		off += keyLen
+
+		valueLen := binary.LittleEndian.Uint32(data[off : off+4])
+		off += 4
+
+		value := string(data[off : off+valueLen])
+		off += valueLen
+
+		m[key] = value
+	}
+
+	enc.CipherAlgorithm = m["cipher_algorithm"]
+	enc.KeyProvider = m["key_provider"]
+	enc.SecretKeyFile = m["secret_key_file"]
+	if secretKeyStrength, ok := m["secret_key_strength"]; ok {
+		sks, err := strconv.Atoi(secretKeyStrength)
+		if err != nil {
+			return err
+		}
+		enc.SecretKeyStrength = sks
+	}
+
+	return nil
+}
diff --git a/vendor/github.com/gocql/gocql/ring_describer.go b/vendor/github.com/gocql/gocql/ring_describer.go
new file mode 100644
index 0000000..cad929d
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/ring_describer.go
@@ -0,0 +1,275 @@
+package gocql
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net"
+	"sync"
+)
+
+// Polls system.peers at a specific interval to find new hosts
+type ringDescriber struct {
+	control         controlConnection
+	cfg             *ClusterConfig
+	logger          StdLogger
+	mu              sync.RWMutex
+	prevHosts       []*HostInfo
+	prevPartitioner string
+
+	// hosts are the set of all hosts in the cassandra ring that we know of.
+	// key of map is host_id.
+	hosts map[string]*HostInfo
+	// hostIPToUUID maps host native address to host_id.
+	hostIPToUUID map[string]string
+}
+
+func (r *ringDescriber) setControlConn(c controlConnection) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	r.control = c
+}
+
+// Ask the control node for the local host info
+func (r *ringDescriber) getLocalHostInfo(conn ConnInterface) (*HostInfo, error) {
+	iter := conn.querySystem(context.TODO(), qrySystemLocal)
+
+	if iter == nil {
+		return nil, errNoControl
+	}
+
+	host, err := hostInfoFromIter(iter, nil, r.cfg.Port, r.cfg.translateAddressPort)
+	if err != nil {
+		return nil, fmt.Errorf("could not retrieve local host info: %w", err)
+	}
+	return host, nil
+}
+
+// Ask the control node for host info on all it's known peers
+func (r *ringDescriber) getClusterPeerInfo(localHost *HostInfo, c ConnInterface) ([]*HostInfo, error) {
+	var iter *Iter
+	if c.getIsSchemaV2() {
+		iter = c.querySystem(context.TODO(), qrySystemPeersV2)
+	} else {
+		iter = c.querySystem(context.TODO(), qrySystemPeers)
+	}
+
+	if iter == nil {
+		return nil, errNoControl
+	}
+
+	rows, err := iter.SliceMap()
+	if err != nil {
+		// TODO(zariel): make typed error
+		return nil, fmt.Errorf("unable to fetch peer host info: %s", err)
+	}
+
+	return getPeersFromQuerySystemPeers(rows, r.cfg.Port, r.cfg.translateAddressPort, r.logger)
+}
+
+func getPeersFromQuerySystemPeers(querySystemPeerRows []map[string]interface{}, port int, translateAddressPort func(addr net.IP, port int) (net.IP, int), logger StdLogger) ([]*HostInfo, error) {
+	var peers []*HostInfo
+
+	for _, row := range querySystemPeerRows {
+		// extract all available info about the peer
+		host, err := hostInfoFromMap(row, &HostInfo{port: port}, translateAddressPort)
+		if err != nil {
+			return nil, err
+		} else if !isValidPeer(host) {
+			// If it's not a valid peer
+			logger.Printf("Found invalid peer '%s' "+
+				"Likely due to a gossip or snitch issue, this host will be ignored", host)
+			continue
+		} else if isZeroToken(host) {
+			continue
+		}
+
+		peers = append(peers, host)
+	}
+
+	return peers, nil
+}
+
+// Return true if the host is a valid peer
+func isValidPeer(host *HostInfo) bool {
+	return !(len(host.RPCAddress()) == 0 ||
+		host.hostId == "" ||
+		host.dataCenter == "" ||
+		host.rack == "")
+}
+
+func isZeroToken(host *HostInfo) bool {
+	return len(host.tokens) == 0
+}
+
+// GetHostsFromSystem returns a list of hosts found via queries to system.local and system.peers
+func (r *ringDescriber) GetHostsFromSystem() ([]*HostInfo, string, error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.control == nil {
+		return r.prevHosts, r.prevPartitioner, errNoControl
+	}
+
+	ch := r.control.getConn()
+	localHost, err := r.getLocalHostInfo(ch.conn)
+	if err != nil {
+		return r.prevHosts, r.prevPartitioner, err
+	}
+
+	peerHosts, err := r.getClusterPeerInfo(localHost, ch.conn)
+	if err != nil {
+		return r.prevHosts, r.prevPartitioner, err
+	}
+
+	var hosts []*HostInfo
+	if !isZeroToken(localHost) {
+		hosts = []*HostInfo{localHost}
+	}
+	hosts = append(hosts, peerHosts...)
+
+	var partitioner string
+	if len(hosts) > 0 {
+		partitioner = hosts[0].Partitioner()
+	}
+
+	r.prevHosts = hosts
+	r.prevPartitioner = partitioner
+
+	return hosts, partitioner, nil
+}
+
+// Given an ip/port return HostInfo for the specified ip/port
+func (r *ringDescriber) getHostInfo(hostID UUID) (*HostInfo, error) {
+	var host *HostInfo
+	for _, table := range []string{"system.peers", "system.local"} {
+		ch := r.control.getConn()
+		var iter *Iter
+		if ch.host.HostID() == hostID.String() {
+			host = ch.host
+			iter = nil
+		}
+
+		if table == "system.peers" {
+			if ch.conn.getIsSchemaV2() {
+				iter = ch.conn.querySystem(context.TODO(), qrySystemPeersV2)
+			} else {
+				iter = ch.conn.querySystem(context.TODO(), qrySystemPeers)
+			}
+		} else {
+			iter = ch.conn.query(context.TODO(), fmt.Sprintf("SELECT * FROM %s", table))
+		}
+
+		if iter != nil {
+			rows, err := iter.SliceMap()
+			if err != nil {
+				return nil, err
+			}
+
+			for _, row := range rows {
+				h, err := hostInfoFromMap(row, &HostInfo{port: r.cfg.Port}, r.cfg.translateAddressPort)
+				if err != nil {
+					return nil, err
+				}
+
+				if h.HostID() == hostID.String() {
+					host = h
+					break
+				}
+			}
+		}
+	}
+
+	if host == nil {
+		return nil, errors.New("unable to fetch host info: invalid control connection")
+	} else if host.invalidConnectAddr() {
+		return nil, fmt.Errorf("host ConnectAddress invalid ip=%v: %v", host.connectAddress, host)
+	}
+
+	return host, nil
+}
+
+func (r *ringDescriber) getHostByIP(ip string) (*HostInfo, bool) {
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+	hi, ok := r.hostIPToUUID[ip]
+	return r.hosts[hi], ok
+}
+
+func (r *ringDescriber) getHost(hostID string) *HostInfo {
+	r.mu.RLock()
+	host := r.hosts[hostID]
+	r.mu.RUnlock()
+	return host
+}
+
+func (r *ringDescriber) getHostsList() []*HostInfo {
+	r.mu.RLock()
+	hosts := make([]*HostInfo, 0, len(r.hosts))
+	for _, host := range r.hosts {
+		hosts = append(hosts, host)
+	}
+	r.mu.RUnlock()
+	return hosts
+}
+
+func (r *ringDescriber) getHostsMap() map[string]*HostInfo {
+	r.mu.RLock()
+	hosts := make(map[string]*HostInfo, len(r.hosts))
+	for k, v := range r.hosts {
+		hosts[k] = v
+	}
+	r.mu.RUnlock()
+	return hosts
+}
+
+func (r *ringDescriber) addOrUpdate(host *HostInfo) *HostInfo {
+	if existingHost, ok := r.addHostIfMissing(host); ok {
+		existingHost.update(host)
+		host = existingHost
+	}
+	return host
+}
+
+func (r *ringDescriber) addHostIfMissing(host *HostInfo) (*HostInfo, bool) {
+	if host.invalidConnectAddr() {
+		panic(fmt.Sprintf("invalid host: %v", host))
+	}
+	hostID := host.HostID()
+
+	r.mu.Lock()
+	if r.hosts == nil {
+		r.hosts = make(map[string]*HostInfo)
+	}
+	if r.hostIPToUUID == nil {
+		r.hostIPToUUID = make(map[string]string)
+	}
+
+	existing, ok := r.hosts[hostID]
+	if !ok {
+		r.hosts[hostID] = host
+		r.hostIPToUUID[host.nodeToNodeAddress().String()] = hostID
+		existing = host
+	}
+	r.mu.Unlock()
+	return existing, ok
+}
+
+func (r *ringDescriber) removeHost(hostID string) bool {
+	r.mu.Lock()
+	if r.hosts == nil {
+		r.hosts = make(map[string]*HostInfo)
+	}
+	if r.hostIPToUUID == nil {
+		r.hostIPToUUID = make(map[string]string)
+	}
+
+	h, ok := r.hosts[hostID]
+	if ok {
+		delete(r.hostIPToUUID, h.nodeToNodeAddress().String())
+	}
+	delete(r.hosts, hostID)
+	r.mu.Unlock()
+	return ok
+}
diff --git a/vendor/github.com/gocql/gocql/scylla.go b/vendor/github.com/gocql/gocql/scylla.go
new file mode 100644
index 0000000..7cabeae
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/scylla.go
@@ -0,0 +1,874 @@
+package gocql
+
+import (
+	"context"
+	"crypto/tls"
+	"errors"
+	"fmt"
+	"math"
+	"net"
+	"strconv"
+	"strings"
+	"sync/atomic"
+	"syscall"
+	"time"
+)
+
+// scyllaSupported represents Scylla connection options as sent in SUPPORTED
+// frame.
+// FIXME: Should also follow `cqlProtocolExtension` interface.
+type scyllaSupported struct {
+	shard             int
+	nrShards          int
+	msbIgnore         uint64
+	partitioner       string
+	shardingAlgorithm string
+	shardAwarePort    uint16
+	shardAwarePortSSL uint16
+	lwtFlagMask       int
+}
+
+// CQL Protocol extension interface for Scylla.
+// Each extension is identified by a name and defines a way to serialize itself
+// in STARTUP message payload.
+type cqlProtocolExtension interface {
+	name() string
+	serialize() map[string]string
+}
+
+func findCQLProtoExtByName(exts []cqlProtocolExtension, name string) cqlProtocolExtension {
+	for i := range exts {
+		if exts[i].name() == name {
+			return exts[i]
+		}
+	}
+	return nil
+}
+
+// Top-level keys used for serialization/deserialization of CQL protocol
+// extensions in SUPPORTED/STARTUP messages.
+// Each key identifies a single extension.
+const (
+	lwtAddMetadataMarkKey = "SCYLLA_LWT_ADD_METADATA_MARK"
+	rateLimitError        = "SCYLLA_RATE_LIMIT_ERROR"
+	tabletsRoutingV1      = "TABLETS_ROUTING_V1"
+)
+
+// "tabletsRoutingV1" CQL Protocol Extension.
+// This extension, if enabled (properly negotiated), allows Scylla server
+// to send a tablet information in `custom_payload`.
+//
+// Implements cqlProtocolExtension interface.
+type tabletsRoutingV1Ext struct {
+}
+
+var _ cqlProtocolExtension = &tabletsRoutingV1Ext{}
+
+// Factory function to deserialize and create an `tabletsRoutingV1Ext` instance
+// from SUPPORTED message payload.
+func newTabletsRoutingV1Ext(supported map[string][]string) *tabletsRoutingV1Ext {
+	if _, found := supported[tabletsRoutingV1]; found {
+		return &tabletsRoutingV1Ext{}
+	}
+	return nil
+}
+
+func (ext *tabletsRoutingV1Ext) serialize() map[string]string {
+	return map[string]string{
+		tabletsRoutingV1: "",
+	}
+}
+
+func (ext *tabletsRoutingV1Ext) name() string {
+	return tabletsRoutingV1
+}
+
+// "Rate limit" CQL Protocol Extension.
+// This extension, if enabled (properly negotiated), allows Scylla server
+// to send a special kind of error.
+//
+// Implements cqlProtocolExtension interface.
+type rateLimitExt struct {
+	rateLimitErrorCode int
+}
+
+var _ cqlProtocolExtension = &rateLimitExt{}
+
+// Factory function to deserialize and create an `rateLimitExt` instance
+// from SUPPORTED message payload.
+func newRateLimitExt(supported map[string][]string) *rateLimitExt {
+	const rateLimitErrorCode = "ERROR_CODE"
+
+	if v, found := supported[rateLimitError]; found {
+		for i := range v {
+			splitVal := strings.Split(v[i], "=")
+			if splitVal[0] == rateLimitErrorCode {
+				var (
+					err       error
+					errorCode int
+				)
+				if errorCode, err = strconv.Atoi(splitVal[1]); err != nil {
+					if gocqlDebug {
+						Logger.Printf("scylla: failed to parse %s value %v: %s", rateLimitErrorCode, splitVal[1], err)
+						return nil
+					}
+				}
+				return &rateLimitExt{
+					rateLimitErrorCode: errorCode,
+				}
+			}
+		}
+	}
+	return nil
+}
+
+func (ext *rateLimitExt) serialize() map[string]string {
+	return map[string]string{
+		rateLimitError: "",
+	}
+}
+
+func (ext *rateLimitExt) name() string {
+	return rateLimitError
+}
+
+// "LWT prepared statements metadata mark" CQL Protocol Extension.
+// This extension, if enabled (properly negotiated), allows Scylla server
+// to set a special bit in prepared statements metadata, which would indicate
+// whether the statement at hand is LWT statement or not.
+//
+// This is further used to consistently choose primary replicas in a predefined
+// order for these queries, which can reduce contention over hot keys and thus
+// increase LWT performance.
+//
+// Implements cqlProtocolExtension interface.
+type lwtAddMetadataMarkExt struct {
+	lwtOptMetaBitMask int
+}
+
+var _ cqlProtocolExtension = &lwtAddMetadataMarkExt{}
+
+// Factory function to deserialize and create an `lwtAddMetadataMarkExt` instance
+// from SUPPORTED message payload.
+func newLwtAddMetaMarkExt(supported map[string][]string) *lwtAddMetadataMarkExt {
+	const lwtOptMetaBitMaskKey = "LWT_OPTIMIZATION_META_BIT_MASK"
+
+	if v, found := supported[lwtAddMetadataMarkKey]; found {
+		for i := range v {
+			splitVal := strings.Split(v[i], "=")
+			if splitVal[0] == lwtOptMetaBitMaskKey {
+				var (
+					err     error
+					bitMask int
+				)
+				if bitMask, err = strconv.Atoi(splitVal[1]); err != nil {
+					if gocqlDebug {
+						Logger.Printf("scylla: failed to parse %s value %v: %s", lwtOptMetaBitMaskKey, splitVal[1], err)
+						return nil
+					}
+				}
+				return &lwtAddMetadataMarkExt{
+					lwtOptMetaBitMask: bitMask,
+				}
+			}
+		}
+	}
+	return nil
+}
+
+func (ext *lwtAddMetadataMarkExt) serialize() map[string]string {
+	return map[string]string{
+		lwtAddMetadataMarkKey: fmt.Sprintf("LWT_OPTIMIZATION_META_BIT_MASK=%d", ext.lwtOptMetaBitMask),
+	}
+}
+
+func (ext *lwtAddMetadataMarkExt) name() string {
+	return lwtAddMetadataMarkKey
+}
+
+func parseSupported(supported map[string][]string) scyllaSupported {
+	const (
+		scyllaShard             = "SCYLLA_SHARD"
+		scyllaNrShards          = "SCYLLA_NR_SHARDS"
+		scyllaPartitioner       = "SCYLLA_PARTITIONER"
+		scyllaShardingAlgorithm = "SCYLLA_SHARDING_ALGORITHM"
+		scyllaShardingIgnoreMSB = "SCYLLA_SHARDING_IGNORE_MSB"
+		scyllaShardAwarePort    = "SCYLLA_SHARD_AWARE_PORT"
+		scyllaShardAwarePortSSL = "SCYLLA_SHARD_AWARE_PORT_SSL"
+	)
+
+	var (
+		si  scyllaSupported
+		err error
+	)
+
+	if s, ok := supported[scyllaShard]; ok {
+		if si.shard, err = strconv.Atoi(s[0]); err != nil {
+			if gocqlDebug {
+				Logger.Printf("scylla: failed to parse %s value %v: %s", scyllaShard, s, err)
+			}
+		}
+	}
+	if s, ok := supported[scyllaNrShards]; ok {
+		if si.nrShards, err = strconv.Atoi(s[0]); err != nil {
+			if gocqlDebug {
+				Logger.Printf("scylla: failed to parse %s value %v: %s", scyllaNrShards, s, err)
+			}
+		}
+	}
+	if s, ok := supported[scyllaShardingIgnoreMSB]; ok {
+		if si.msbIgnore, err = strconv.ParseUint(s[0], 10, 64); err != nil {
+			if gocqlDebug {
+				Logger.Printf("scylla: failed to parse %s value %v: %s", scyllaShardingIgnoreMSB, s, err)
+			}
+		}
+	}
+
+	if s, ok := supported[scyllaPartitioner]; ok {
+		si.partitioner = s[0]
+	}
+	if s, ok := supported[scyllaShardingAlgorithm]; ok {
+		si.shardingAlgorithm = s[0]
+	}
+	if s, ok := supported[scyllaShardAwarePort]; ok {
+		if shardAwarePort, err := strconv.ParseUint(s[0], 10, 16); err != nil {
+			if gocqlDebug {
+				Logger.Printf("scylla: failed to parse %s value %v: %s", scyllaShardAwarePort, s, err)
+			}
+		} else {
+			si.shardAwarePort = uint16(shardAwarePort)
+		}
+	}
+	if s, ok := supported[scyllaShardAwarePortSSL]; ok {
+		if shardAwarePortSSL, err := strconv.ParseUint(s[0], 10, 16); err != nil {
+			if gocqlDebug {
+				Logger.Printf("scylla: failed to parse %s value %v: %s", scyllaShardAwarePortSSL, s, err)
+			}
+		} else {
+			si.shardAwarePortSSL = uint16(shardAwarePortSSL)
+		}
+	}
+
+	if si.partitioner != "org.apache.cassandra.dht.Murmur3Partitioner" || si.shardingAlgorithm != "biased-token-round-robin" || si.nrShards == 0 || si.msbIgnore == 0 {
+		if gocqlDebug {
+			Logger.Printf("scylla: unsupported sharding configuration, partitioner=%s, algorithm=%s, no_shards=%d, msb_ignore=%d",
+				si.partitioner, si.shardingAlgorithm, si.nrShards, si.msbIgnore)
+		}
+		return scyllaSupported{}
+	}
+
+	return si
+}
+
+func parseCQLProtocolExtensions(supported map[string][]string) []cqlProtocolExtension {
+	exts := []cqlProtocolExtension{}
+
+	lwtExt := newLwtAddMetaMarkExt(supported)
+	if lwtExt != nil {
+		exts = append(exts, lwtExt)
+	}
+
+	rateLimitExt := newRateLimitExt(supported)
+	if rateLimitExt != nil {
+		exts = append(exts, rateLimitExt)
+	}
+
+	tabletsExt := newTabletsRoutingV1Ext(supported)
+	if tabletsExt != nil {
+		exts = append(exts, tabletsExt)
+	}
+
+	return exts
+}
+
+// isScyllaConn checks if conn is suitable for scyllaConnPicker.
+func (conn *Conn) isScyllaConn() bool {
+	return conn.getScyllaSupported().nrShards != 0
+}
+
+// scyllaConnPicker is a specialised ConnPicker that selects connections based
+// on token trying to get connection to a shard containing the given token.
+// A list of excess connections is maintained to allow for lazy closing of
+// connections to already opened shards. Keeping excess connections open helps
+// reaching equilibrium faster since the likelihood of hitting the same shard
+// decreases with the number of connections to the shard.
+//
+// scyllaConnPicker keeps track of the details about the shard-aware port.
+// When used as a Dialer, it connects to the shard-aware port instead of the
+// regular port (if the node supports it). For each subsequent connection
+// it tries to make, the shard that it aims to connect to is chosen
+// in a round-robin fashion.
+type scyllaConnPicker struct {
+	address                string
+	hostId                 string
+	shardAwareAddress      string
+	conns                  []*Conn
+	excessConns            []*Conn
+	nrConns                int
+	nrShards               int
+	msbIgnore              uint64
+	pos                    uint64
+	lastAttemptedShard     int
+	shardAwarePortDisabled bool
+
+	// Used to disable new connections to the shard-aware port temporarily
+	disableShardAwarePortUntil *atomic.Value
+}
+
+func newScyllaConnPicker(conn *Conn) *scyllaConnPicker {
+	addr := conn.Address()
+	hostId := conn.host.hostId
+
+	if conn.scyllaSupported.nrShards == 0 {
+		panic(fmt.Sprintf("scylla: %s not a sharded connection", addr))
+	}
+
+	if gocqlDebug {
+		Logger.Printf("scylla: %s new conn picker sharding options %+v", addr, conn.scyllaSupported)
+	}
+
+	var shardAwarePort uint16
+	if conn.session.connCfg.tlsConfig != nil {
+		shardAwarePort = conn.scyllaSupported.shardAwarePortSSL
+	} else {
+		shardAwarePort = conn.scyllaSupported.shardAwarePort
+	}
+
+	var shardAwareAddress string
+	if shardAwarePort != 0 {
+		tIP, tPort := conn.session.cfg.translateAddressPort(conn.host.UntranslatedConnectAddress(), int(shardAwarePort))
+		shardAwareAddress = net.JoinHostPort(tIP.String(), strconv.Itoa(tPort))
+	}
+
+	return &scyllaConnPicker{
+		address:                addr,
+		hostId:                 hostId,
+		shardAwareAddress:      shardAwareAddress,
+		nrShards:               conn.scyllaSupported.nrShards,
+		msbIgnore:              conn.scyllaSupported.msbIgnore,
+		lastAttemptedShard:     0,
+		shardAwarePortDisabled: conn.session.cfg.DisableShardAwarePort,
+
+		disableShardAwarePortUntil: new(atomic.Value),
+	}
+}
+
+func (p *scyllaConnPicker) Pick(t Token, qry ExecutableQuery) *Conn {
+	if len(p.conns) == 0 {
+		return nil
+	}
+
+	if t == nil {
+		return p.leastBusyConn()
+	}
+
+	mmt, ok := t.(int64Token)
+	// double check if that's murmur3 token
+	if !ok {
+		return nil
+	}
+
+	idx := -1
+
+	for _, conn := range p.conns {
+		if conn == nil {
+			continue
+		}
+
+		if qry != nil && conn.isTabletSupported() {
+			tablets := conn.session.getTablets()
+
+			// Search for tablets with Keyspace and Table from the Query
+			l, r := tablets.findTablets(qry.Keyspace(), qry.Table())
+
+			if l != -1 {
+				tablet := tablets.findTabletForToken(mmt, l, r)
+
+				for _, replica := range tablet.replicas {
+					if replica.hostId.String() == p.hostId {
+						idx = replica.shardId
+					}
+				}
+			}
+		}
+
+		break
+	}
+
+	if idx == -1 {
+		idx = p.shardOf(mmt)
+	}
+
+	if c := p.conns[idx]; c != nil {
+		// We have this shard's connection
+		// so let's give it to the caller.
+		// But only if it's not loaded too much and load is well distributed.
+		if qry != nil && qry.IsLWT() {
+			return c
+		}
+		return p.maybeReplaceWithLessBusyConnection(c)
+	}
+	return p.leastBusyConn()
+}
+
+func (p *scyllaConnPicker) maybeReplaceWithLessBusyConnection(c *Conn) *Conn {
+	if !isHeavyLoaded(c) {
+		return c
+	}
+	alternative := p.leastBusyConn()
+	if alternative == nil || alternative.AvailableStreams()*120 > c.AvailableStreams()*100 {
+		return c
+	} else {
+		return alternative
+	}
+}
+
+func isHeavyLoaded(c *Conn) bool {
+	return c.streams.NumStreams/2 > c.AvailableStreams()
+}
+
+func (p *scyllaConnPicker) leastBusyConn() *Conn {
+	var (
+		leastBusyConn    *Conn
+		streamsAvailable int
+	)
+	idx := int(atomic.AddUint64(&p.pos, 1))
+	// find the conn which has the most available streams, this is racy
+	for i := range p.conns {
+		if conn := p.conns[(idx+i)%len(p.conns)]; conn != nil {
+			if streams := conn.AvailableStreams(); streams > streamsAvailable {
+				leastBusyConn = conn
+				streamsAvailable = streams
+			}
+		}
+	}
+	return leastBusyConn
+}
+
+func (p *scyllaConnPicker) shardOf(token int64Token) int {
+	shards := uint64(p.nrShards)
+	z := uint64(token+math.MinInt64) << p.msbIgnore
+	lo := z & 0xffffffff
+	hi := (z >> 32) & 0xffffffff
+	mul1 := lo * shards
+	mul2 := hi * shards
+	sum := (mul1 >> 32) + mul2
+	return int(sum >> 32)
+}
+
+func (p *scyllaConnPicker) Put(conn *Conn) {
+	var (
+		nrShards = conn.scyllaSupported.nrShards
+		shard    = conn.scyllaSupported.shard
+	)
+
+	if nrShards == 0 {
+		panic(fmt.Sprintf("scylla: %s not a sharded connection", p.address))
+	}
+
+	if nrShards != len(p.conns) {
+		if nrShards != p.nrShards {
+			panic(fmt.Sprintf("scylla: %s invalid number of shards", p.address))
+		}
+		conns := p.conns
+		p.conns = make([]*Conn, nrShards, nrShards)
+		copy(p.conns, conns)
+	}
+
+	if c := p.conns[shard]; c != nil {
+		if conn.addr == p.shardAwareAddress {
+			// A connection made to the shard-aware port resulted in duplicate
+			// connection to the same shard being made. Because this is never
+			// intentional, it suggests that a NAT or AddressTranslator
+			// changes the source port along the way, therefore we can't trust
+			// the shard-aware port to return connection to the shard
+			// that we requested. Fall back to non-shard-aware port for some time.
+			Logger.Printf(
+				"scylla: %s connection to shard-aware address %s resulted in wrong shard being assigned; please check that you are not behind a NAT or AddressTranslater which changes source ports; falling back to non-shard-aware port for %v",
+				p.address,
+				p.shardAwareAddress,
+				scyllaShardAwarePortFallbackDuration,
+			)
+			until := time.Now().Add(scyllaShardAwarePortFallbackDuration)
+			p.disableShardAwarePortUntil.Store(until)
+
+			// Connections to shard-aware port do not influence how shards
+			// are chosen for the non-shard-aware port, therefore it can be
+			// closed immediately
+			closeConns(conn)
+		} else {
+			p.excessConns = append(p.excessConns, conn)
+			if gocqlDebug {
+				Logger.Printf("scylla: %s put shard %d excess connection total: %d missing: %d excess: %d", p.address, shard, p.nrConns, p.nrShards-p.nrConns, len(p.excessConns))
+			}
+		}
+	} else {
+		p.conns[shard] = conn
+		p.nrConns++
+		if gocqlDebug {
+			Logger.Printf("scylla: %s put shard %d connection total: %d missing: %d", p.address, shard, p.nrConns, p.nrShards-p.nrConns)
+		}
+	}
+
+	if p.shouldCloseExcessConns() {
+		p.closeExcessConns()
+	}
+}
+
+func (p *scyllaConnPicker) shouldCloseExcessConns() bool {
+	const maxExcessConnsFactor = 10
+
+	if p.nrConns >= p.nrShards {
+		return true
+	}
+	return len(p.excessConns) > maxExcessConnsFactor*p.nrShards
+}
+
+func (p *scyllaConnPicker) Remove(conn *Conn) {
+	shard := conn.scyllaSupported.shard
+
+	if conn.scyllaSupported.nrShards == 0 {
+		// It is possible for Remove to be called before the connection is added to the pool.
+		// Ignoring these connections here is safe.
+		if gocqlDebug {
+			Logger.Printf("scylla: %s has unknown sharding state, ignoring it", p.address)
+		}
+		return
+	}
+	if gocqlDebug {
+		Logger.Printf("scylla: %s remove shard %d connection", p.address, shard)
+	}
+
+	if p.conns[shard] != nil {
+		p.conns[shard] = nil
+		p.nrConns--
+	}
+}
+
+func (p *scyllaConnPicker) InFlight() int {
+	result := 0
+	for _, conn := range p.conns {
+		if conn != nil {
+			result = result + (conn.streams.InUse())
+		}
+	}
+	return result
+}
+
+func (p *scyllaConnPicker) Size() (int, int) {
+	return p.nrConns, p.nrShards - p.nrConns
+}
+
+func (p *scyllaConnPicker) Close() {
+	p.closeConns()
+	p.closeExcessConns()
+}
+
+func (p *scyllaConnPicker) closeConns() {
+	if len(p.conns) == 0 {
+		if gocqlDebug {
+			Logger.Printf("scylla: %s no connections to close", p.address)
+		}
+		return
+	}
+
+	conns := p.conns
+	p.conns = nil
+	p.nrConns = 0
+
+	if gocqlDebug {
+		Logger.Printf("scylla: %s closing %d connections", p.address, len(conns))
+	}
+	go closeConns(conns...)
+}
+
+func (p *scyllaConnPicker) closeExcessConns() {
+	if len(p.excessConns) == 0 {
+		if gocqlDebug {
+			Logger.Printf("scylla: %s no excess connections to close", p.address)
+		}
+		return
+	}
+
+	conns := p.excessConns
+	p.excessConns = nil
+
+	if gocqlDebug {
+		Logger.Printf("scylla: %s closing %d excess connections", p.address, len(conns))
+	}
+	go closeConns(conns...)
+}
+
+// Closing must be done outside of hostConnPool lock. If holding a lock
+// a deadlock can occur when closing one of the connections returns error on close.
+// See scylladb/gocql#53.
+func closeConns(conns ...*Conn) {
+	for _, conn := range conns {
+		if conn != nil {
+			conn.Close()
+		}
+	}
+}
+
+// NextShard returns the shardID to connect to.
+// nrShard specifies how many shards the host has.
+// If nrShards is zero, the caller shouldn't use shard-aware port.
+func (p *scyllaConnPicker) NextShard() (shardID, nrShards int) {
+	if p.shardAwarePortDisabled {
+		return 0, 0
+	}
+
+	disableUntil, _ := p.disableShardAwarePortUntil.Load().(time.Time)
+	if time.Now().Before(disableUntil) {
+		// There is suspicion that the shard-aware-port is not reachable
+		// or misconfigured, fall back to the non-shard-aware port
+		return 0, 0
+	}
+
+	// Find the shard without a connection
+	// It's important to start counting from 1 here because we want
+	// to consider the next shard after the previously attempted one
+	for i := 1; i <= p.nrShards; i++ {
+		shardID := (p.lastAttemptedShard + i) % p.nrShards
+		if p.conns == nil || p.conns[shardID] == nil {
+			p.lastAttemptedShard = shardID
+			return shardID, p.nrShards
+		}
+	}
+
+	// We did not find an unallocated shard
+	// We will dial the non-shard-aware port
+	return 0, 0
+}
+
+// ShardDialer is like HostDialer but is shard-aware.
+// If the driver wants to connect to a specific shard, it will call DialShard,
+// otherwise it will call DialHost.
+type ShardDialer interface {
+	HostDialer
+
+	// DialShard establishes a connection to the specified shard ID out of nrShards.
+	// The returned connection must be directly usable for CQL protocol,
+	// specifically DialShard is responsible also for setting up the TLS session if needed.
+	DialShard(ctx context.Context, host *HostInfo, shardID, nrShards int) (*DialedHost, error)
+}
+
+// A dialer which dials a particular shard
+type scyllaDialer struct {
+	dialer    Dialer
+	logger    StdLogger
+	tlsConfig *tls.Config
+	cfg       *ClusterConfig
+}
+
+const scyllaShardAwarePortFallbackDuration time.Duration = 5 * time.Minute
+
+func (sd *scyllaDialer) DialHost(ctx context.Context, host *HostInfo) (*DialedHost, error) {
+	ip := host.ConnectAddress()
+	port := host.Port()
+
+	if !validIpAddr(ip) {
+		return nil, fmt.Errorf("host missing connect ip address: %v", ip)
+	} else if port == 0 {
+		return nil, fmt.Errorf("host missing port: %v", port)
+	}
+
+	addr := host.HostnameAndPort()
+	conn, err := sd.dialer.DialContext(ctx, "tcp", addr)
+	if err != nil {
+		return nil, err
+	}
+	return WrapTLS(ctx, conn, addr, sd.tlsConfig)
+}
+
+func (sd *scyllaDialer) DialShard(ctx context.Context, host *HostInfo, shardID, nrShards int) (*DialedHost, error) {
+	ip := host.ConnectAddress()
+	port := host.Port()
+
+	if !validIpAddr(ip) {
+		return nil, fmt.Errorf("host missing connect ip address: %v", ip)
+	} else if port == 0 {
+		return nil, fmt.Errorf("host missing port: %v", port)
+	}
+
+	iter := newScyllaPortIterator(shardID, nrShards)
+
+	addr := host.HostnameAndPort()
+
+	var shardAwarePort uint16
+	if sd.tlsConfig != nil {
+		shardAwarePort = host.ScyllaShardAwarePortTLS()
+	} else {
+		shardAwarePort = host.ScyllaShardAwarePort()
+	}
+
+	var shardAwareAddress string
+	if shardAwarePort != 0 {
+		tIP, tPort := sd.cfg.translateAddressPort(host.UntranslatedConnectAddress(), int(shardAwarePort))
+		shardAwareAddress = net.JoinHostPort(tIP.String(), strconv.Itoa(tPort))
+	}
+
+	if gocqlDebug {
+		sd.logger.Printf("scylla: connecting to shard %d", shardID)
+	}
+
+	conn, err := sd.dialShardAware(ctx, addr, shardAwareAddress, iter)
+	if err != nil {
+		return nil, err
+	}
+
+	return WrapTLS(ctx, conn, addr, sd.tlsConfig)
+}
+
+func (sd *scyllaDialer) dialShardAware(ctx context.Context, addr, shardAwareAddr string, iter *scyllaPortIterator) (net.Conn, error) {
+	for {
+		port, ok := iter.Next()
+		if !ok {
+			// We exhausted ports to connect from. Try the non-shard-aware port.
+			return sd.dialer.DialContext(ctx, "tcp", addr)
+		}
+
+		ctxWithPort := context.WithValue(ctx, scyllaSourcePortCtx{}, port)
+		conn, err := sd.dialer.DialContext(ctxWithPort, "tcp", shardAwareAddr)
+
+		if isLocalAddrInUseErr(err) {
+			// This indicates that the source port is already in use
+			// We can immediately retry with another source port for this shard
+			continue
+		} else if err != nil {
+			conn, err := sd.dialer.DialContext(ctx, "tcp", addr)
+			if err == nil {
+				// We failed to connect to the shard-aware port, but succeeded
+				// in connecting to the non-shard-aware port. This might
+				// indicate that the shard-aware port is just not reachable,
+				// but we may also be unlucky and the node became reachable
+				// just after we tried the first connection.
+				// We can't avoid false positives here, so I'm putting it
+				// behind a debug flag.
+				if gocqlDebug {
+					sd.logger.Printf(
+						"scylla: %s couldn't connect to shard-aware address while the non-shard-aware address %s is available; this might be an issue with ",
+						addr,
+						shardAwareAddr,
+					)
+				}
+			}
+			return conn, err
+		}
+		return conn, err
+	}
+}
+
+// ErrScyllaSourcePortAlreadyInUse An error value which can returned from
+// a custom dialer implementation to indicate that the requested source port
+// to dial from is already in use
+var ErrScyllaSourcePortAlreadyInUse = errors.New("scylla: source port is already in use")
+
+func isLocalAddrInUseErr(err error) bool {
+	return errors.Is(err, syscall.EADDRINUSE) || errors.Is(err, ErrScyllaSourcePortAlreadyInUse)
+}
+
+// ScyllaShardAwareDialer wraps a net.Dialer, but uses a source port specified by gocql when connecting.
+//
+// Unlike in the case standard native transport ports, gocql can choose which shard will handle
+// a new connection by connecting from a specific source port. If you are using your own net.Dialer
+// in ClusterConfig, you can use ScyllaShardAwareDialer to "upgrade" it so that it connects
+// from the source port chosen by gocql.
+//
+// Please note that ScyllaShardAwareDialer overwrites the LocalAddr field in order to choose
+// the right source port for connection.
+type ScyllaShardAwareDialer struct {
+	net.Dialer
+}
+
+func (d *ScyllaShardAwareDialer) DialContext(ctx context.Context, network, addr string) (conn net.Conn, err error) {
+	sourcePort := ScyllaGetSourcePort(ctx)
+	if sourcePort == 0 {
+		return d.Dialer.DialContext(ctx, network, addr)
+	}
+	dialerWithLocalAddr := d.Dialer
+	dialerWithLocalAddr.LocalAddr, err = net.ResolveTCPAddr(network, fmt.Sprintf(":%d", sourcePort))
+	if err != nil {
+		return nil, err
+	}
+
+	return dialerWithLocalAddr.DialContext(ctx, network, addr)
+}
+
+type scyllaPortIterator struct {
+	currentPort int
+	shardCount  int
+}
+
+const (
+	scyllaPortBasedBalancingMin = 0x8000
+	scyllaPortBasedBalancingMax = 0xFFFF
+)
+
+func newScyllaPortIterator(shardID, shardCount int) *scyllaPortIterator {
+	if shardCount == 0 {
+		panic("shardCount cannot be 0")
+	}
+
+	// Find the smallest port p such that p >= min and p % shardCount == shardID
+	port := scyllaPortBasedBalancingMin - scyllaShardForSourcePort(scyllaPortBasedBalancingMin, shardCount) + shardID
+	if port < scyllaPortBasedBalancingMin {
+		port += shardCount
+	}
+
+	return &scyllaPortIterator{
+		currentPort: port,
+		shardCount:  shardCount,
+	}
+}
+
+func (spi *scyllaPortIterator) Next() (uint16, bool) {
+	if spi == nil {
+		return 0, false
+	}
+
+	p := spi.currentPort
+
+	if p > scyllaPortBasedBalancingMax {
+		return 0, false
+	}
+
+	spi.currentPort += spi.shardCount
+	return uint16(p), true
+}
+
+func scyllaShardForSourcePort(sourcePort uint16, shardCount int) int {
+	return int(sourcePort) % shardCount
+}
+
+type scyllaSourcePortCtx struct{}
+
+// ScyllaGetSourcePort returns the source port that should be used when connecting to a node.
+//
+// Unlike in the case standard native transport ports, gocql can choose which shard will handle
+// a new connection at the shard-aware port by connecting from a specific source port. Therefore,
+// if you are using a custom Dialer and your nodes expose shard-aware ports, your dialer should
+// use the source port specified by gocql.
+//
+// If this function returns 0, then your dialer can use any source port.
+//
+// If you aren't using a custom dialer, gocql will use a default one which uses appropriate source port.
+// If you are using net.Dialer, consider wrapping it in a gocql.ScyllaShardAwareDialer.
+func ScyllaGetSourcePort(ctx context.Context) uint16 {
+	sourcePort, _ := ctx.Value(scyllaSourcePortCtx{}).(uint16)
+	return sourcePort
+}
+
+// Returns a partitioner specific to the table, or "nil"
+// if the cluster-global partitioner should be used
+func scyllaGetTablePartitioner(session *Session, keyspaceName, tableName string) (Partitioner, error) {
+	isCdc, err := scyllaIsCdcTable(session, keyspaceName, tableName)
+	if err != nil {
+		return nil, err
+	}
+	if isCdc {
+		return scyllaCDCPartitioner{}, nil
+	}
+
+	return nil, nil
+}
diff --git a/vendor/github.com/gocql/gocql/scylla_cdc.go b/vendor/github.com/gocql/gocql/scylla_cdc.go
new file mode 100644
index 0000000..c4f8d57
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/scylla_cdc.go
@@ -0,0 +1,93 @@
+package gocql
+
+import (
+	"encoding/binary"
+	"math"
+	"strings"
+)
+
+// cdc partitioner
+
+const (
+	scyllaCDCPartitionerName     = "CDCPartitioner"
+	scyllaCDCPartitionerFullName = "com.scylladb.dht.CDCPartitioner"
+
+	scyllaCDCPartitionKeyLength  = 16
+	scyllaCDCVersionMask         = 0x0F
+	scyllaCDCMinSupportedVersion = 1
+	scyllaCDCMaxSupportedVersion = 1
+
+	scyllaCDCMinToken           = int64Token(math.MinInt64)
+	scyllaCDCLogTableNameSuffix = "_scylla_cdc_log"
+	scyllaCDCExtensionName      = "cdc"
+)
+
+type scyllaCDCPartitioner struct{}
+
+var _ Partitioner = scyllaCDCPartitioner{}
+
+func (p scyllaCDCPartitioner) Name() string {
+	return scyllaCDCPartitionerName
+}
+
+func (p scyllaCDCPartitioner) Hash(partitionKey []byte) Token {
+	if len(partitionKey) < 8 {
+		// The key is too short to extract any sensible token,
+		// so return the min token instead
+		if gocqlDebug {
+			Logger.Printf("scylla: cdc partition key too short: %d < 8", len(partitionKey))
+		}
+		return scyllaCDCMinToken
+	}
+
+	upperQword := binary.BigEndian.Uint64(partitionKey[0:])
+
+	if gocqlDebug {
+		// In debug mode, do some more checks
+
+		if len(partitionKey) != scyllaCDCPartitionKeyLength {
+			// The token has unrecognized format, but the first quadword
+			// should be the token value that we want
+			Logger.Printf("scylla: wrong size of cdc partition key: %d", len(partitionKey))
+		}
+
+		lowerQword := binary.BigEndian.Uint64(partitionKey[8:])
+		version := lowerQword & scyllaCDCVersionMask
+		if version < scyllaCDCMinSupportedVersion || version > scyllaCDCMaxSupportedVersion {
+			// We don't support this version yet,
+			// the token may be wrong
+			Logger.Printf(
+				"scylla: unsupported version: %d is not in range [%d, %d]",
+				version,
+				scyllaCDCMinSupportedVersion,
+				scyllaCDCMaxSupportedVersion,
+			)
+		}
+	}
+
+	return int64Token(upperQword)
+}
+
+func (p scyllaCDCPartitioner) ParseString(str string) Token {
+	return parseInt64Token(str)
+}
+
+func scyllaIsCdcTable(session *Session, keyspaceName, tableName string) (bool, error) {
+	if !strings.HasSuffix(tableName, scyllaCDCLogTableNameSuffix) {
+		// Not a CDC table, use the default partitioner
+		return false, nil
+	}
+
+	// Get the table metadata to see if it has the cdc partitioner set
+	keyspaceMeta, err := session.KeyspaceMetadata(keyspaceName)
+	if err != nil {
+		return false, err
+	}
+
+	tableMeta, ok := keyspaceMeta.Tables[tableName]
+	if !ok {
+		return false, ErrNoMetadata
+	}
+
+	return tableMeta.Options.Partitioner == scyllaCDCPartitionerFullName, nil
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/ascii/marshal.go b/vendor/github.com/gocql/gocql/serialization/ascii/marshal.go
new file mode 100644
index 0000000..d365de1
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/ascii/marshal.go
@@ -0,0 +1,28 @@
+package ascii
+
+import (
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case string:
+		return EncString(v)
+	case *string:
+		return EncStringR(v)
+	case []byte:
+		return EncBytes(v)
+	case *[]byte:
+		return EncBytesR(v)
+	default:
+		// Custom types (type MyString string) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(rv)
+		}
+		return EncReflectR(rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/ascii/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/ascii/marshal_utils.go
new file mode 100644
index 0000000..2213b45
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/ascii/marshal_utils.go
@@ -0,0 +1,61 @@
+package ascii
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func EncString(v string) ([]byte, error) {
+	return encString(v), nil
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encString(*v), nil
+}
+
+func EncBytes(v []byte) ([]byte, error) {
+	return v, nil
+}
+
+func EncBytesR(v *[]byte) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return *v, nil
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.String:
+		return encString(v.String()), nil
+	case reflect.Slice:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return nil, fmt.Errorf("failed to marshal ascii: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return EncBytes(v.Bytes())
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal ascii: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal ascii: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encString(v string) []byte {
+	if v == "" {
+		return make([]byte, 0)
+	}
+	return []byte(v)
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/ascii/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/ascii/unmarshal.go
new file mode 100644
index 0000000..79a5fff
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/ascii/unmarshal.go
@@ -0,0 +1,33 @@
+package ascii
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+	case *string:
+		return DecString(data, v)
+	case **string:
+		return DecStringR(data, v)
+	case *[]byte:
+		return DecBytes(data, v)
+	case **[]byte:
+		return DecBytesR(data, v)
+	default:
+		// Custom types (type MyString string) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal ascii: unsupported value type (%T)(%[1]v)", v)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/ascii/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/ascii/unmarshal_utils.go
new file mode 100644
index 0000000..e4be4e0
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/ascii/unmarshal_utils.go
@@ -0,0 +1,166 @@
+package ascii
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func errInvalidData(p []byte) error {
+	for i := range p {
+		if p[i] > 127 {
+			return fmt.Errorf("failed to unmarshal ascii: invalid charester %s", string(p[i]))
+		}
+	}
+	return nil
+}
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal ascii: can not unmarshal into nil reference(%T)(%[1]v)", v)
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decString(p)
+	return errInvalidData(p)
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decStringR(p)
+	return errInvalidData(p)
+}
+
+func DecBytes(p []byte, v *[]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decBytes(p)
+	return errInvalidData(p)
+}
+
+func DecBytesR(p []byte, v **[]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decBytesR(p)
+	return errInvalidData(p)
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.String:
+		v.SetString(decString(p))
+	case reflect.Slice:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to marshal ascii: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		v.SetBytes(decBytes(p))
+	default:
+		return fmt.Errorf("failed to unmarshal ascii: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+	return errInvalidData(p)
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch ev := v.Type().Elem().Elem(); ev.Kind() {
+	case reflect.String:
+		return decReflectStringR(p, v)
+	case reflect.Slice:
+		if ev.Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to marshal ascii: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return decReflectBytesR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal ascii: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	if len(p) == 0 {
+		if p == nil {
+			v.Elem().Set(reflect.Zero(v.Elem().Type()))
+		} else {
+			v.Elem().Set(reflect.New(v.Type().Elem().Elem()))
+		}
+		return nil
+	}
+	val := reflect.New(v.Type().Elem().Elem())
+	val.Elem().SetString(string(p))
+	v.Elem().Set(val)
+	return errInvalidData(p)
+}
+
+func decReflectBytesR(p []byte, v reflect.Value) error {
+	if len(p) == 0 {
+		if p == nil {
+			v.Elem().Set(reflect.Zero(v.Elem().Type()))
+		} else {
+			val := reflect.New(v.Type().Elem().Elem())
+			val.Elem().SetBytes(make([]byte, 0))
+			v.Elem().Set(val)
+		}
+		return nil
+	}
+	tmp := make([]byte, len(p))
+	copy(tmp, p)
+
+	val := reflect.New(v.Type().Elem().Elem())
+	val.Elem().SetBytes(tmp)
+	v.Elem().Set(val)
+	return errInvalidData(p)
+}
+
+func decString(p []byte) string {
+	if len(p) == 0 {
+		return ""
+	}
+	return string(p)
+}
+
+func decStringR(p []byte) *string {
+	if len(p) == 0 {
+		if p == nil {
+			return nil
+		}
+		return new(string)
+	}
+	tmp := string(p)
+	return &tmp
+}
+
+func decBytes(p []byte) []byte {
+	if len(p) == 0 {
+		if p == nil {
+			return nil
+		}
+		return make([]byte, 0)
+	}
+	tmp := make([]byte, len(p))
+	copy(tmp, p)
+	return tmp
+}
+
+func decBytesR(p []byte) *[]byte {
+	if len(p) == 0 {
+		if p == nil {
+			return nil
+		}
+		tmp := make([]byte, 0)
+		return &tmp
+	}
+	tmp := make([]byte, len(p))
+	copy(tmp, p)
+	return &tmp
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/bigint/marshal.go b/vendor/github.com/gocql/gocql/serialization/bigint/marshal.go
new file mode 100644
index 0000000..8a92295
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/bigint/marshal.go
@@ -0,0 +1,74 @@
+package bigint
+
+import (
+	"math/big"
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case int8:
+		return EncInt8(v)
+	case int16:
+		return EncInt16(v)
+	case int32:
+		return EncInt32(v)
+	case int64:
+		return EncInt64(v)
+	case int:
+		return EncInt(v)
+
+	case uint8:
+		return EncUint8(v)
+	case uint16:
+		return EncUint16(v)
+	case uint32:
+		return EncUint32(v)
+	case uint64:
+		return EncUint64(v)
+	case uint:
+		return EncUint(v)
+
+	case big.Int:
+		return EncBigInt(v)
+	case string:
+		return EncString(v)
+
+	case *int8:
+		return EncInt8R(v)
+	case *int16:
+		return EncInt16R(v)
+	case *int32:
+		return EncInt32R(v)
+	case *int64:
+		return EncInt64R(v)
+	case *int:
+		return EncIntR(v)
+
+	case *uint8:
+		return EncUint8R(v)
+	case *uint16:
+		return EncUint16R(v)
+	case *uint32:
+		return EncUint32R(v)
+	case *uint64:
+		return EncUint64R(v)
+	case *uint:
+		return EncUintR(v)
+
+	case *big.Int:
+		return EncBigIntR(v)
+	case *string:
+		return EncStringR(v)
+	default:
+		// Custom types (type MyInt int) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/bigint/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/bigint/marshal_utils.go
new file mode 100644
index 0000000..b01111b
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/bigint/marshal_utils.go
@@ -0,0 +1,206 @@
+package bigint
+
+import (
+	"fmt"
+	"math"
+	"math/big"
+	"reflect"
+	"strconv"
+)
+
+var (
+	maxBigInt = big.NewInt(math.MaxInt64)
+	minBigInt = big.NewInt(math.MinInt64)
+)
+
+func EncInt8(v int8) ([]byte, error) {
+	if v < 0 {
+		return []byte{255, 255, 255, 255, 255, 255, 255, byte(v)}, nil
+	}
+	return []byte{0, 0, 0, 0, 0, 0, 0, byte(v)}, nil
+}
+
+func EncInt8R(v *int8) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt8(*v)
+}
+
+func EncInt16(v int16) ([]byte, error) {
+	if v < 0 {
+		return []byte{255, 255, 255, 255, 255, 255, byte(v >> 8), byte(v)}, nil
+	}
+	return []byte{0, 0, 0, 0, 0, 0, byte(v >> 8), byte(v)}, nil
+}
+
+func EncInt16R(v *int16) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt16(*v)
+}
+
+func EncInt32(v int32) ([]byte, error) {
+	if v < 0 {
+		return []byte{255, 255, 255, 255, byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+	}
+	return []byte{0, 0, 0, 0, byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncInt32R(v *int32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt32(*v)
+}
+
+func EncInt64(v int64) ([]byte, error) {
+	return encInt64(v), nil
+}
+
+func EncInt64R(v *int64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt64(*v)
+}
+
+func EncInt(v int) ([]byte, error) {
+	return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncIntR(v *int) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt(*v)
+}
+
+func EncUint8(v uint8) ([]byte, error) {
+	return []byte{0, 0, 0, 0, 0, 0, 0, v}, nil
+}
+
+func EncUint8R(v *uint8) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint8(*v)
+}
+
+func EncUint16(v uint16) ([]byte, error) {
+	return []byte{0, 0, 0, 0, 0, 0, byte(v >> 8), byte(v)}, nil
+}
+
+func EncUint16R(v *uint16) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint16(*v)
+}
+
+func EncUint32(v uint32) ([]byte, error) {
+	return []byte{0, 0, 0, 0, byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncUint32R(v *uint32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint32(*v)
+}
+
+func EncUint64(v uint64) ([]byte, error) {
+	return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncUint64R(v *uint64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint64(*v)
+}
+
+func EncUint(v uint) ([]byte, error) {
+	return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncUintR(v *uint) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint(*v)
+}
+
+func EncBigInt(v big.Int) ([]byte, error) {
+	if v.Cmp(maxBigInt) == 1 || v.Cmp(minBigInt) == -1 {
+		return nil, fmt.Errorf("failed to marshal bigint: value (%T)(%s) out of range", v, v.String())
+	}
+	return encInt64(v.Int64()), nil
+}
+
+func EncBigIntR(v *big.Int) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	if v.Cmp(maxBigInt) == 1 || v.Cmp(minBigInt) == -1 {
+		return nil, fmt.Errorf("failed to marshal bigint: value (%T)(%s) out of range", v, v.String())
+	}
+	return encInt64(v.Int64()), nil
+}
+
+func EncString(v string) ([]byte, error) {
+	if v == "" {
+		return nil, nil
+	}
+
+	n, err := strconv.ParseInt(v, 10, 64)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal bigint: can not marshal (%T)(%[1]v) %s", v, err)
+	}
+	return encInt64(n), nil
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncString(*v)
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.Int, reflect.Int64, reflect.Int32, reflect.Int16, reflect.Int8:
+		return EncInt64(v.Int())
+	case reflect.Uint, reflect.Uint64, reflect.Uint32, reflect.Uint16, reflect.Uint8:
+		return EncUint64(v.Uint())
+	case reflect.String:
+		val := v.String()
+		if val == "" {
+			return nil, nil
+		}
+		n, err := strconv.ParseInt(val, 10, 64)
+		if err != nil {
+			return nil, fmt.Errorf("failed to marshal bigint: can not marshal (%T)(%[1]v) %s", v.Interface(), err)
+		}
+		return encInt64(n), nil
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal bigint: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal bigint: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encInt64(v int64) []byte {
+	return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/bigint/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/bigint/unmarshal.go
new file mode 100644
index 0000000..eb26dad
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/bigint/unmarshal.go
@@ -0,0 +1,81 @@
+package bigint
+
+import (
+	"fmt"
+	"math/big"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+
+	case *int8:
+		return DecInt8(data, v)
+	case *int16:
+		return DecInt16(data, v)
+	case *int32:
+		return DecInt32(data, v)
+	case *int64:
+		return DecInt64(data, v)
+	case *int:
+		return DecInt(data, v)
+
+	case *uint8:
+		return DecUint8(data, v)
+	case *uint16:
+		return DecUint16(data, v)
+	case *uint32:
+		return DecUint32(data, v)
+	case *uint64:
+		return DecUint64(data, v)
+	case *uint:
+		return DecUint(data, v)
+
+	case *big.Int:
+		return DecBigInt(data, v)
+	case *string:
+		return DecString(data, v)
+
+	case **int8:
+		return DecInt8R(data, v)
+	case **int16:
+		return DecInt16R(data, v)
+	case **int32:
+		return DecInt32R(data, v)
+	case **int64:
+		return DecInt64R(data, v)
+	case **int:
+		return DecIntR(data, v)
+
+	case **uint8:
+		return DecUint8R(data, v)
+	case **uint16:
+		return DecUint16R(data, v)
+	case **uint32:
+		return DecUint32R(data, v)
+	case **uint64:
+		return DecUint64R(data, v)
+	case **uint:
+		return DecUintR(data, v)
+
+	case **big.Int:
+		return DecBigIntR(data, v)
+	case **string:
+		return DecStringR(data, v)
+	default:
+
+		// Custom types (type MyInt int) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal bigint: unsupported value type (%T)(%[1]v)", value)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/bigint/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/bigint/unmarshal_utils.go
new file mode 100644
index 0000000..d324483
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/bigint/unmarshal_utils.go
@@ -0,0 +1,841 @@
+package bigint
+
+import (
+	"fmt"
+	"math"
+	"math/big"
+	"reflect"
+	"strconv"
+)
+
+var errWrongDataLen = fmt.Errorf("failed to unmarshal bigint: the length of the data should be 0 or 8")
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal bigint: can not unmarshal into nil reference (%T)(%[1]v))", v)
+}
+
+func DecInt8(p []byte, v *int8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into int8, the data should be in the int8 range")
+		}
+		*v = int8(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt8R(p []byte, v **int8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int8)
+		}
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into int8, the data should be in the int8 range")
+		}
+		tmp := int8(val)
+		*v = &tmp
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt16(p []byte, v *int16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt16 || val < math.MinInt16 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into int16, the data should be in the int16 range")
+		}
+		*v = int16(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt16R(p []byte, v **int16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int16)
+		}
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt16 || val < math.MinInt16 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into int16, the data should be in the int16 range")
+		}
+		tmp := int16(val)
+		*v = &tmp
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt32(p []byte, v *int32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt32 || val < math.MinInt32 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into int32, the data should be in the int32 range")
+		}
+		*v = int32(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt32R(p []byte, v **int32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int32)
+		}
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt32 || val < math.MinInt32 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into int32, the data should be in the int32 range")
+		}
+		tmp := int32(val)
+		*v = &tmp
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt64(p []byte, v *int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		*v = decInt64(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt64R(p []byte, v **int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int64)
+		}
+	case 8:
+		val := decInt64(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt(p []byte, v *int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		*v = int(p[0])<<56 | int(p[1])<<48 | int(p[2])<<40 | int(p[3])<<32 | int(p[4])<<24 | int(p[5])<<16 | int(p[6])<<8 | int(p[7])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecIntR(p []byte, v **int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int)
+		}
+	case 8:
+		val := int(p[0])<<56 | int(p[1])<<48 | int(p[2])<<40 | int(p[3])<<32 | int(p[4])<<24 | int(p[5])<<16 | int(p[6])<<8 | int(p[7])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint8(p []byte, v *uint8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 || p[6] != 0 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into uint8, the data should be in the uint8 range")
+		}
+		*v = p[7]
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint8R(p []byte, v **uint8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint8)
+		}
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 || p[6] != 0 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into uint8, the data should be in the uint8 range")
+		}
+		val := p[7]
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint16(p []byte, v *uint16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into uint16, the data should be in the uint16 range")
+		}
+		*v = uint16(p[6])<<8 | uint16(p[7])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint16R(p []byte, v **uint16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint16)
+		}
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into uint16, the data should be in the uint16 range")
+		}
+		val := uint16(p[6])<<8 | uint16(p[7])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint32(p []byte, v *uint32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into uint32, the data should be in the uint32 range")
+		}
+		*v = uint32(p[4])<<24 | uint32(p[5])<<16 | uint32(p[6])<<8 | uint32(p[7])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint32R(p []byte, v **uint32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint32)
+		}
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into uint32, the data should be in the uint32 range")
+		}
+		val := uint32(p[4])<<24 | uint32(p[5])<<16 | uint32(p[6])<<8 | uint32(p[7])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint64(p []byte, v *uint64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		*v = decUint64(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint64R(p []byte, v **uint64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint64)
+		}
+	case 8:
+		val := decUint64(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint(p []byte, v *uint) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		*v = uint(p[0])<<56 | uint(p[1])<<48 | uint(p[2])<<40 | uint(p[3])<<32 | uint(p[4])<<24 | uint(p[5])<<16 | uint(p[6])<<8 | uint(p[7])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUintR(p []byte, v **uint) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint)
+		}
+	case 8:
+		val := uint(p[0])<<56 | uint(p[1])<<48 | uint(p[2])<<40 | uint(p[3])<<32 | uint(p[4])<<24 | uint(p[5])<<16 | uint(p[6])<<8 | uint(p[7])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = ""
+		} else {
+			*v = "0"
+		}
+	case 8:
+		*v = strconv.FormatInt(decInt64(p), 10)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			val := "0"
+			*v = &val
+		}
+	case 8:
+		val := strconv.FormatInt(decInt64(p), 10)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecBigInt(p []byte, v *big.Int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		v.SetInt64(0)
+	case 8:
+		v.SetInt64(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecBigIntR(p []byte, v **big.Int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(big.Int)
+		}
+	case 8:
+		*v = big.NewInt(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal bigint: can not unmarshal into nil reference (%T)(%[1]v))", v.Interface())
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Int8:
+		return decReflectInt8(p, v)
+	case reflect.Int16:
+		return decReflectInt16(p, v)
+	case reflect.Int32:
+		return decReflectInt32(p, v)
+	case reflect.Int64, reflect.Int:
+		return decReflectInts(p, v)
+	case reflect.Uint8:
+		return decReflectUint8(p, v)
+	case reflect.Uint16:
+		return decReflectUint16(p, v)
+	case reflect.Uint32:
+		return decReflectUint32(p, v)
+	case reflect.Uint64, reflect.Uint:
+		return decReflectUints(p, v)
+	case reflect.String:
+		return decReflectString(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal bigint: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectInt8(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into %T, the data should be in the int8 range", v.Interface())
+		}
+		v.SetInt(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInt16(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt16 || val < math.MinInt16 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into %T, the data should be in the int16 range", v.Interface())
+		}
+		v.SetInt(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInt32(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt32 || val < math.MinInt32 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into %T, the data should be in the int32 range", v.Interface())
+		}
+		v.SetInt(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInts(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 8:
+		v.SetInt(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint8(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 || p[6] != 0 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into %T, the data should be in the uint8 range", v.Interface())
+		}
+		v.SetUint(uint64(p[7]))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint16(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into %T, the data should be in the uint16 range", v.Interface())
+		}
+		v.SetUint(uint64(p[6])<<8 | uint64(p[7]))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint32(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into %T, the data should be in the uint32 range", v.Interface())
+		}
+		v.SetUint(uint64(p[4])<<24 | uint64(p[5])<<16 | uint64(p[6])<<8 | uint64(p[7]))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUints(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 8:
+		v.SetUint(decUint64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectString(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.SetString("")
+		} else {
+			v.SetString("0")
+		}
+	case 8:
+		v.SetString(strconv.FormatInt(decInt64(p), 10))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal bigint: can not unmarshal into nil reference (%T)(%[1]v)", v.Interface())
+	}
+
+	switch v.Type().Elem().Elem().Kind() {
+	case reflect.Int8:
+		return decReflectInt8R(p, v)
+	case reflect.Int16:
+		return decReflectInt16R(p, v)
+	case reflect.Int32:
+		return decReflectInt32R(p, v)
+	case reflect.Int64, reflect.Int:
+		return decReflectIntsR(p, v)
+	case reflect.Uint8:
+		return decReflectUint8R(p, v)
+	case reflect.Uint16:
+		return decReflectUint16R(p, v)
+	case reflect.Uint32:
+		return decReflectUint32R(p, v)
+	case reflect.Uint64, reflect.Uint:
+		return decReflectUintsR(p, v)
+	case reflect.String:
+		return decReflectStringR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal bigint: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectInt8R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into %T, the data should be in the int8 range", v.Interface())
+		}
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(val)
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInt16R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt16 || val < math.MinInt16 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into %T, the data should be in the int16 range", v.Interface())
+		}
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(val)
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInt32R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt32 || val < math.MinInt32 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into %T, the data should be in the int32 range", v.Interface())
+		}
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(val)
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectIntsR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetInt(decInt64(p))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint8R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 || p[6] != 0 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into %T, the data should be in the uint8 range", v.Interface())
+		}
+		newVal.Elem().SetUint(uint64(p[7]))
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint16R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into %T, the data should be in the uint16 range", v.Interface())
+		}
+		newVal.Elem().SetUint(uint64(p[6])<<8 | uint64(p[7]))
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint32R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 {
+			return fmt.Errorf("failed to unmarshal bigint: to unmarshal into %T, the data should be in the uint32 range", v.Interface())
+		}
+		newVal.Elem().SetUint(uint64(p[4])<<24 | uint64(p[5])<<16 | uint64(p[6])<<8 | uint64(p[7]))
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUintsR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetUint(decUint64(p))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		var val reflect.Value
+		if p == nil {
+			val = reflect.Zero(v.Type().Elem())
+		} else {
+			val = reflect.New(v.Type().Elem().Elem())
+			val.Elem().SetString("0")
+		}
+		v.Elem().Set(val)
+	case 8:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetString(strconv.FormatInt(decInt64(p), 10))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectNullableR(p []byte, v reflect.Value) reflect.Value {
+	if p == nil {
+		return reflect.Zero(v.Elem().Type())
+	}
+	return reflect.New(v.Type().Elem().Elem())
+}
+
+func decInt64(p []byte) int64 {
+	return int64(p[0])<<56 | int64(p[1])<<48 | int64(p[2])<<40 | int64(p[3])<<32 | int64(p[4])<<24 | int64(p[5])<<16 | int64(p[6])<<8 | int64(p[7])
+}
+
+func decUint64(p []byte) uint64 {
+	return uint64(p[0])<<56 | uint64(p[1])<<48 | uint64(p[2])<<40 | uint64(p[3])<<32 | uint64(p[4])<<24 | uint64(p[5])<<16 | uint64(p[6])<<8 | uint64(p[7])
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/blob/marshal.go b/vendor/github.com/gocql/gocql/serialization/blob/marshal.go
new file mode 100644
index 0000000..7a646f4
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/blob/marshal.go
@@ -0,0 +1,28 @@
+package blob
+
+import (
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case string:
+		return EncString(v)
+	case *string:
+		return EncStringR(v)
+	case []byte:
+		return EncBytes(v)
+	case *[]byte:
+		return EncBytesR(v)
+	default:
+		// Custom types (type MyString string) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(rv)
+		}
+		return EncReflectR(rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/blob/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/blob/marshal_utils.go
new file mode 100644
index 0000000..4b3f59c
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/blob/marshal_utils.go
@@ -0,0 +1,61 @@
+package blob
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func EncString(v string) ([]byte, error) {
+	return encString(v), nil
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encString(*v), nil
+}
+
+func EncBytes(v []byte) ([]byte, error) {
+	return v, nil
+}
+
+func EncBytesR(v *[]byte) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return *v, nil
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.String:
+		return encString(v.String()), nil
+	case reflect.Slice:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return nil, fmt.Errorf("failed to marshal blob: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return EncBytes(v.Bytes())
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal blob: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal blob: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encString(v string) []byte {
+	if v == "" {
+		return make([]byte, 0)
+	}
+	return []byte(v)
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/blob/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/blob/unmarshal.go
new file mode 100644
index 0000000..70cd59e
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/blob/unmarshal.go
@@ -0,0 +1,35 @@
+package blob
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+	case *string:
+		return DecString(data, v)
+	case **string:
+		return DecStringR(data, v)
+	case *[]byte:
+		return DecBytes(data, v)
+	case **[]byte:
+		return DecBytesR(data, v)
+	case *interface{}:
+		return DecInterface(data, v)
+	default:
+		// Custom types (type MyString string) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal blob: unsupported value type (%T)(%[1]v)", v)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/blob/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/blob/unmarshal_utils.go
new file mode 100644
index 0000000..89039b4
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/blob/unmarshal_utils.go
@@ -0,0 +1,167 @@
+package blob
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal blob: can not unmarshal into nil reference(%T)(%[1]v)", v)
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decString(p)
+	return nil
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decStringR(p)
+	return nil
+}
+
+func DecBytes(p []byte, v *[]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decBytes(p)
+	return nil
+}
+
+func DecBytesR(p []byte, v **[]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decBytesR(p)
+	return nil
+}
+
+func DecInterface(p []byte, v *interface{}) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decBytes(p)
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.String:
+		v.SetString(decString(p))
+	case reflect.Slice:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to marshal blob: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		v.SetBytes(decBytes(p))
+	case reflect.Interface:
+		v.Set(reflect.ValueOf(decBytes(p)))
+	default:
+		return fmt.Errorf("failed to unmarshal blob: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+	return nil
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch ev := v.Type().Elem().Elem(); ev.Kind() {
+	case reflect.String:
+		return decReflectStringR(p, v)
+	case reflect.Slice:
+		if ev.Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to marshal blob: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return decReflectBytesR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal blob: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	if len(p) == 0 {
+		if p == nil {
+			v.Elem().Set(reflect.Zero(v.Type().Elem()))
+		} else {
+			v.Elem().Set(reflect.New(v.Type().Elem().Elem()))
+		}
+		return nil
+	}
+	val := reflect.New(v.Type().Elem().Elem())
+	val.Elem().SetString(string(p))
+	v.Elem().Set(val)
+	return nil
+}
+
+func decReflectBytesR(p []byte, v reflect.Value) error {
+	if len(p) == 0 {
+		if p == nil {
+			v.Elem().Set(reflect.Zero(v.Elem().Type()))
+		} else {
+			val := reflect.New(v.Type().Elem().Elem())
+			val.Elem().SetBytes(make([]byte, 0))
+			v.Elem().Set(val)
+		}
+		return nil
+	}
+	tmp := make([]byte, len(p))
+	copy(tmp, p)
+
+	val := reflect.New(v.Type().Elem().Elem())
+	val.Elem().SetBytes(tmp)
+	v.Elem().Set(val)
+	return nil
+}
+
+func decString(p []byte) string {
+	if len(p) == 0 {
+		return ""
+	}
+	return string(p)
+}
+
+func decStringR(p []byte) *string {
+	if len(p) == 0 {
+		if p == nil {
+			return nil
+		}
+		return new(string)
+	}
+	tmp := string(p)
+	return &tmp
+}
+
+func decBytes(p []byte) []byte {
+	if len(p) == 0 {
+		if p == nil {
+			return nil
+		}
+		return make([]byte, 0)
+	}
+	tmp := make([]byte, len(p))
+	copy(tmp, p)
+	return tmp
+}
+
+func decBytesR(p []byte) *[]byte {
+	if len(p) == 0 {
+		if p == nil {
+			return nil
+		}
+		tmp := make([]byte, 0)
+		return &tmp
+	}
+	tmp := make([]byte, len(p))
+	copy(tmp, p)
+	return &tmp
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/boolean/marshal.go b/vendor/github.com/gocql/gocql/serialization/boolean/marshal.go
new file mode 100644
index 0000000..36c9b29
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/boolean/marshal.go
@@ -0,0 +1,24 @@
+package boolean
+
+import (
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case bool:
+		return EncBool(v)
+	case *bool:
+		return EncBoolR(v)
+	default:
+		// Custom types (type MyBool bool) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/boolean/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/boolean/marshal_utils.go
new file mode 100644
index 0000000..1166cd3
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/boolean/marshal_utils.go
@@ -0,0 +1,45 @@
+package boolean
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func EncBool(v bool) ([]byte, error) {
+	return encBool(v), nil
+}
+
+func EncBoolR(v *bool) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encBool(*v), nil
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.Bool:
+		return encBool(v.Bool()), nil
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal boolean: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal boolean: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encBool(v bool) []byte {
+	if v {
+		return []byte{1}
+	}
+	return []byte{0}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/boolean/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/boolean/unmarshal.go
new file mode 100644
index 0000000..0bf746c
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/boolean/unmarshal.go
@@ -0,0 +1,29 @@
+package boolean
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+	case *bool:
+		return DecBool(data, v)
+	case **bool:
+		return DecBoolR(data, v)
+	default:
+		// Custom types (type MyBool bool) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal boolean: unsupported value type (%T)(%[1]v)", v)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/boolean/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/boolean/unmarshal_utils.go
new file mode 100644
index 0000000..53c21da
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/boolean/unmarshal_utils.go
@@ -0,0 +1,108 @@
+package boolean
+
+import (
+	"fmt"
+	"reflect"
+)
+
+var errWrongDataLen = fmt.Errorf("failed to unmarshal boolean: the length of the data should be 0 or 1")
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal boolean: can not unmarshal into nil reference(%T)(%[1]v)", v)
+}
+
+func DecBool(p []byte, v *bool) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = false
+	case 1:
+		*v = decBool(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecBoolR(p []byte, v **bool) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(bool)
+		}
+	case 1:
+		val := decBool(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Bool:
+		return decReflectBool(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal boolean: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v.Type().Elem().Elem().Kind() {
+	case reflect.Bool:
+		return decReflectBoolR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal boolean: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectBool(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetBool(false)
+	case 1:
+		v.SetBool(decBool(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectBoolR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.Elem().Set(reflect.Zero(v.Type().Elem()))
+		} else {
+			val := reflect.New(v.Type().Elem().Elem())
+			v.Elem().Set(val)
+		}
+	case 1:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetBool(decBool(p))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decBool(p []byte) bool {
+	return p[0] != 0
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/counter/marshal.go b/vendor/github.com/gocql/gocql/serialization/counter/marshal.go
new file mode 100644
index 0000000..7193787
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/counter/marshal.go
@@ -0,0 +1,74 @@
+package counter
+
+import (
+	"math/big"
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case int8:
+		return EncInt8(v)
+	case int16:
+		return EncInt16(v)
+	case int32:
+		return EncInt32(v)
+	case int64:
+		return EncInt64(v)
+	case int:
+		return EncInt(v)
+
+	case uint8:
+		return EncUint8(v)
+	case uint16:
+		return EncUint16(v)
+	case uint32:
+		return EncUint32(v)
+	case uint64:
+		return EncUint64(v)
+	case uint:
+		return EncUint(v)
+
+	case big.Int:
+		return EncBigInt(v)
+	case string:
+		return EncString(v)
+
+	case *int8:
+		return EncInt8R(v)
+	case *int16:
+		return EncInt16R(v)
+	case *int32:
+		return EncInt32R(v)
+	case *int64:
+		return EncInt64R(v)
+	case *int:
+		return EncIntR(v)
+
+	case *uint8:
+		return EncUint8R(v)
+	case *uint16:
+		return EncUint16R(v)
+	case *uint32:
+		return EncUint32R(v)
+	case *uint64:
+		return EncUint64R(v)
+	case *uint:
+		return EncUintR(v)
+
+	case *big.Int:
+		return EncBigIntR(v)
+	case *string:
+		return EncStringR(v)
+	default:
+		// Custom types (type MyInt int) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/counter/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/counter/marshal_utils.go
new file mode 100644
index 0000000..339ce6c
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/counter/marshal_utils.go
@@ -0,0 +1,206 @@
+package counter
+
+import (
+	"fmt"
+	"math"
+	"math/big"
+	"reflect"
+	"strconv"
+)
+
+var (
+	maxBigInt = big.NewInt(math.MaxInt64)
+	minBigInt = big.NewInt(math.MinInt64)
+)
+
+func EncInt8(v int8) ([]byte, error) {
+	if v < 0 {
+		return []byte{255, 255, 255, 255, 255, 255, 255, byte(v)}, nil
+	}
+	return []byte{0, 0, 0, 0, 0, 0, 0, byte(v)}, nil
+}
+
+func EncInt8R(v *int8) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt8(*v)
+}
+
+func EncInt16(v int16) ([]byte, error) {
+	if v < 0 {
+		return []byte{255, 255, 255, 255, 255, 255, byte(v >> 8), byte(v)}, nil
+	}
+	return []byte{0, 0, 0, 0, 0, 0, byte(v >> 8), byte(v)}, nil
+}
+
+func EncInt16R(v *int16) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt16(*v)
+}
+
+func EncInt32(v int32) ([]byte, error) {
+	if v < 0 {
+		return []byte{255, 255, 255, 255, byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+	}
+	return []byte{0, 0, 0, 0, byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncInt32R(v *int32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt32(*v)
+}
+
+func EncInt64(v int64) ([]byte, error) {
+	return encInt64(v), nil
+}
+
+func EncInt64R(v *int64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt64(*v)
+}
+
+func EncInt(v int) ([]byte, error) {
+	return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncIntR(v *int) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt(*v)
+}
+
+func EncUint8(v uint8) ([]byte, error) {
+	return []byte{0, 0, 0, 0, 0, 0, 0, v}, nil
+}
+
+func EncUint8R(v *uint8) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint8(*v)
+}
+
+func EncUint16(v uint16) ([]byte, error) {
+	return []byte{0, 0, 0, 0, 0, 0, byte(v >> 8), byte(v)}, nil
+}
+
+func EncUint16R(v *uint16) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint16(*v)
+}
+
+func EncUint32(v uint32) ([]byte, error) {
+	return []byte{0, 0, 0, 0, byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncUint32R(v *uint32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint32(*v)
+}
+
+func EncUint64(v uint64) ([]byte, error) {
+	return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncUint64R(v *uint64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint64(*v)
+}
+
+func EncUint(v uint) ([]byte, error) {
+	return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncUintR(v *uint) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint(*v)
+}
+
+func EncBigInt(v big.Int) ([]byte, error) {
+	if v.Cmp(maxBigInt) == 1 || v.Cmp(minBigInt) == -1 {
+		return nil, fmt.Errorf("failed to marshal counter: value (%T)(%s) out of range", v, v.String())
+	}
+	return encInt64(v.Int64()), nil
+}
+
+func EncBigIntR(v *big.Int) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	if v.Cmp(maxBigInt) == 1 || v.Cmp(minBigInt) == -1 {
+		return nil, fmt.Errorf("failed to marshal counter: value (%T)(%s) out of range", v, v.String())
+	}
+	return encInt64(v.Int64()), nil
+}
+
+func EncString(v string) ([]byte, error) {
+	if v == "" {
+		return nil, nil
+	}
+
+	n, err := strconv.ParseInt(v, 10, 64)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal counter: can not marshal %#v %s", v, err)
+	}
+	return encInt64(n), nil
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncString(*v)
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.Int, reflect.Int64, reflect.Int32, reflect.Int16, reflect.Int8:
+		return EncInt64(v.Int())
+	case reflect.Uint, reflect.Uint64, reflect.Uint32, reflect.Uint16, reflect.Uint8:
+		return EncUint64(v.Uint())
+	case reflect.String:
+		val := v.String()
+		if val == "" {
+			return nil, nil
+		}
+		n, err := strconv.ParseInt(val, 10, 64)
+		if err != nil {
+			return nil, fmt.Errorf("failed to marshal counter: can not marshal (%T)(%[1]v) %s", v.Interface(), err)
+		}
+		return encInt64(n), nil
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal counter: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal counter: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encInt64(v int64) []byte {
+	return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/counter/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/counter/unmarshal.go
new file mode 100644
index 0000000..db18450
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/counter/unmarshal.go
@@ -0,0 +1,81 @@
+package counter
+
+import (
+	"fmt"
+	"math/big"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+
+	case *int8:
+		return DecInt8(data, v)
+	case *int16:
+		return DecInt16(data, v)
+	case *int32:
+		return DecInt32(data, v)
+	case *int64:
+		return DecInt64(data, v)
+	case *int:
+		return DecInt(data, v)
+
+	case *uint8:
+		return DecUint8(data, v)
+	case *uint16:
+		return DecUint16(data, v)
+	case *uint32:
+		return DecUint32(data, v)
+	case *uint64:
+		return DecUint64(data, v)
+	case *uint:
+		return DecUint(data, v)
+
+	case *big.Int:
+		return DecBigInt(data, v)
+	case *string:
+		return DecString(data, v)
+
+	case **int8:
+		return DecInt8R(data, v)
+	case **int16:
+		return DecInt16R(data, v)
+	case **int32:
+		return DecInt32R(data, v)
+	case **int64:
+		return DecInt64R(data, v)
+	case **int:
+		return DecIntR(data, v)
+
+	case **uint8:
+		return DecUint8R(data, v)
+	case **uint16:
+		return DecUint16R(data, v)
+	case **uint32:
+		return DecUint32R(data, v)
+	case **uint64:
+		return DecUint64R(data, v)
+	case **uint:
+		return DecUintR(data, v)
+
+	case **big.Int:
+		return DecBigIntR(data, v)
+	case **string:
+		return DecStringR(data, v)
+	default:
+
+		// Custom types (type MyInt int) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal counter: unsupported value type (%T)(%[1]v)", value)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/counter/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/counter/unmarshal_utils.go
new file mode 100644
index 0000000..59194dc
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/counter/unmarshal_utils.go
@@ -0,0 +1,841 @@
+package counter
+
+import (
+	"fmt"
+	"math"
+	"math/big"
+	"reflect"
+	"strconv"
+)
+
+var errWrongDataLen = fmt.Errorf("failed to unmarshal counter: the length of the data should be 0 or 8")
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal counter: can not unmarshal into nil reference (%T)(%[1]v))", v)
+}
+
+func DecInt8(p []byte, v *int8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into int8, the data should be in the int8 range")
+		}
+		*v = int8(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt8R(p []byte, v **int8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int8)
+		}
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into int8, the data should be in the int8 range")
+		}
+		tmp := int8(val)
+		*v = &tmp
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt16(p []byte, v *int16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt16 || val < math.MinInt16 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into int16, the data should be in the int16 range")
+		}
+		*v = int16(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt16R(p []byte, v **int16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int16)
+		}
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt16 || val < math.MinInt16 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into int16, the data should be in the int16 range")
+		}
+		tmp := int16(val)
+		*v = &tmp
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt32(p []byte, v *int32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt32 || val < math.MinInt32 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into int32, the data should be in the int32 range")
+		}
+		*v = int32(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt32R(p []byte, v **int32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int32)
+		}
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt32 || val < math.MinInt32 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into int32, the data should be in the int32 range")
+		}
+		tmp := int32(val)
+		*v = &tmp
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt64(p []byte, v *int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		*v = decInt64(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt64R(p []byte, v **int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int64)
+		}
+	case 8:
+		val := decInt64(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt(p []byte, v *int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		*v = int(p[0])<<56 | int(p[1])<<48 | int(p[2])<<40 | int(p[3])<<32 | int(p[4])<<24 | int(p[5])<<16 | int(p[6])<<8 | int(p[7])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecIntR(p []byte, v **int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int)
+		}
+	case 8:
+		val := int(p[0])<<56 | int(p[1])<<48 | int(p[2])<<40 | int(p[3])<<32 | int(p[4])<<24 | int(p[5])<<16 | int(p[6])<<8 | int(p[7])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint8(p []byte, v *uint8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 || p[6] != 0 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into uint8, the data should be in the uint8 range")
+		}
+		*v = p[7]
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint8R(p []byte, v **uint8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint8)
+		}
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 || p[6] != 0 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into uint8, the data should be in the uint8 range")
+		}
+		val := p[7]
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint16(p []byte, v *uint16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into uint16, the data should be in the uint16 range")
+		}
+		*v = uint16(p[6])<<8 | uint16(p[7])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint16R(p []byte, v **uint16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint16)
+		}
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into uint16, the data should be in the uint16 range")
+		}
+		val := uint16(p[6])<<8 | uint16(p[7])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint32(p []byte, v *uint32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into uint32, the data should be in the uint32 range")
+		}
+		*v = uint32(p[4])<<24 | uint32(p[5])<<16 | uint32(p[6])<<8 | uint32(p[7])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint32R(p []byte, v **uint32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint32)
+		}
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into uint32, the data should be in the uint32 range")
+		}
+		val := uint32(p[4])<<24 | uint32(p[5])<<16 | uint32(p[6])<<8 | uint32(p[7])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint64(p []byte, v *uint64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		*v = decUint64(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint64R(p []byte, v **uint64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint64)
+		}
+	case 8:
+		val := decUint64(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint(p []byte, v *uint) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		*v = uint(p[0])<<56 | uint(p[1])<<48 | uint(p[2])<<40 | uint(p[3])<<32 | uint(p[4])<<24 | uint(p[5])<<16 | uint(p[6])<<8 | uint(p[7])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUintR(p []byte, v **uint) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint)
+		}
+	case 8:
+		val := uint(p[0])<<56 | uint(p[1])<<48 | uint(p[2])<<40 | uint(p[3])<<32 | uint(p[4])<<24 | uint(p[5])<<16 | uint(p[6])<<8 | uint(p[7])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = ""
+		} else {
+			*v = "0"
+		}
+	case 8:
+		*v = strconv.FormatInt(decInt64(p), 10)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			val := "0"
+			*v = &val
+		}
+	case 8:
+		val := strconv.FormatInt(decInt64(p), 10)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecBigInt(p []byte, v *big.Int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		v.SetInt64(0)
+	case 8:
+		v.SetInt64(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecBigIntR(p []byte, v **big.Int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(big.Int)
+		}
+	case 8:
+		*v = big.NewInt(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal counter: can not unmarshal into nil reference (%T)(%[1]v)", v.Interface())
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Int8:
+		return decReflectInt8(p, v)
+	case reflect.Int16:
+		return decReflectInt16(p, v)
+	case reflect.Int32:
+		return decReflectInt32(p, v)
+	case reflect.Int64, reflect.Int:
+		return decReflectInts(p, v)
+	case reflect.Uint8:
+		return decReflectUint8(p, v)
+	case reflect.Uint16:
+		return decReflectUint16(p, v)
+	case reflect.Uint32:
+		return decReflectUint32(p, v)
+	case reflect.Uint64, reflect.Uint:
+		return decReflectUints(p, v)
+	case reflect.String:
+		return decReflectString(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal counter: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectInt8(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into %T, the data should be in the int8 range", v.Interface())
+		}
+		v.SetInt(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInt16(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt16 || val < math.MinInt16 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into %T, the data should be in the int16 range", v.Interface())
+		}
+		v.SetInt(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInt32(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt32 || val < math.MinInt32 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into %T, the data should be in the int32 range", v.Interface())
+		}
+		v.SetInt(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInts(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 8:
+		v.SetInt(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint8(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 || p[6] != 0 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into %T, the data should be in the uint8 range", v.Interface())
+		}
+		v.SetUint(uint64(p[7]))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint16(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into %T, the data should be in the uint16 range", v.Interface())
+		}
+		v.SetUint(uint64(p[6])<<8 | uint64(p[7]))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint32(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 8:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into %T, the data should be in the uint32 range", v.Interface())
+		}
+		v.SetUint(uint64(p[4])<<24 | uint64(p[5])<<16 | uint64(p[6])<<8 | uint64(p[7]))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUints(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 8:
+		v.SetUint(decUint64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectString(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.SetString("")
+		} else {
+			v.SetString("0")
+		}
+	case 8:
+		v.SetString(strconv.FormatInt(decInt64(p), 10))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal counter: can not unmarshal into nil reference (%T)(%[1]v)", v.Interface())
+	}
+
+	switch v.Type().Elem().Elem().Kind() {
+	case reflect.Int8:
+		return decReflectInt8R(p, v)
+	case reflect.Int16:
+		return decReflectInt16R(p, v)
+	case reflect.Int32:
+		return decReflectInt32R(p, v)
+	case reflect.Int64, reflect.Int:
+		return decReflectIntsR(p, v)
+	case reflect.Uint8:
+		return decReflectUint8R(p, v)
+	case reflect.Uint16:
+		return decReflectUint16R(p, v)
+	case reflect.Uint32:
+		return decReflectUint32R(p, v)
+	case reflect.Uint64, reflect.Uint:
+		return decReflectUintsR(p, v)
+	case reflect.String:
+		return decReflectStringR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal counter: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectInt8R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into %T, the data should be in the int8 range", v.Interface())
+		}
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(val)
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInt16R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt16 || val < math.MinInt16 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into %T, the data should be in the int16 range", v.Interface())
+		}
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(val)
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInt32R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		val := decInt64(p)
+		if val > math.MaxInt32 || val < math.MinInt32 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into %T, the data should be in the int32 range", v.Interface())
+		}
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(val)
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectIntsR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetInt(decInt64(p))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint8R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 || p[6] != 0 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into %T, the data should be in the uint8 range", v.Interface())
+		}
+		newVal.Elem().SetUint(uint64(p[7]))
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint16R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 || p[4] != 0 || p[5] != 0 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into %T, the data should be in the uint16 range", v.Interface())
+		}
+		newVal.Elem().SetUint(uint64(p[6])<<8 | uint64(p[7]))
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint32R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 || p[3] != 0 {
+			return fmt.Errorf("failed to unmarshal counter: to unmarshal into %T, the data should be in the uint32 range", v.Interface())
+		}
+		newVal.Elem().SetUint(uint64(p[4])<<24 | uint64(p[5])<<16 | uint64(p[6])<<8 | uint64(p[7]))
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUintsR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetUint(decUint64(p))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		var val reflect.Value
+		if p == nil {
+			val = reflect.Zero(v.Type().Elem())
+		} else {
+			val = reflect.New(v.Type().Elem().Elem())
+			val.Elem().SetString("0")
+		}
+		v.Elem().Set(val)
+	case 8:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetString(strconv.FormatInt(decInt64(p), 10))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectNullableR(p []byte, v reflect.Value) reflect.Value {
+	if p == nil {
+		return reflect.Zero(v.Elem().Type())
+	}
+	return reflect.New(v.Type().Elem().Elem())
+}
+
+func decInt64(p []byte) int64 {
+	return int64(p[0])<<56 | int64(p[1])<<48 | int64(p[2])<<40 | int64(p[3])<<32 | int64(p[4])<<24 | int64(p[5])<<16 | int64(p[6])<<8 | int64(p[7])
+}
+
+func decUint64(p []byte) uint64 {
+	return uint64(p[0])<<56 | uint64(p[1])<<48 | uint64(p[2])<<40 | uint64(p[3])<<32 | uint64(p[4])<<24 | uint64(p[5])<<16 | uint64(p[6])<<8 | uint64(p[7])
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/cqlint/marshal.go b/vendor/github.com/gocql/gocql/serialization/cqlint/marshal.go
new file mode 100644
index 0000000..b4d6bee
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/cqlint/marshal.go
@@ -0,0 +1,74 @@
+package cqlint
+
+import (
+	"math/big"
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case int8:
+		return EncInt8(v)
+	case int32:
+		return EncInt32(v)
+	case int16:
+		return EncInt16(v)
+	case int64:
+		return EncInt64(v)
+	case int:
+		return EncInt(v)
+
+	case uint8:
+		return EncUint8(v)
+	case uint16:
+		return EncUint16(v)
+	case uint32:
+		return EncUint32(v)
+	case uint64:
+		return EncUint64(v)
+	case uint:
+		return EncUint(v)
+
+	case big.Int:
+		return EncBigInt(v)
+	case string:
+		return EncString(v)
+
+	case *int8:
+		return EncInt8R(v)
+	case *int16:
+		return EncInt16R(v)
+	case *int32:
+		return EncInt32R(v)
+	case *int64:
+		return EncInt64R(v)
+	case *int:
+		return EncIntR(v)
+
+	case *uint8:
+		return EncUint8R(v)
+	case *uint16:
+		return EncUint16R(v)
+	case *uint32:
+		return EncUint32R(v)
+	case *uint64:
+		return EncUint64R(v)
+	case *uint:
+		return EncUintR(v)
+
+	case *big.Int:
+		return EncBigIntR(v)
+	case *string:
+		return EncStringR(v)
+	default:
+		// Custom types (type MyInt int) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/cqlint/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/cqlint/marshal_utils.go
new file mode 100644
index 0000000..4e71d78
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/cqlint/marshal_utils.go
@@ -0,0 +1,249 @@
+package cqlint
+
+import (
+	"fmt"
+	"math"
+	"math/big"
+	"reflect"
+	"strconv"
+)
+
+var (
+	maxBigInt = big.NewInt(math.MaxInt32)
+	minBigInt = big.NewInt(math.MinInt32)
+)
+
+func EncInt8(v int8) ([]byte, error) {
+	if v < 0 {
+		return []byte{255, 255, 255, byte(v)}, nil
+	}
+	return []byte{0, 0, 0, byte(v)}, nil
+}
+
+func EncInt8R(v *int8) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt8(*v)
+}
+
+func EncInt16(v int16) ([]byte, error) {
+	if v < 0 {
+		return []byte{255, 255, byte(v >> 8), byte(v)}, nil
+	}
+	return []byte{0, 0, byte(v >> 8), byte(v)}, nil
+}
+
+func EncInt16R(v *int16) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt16(*v)
+}
+
+func EncInt32(v int32) ([]byte, error) {
+	return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncInt32R(v *int32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt32(*v)
+}
+
+func EncInt64(v int64) ([]byte, error) {
+	if v > math.MaxInt32 || v < math.MinInt32 {
+		return nil, fmt.Errorf("failed to marshal int: value %#v out of range", v)
+	}
+	return encInt64(v), nil
+}
+
+func EncInt64R(v *int64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt64(*v)
+}
+
+func EncInt(v int) ([]byte, error) {
+	if v > math.MaxInt32 || v < math.MinInt32 {
+		return nil, fmt.Errorf("failed to marshal int: value %#v out of range", v)
+	}
+	return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncIntR(v *int) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt(*v)
+}
+
+func EncUint8(v uint8) ([]byte, error) {
+	return []byte{0, 0, 0, v}, nil
+}
+
+func EncUint8R(v *uint8) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint8(*v)
+}
+
+func EncUint16(v uint16) ([]byte, error) {
+	return []byte{0, 0, byte(v >> 8), byte(v)}, nil
+}
+
+func EncUint16R(v *uint16) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint16(*v)
+}
+
+func EncUint32(v uint32) ([]byte, error) {
+	return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncUint32R(v *uint32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint32(*v)
+}
+
+func EncUint64(v uint64) ([]byte, error) {
+	if v > math.MaxUint32 {
+		return nil, fmt.Errorf("failed to marshal int: value %#v out of range", v)
+	}
+	return encUint64(v), nil
+}
+
+func EncUint64R(v *uint64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint64(*v)
+}
+
+func EncUint(v uint) ([]byte, error) {
+	if v > math.MaxUint32 {
+		return nil, fmt.Errorf("failed to marshal int: value %#v out of range", v)
+	}
+	return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncUintR(v *uint) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint(*v)
+}
+
+func EncBigInt(v big.Int) ([]byte, error) {
+	if v.Cmp(maxBigInt) == 1 || v.Cmp(minBigInt) == -1 {
+		return nil, fmt.Errorf("failed to marshal int: value (%T)(%s) out of range", v, v.String())
+	}
+	return encInt64(v.Int64()), nil
+}
+
+func EncBigIntR(v *big.Int) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	if v.Cmp(maxBigInt) == 1 || v.Cmp(minBigInt) == -1 {
+		return nil, fmt.Errorf("failed to marshal int: value (%T)(%s) out of range", v, v.String())
+	}
+	return encInt64(v.Int64()), nil
+}
+
+func EncString(v string) ([]byte, error) {
+	if v == "" {
+		return nil, nil
+	}
+
+	n, err := strconv.ParseInt(v, 10, 32)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal int: can not marshal (%T)(%[1]v) %s", v, err)
+	}
+	return encInt64(n), nil
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncString(*v)
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Type().Kind() {
+	case reflect.Int8:
+		val := v.Int()
+		if val < 0 {
+			return []byte{255, 255, 255, byte(val)}, nil
+		}
+		return []byte{0, 0, 0, byte(val)}, nil
+	case reflect.Int16:
+		val := v.Int()
+		if val < 0 {
+			return []byte{255, 255, byte(val >> 8), byte(val)}, nil
+		}
+		return []byte{0, 0, byte(val >> 8), byte(val)}, nil
+	case reflect.Int32:
+		return encInt64(v.Int()), nil
+	case reflect.Int, reflect.Int64:
+		val := v.Int()
+		if val > math.MaxInt32 || val < math.MinInt32 {
+			return nil, fmt.Errorf("failed to marshal int: value (%T)(%[1]v) out of range", v.Interface())
+		}
+		return encInt64(val), nil
+	case reflect.Uint8:
+		return []byte{0, 0, 0, byte(v.Uint())}, nil
+	case reflect.Uint16:
+		val := v.Uint()
+		return []byte{0, 0, byte(val >> 8), byte(val)}, nil
+	case reflect.Uint32:
+		return encUint64(v.Uint()), nil
+	case reflect.Uint, reflect.Uint64:
+		val := v.Uint()
+		if val > math.MaxUint32 {
+			return nil, fmt.Errorf("failed to marshal int: value (%T)(%[1]v) out of range", v.Interface())
+		}
+		return encUint64(val), nil
+	case reflect.String:
+		val := v.String()
+		if val == "" {
+			return nil, nil
+		}
+
+		n, err := strconv.ParseInt(val, 10, 32)
+		if err != nil {
+			return nil, fmt.Errorf("failed to marshal int: can not marshal (%T)(%[1]v) %s", v.Interface(), err)
+		}
+		return encInt64(n), nil
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal int: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal int: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encInt64(v int64) []byte {
+	return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
+
+func encUint64(v uint64) []byte {
+	return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/cqlint/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/cqlint/unmarshal.go
new file mode 100644
index 0000000..3d0e611
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/cqlint/unmarshal.go
@@ -0,0 +1,81 @@
+package cqlint
+
+import (
+	"fmt"
+	"math/big"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+
+	case *int8:
+		return DecInt8(data, v)
+	case *int16:
+		return DecInt16(data, v)
+	case *int32:
+		return DecInt32(data, v)
+	case *int64:
+		return DecInt64(data, v)
+	case *int:
+		return DecInt(data, v)
+
+	case *uint8:
+		return DecUint8(data, v)
+	case *uint16:
+		return DecUint16(data, v)
+	case *uint32:
+		return DecUint32(data, v)
+	case *uint64:
+		return DecUint64(data, v)
+	case *uint:
+		return DecUint(data, v)
+
+	case *big.Int:
+		return DecBigInt(data, v)
+	case *string:
+		return DecString(data, v)
+
+	case **int8:
+		return DecInt8R(data, v)
+	case **int16:
+		return DecInt16R(data, v)
+	case **int32:
+		return DecInt32R(data, v)
+	case **int64:
+		return DecInt64R(data, v)
+	case **int:
+		return DecIntR(data, v)
+
+	case **uint8:
+		return DecUint8R(data, v)
+	case **uint16:
+		return DecUint16R(data, v)
+	case **uint32:
+		return DecUint32R(data, v)
+	case **uint64:
+		return DecUint64R(data, v)
+	case **uint:
+		return DecUintR(data, v)
+
+	case **big.Int:
+		return DecBigIntR(data, v)
+	case **string:
+		return DecStringR(data, v)
+	default:
+
+		// Custom types (type MyInt int) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal int: unsupported value type (%T)(%[1]v)", value)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/cqlint/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/cqlint/unmarshal_utils.go
new file mode 100644
index 0000000..eb72cd6
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/cqlint/unmarshal_utils.go
@@ -0,0 +1,772 @@
+package cqlint
+
+import (
+	"fmt"
+	"math"
+	"math/big"
+	"reflect"
+	"strconv"
+)
+
+const (
+	negInt64 = int64(-1) << 32
+	negInt   = int(-1) << 32
+)
+
+var errWrongDataLen = fmt.Errorf("failed to unmarshal int: the length of the data should be 0 or 4")
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal int: can not unmarshal into nil reference (%T)(%[1]v))", v)
+}
+
+func DecInt8(p []byte, v *int8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 4:
+		val := decInt32(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into int8, the data should be in the int8 range")
+		}
+		*v = int8(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt8R(p []byte, v **int8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int8)
+		}
+	case 4:
+		val := decInt32(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into int8, the data should be in the int8 range")
+		}
+		tmp := int8(val)
+		*v = &tmp
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt16(p []byte, v *int16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 4:
+		val := decInt32(p)
+		if val > math.MaxInt16 || val < math.MinInt16 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into int16, the data should be in the int16 range")
+		}
+		*v = int16(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt16R(p []byte, v **int16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int16)
+		}
+	case 4:
+		val := decInt32(p)
+		if val > math.MaxInt16 || val < math.MinInt16 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into int16, the data should be in the int16 range")
+		}
+		tmp := int16(val)
+		*v = &tmp
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt32(p []byte, v *int32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 4:
+		*v = decInt32(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt32R(p []byte, v **int32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int32)
+		}
+	case 4:
+		tmp := decInt32(p)
+		*v = &tmp
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt64(p []byte, v *int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 4:
+		*v = decInt64(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt64R(p []byte, v **int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int64)
+		}
+	case 4:
+		val := decInt64(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt(p []byte, v *int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 4:
+		*v = decInt(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecIntR(p []byte, v **int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int)
+		}
+	case 4:
+		val := decInt(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint8(p []byte, v *uint8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 4:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into uint8, the data should be in the uint8 range")
+		}
+		*v = p[3]
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint8R(p []byte, v **uint8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint8)
+		}
+	case 4:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into uint8, the data should be in the uint8 range")
+		}
+		val := p[3]
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint16(p []byte, v *uint16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 4:
+		if p[0] != 0 || p[1] != 0 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into uint16, the data should be in the uint16 range")
+		}
+		*v = uint16(p[2])<<8 | uint16(p[3])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint16R(p []byte, v **uint16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint16)
+		}
+	case 4:
+		if p[0] != 0 || p[1] != 0 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into uint16, the data should be in the uint16 range")
+		}
+		val := uint16(p[2])<<8 | uint16(p[3])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint32(p []byte, v *uint32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 4:
+		*v = uint32(p[0])<<24 | uint32(p[1])<<16 | uint32(p[2])<<8 | uint32(p[3])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint32R(p []byte, v **uint32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint32)
+		}
+	case 4:
+		val := uint32(p[0])<<24 | uint32(p[1])<<16 | uint32(p[2])<<8 | uint32(p[3])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint64(p []byte, v *uint64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 4:
+		*v = decUint64(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint64R(p []byte, v **uint64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint64)
+		}
+	case 4:
+		val := decUint64(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint(p []byte, v *uint) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 4:
+		*v = uint(p[0])<<24 | uint(p[1])<<16 | uint(p[2])<<8 | uint(p[3])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUintR(p []byte, v **uint) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint)
+		}
+	case 4:
+		val := uint(p[0])<<24 | uint(p[1])<<16 | uint(p[2])<<8 | uint(p[3])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = ""
+		} else {
+			*v = "0"
+		}
+	case 4:
+		*v = strconv.FormatInt(decInt64(p), 10)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			val := "0"
+			*v = &val
+		}
+	case 4:
+		val := strconv.FormatInt(decInt64(p), 10)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecBigInt(p []byte, v *big.Int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		v.SetInt64(0)
+	case 4:
+		v.SetInt64(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecBigIntR(p []byte, v **big.Int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = big.NewInt(0)
+		}
+	case 4:
+		*v = big.NewInt(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal int: can not unmarshal into nil reference (%T)(%[1]v)", v.Interface())
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Int8:
+		return decReflectInt8(p, v)
+	case reflect.Int16:
+		return decReflectInt16(p, v)
+	case reflect.Int32, reflect.Int64, reflect.Int:
+		return decReflectInts(p, v)
+	case reflect.Uint8:
+		return decReflectUint8(p, v)
+	case reflect.Uint16:
+		return decReflectUint16(p, v)
+	case reflect.Uint32, reflect.Uint64, reflect.Uint:
+		return decReflectUints(p, v)
+	case reflect.String:
+		return decReflectString(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal int: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal int: can not unmarshal into nil reference (%T)(%[1]v)", v.Interface())
+	}
+
+	switch v.Type().Elem().Elem().Kind() {
+	case reflect.Int8:
+		return decReflectInt8R(p, v)
+	case reflect.Int16:
+		return decReflectInt16R(p, v)
+	case reflect.Int32, reflect.Int64, reflect.Int:
+		return decReflectIntsR(p, v)
+	case reflect.Uint8:
+		return decReflectUint8R(p, v)
+	case reflect.Uint16:
+		return decReflectUint16R(p, v)
+	case reflect.Uint32, reflect.Uint64, reflect.Uint:
+		return decReflectUintsR(p, v)
+	case reflect.String:
+		return decReflectStringR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal int: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectInt8(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 4:
+		val := decInt64(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into (%T), the data should be in the int8 range", v.Interface())
+		}
+		v.SetInt(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInt16(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 4:
+		val := decInt64(p)
+		if val > math.MaxInt16 || val < math.MinInt16 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into (%T), the data should be in the int16 range", v.Interface())
+		}
+		v.SetInt(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInts(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 4:
+		v.SetInt(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint8(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 4:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into (%T), the data should be in the uint8 range", v.Interface())
+		}
+		v.SetUint(uint64(p[3]))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint16(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 4:
+		if p[0] != 0 || p[1] != 0 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into (%T), the data should be in the uint16 range", v.Interface())
+		}
+		v.SetUint(uint64(p[2])<<8 | uint64(p[3]))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUints(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 4:
+		v.SetUint(decUint64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectString(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.SetString("")
+		} else {
+			v.SetString("0")
+		}
+	case 4:
+		v.SetString(strconv.FormatInt(decInt64(p), 10))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectNullableR(p []byte, v reflect.Value) reflect.Value {
+	if p == nil {
+		return reflect.Zero(v.Elem().Type())
+	}
+	return reflect.New(v.Type().Elem().Elem())
+}
+
+func decReflectInt8R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 4:
+		val := decInt64(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into (%T), the data should be in the int8 range", v.Interface())
+		}
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(val)
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInt16R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 4:
+		val := decInt64(p)
+		if val > math.MaxInt16 || val < math.MinInt16 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into (%T), the data should be in the int16 range", v.Interface())
+		}
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(val)
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectIntsR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 4:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(decInt64(p))
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint8R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 4:
+		if p[0] != 0 || p[1] != 0 || p[2] != 0 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into (%T), the data should be in the uint8 range", v.Interface())
+		}
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(uint64(p[3]))
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint16R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 4:
+		if p[0] != 0 || p[1] != 0 {
+			return fmt.Errorf("failed to unmarshal int: to unmarshal into (%T), the data should be in the uint16 range", v.Interface())
+		}
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(uint64(p[2])<<8 | uint64(p[3]))
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUintsR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 4:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(decUint64(p))
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		var val reflect.Value
+		if p == nil {
+			val = reflect.Zero(v.Type().Elem())
+		} else {
+			val = reflect.New(v.Type().Elem().Elem())
+			val.Elem().SetString("0")
+		}
+		v.Elem().Set(val)
+	case 4:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(strconv.FormatInt(decInt64(p), 10))
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decInt32(p []byte) int32 {
+	return int32(p[0])<<24 | int32(p[1])<<16 | int32(p[2])<<8 | int32(p[3])
+}
+
+func decInt64(p []byte) int64 {
+	if p[0] > math.MaxInt8 {
+		return negInt64 | int64(p[0])<<24 | int64(p[1])<<16 | int64(p[2])<<8 | int64(p[3])
+	}
+	return int64(p[0])<<24 | int64(p[1])<<16 | int64(p[2])<<8 | int64(p[3])
+}
+
+func decInt(p []byte) int {
+	if p[0] > math.MaxInt8 {
+		return negInt | int(p[0])<<24 | int(p[1])<<16 | int(p[2])<<8 | int(p[3])
+	}
+	return int(p[0])<<24 | int(p[1])<<16 | int(p[2])<<8 | int(p[3])
+}
+
+func decUint64(p []byte) uint64 {
+	return uint64(p[0])<<24 | uint64(p[1])<<16 | uint64(p[2])<<8 | uint64(p[3])
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/cqltime/marshal.go b/vendor/github.com/gocql/gocql/serialization/cqltime/marshal.go
new file mode 100644
index 0000000..e33ca4d
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/cqltime/marshal.go
@@ -0,0 +1,30 @@
+package cqltime
+
+import (
+	"reflect"
+	"time"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case int64:
+		return EncInt64(v)
+	case *int64:
+		return EncInt64R(v)
+	case time.Duration:
+		return EncDuration(v)
+	case *time.Duration:
+		return EncDurationR(v)
+
+	default:
+		// Custom types (type MyTime int64) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/cqltime/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/cqltime/marshal_utils.go
new file mode 100644
index 0000000..bfc4681
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/cqltime/marshal_utils.go
@@ -0,0 +1,76 @@
+package cqltime
+
+import (
+	"fmt"
+	"reflect"
+	"time"
+)
+
+const (
+	maxValInt64 int64         = 86399999999999
+	minValInt64 int64         = 0
+	maxValDur   time.Duration = 86399999999999
+	minValDur   time.Duration = 0
+)
+
+var (
+	errOutRangeInt64 = fmt.Errorf("failed to marshal time: the (int64) should be in the range 0 to 86399999999999")
+	errOutRangeDur   = fmt.Errorf("failed to marshal time: the (time.Duration) should be in the range 0 to 86399999999999")
+)
+
+func EncInt64(v int64) ([]byte, error) {
+	if v > maxValInt64 || v < minValInt64 {
+		return nil, errOutRangeInt64
+	}
+	return encInt64(v), nil
+}
+
+func EncInt64R(v *int64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt64(*v)
+}
+
+func EncDuration(v time.Duration) ([]byte, error) {
+	if v > maxValDur || v < minValDur {
+		return nil, errOutRangeDur
+	}
+	return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}, nil
+}
+
+func EncDurationR(v *time.Duration) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncDuration(*v)
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.Int64:
+		val := v.Int()
+		if val > maxValInt64 || val < minValInt64 {
+			return nil, fmt.Errorf("failed to marshal time: the (%T) should be in the range 0 to 86399999999999", v.Interface())
+		}
+		return encInt64(val), nil
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal time: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal time: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encInt64(v int64) []byte {
+	return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/cqltime/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/cqltime/unmarshal.go
new file mode 100644
index 0000000..cafef21
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/cqltime/unmarshal.go
@@ -0,0 +1,36 @@
+package cqltime
+
+import (
+	"fmt"
+	"reflect"
+	"time"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+
+	case *int64:
+		return DecInt64(data, v)
+	case **int64:
+		return DecInt64R(data, v)
+	case *time.Duration:
+		return DecDuration(data, v)
+	case **time.Duration:
+		return DecDurationR(data, v)
+	default:
+
+		// Custom types (type MyTime int64) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal time: unsupported value type (%T)(%[1]v)", value)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/cqltime/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/cqltime/unmarshal_utils.go
new file mode 100644
index 0000000..cc26f54
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/cqltime/unmarshal_utils.go
@@ -0,0 +1,171 @@
+package cqltime
+
+import (
+	"fmt"
+	"reflect"
+	"time"
+)
+
+var (
+	errWrongDataLen      = fmt.Errorf("failed to unmarshal time: the length of the data should be 0 or 8")
+	errDataOutRangeInt64 = fmt.Errorf("failed to unmarshal time: (int64) the data should be in the range 0 to 86399999999999")
+	errDataOutRangeDur   = fmt.Errorf("failed to unmarshal time: (time.Duration) the data should be in the range 0 to 86399999999999")
+)
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal time: can not unmarshal into nil reference (%T)(%[1]v))", v)
+}
+
+func DecInt64(p []byte, v *int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		*v = decInt64(p)
+		if *v > maxValInt64 || *v < minValInt64 {
+			return errDataOutRangeInt64
+		}
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt64R(p []byte, v **int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int64)
+		}
+	case 8:
+		val := decInt64(p)
+		if val > maxValInt64 || val < minValInt64 {
+			return errDataOutRangeInt64
+		}
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecDuration(p []byte, v *time.Duration) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		*v = decDur(p)
+		if *v > maxValDur || *v < minValDur {
+			return errDataOutRangeDur
+		}
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecDurationR(p []byte, v **time.Duration) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(time.Duration)
+		}
+	case 8:
+		val := decDur(p)
+		if val > maxValDur || val < minValDur {
+			return errDataOutRangeDur
+		}
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal time: can not unmarshal into nil reference (%T)(%[1]v))", v.Interface())
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Int64, reflect.Int:
+		return decReflectInt64(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal time: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectInt64(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 8:
+		val := decInt64(p)
+		if val > maxValInt64 || val < minValInt64 {
+			return fmt.Errorf("failed to unmarshal time: (%T) the data should be in the range 0 to 86399999999999", v.Interface())
+		}
+		v.SetInt(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal time: can not unmarshal into nil reference (%T)(%[1]v)", v.Interface())
+	}
+
+	switch v.Type().Elem().Elem().Kind() {
+	case reflect.Int64, reflect.Int:
+		return decReflectIntsR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal time: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectIntsR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.Elem().Set(reflect.Zero(v.Elem().Type()))
+		} else {
+			v.Elem().Set(reflect.New(v.Type().Elem().Elem()))
+		}
+	case 8:
+		vv := decInt64(p)
+		if vv > maxValInt64 || vv < minValInt64 {
+			return fmt.Errorf("failed to unmarshal time: (%T) the data should be in the range 0 to 86399999999999", v.Interface())
+		}
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetInt(vv)
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decInt64(p []byte) int64 {
+	return int64(p[0])<<56 | int64(p[1])<<48 | int64(p[2])<<40 | int64(p[3])<<32 | int64(p[4])<<24 | int64(p[5])<<16 | int64(p[6])<<8 | int64(p[7])
+}
+
+func decDur(p []byte) time.Duration {
+	return time.Duration(p[0])<<56 | time.Duration(p[1])<<48 | time.Duration(p[2])<<40 | time.Duration(p[3])<<32 | time.Duration(p[4])<<24 | time.Duration(p[5])<<16 | time.Duration(p[6])<<8 | time.Duration(p[7])
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/date/marshal.go b/vendor/github.com/gocql/gocql/serialization/date/marshal.go
new file mode 100644
index 0000000..0115c1a
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/date/marshal.go
@@ -0,0 +1,42 @@
+package date
+
+import (
+	"reflect"
+	"time"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case int32:
+		return EncInt32(v)
+	case int64:
+		return EncInt64(v)
+	case uint32:
+		return EncUint32(v)
+	case string:
+		return EncString(v)
+	case time.Time:
+		return EncTime(v)
+
+	case *int32:
+		return EncInt32R(v)
+	case *int64:
+		return EncInt64R(v)
+	case *uint32:
+		return EncUint32R(v)
+	case *string:
+		return EncStringR(v)
+	case *time.Time:
+		return EncTimeR(v)
+	default:
+		// Custom types (type MyDate uint32) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/date/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/date/marshal_utils.go
new file mode 100644
index 0000000..e18cc1b
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/date/marshal_utils.go
@@ -0,0 +1,223 @@
+package date
+
+import (
+	"fmt"
+	"reflect"
+	"strconv"
+	"strings"
+	"time"
+)
+
+const (
+	millisecondsInADay int64 = 24 * 60 * 60 * 1000
+	centerEpoch        int64 = 1 << 31
+	maxYear            int   = 5881580
+	minYear            int   = -5877641
+	maxMilliseconds    int64 = 185542587100800000
+	minMilliseconds    int64 = -185542587187200000
+)
+
+var (
+	maxDate = time.Date(5881580, 07, 11, 0, 0, 0, 0, time.UTC)
+	minDate = time.Date(-5877641, 06, 23, 0, 0, 0, 0, time.UTC)
+)
+
+func errWrongStringFormat(v interface{}) error {
+	return fmt.Errorf(`failed to marshal date: the (%T)(%[1]v) should have fromat "2006-01-02"`, v)
+}
+
+func EncInt32(v int32) ([]byte, error) {
+	return encInt32(v), nil
+}
+
+func EncInt32R(v *int32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encInt32(*v), nil
+}
+
+func EncInt64(v int64) ([]byte, error) {
+	if v > maxMilliseconds || v < minMilliseconds {
+		return nil, fmt.Errorf("failed to marshal date: the (int64)(%v) value out of range", v)
+	}
+	return encInt64(days(v)), nil
+}
+
+func EncInt64R(v *int64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt64(*v)
+}
+
+func EncUint32(v uint32) ([]byte, error) {
+	return encUint32(v), nil
+}
+
+func EncUint32R(v *uint32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encUint32(*v), nil
+}
+
+func EncTime(v time.Time) ([]byte, error) {
+	if v.After(maxDate) || v.Before(minDate) {
+		return nil, fmt.Errorf("failed to marshal date: the (%T)(%s) value should be in the range from -5877641-06-23 to 5881580-07-11", v, v.Format("2006-01-02"))
+	}
+	return encTime(v), nil
+}
+
+func EncTimeR(v *time.Time) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncTime(*v)
+}
+
+func EncString(v string) ([]byte, error) {
+	if v == "" {
+		return nil, nil
+	}
+	var err error
+	var y, m, d int
+	var t time.Time
+	switch ps := strings.Split(v, "-"); len(ps) {
+	case 3:
+		if y, err = strconv.Atoi(ps[0]); err != nil {
+			return nil, errWrongStringFormat(v)
+		}
+		if m, err = strconv.Atoi(ps[1]); err != nil {
+			return nil, errWrongStringFormat(v)
+		}
+		if d, err = strconv.Atoi(ps[2]); err != nil {
+			return nil, errWrongStringFormat(v)
+		}
+	case 4:
+		if y, err = strconv.Atoi(ps[1]); err != nil || ps[0] != "" {
+			return nil, errWrongStringFormat(v)
+		}
+		y = -y
+		if m, err = strconv.Atoi(ps[2]); err != nil {
+			return nil, errWrongStringFormat(v)
+		}
+		if d, err = strconv.Atoi(ps[3]); err != nil {
+			return nil, errWrongStringFormat(v)
+		}
+	default:
+		return nil, errWrongStringFormat(v)
+	}
+	if y > maxYear || y < minYear {
+		return nil, fmt.Errorf("failed to marshal date: the (%T)(%[1]v) value should be in the range from -5877641-06-23 to 5881580-07-11", v)
+	}
+	t = time.Date(y, time.Month(m), d, 0, 0, 0, 0, time.UTC)
+	if t.After(maxDate) || t.Before(minDate) {
+		return nil, fmt.Errorf("failed to marshal date: the (%T)(%[1]v) value should be in the range from -5877641-06-23 to 5881580-07-11", v)
+	}
+	return encTime(t), nil
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncString(*v)
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.Int32:
+		return encInt64(v.Int()), nil
+	case reflect.Int64:
+		val := v.Int()
+		if val > maxMilliseconds || val < minMilliseconds {
+			return nil, fmt.Errorf("failed to marshal date: the value (%T)(%[1]v) out of range", v.Interface())
+		}
+		return encInt64(days(val)), nil
+	case reflect.Uint32:
+		val := v.Uint()
+		return []byte{byte(val >> 24), byte(val >> 16), byte(val >> 8), byte(val)}, nil
+	case reflect.String:
+		return encReflectString(v)
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal date: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal date: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encReflectString(v reflect.Value) ([]byte, error) {
+	val := v.String()
+	if val == "" {
+		return nil, nil
+	}
+	var err error
+	var y, m, d int
+	var t time.Time
+	ps := strings.Split(val, "-")
+	switch len(ps) {
+	case 3:
+		if y, err = strconv.Atoi(ps[0]); err != nil {
+			return nil, errWrongStringFormat(v.Interface())
+		}
+		if m, err = strconv.Atoi(ps[1]); err != nil {
+			return nil, errWrongStringFormat(v.Interface())
+		}
+		if d, err = strconv.Atoi(ps[2]); err != nil {
+			return nil, errWrongStringFormat(v.Interface())
+		}
+	case 4:
+		if y, err = strconv.Atoi(ps[1]); err != nil {
+			return nil, errWrongStringFormat(v.Interface())
+		}
+		y = -y
+		if m, err = strconv.Atoi(ps[2]); err != nil {
+			return nil, errWrongStringFormat(v.Interface())
+		}
+		if d, err = strconv.Atoi(ps[3]); err != nil {
+			return nil, errWrongStringFormat(v.Interface())
+		}
+	default:
+		return nil, errWrongStringFormat(v.Interface())
+	}
+	if y > maxYear || y < minYear {
+		return nil, fmt.Errorf("failed to marshal date: the (%T)(%[1]v) value should be in the range from -5877641-06-23 to 5881580-07-11", v.Interface())
+	}
+	t = time.Date(y, time.Month(m), d, 0, 0, 0, 0, time.UTC)
+	if t.After(maxDate) || t.Before(minDate) {
+		return nil, fmt.Errorf("failed to marshal date: the (%T)(%[1]v) value should be in the range from -5877641-06-23 to 5881580-07-11", v.Interface())
+	}
+	return encTime(t), nil
+}
+
+func encInt64(v int64) []byte {
+	return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
+
+func encInt32(v int32) []byte {
+	return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
+
+func encUint32(v uint32) []byte {
+	return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
+
+func encTime(v time.Time) []byte {
+	d := days(v.UnixMilli())
+	return []byte{byte(d >> 24), byte(d >> 16), byte(d >> 8), byte(d)}
+}
+
+func days(v int64) int64 {
+	return v/millisecondsInADay + centerEpoch
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/date/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/date/unmarshal.go
new file mode 100644
index 0000000..bca27cb
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/date/unmarshal.go
@@ -0,0 +1,49 @@
+package date
+
+import (
+	"fmt"
+	"reflect"
+	"time"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+
+	case *int32:
+		return DecInt32(data, v)
+	case *int64:
+		return DecInt64(data, v)
+	case *uint32:
+		return DecUint32(data, v)
+	case *string:
+		return DecString(data, v)
+	case *time.Time:
+		return DecTime(data, v)
+
+	case **int32:
+		return DecInt32R(data, v)
+	case **int64:
+		return DecInt64R(data, v)
+	case **uint32:
+		return DecUint32R(data, v)
+	case **string:
+		return DecStringR(data, v)
+	case **time.Time:
+		return DecTimeR(data, v)
+	default:
+
+		// Custom types (type MyDate uint32) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal date: unsupported value type (%T)(%[1]v)", value)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/date/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/date/unmarshal_utils.go
new file mode 100644
index 0000000..75cb6d1
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/date/unmarshal_utils.go
@@ -0,0 +1,401 @@
+package date
+
+import (
+	"fmt"
+	"math"
+	"reflect"
+	"time"
+)
+
+const (
+	negInt64       = int64(-1) << 32
+	zeroDate       = "-5877641-06-23"
+	zeroMS   int64 = -185542587187200000
+)
+
+var errWrongDataLen = fmt.Errorf("failed to unmarshal date: the length of the data should be 0 or 4")
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal date: can not unmarshal into nil reference (%T)(%[1]v))", v)
+}
+
+func DecInt32(p []byte, v *int32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 4:
+		*v = decInt32(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt32R(p []byte, v **int32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int32)
+		}
+	case 4:
+		val := decInt32(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt64(p []byte, v *int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = zeroMS
+	case 4:
+		*v = decMilliseconds(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt64R(p []byte, v **int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			val := zeroMS
+			*v = &val
+		}
+	case 4:
+		val := decMilliseconds(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint32(p []byte, v *uint32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 4:
+		*v = decUint32(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint32R(p []byte, v **uint32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint32)
+		}
+	case 4:
+		val := decUint32(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = ""
+		} else {
+			*v = zeroDate
+		}
+	case 4:
+		*v = decString(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			val := zeroDate
+			*v = &val
+		}
+	case 4:
+		val := decString(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecTime(p []byte, v *time.Time) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = minDate
+	case 4:
+		*v = decTime(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecTimeR(p []byte, v **time.Time) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			val := minDate
+			*v = &val
+		}
+	case 4:
+		val := decTime(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal date: can not unmarshal into nil reference (%T)(%[1]v))", v.Interface())
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Int32:
+		return decReflectInt32(p, v)
+	case reflect.Int64:
+		return decReflectInt64(p, v)
+	case reflect.Uint32:
+		return decReflectUint32(p, v)
+	case reflect.String:
+		return decReflectString(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal date: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectInt32(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 4:
+		v.SetInt(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInt64(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(zeroMS)
+	case 4:
+		v.SetInt(decMilliseconds(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint32(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 4:
+		v.SetUint(decUint64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectString(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.SetString("")
+		} else {
+			v.SetString(zeroDate)
+		}
+	case 4:
+		v.SetString(decString(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal date: can not unmarshal into nil reference (%T)(%[1]v)", v.Interface())
+	}
+
+	switch v.Type().Elem().Elem().Kind() {
+	case reflect.Int32:
+		return decReflectInt32R(p, v)
+	case reflect.Int64:
+		return decReflectInt64R(p, v)
+	case reflect.Uint32:
+		return decReflectUint32R(p, v)
+	case reflect.String:
+		return decReflectStringR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal date: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectInt32R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 4:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(decInt64(p))
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInt64R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		var val reflect.Value
+		if p == nil {
+			val = reflect.Zero(v.Type().Elem())
+		} else {
+			val = reflect.New(v.Type().Elem().Elem())
+			val.Elem().SetInt(zeroMS)
+			v.Elem().Set(val)
+		}
+		v.Elem().Set(val)
+	case 4:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetInt(decMilliseconds(p))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint32R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 4:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(decUint64(p))
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		var val reflect.Value
+		if p == nil {
+			val = reflect.Zero(v.Type().Elem())
+		} else {
+			val = reflect.New(v.Type().Elem().Elem())
+			val.Elem().SetString(zeroDate)
+		}
+		v.Elem().Set(val)
+	case 4:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetString(decString(p))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectNullableR(p []byte, v reflect.Value) reflect.Value {
+	if p == nil {
+		return reflect.Zero(v.Elem().Type())
+	}
+	return reflect.New(v.Type().Elem().Elem())
+}
+
+func decInt32(p []byte) int32 {
+	return int32(p[0])<<24 | int32(p[1])<<16 | int32(p[2])<<8 | int32(p[3])
+}
+
+func decInt64(p []byte) int64 {
+	if p[0] > math.MaxInt8 {
+		return negInt64 | int64(p[0])<<24 | int64(p[1])<<16 | int64(p[2])<<8 | int64(p[3])
+	}
+	return int64(p[0])<<24 | int64(p[1])<<16 | int64(p[2])<<8 | int64(p[3])
+}
+
+func decMilliseconds(p []byte) int64 {
+	return (int64(p[0])<<24 | int64(p[1])<<16 | int64(p[2])<<8 | int64(p[3]) - centerEpoch) * millisecondsInADay
+}
+
+func decUint32(p []byte) uint32 {
+	return uint32(p[0])<<24 | uint32(p[1])<<16 | uint32(p[2])<<8 | uint32(p[3])
+}
+
+func decUint64(p []byte) uint64 {
+	return uint64(p[0])<<24 | uint64(p[1])<<16 | uint64(p[2])<<8 | uint64(p[3])
+}
+
+func decString(p []byte) string {
+	return decTime(p).Format("2006-01-02")
+}
+
+func decTime(p []byte) time.Time {
+	return time.UnixMilli(decMilliseconds(p)).UTC()
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/decimal/marshal.go b/vendor/github.com/gocql/gocql/serialization/decimal/marshal.go
new file mode 100644
index 0000000..dbf95a2
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/decimal/marshal.go
@@ -0,0 +1,29 @@
+package decimal
+
+import (
+	"gopkg.in/inf.v0"
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case inf.Dec:
+		return EncInfDec(v)
+	case *inf.Dec:
+		return EncInfDecR(v)
+	case string:
+		return EncString(v)
+	case *string:
+		return EncStringR(v)
+	default:
+		// Custom types (type MyString string) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/decimal/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/decimal/marshal_utils.go
new file mode 100644
index 0000000..6384af7
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/decimal/marshal_utils.go
@@ -0,0 +1,141 @@
+package decimal
+
+import (
+	"fmt"
+	"gopkg.in/inf.v0"
+	"math/big"
+	"reflect"
+	"strconv"
+	"strings"
+
+	"github.com/gocql/gocql/serialization/varint"
+)
+
+func EncInfDec(v inf.Dec) ([]byte, error) {
+	sign := v.Sign()
+	if sign == 0 {
+		return []byte{0, 0, 0, 0, 0}, nil
+	}
+	return append(encScale(v.Scale()), varint.EncBigIntRS(v.UnscaledBig())...), nil
+}
+
+func EncInfDecR(v *inf.Dec) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encInfDecR(v), nil
+}
+
+// EncString encodes decimal string which should contains `scale` and `unscaled` strings separated by `;`.
+func EncString(v string) ([]byte, error) {
+	if v == "" {
+		return nil, nil
+	}
+	vs := strings.Split(v, ";")
+	if len(vs) != 2 {
+		return nil, fmt.Errorf("failed to marshal decimal: invalid decimal string %s", v)
+	}
+	scale, err := strconv.ParseInt(vs[0], 10, 32)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal decimal: invalid decimal scale string %s", vs[0])
+	}
+	unscaleData, err := encUnscaledString(vs[1])
+	if err != nil {
+		return nil, err
+	}
+	return append(encScale64(scale), unscaleData...), nil
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncString(*v)
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Type().Kind() {
+	case reflect.String:
+		return encReflectString(v)
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal decimal: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal decimal: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encReflectString(v reflect.Value) ([]byte, error) {
+	val := v.String()
+	if val == "" {
+		return nil, nil
+	}
+	vs := strings.Split(val, ";")
+	if len(vs) != 2 {
+		return nil, fmt.Errorf("failed to marshal decimal: invalid decimal string (%T)(%[1]v)", v.Interface())
+	}
+	scale, err := strconv.ParseInt(vs[0], 10, 32)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal decimal: invalid decimal scale string (%T)(%s)", v.Interface(), vs[0])
+	}
+	unscaledData, err := encUnscaledString(vs[1])
+	if err != nil {
+		return nil, err
+	}
+	return append(encScale64(scale), unscaledData...), nil
+}
+
+func encInfDecR(v *inf.Dec) []byte {
+	sign := v.Sign()
+	if sign == 0 {
+		return []byte{0, 0, 0, 0, 0}
+	}
+	return append(encScale(v.Scale()), varint.EncBigIntRS(v.UnscaledBig())...)
+}
+
+func encScale(v inf.Scale) []byte {
+	return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
+
+func encScale64(v int64) []byte {
+	return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
+
+func encUnscaledString(v string) ([]byte, error) {
+	switch {
+	case len(v) == 0:
+		return nil, nil
+	case len(v) <= 18:
+		n, err := strconv.ParseInt(v, 10, 64)
+		if err != nil {
+			return nil, fmt.Errorf("failed to marshal decimal: invalid unscaled string %s, %s", v, err)
+		}
+		return varint.EncInt64Ext(n), nil
+	case len(v) <= 20:
+		n, err := strconv.ParseInt(v, 10, 64)
+		if err == nil {
+			return varint.EncInt64Ext(n), nil
+		}
+
+		t, ok := new(big.Int).SetString(v, 10)
+		if !ok {
+			return nil, fmt.Errorf("failed to marshal decimal: invalid unscaled string %s", v)
+		}
+		return varint.EncBigIntRS(t), nil
+	default:
+		t, ok := new(big.Int).SetString(v, 10)
+		if !ok {
+			return nil, fmt.Errorf("failed to marshal decimal: invalid unscaled string %s", v)
+		}
+		return varint.EncBigIntRS(t), nil
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/decimal/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/decimal/unmarshal.go
new file mode 100644
index 0000000..6433f0d
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/decimal/unmarshal.go
@@ -0,0 +1,34 @@
+package decimal
+
+import (
+	"fmt"
+	"gopkg.in/inf.v0"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+	case *inf.Dec:
+		return DecInfDec(data, v)
+	case **inf.Dec:
+		return DecInfDecR(data, v)
+	case *string:
+		return DecString(data, v)
+	case **string:
+		return DecStringR(data, v)
+	default:
+		// Custom types (type MyString string) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal decimal: unsupported value type (%T)(%#[1]v)", value)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/decimal/unmarshal_ints.go b/vendor/github.com/gocql/gocql/serialization/decimal/unmarshal_ints.go
new file mode 100644
index 0000000..f219100
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/decimal/unmarshal_ints.go
@@ -0,0 +1,80 @@
+package decimal
+
+import (
+	"gopkg.in/inf.v0"
+)
+
+const (
+	neg8     = int64(-1) << 8
+	neg16    = int64(-1) << 16
+	neg24    = int64(-1) << 24
+	neg32    = int64(-1) << 32
+	neg40    = int64(-1) << 40
+	neg48    = int64(-1) << 48
+	neg56    = int64(-1) << 56
+	neg32Int = int(-1) << 32
+)
+
+func decScale(p []byte) inf.Scale {
+	return inf.Scale(p[0])<<24 | inf.Scale(p[1])<<16 | inf.Scale(p[2])<<8 | inf.Scale(p[3])
+}
+
+func decScaleInt64(p []byte) int64 {
+	if p[0] > 127 {
+		return neg32 | int64(p[0])<<24 | int64(p[1])<<16 | int64(p[2])<<8 | int64(p[3])
+	}
+	return int64(p[0])<<24 | int64(p[1])<<16 | int64(p[2])<<8 | int64(p[3])
+}
+
+func dec1toInt64(p []byte) int64 {
+	if p[4] > 127 {
+		return neg8 | int64(p[4])
+	}
+	return int64(p[4])
+}
+
+func dec2toInt64(p []byte) int64 {
+	if p[4] > 127 {
+		return neg16 | int64(p[4])<<8 | int64(p[5])
+	}
+	return int64(p[4])<<8 | int64(p[5])
+}
+
+func dec3toInt64(p []byte) int64 {
+	if p[4] > 127 {
+		return neg24 | int64(p[4])<<16 | int64(p[5])<<8 | int64(p[6])
+	}
+	return int64(p[4])<<16 | int64(p[5])<<8 | int64(p[6])
+}
+
+func dec4toInt64(p []byte) int64 {
+	if p[4] > 127 {
+		return neg32 | int64(p[4])<<24 | int64(p[5])<<16 | int64(p[6])<<8 | int64(p[7])
+	}
+	return int64(p[4])<<24 | int64(p[5])<<16 | int64(p[6])<<8 | int64(p[7])
+}
+
+func dec5toInt64(p []byte) int64 {
+	if p[4] > 127 {
+		return neg40 | int64(p[4])<<32 | int64(p[5])<<24 | int64(p[6])<<16 | int64(p[7])<<8 | int64(p[8])
+	}
+	return int64(p[4])<<32 | int64(p[5])<<24 | int64(p[6])<<16 | int64(p[7])<<8 | int64(p[8])
+}
+
+func dec6toInt64(p []byte) int64 {
+	if p[4] > 127 {
+		return neg48 | int64(p[4])<<40 | int64(p[5])<<32 | int64(p[6])<<24 | int64(p[7])<<16 | int64(p[8])<<8 | int64(p[9])
+	}
+	return int64(p[4])<<40 | int64(p[5])<<32 | int64(p[6])<<24 | int64(p[7])<<16 | int64(p[8])<<8 | int64(p[9])
+}
+
+func dec7toInt64(p []byte) int64 {
+	if p[4] > 127 {
+		return neg56 | int64(p[4])<<48 | int64(p[5])<<40 | int64(p[6])<<32 | int64(p[7])<<24 | int64(p[8])<<16 | int64(p[9])<<8 | int64(p[10])
+	}
+	return int64(p[4])<<48 | int64(p[5])<<40 | int64(p[6])<<32 | int64(p[7])<<24 | int64(p[8])<<16 | int64(p[9])<<8 | int64(p[10])
+}
+
+func dec8toInt64(p []byte) int64 {
+	return int64(p[4])<<56 | int64(p[5])<<48 | int64(p[6])<<40 | int64(p[7])<<32 | int64(p[8])<<24 | int64(p[9])<<16 | int64(p[10])<<8 | int64(p[11])
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/decimal/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/decimal/unmarshal_utils.go
new file mode 100644
index 0000000..557f6ce
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/decimal/unmarshal_utils.go
@@ -0,0 +1,323 @@
+package decimal
+
+import (
+	"fmt"
+	"gopkg.in/inf.v0"
+	"reflect"
+	"strconv"
+
+	"github.com/gocql/gocql/serialization/varint"
+)
+
+var errWrongDataLen = fmt.Errorf("failed to unmarshal decimal: the length of the data should be 0 or more than 5")
+
+func errBrokenData(p []byte) error {
+	if p[4] == 0 && p[5] <= 127 || p[4] == 255 && p[5] > 127 {
+		return fmt.Errorf("failed to unmarshal decimal: the data is broken")
+	}
+	return nil
+}
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal decimal: can not unmarshal into nil reference(%T)(%[1]v)", v)
+}
+
+func DecInfDec(p []byte, v *inf.Dec) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		v.SetScale(0).SetUnscaled(0)
+		return nil
+	case 1, 2, 3, 4:
+		return errWrongDataLen
+	case 5:
+		v.SetScale(decScale(p)).SetUnscaled(dec1toInt64(p))
+		return nil
+	case 6:
+		v.SetScale(decScale(p)).SetUnscaled(dec2toInt64(p))
+	case 7:
+		v.SetScale(decScale(p)).SetUnscaled(dec3toInt64(p))
+	case 8:
+		v.SetScale(decScale(p)).SetUnscaled(dec4toInt64(p))
+	case 9:
+		v.SetScale(decScale(p)).SetUnscaled(dec5toInt64(p))
+	case 10:
+		v.SetScale(decScale(p)).SetUnscaled(dec6toInt64(p))
+	case 11:
+		v.SetScale(decScale(p)).SetUnscaled(dec7toInt64(p))
+	case 12:
+		v.SetScale(decScale(p)).SetUnscaled(dec8toInt64(p))
+	default:
+		v.SetScale(decScale(p)).SetUnscaledBig(varint.Dec2BigInt(p[4:]))
+	}
+	return errBrokenData(p)
+}
+
+func DecInfDecR(p []byte, v **inf.Dec) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = inf.NewDec(0, 0)
+		}
+		return nil
+	case 1, 2, 3, 4:
+		return errWrongDataLen
+	case 5:
+		*v = inf.NewDec(dec1toInt64(p), decScale(p))
+		return nil
+	case 6:
+		*v = inf.NewDec(dec2toInt64(p), decScale(p))
+	case 7:
+		*v = inf.NewDec(dec3toInt64(p), decScale(p))
+	case 8:
+		*v = inf.NewDec(dec4toInt64(p), decScale(p))
+	case 9:
+		*v = inf.NewDec(dec5toInt64(p), decScale(p))
+	case 10:
+		*v = inf.NewDec(dec6toInt64(p), decScale(p))
+	case 11:
+		*v = inf.NewDec(dec7toInt64(p), decScale(p))
+	case 12:
+		*v = inf.NewDec(dec8toInt64(p), decScale(p))
+	default:
+		*v = inf.NewDecBig(varint.Dec2BigInt(p[4:]), decScale(p))
+	}
+	return errBrokenData(p)
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = ""
+		} else {
+			*v = "0;0"
+		}
+		return nil
+	case 1, 2, 3, 4:
+		return errWrongDataLen
+	case 5:
+		*v = decString5(p)
+		return nil
+	case 6:
+		*v = decString6(p)
+	case 7:
+		*v = decString7(p)
+	case 8:
+		*v = decString8(p)
+	case 9:
+		*v = decString9(p)
+	case 10:
+		*v = decString10(p)
+	case 11:
+		*v = decString11(p)
+	case 12:
+		*v = decString12(p)
+	default:
+		*v = decString(p)
+	}
+	return errBrokenData(p)
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			tmp := "0;0"
+			*v = &tmp
+		}
+		return nil
+	case 1, 2, 3, 4:
+		return errWrongDataLen
+	case 5:
+		tmp := decString5(p)
+		*v = &tmp
+		return nil
+	case 6:
+		tmp := decString6(p)
+		*v = &tmp
+	case 7:
+		tmp := decString7(p)
+		*v = &tmp
+	case 8:
+		tmp := decString8(p)
+		*v = &tmp
+	case 9:
+		tmp := decString9(p)
+		*v = &tmp
+	case 10:
+		tmp := decString10(p)
+		*v = &tmp
+	case 11:
+		tmp := decString11(p)
+		*v = &tmp
+	case 12:
+		tmp := decString12(p)
+		*v = &tmp
+	default:
+		tmp := decString(p)
+		*v = &tmp
+	}
+	return errBrokenData(p)
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal decimal: can not unmarshal into nil reference (%T)(%#[1]v)", v.Interface())
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.String:
+		return decReflectString(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal decimal: unsupported value type (%T)(%#[1]v)", v.Interface())
+	}
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal decimal: can not unmarshal into nil reference (%T)(%[1]v)", v.Interface())
+	}
+
+	switch v.Type().Elem().Elem().Kind() {
+	case reflect.String:
+		return decReflectStringR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal decimal: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectString(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.SetString("")
+		} else {
+			v.SetString("0;0")
+		}
+		return nil
+	case 1, 2, 3, 4:
+		return errWrongDataLen
+	case 5:
+		v.SetString(decString5(p))
+		return nil
+	case 6:
+		v.SetString(decString6(p))
+	case 7:
+		v.SetString(decString7(p))
+	case 8:
+		v.SetString(decString8(p))
+	case 9:
+		v.SetString(decString9(p))
+	case 10:
+		v.SetString(decString10(p))
+	case 11:
+		v.SetString(decString11(p))
+	case 12:
+		v.SetString(decString12(p))
+	default:
+		v.SetString(decString(p))
+	}
+	return errBrokenData(p)
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		var val reflect.Value
+		if p == nil {
+			val = reflect.Zero(v.Type().Elem())
+		} else {
+			val = reflect.New(v.Type().Elem().Elem())
+			val.Elem().SetString("0;0")
+		}
+		v.Elem().Set(val)
+		return nil
+	case 1, 2, 3, 4:
+		return errWrongDataLen
+	case 5:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(decString5(p))
+		v.Elem().Set(newVal)
+		return nil
+	case 6:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(decString6(p))
+		v.Elem().Set(newVal)
+	case 7:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(decString7(p))
+		v.Elem().Set(newVal)
+	case 8:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(decString8(p))
+		v.Elem().Set(newVal)
+	case 9:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(decString9(p))
+		v.Elem().Set(newVal)
+	case 10:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(decString10(p))
+		v.Elem().Set(newVal)
+	case 11:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(decString11(p))
+		v.Elem().Set(newVal)
+	case 12:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(decString12(p))
+		v.Elem().Set(newVal)
+	default:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(decString(p))
+		v.Elem().Set(newVal)
+	}
+	return errBrokenData(p)
+}
+
+func decString5(p []byte) string {
+	return strconv.FormatInt(decScaleInt64(p), 10) + ";" + strconv.FormatInt(dec1toInt64(p), 10)
+}
+
+func decString6(p []byte) string {
+	return strconv.FormatInt(decScaleInt64(p), 10) + ";" + strconv.FormatInt(dec2toInt64(p), 10)
+}
+
+func decString7(p []byte) string {
+	return strconv.FormatInt(decScaleInt64(p), 10) + ";" + strconv.FormatInt(dec3toInt64(p), 10)
+}
+func decString8(p []byte) string {
+	return strconv.FormatInt(decScaleInt64(p), 10) + ";" + strconv.FormatInt(dec4toInt64(p), 10)
+}
+func decString9(p []byte) string {
+	return strconv.FormatInt(decScaleInt64(p), 10) + ";" + strconv.FormatInt(dec5toInt64(p), 10)
+}
+func decString10(p []byte) string {
+	return strconv.FormatInt(decScaleInt64(p), 10) + ";" + strconv.FormatInt(dec6toInt64(p), 10)
+}
+func decString11(p []byte) string {
+	return strconv.FormatInt(decScaleInt64(p), 10) + ";" + strconv.FormatInt(dec7toInt64(p), 10)
+}
+func decString12(p []byte) string {
+	return strconv.FormatInt(decScaleInt64(p), 10) + ";" + strconv.FormatInt(dec8toInt64(p), 10)
+}
+
+func decString(p []byte) string {
+	return strconv.FormatInt(decScaleInt64(p), 10) + ";" + varint.Dec2BigInt(p[4:]).String()
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/double/marshal.go b/vendor/github.com/gocql/gocql/serialization/double/marshal.go
new file mode 100644
index 0000000..d3fdf2f
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/double/marshal.go
@@ -0,0 +1,24 @@
+package double
+
+import (
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case float64:
+		return EncFloat64(v)
+	case *float64:
+		return EncFloat64R(v)
+	default:
+		// Custom types (type MyFloat float64) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/double/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/double/marshal_utils.go
new file mode 100644
index 0000000..5ae206d
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/double/marshal_utils.go
@@ -0,0 +1,59 @@
+package double
+
+import (
+	"fmt"
+	"reflect"
+	"unsafe"
+)
+
+func EncFloat64(v float64) ([]byte, error) {
+	return encFloat64(v), nil
+}
+
+func EncFloat64R(v *float64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encFloat64R(v), nil
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.Float64:
+		return encFloat64(v.Float()), nil
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal double: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal double: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encFloat64(v float64) []byte {
+	return encUint64(floatToUint(v))
+}
+
+func encFloat64R(v *float64) []byte {
+	return encUint64(floatToUintR(v))
+}
+
+func encUint64(v uint64) []byte {
+	return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
+
+func floatToUint(v float64) uint64 {
+	return *(*uint64)(unsafe.Pointer(&v))
+}
+
+func floatToUintR(v *float64) uint64 {
+	return *(*uint64)(unsafe.Pointer(v))
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/double/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/double/unmarshal.go
new file mode 100644
index 0000000..f0712a1
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/double/unmarshal.go
@@ -0,0 +1,29 @@
+package double
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+	case *float64:
+		return DecFloat64(data, v)
+	case **float64:
+		return DecFloat64R(data, v)
+	default:
+		// Custom types (type MyFloat float64) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal double: unsupported value type (%T)(%[1]v)", v)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/double/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/double/unmarshal_utils.go
new file mode 100644
index 0000000..e6bf5cd
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/double/unmarshal_utils.go
@@ -0,0 +1,126 @@
+package double
+
+import (
+	"fmt"
+	"reflect"
+	"unsafe"
+)
+
+var errWrongDataLen = fmt.Errorf("failed to unmarshal double: the length of the data should be 0 or 8")
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal double: can not unmarshal into nil reference(%T)(%[1]v)", v)
+}
+
+func DecFloat64(p []byte, v *float64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		*v = decFloat64(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecFloat64R(p []byte, v **float64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(float64)
+		}
+	case 8:
+		*v = decFloat64R(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Float64:
+		return decReflectFloat32(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal double: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v.Type().Elem().Elem().Kind() {
+	case reflect.Float64:
+		return decReflectFloat32R(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal double: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectFloat32(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetFloat(0)
+	case 8:
+		v.SetFloat(decFloat64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectFloat32R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 8:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetFloat(decFloat64(p))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectNullableR(p []byte, v reflect.Value) reflect.Value {
+	if p == nil {
+		return reflect.Zero(v.Elem().Type())
+	}
+	return reflect.New(v.Type().Elem().Elem())
+}
+
+func decFloat64(p []byte) float64 {
+	return uint64ToFloat(decUint64(p))
+}
+
+func decFloat64R(p []byte) *float64 {
+	return uint64ToFloatR(decUint64(p))
+}
+
+func uint64ToFloat(v uint64) float64 {
+	return *(*float64)(unsafe.Pointer(&v))
+}
+
+func uint64ToFloatR(v uint64) *float64 {
+	return (*float64)(unsafe.Pointer(&v))
+}
+
+func decUint64(p []byte) uint64 {
+	return uint64(p[0])<<56 | uint64(p[1])<<48 | uint64(p[2])<<40 | uint64(p[3])<<32 | uint64(p[4])<<24 | uint64(p[5])<<16 | uint64(p[6])<<8 | uint64(p[7])
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/duration/duration.go b/vendor/github.com/gocql/gocql/serialization/duration/duration.go
new file mode 100644
index 0000000..823fd87
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/duration/duration.go
@@ -0,0 +1,17 @@
+package duration
+
+type Duration struct {
+	Months      int32
+	Days        int32
+	Nanoseconds int64
+}
+
+func (d Duration) Valid() bool {
+	if d.Months >= 0 && d.Days >= 0 && d.Nanoseconds >= 0 {
+		return true
+	}
+	if d.Months <= 0 && d.Days <= 0 && d.Nanoseconds <= 0 {
+		return true
+	}
+	return false
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/duration/marshal.go b/vendor/github.com/gocql/gocql/serialization/duration/marshal.go
new file mode 100644
index 0000000..470fe49
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/duration/marshal.go
@@ -0,0 +1,38 @@
+package duration
+
+import (
+	"reflect"
+	"time"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case int64:
+		return EncInt64(v)
+	case time.Duration:
+		return EncDur(v)
+	case string:
+		return EncString(v)
+	case Duration:
+		return EncDuration(v)
+
+	case *int64:
+		return EncInt64R(v)
+	case *time.Duration:
+		return EncDurR(v)
+	case *string:
+		return EncStringR(v)
+	case *Duration:
+		return EncDurationR(v)
+	default:
+		// Custom types (type MyDate uint32) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/duration/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/duration/marshal_utils.go
new file mode 100644
index 0000000..be79a9d
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/duration/marshal_utils.go
@@ -0,0 +1,217 @@
+package duration
+
+import (
+	"fmt"
+	"reflect"
+	"time"
+)
+
+const (
+	vintPrefix1 byte = 128
+	vintPrefix2 byte = 192
+	vintPrefix3 byte = 224
+	vintPrefix4 byte = 240
+	vintPrefix5 byte = 248
+	vintPrefix6 byte = 252
+	vintPrefix7 byte = 254
+	vintPrefix8 byte = 255
+
+	nanoDayPos = 24 * 60 * 60 * 1000 * 1000 * 1000
+	nanoDayNeg = -nanoDayPos
+)
+
+func EncInt64(v int64) ([]byte, error) {
+	return encInt64(v), nil
+}
+
+func EncInt64R(v *int64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encInt64(*v), nil
+}
+
+func EncDur(v time.Duration) ([]byte, error) {
+	return encDur(v), nil
+}
+
+func EncDurR(v *time.Duration) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encDur(*v), nil
+}
+
+func EncString(v string) ([]byte, error) {
+	if v == "" {
+		return nil, nil
+	}
+	d, err := time.ParseDuration(v)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal duration: the (string)(%s) have invalid format, %v", v, err)
+	}
+	return encDur(d), nil
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncString(*v)
+}
+
+func EncDuration(v Duration) ([]byte, error) {
+	if !v.Valid() {
+		return nil, fmt.Errorf("failed to marshal duration: the (Duration) values of months (%d), days (%d) and nanoseconds (%d) should have the same sign", v.Months, v.Days, v.Nanoseconds)
+	}
+	return append(append(encVint32(encIntZigZag32(v.Months)), encVint32(encIntZigZag32(v.Days))...), encVint64(encIntZigZag64(v.Nanoseconds))...), nil
+}
+
+func EncDurationR(v *Duration) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	if !v.Valid() {
+		return nil, fmt.Errorf("failed to marshal duration: the (*Duration) values of the months (%d), days (%d) and nanoseconds (%d) should have same sign", v.Months, v.Days, v.Nanoseconds)
+	}
+	return append(append(encVint32(encIntZigZag32(v.Months)), encVint32(encIntZigZag32(v.Days))...), encVint64(encIntZigZag64(v.Nanoseconds))...), nil
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.Int64:
+		return encInt64(v.Int()), nil
+	case reflect.String:
+		val := v.String()
+		if val == "" {
+			return nil, nil
+		}
+		d, err := time.ParseDuration(val)
+		if err != nil {
+			return nil, fmt.Errorf("failed to marshal duration: the (%T)(%[1]v) have invalid format, %v", v, err)
+		}
+		return encDur(d), nil
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal duration: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal duration: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encDur(v time.Duration) []byte {
+	if v < nanoDayPos && v > nanoDayNeg {
+		return encNanos(encIntZigZagDur(v))
+	}
+	n := v % nanoDayPos
+	return encDaysNanos(encIntZigZag32(int32((v-n)/nanoDayPos)), encIntZigZagDur(n))
+}
+
+func encInt64(v int64) []byte {
+	if v < nanoDayPos && v > nanoDayNeg {
+		return encNanos(encIntZigZag64(v))
+	}
+	n := v % nanoDayPos
+	return encDaysNanos(encIntZigZag32(int32((v-n)/nanoDayPos)), encIntZigZag64(n))
+}
+
+func encIntZigZag32(v int32) uint32 {
+	return uint32((v >> 31) ^ (v << 1))
+}
+
+func encIntZigZag64(v int64) uint64 {
+	return uint64((v >> 63) ^ (v << 1))
+}
+
+func encIntZigZagDur(v time.Duration) uint64 {
+	return uint64((v >> 63) ^ (v << 1))
+}
+
+func encVint32(v uint32) []byte {
+	switch {
+	case byte(v>>28) != 0:
+		return []byte{vintPrefix4, byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>21) != 0:
+		return []byte{vintPrefix3 | byte(v>>24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>14) != 0:
+		return []byte{vintPrefix2 | byte(v>>16), byte(v >> 8), byte(v)}
+	case byte(v>>7) != 0:
+		return []byte{vintPrefix1 | byte(v>>8), byte(v)}
+	default:
+		return []byte{byte(v)}
+	}
+}
+
+func encVint64(v uint64) []byte {
+	switch {
+	case byte(v>>56) != 0:
+		return []byte{vintPrefix8, byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>49) != 0:
+		return []byte{vintPrefix7 | byte(v>>56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>42) != 0:
+		return []byte{vintPrefix6 | byte(v>>48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>35) != 0:
+		return []byte{vintPrefix5 | byte(v>>40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>28) != 0:
+		return []byte{vintPrefix4 | byte(v>>32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>21) != 0:
+		return []byte{vintPrefix3 | byte(v>>24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>14) != 0:
+		return []byte{vintPrefix2 | byte(v>>16), byte(v >> 8), byte(v)}
+	case byte(v>>7) != 0:
+		return []byte{vintPrefix1 | byte(v>>8), byte(v)}
+	default:
+		return []byte{byte(v)}
+	}
+}
+
+func encDaysNanos(d uint32, n uint64) []byte {
+	return append(encDays(d), encVint64(n)...)
+}
+
+func encDays(v uint32) []byte {
+	switch {
+	case byte(v>>28) != 0:
+		return []byte{0, vintPrefix4, byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>21) != 0:
+		return []byte{0, vintPrefix3 | byte(v>>24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>14) != 0:
+		return []byte{0, vintPrefix2 | byte(v>>16), byte(v >> 8), byte(v)}
+	case byte(v>>7) != 0:
+		return []byte{0, vintPrefix1 | byte(v>>8), byte(v)}
+	default:
+		return []byte{0, byte(v)}
+	}
+}
+
+func encNanos(v uint64) []byte {
+	switch {
+	case byte(v>>56) != 0:
+		return []byte{0, 0, vintPrefix8, byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>49) != 0:
+		return []byte{0, 0, vintPrefix7 | byte(v>>56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>42) != 0:
+		return []byte{0, 0, vintPrefix6 | byte(v>>48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>35) != 0:
+		return []byte{0, 0, vintPrefix5 | byte(v>>40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>28) != 0:
+		return []byte{0, 0, vintPrefix4 | byte(v>>32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>21) != 0:
+		return []byte{0, 0, vintPrefix3 | byte(v>>24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>14) != 0:
+		return []byte{0, 0, vintPrefix2 | byte(v>>16), byte(v >> 8), byte(v)}
+	case byte(v>>7) != 0:
+		return []byte{0, 0, vintPrefix1 | byte(v>>8), byte(v)}
+	default:
+		return []byte{0, 0, byte(v)}
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/duration/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/duration/unmarshal.go
new file mode 100644
index 0000000..010b0ca
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/duration/unmarshal.go
@@ -0,0 +1,45 @@
+package duration
+
+import (
+	"fmt"
+	"reflect"
+	"time"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+
+	case *int64:
+		return DecInt64(data, v)
+	case *string:
+		return DecString(data, v)
+	case *time.Duration:
+		return DecDur(data, v)
+	case *Duration:
+		return DecDuration(data, v)
+
+	case **int64:
+		return DecInt64R(data, v)
+	case **string:
+		return DecStringR(data, v)
+	case **time.Duration:
+		return DecDurR(data, v)
+	case **Duration:
+		return DecDurationR(data, v)
+	default:
+
+		// Custom types (type MyDate uint32) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal duration: unsupported value type (%T)(%[1]v)", value)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/duration/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/duration/unmarshal_utils.go
new file mode 100644
index 0000000..465db48
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/duration/unmarshal_utils.go
@@ -0,0 +1,765 @@
+package duration
+
+import (
+	"fmt"
+	"math"
+	"reflect"
+	"time"
+)
+
+const (
+	maxDays      = (math.MaxInt64 - math.MaxInt64%nanoDayPos) / nanoDayPos
+	minDays      = -maxDays
+	maxDaysNanos = maxDays * nanoDayPos
+	minDaysNanos = minDays * nanoDayPos
+	zeroDuration = "0s"
+)
+
+var (
+	errWrongDataLen = fmt.Errorf("failed to unmarshal duration: the length of the data should be 0 or 3-19")
+	errBrokenData   = fmt.Errorf("failed to unmarshal duration: the data is broken")
+	errInvalidSign  = fmt.Errorf("failed to unmarshal duration: the data values of months, days and nanoseconds should have the same sign")
+)
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal duration: can not unmarshal into nil reference (%T)(%[1]v))", v)
+}
+
+func DecInt64(p []byte, v *int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch l := len(p); {
+	case l == 0:
+		*v = 0
+	case l < 3:
+		return errWrongDataLen
+	default:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal duration: to unmarshal into (int64) the months value should be 0")
+		}
+		if p[1] == 0 {
+			var ok bool
+			if *v, ok = decNanos64(p); !ok {
+				return errBrokenData
+			}
+		} else {
+			d, n, ok := decDaysNanos64(p)
+			if !ok {
+				return errBrokenData
+			}
+			if !validSignDateNanos(d, n) {
+				return errInvalidSign
+			}
+			if *v, ok = daysToNanos(d, n); !ok {
+				return fmt.Errorf("failed to unmarshal duration: to unmarshal into (int64) the data value should be in int64 range")
+			}
+		}
+	}
+	return nil
+}
+
+func DecInt64R(p []byte, v **int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch l := len(p); {
+	case l == 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int64)
+		}
+	case l < 3:
+		return errWrongDataLen
+	default:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal duration: to unmarshal into (*int64) the months value should be 0")
+		}
+		if p[1] == 0 {
+			n, ok := decNanos64(p)
+			if !ok {
+				return errBrokenData
+			}
+			*v = &n
+		} else {
+			d, n, ok := decDaysNanos64(p)
+			if !ok {
+				return errBrokenData
+			}
+			if !validSignDateNanos(d, n) {
+				return errInvalidSign
+			}
+			if n, ok = daysToNanos(d, n); !ok {
+				return fmt.Errorf("failed to unmarshal duration: to unmarshal into (*int64) the data value should be in int64 range")
+			}
+			*v = &n
+		}
+	}
+	return nil
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch l := len(p); {
+	case l == 0:
+		if p == nil {
+			*v = ""
+		} else {
+			*v = zeroDuration
+		}
+	case l < 3:
+		return errWrongDataLen
+	default:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal duration: to unmarshal into (string) the months value should be 0")
+		}
+		if p[1] == 0 {
+			n, ok := decNanosDur(p)
+			if !ok {
+				return errBrokenData
+			}
+			*v = n.String()
+		} else {
+			d, n, ok := decDaysNanosDur(p)
+			if !ok {
+				return errBrokenData
+			}
+			if !validDateNanosDur(d, n) {
+				return errInvalidSign
+			}
+			if n, ok = daysToNanosDur(d, n); !ok {
+				return fmt.Errorf("failed to unmarshal duration: to unmarshal into (string) the data value should be in int64 range")
+			}
+			*v = n.String()
+		}
+	}
+	return nil
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch l := len(p); {
+	case l == 0:
+		if p == nil {
+			*v = nil
+		} else {
+			val := zeroDuration
+			*v = &val
+		}
+	case l < 3:
+		return errWrongDataLen
+	default:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal duration: to unmarshal into (*string) the months value should be 0")
+		}
+		var val string
+		if p[1] == 0 {
+			n, ok := decNanosDur(p)
+			if !ok {
+				return errBrokenData
+			}
+			val = n.String()
+		} else {
+			d, n, ok := decDaysNanosDur(p)
+			if !ok {
+				return errBrokenData
+			}
+			if !validDateNanosDur(d, n) {
+				return errInvalidSign
+			}
+			if n, ok = daysToNanosDur(d, n); !ok {
+				return fmt.Errorf("failed to unmarshal duration: to unmarshal into (*string) the data value should be in int64 range")
+			}
+			val = n.String()
+		}
+		*v = &val
+	}
+	return nil
+}
+
+func DecDur(p []byte, v *time.Duration) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch l := len(p); {
+	case l == 0:
+		*v = 0
+	case l < 3:
+		return errWrongDataLen
+	default:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal duration: to unmarshal into (time.Duration) the months value should be 0")
+		}
+		if p[1] == 0 {
+			var ok bool
+			if *v, ok = decNanosDur(p); !ok {
+				return errBrokenData
+			}
+		} else {
+			d, n, ok := decDaysNanosDur(p)
+			if !ok {
+				return errBrokenData
+			}
+			if !validDateNanosDur(d, n) {
+				return errInvalidSign
+			}
+			if n, ok = daysToNanosDur(d, n); !ok {
+				return fmt.Errorf("failed to unmarshal duration: to unmarshal into (time.Duration) the data value should be in int64 range")
+			}
+			*v = n
+		}
+	}
+	return nil
+}
+
+func DecDurR(p []byte, v **time.Duration) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch l := len(p); {
+	case l == 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(time.Duration)
+		}
+	case l < 3:
+		return errWrongDataLen
+	default:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal duration: to unmarshal into (*time.Duration) the months value should be 0")
+		}
+		if p[1] == 0 {
+			n, ok := decNanosDur(p)
+			if !ok {
+				return errBrokenData
+			}
+			*v = &n
+		} else {
+			d, n, ok := decDaysNanosDur(p)
+			if !ok {
+				return errBrokenData
+			}
+			if !validDateNanosDur(d, n) {
+				return errInvalidSign
+			}
+			if n, ok = daysToNanosDur(d, n); !ok {
+				return fmt.Errorf("failed to unmarshal duration: to unmarshal into (*time.Duration) the data value should be in int64 range")
+			}
+			*v = &n
+		}
+	}
+	return nil
+}
+
+func DecDuration(p []byte, v *Duration) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch l := len(p); {
+	case l == 0:
+		*v = Duration{}
+	case l < 3:
+		return errWrongDataLen
+	default:
+		var ok bool
+		v.Months, v.Days, v.Nanoseconds, ok = decVints(p)
+		if !ok {
+			return errBrokenData
+		}
+		if !v.Valid() {
+			return errInvalidSign
+		}
+	}
+	return nil
+}
+
+func DecDurationR(p []byte, v **Duration) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch l := len(p); {
+	case l == 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(Duration)
+		}
+	case l < 3:
+		return errWrongDataLen
+	default:
+		var ok bool
+		var val Duration
+		val.Months, val.Days, val.Nanoseconds, ok = decVints(p)
+		if !ok {
+			return errBrokenData
+		}
+		if !val.Valid() {
+			return errInvalidSign
+		}
+		*v = &val
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal duration: can not unmarshal into nil reference (%T)(%[1]v))", v.Interface())
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Int64:
+		return decReflectInt64(p, v)
+	case reflect.String:
+		return decReflectString(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal duration: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectInt64(p []byte, v reflect.Value) error {
+	switch l := len(p); {
+	case l == 0:
+		v.SetInt(0)
+	case l < 3:
+		return errWrongDataLen
+	default:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal duration: to unmarshal into (%T) the months value should be 0", v.Interface())
+		}
+		if p[1] == 0 {
+			n, ok := decNanos64(p)
+			if !ok {
+				return errBrokenData
+			}
+			v.SetInt(n)
+		} else {
+			d, n, ok := decDaysNanos64(p)
+			if !ok {
+				return errBrokenData
+			}
+			if !validSignDateNanos(d, n) {
+				return errInvalidSign
+			}
+			if n, ok = daysToNanos(d, n); !ok {
+				return fmt.Errorf("failed to unmarshal duration: to unmarshal into (%T) the data value should be in int64 range", v.Interface())
+			}
+			v.SetInt(n)
+		}
+	}
+	return nil
+}
+
+func decReflectString(p []byte, v reflect.Value) error {
+	switch l := len(p); {
+	case l == 0:
+		if p == nil {
+			v.SetString("")
+		} else {
+			v.SetString(zeroDuration)
+		}
+	case l < 3:
+		return errWrongDataLen
+	default:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal duration: to unmarshal into (%T) the months value should be 0", v.Interface())
+		}
+		var val string
+		if p[1] == 0 {
+			n, ok := decNanosDur(p)
+			if !ok {
+				return errBrokenData
+			}
+			val = n.String()
+		} else {
+			d, n, ok := decDaysNanosDur(p)
+			if !ok {
+				return errBrokenData
+			}
+			if !validDateNanosDur(d, n) {
+				return errInvalidSign
+			}
+			if n, ok = daysToNanosDur(d, n); !ok {
+				return fmt.Errorf("failed to unmarshal duration: to unmarshal into (%T) the data value should be in int64 range", v.Interface())
+			}
+			val = n.String()
+		}
+		v.SetString(val)
+	}
+	return nil
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal duration: can not unmarshal into nil reference (%T)(%[1]v)", v.Interface())
+	}
+
+	switch v.Type().Elem().Elem().Kind() {
+	case reflect.Int64:
+		return decReflectInt64R(p, v)
+	case reflect.String:
+		return decReflectStringR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal duration: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectInt64R(p []byte, v reflect.Value) error {
+	switch l := len(p); {
+	case l == 0:
+		var val reflect.Value
+		if p == nil {
+			val = reflect.Zero(v.Type().Elem())
+		} else {
+			val = reflect.New(v.Type().Elem().Elem())
+		}
+		v.Elem().Set(val)
+	case l < 3:
+		return errWrongDataLen
+	default:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal duration: to unmarshal into (%T) the months value should be 0", v.Interface())
+		}
+		val := reflect.New(v.Type().Elem().Elem())
+		if p[1] == 0 {
+			n, ok := decNanos64(p)
+			if !ok {
+				return errBrokenData
+			}
+			val.Elem().SetInt(n)
+		} else {
+			d, n, ok := decDaysNanos64(p)
+			if !ok {
+				return errBrokenData
+			}
+			if !validSignDateNanos(d, n) {
+				return errInvalidSign
+			}
+			if n, ok = daysToNanos(d, n); !ok {
+				return fmt.Errorf("failed to unmarshal duration: to unmarshal into (%T) the data value should be in int64 range", v.Interface())
+			}
+			val.Elem().SetInt(n)
+		}
+		v.Elem().Set(val)
+	}
+	return nil
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	switch l := len(p); {
+	case l == 0:
+		var val reflect.Value
+		if p == nil {
+			val = reflect.Zero(v.Type().Elem())
+		} else {
+			val = reflect.New(v.Type().Elem().Elem())
+			val.Elem().SetString(zeroDuration)
+		}
+		v.Elem().Set(val)
+	case l < 3:
+		return errWrongDataLen
+	default:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal duration: to unmarshal into (%T) the months value should be 0", v.Interface())
+		}
+		val := reflect.New(v.Type().Elem().Elem())
+		if p[1] == 0 {
+			n, ok := decNanosDur(p)
+			if !ok {
+				return errBrokenData
+			}
+			val.Elem().SetString(n.String())
+		} else {
+			d, n, ok := decDaysNanosDur(p)
+			if !ok {
+				return errBrokenData
+			}
+			if !validDateNanosDur(d, n) {
+				return errInvalidSign
+			}
+			if n, ok = daysToNanosDur(d, n); !ok {
+				return fmt.Errorf("failed to unmarshal duration: to unmarshal into (%T) the data value should be in int64 range", v.Interface())
+			}
+			val.Elem().SetString(n.String())
+		}
+		v.Elem().Set(val)
+	}
+	return nil
+}
+
+func validSignDateNanos(d int64, n int64) bool {
+	if d >= 0 && n >= 0 {
+		return true
+	}
+	if d <= 0 && n <= 0 {
+		return true
+	}
+	return false
+}
+
+func daysToNanos(d int64, n int64) (int64, bool) {
+	if d > maxDays || d < minDays {
+		return 0, false
+	}
+	d *= nanoDayPos
+	if (d > 0 && math.MaxInt64-d < n) || (d < 0 && math.MinInt64-d > n) {
+		return 0, false
+	}
+	return n + d, true
+}
+
+func daysToNanosDur(d time.Duration, n time.Duration) (time.Duration, bool) {
+	if d > maxDays || d < minDays {
+		return 0, false
+	}
+	d *= nanoDayPos
+	if (d > 0 && math.MaxInt64-d < n) || (d < 0 && math.MinInt64-d > n) {
+		return 0, false
+	}
+	return n + d, true
+}
+
+func validDateNanosDur(d time.Duration, n time.Duration) bool {
+	if d >= 0 && n >= 0 {
+		return true
+	}
+	if d <= 0 && n <= 0 {
+		return true
+	}
+	return false
+}
+
+func decVints(p []byte) (int32, int32, int64, bool) {
+	m, read := decVint32(p, 0)
+	if read == 0 {
+		return 0, 0, 0, false
+	}
+	d, read := decVint32(p, read)
+	if read == 0 {
+		return 0, 0, 0, false
+	}
+	n, read := decVint64(p, read)
+	if read == 0 {
+		return 0, 0, 0, false
+	}
+	return decZigZag32(m), decZigZag32(d), decZigZag64(n), true
+}
+
+func decDaysNanos64(p []byte) (int64, int64, bool) {
+	d, read := decVint3264(p, 1)
+	if read == 0 {
+		return 0, 0, false
+	}
+	n, read := decVint64(p, read)
+	if read == 0 {
+		return 0, 0, false
+	}
+	return decZigZag64(d), decZigZag64(n), true
+}
+
+func decNanos64(p []byte) (int64, bool) {
+	n, read := decVint64(p, 2)
+	if read == 0 {
+		return 0, false
+	}
+	return decZigZag64(n), true
+}
+
+func decNanosDur(p []byte) (time.Duration, bool) {
+	n, read := decVint64(p, 2)
+	if read == 0 {
+		return 0, false
+	}
+	return decZigZagDur(n), true
+}
+
+func decDaysNanosDur(p []byte) (time.Duration, time.Duration, bool) {
+	d, read := decVint3264(p, 1)
+	if read == 0 {
+		return 0, 0, false
+	}
+	n, read := decVint64(p, read)
+	if read == 0 {
+		return 0, 0, false
+	}
+	return decZigZagDur(d), decZigZagDur(n), true
+}
+
+func decVint64(p []byte, s int) (uint64, int) {
+	vintLen := decVintLen(p[s:])
+	if vintLen+s != len(p) {
+		return 0, 0
+	}
+	switch vintLen {
+	case 9:
+		return dec9Vint64(p[s:]), s + 9
+	case 8:
+		return dec8Vint64(p[s:]), s + 8
+	case 7:
+		return dec7Vint64(p[s:]), s + 7
+	case 6:
+		return dec6Vint64(p[s:]), s + 6
+	case 5:
+		return dec5Vint64(p[s:]), s + 5
+	case 4:
+		return dec4Vint64(p[s:]), s + 4
+	case 3:
+		return dec3Vint64(p[s:]), s + 3
+	case 2:
+		return dec2Vint64(p[s:]), s + 2
+	case 1:
+		return dec1Vint64(p[s:]), s + 1
+	case 0:
+		return 0, s + 1
+	default:
+		return 0, 0
+	}
+}
+
+func decVint32(p []byte, s int) (uint32, int) {
+	vintLen := decVintLen(p[s:])
+	if vintLen+s >= len(p) {
+		return 0, 0
+	}
+	switch vintLen {
+	case 5:
+		if p[s] != vintPrefix4 {
+			return 0, 0
+		}
+		return dec5Vint32(p[s:]), s + 5
+	case 4:
+		return dec4Vint32(p[s:]), s + 4
+	case 3:
+		return dec3Vint32(p[s:]), s + 3
+	case 2:
+		return dec2Vint32(p[s:]), s + 2
+	case 1:
+		return dec1Vint32(p[s:]), s + 1
+	case 0:
+		return 0, s + 1
+	default:
+		return 0, 0
+	}
+}
+
+func decVint3264(p []byte, s int) (uint64, int) {
+	vintLen := decVintLen(p[s:])
+	if vintLen+s >= len(p) {
+		return 0, 0
+	}
+	switch vintLen {
+	case 5:
+		if p[s] != vintPrefix4 {
+			return 0, 0
+		}
+		return dec5Vint64(p[s:]), s + 5
+	case 4:
+		return dec4Vint64(p[s:]), s + 4
+	case 3:
+		return dec3Vint64(p[s:]), s + 3
+	case 2:
+		return dec2Vint64(p[s:]), s + 2
+	case 1:
+		return dec1Vint64(p[s:]), s + 1
+	case 0:
+		return 0, s + 1
+	default:
+		return 0, 0
+	}
+}
+
+func decVintLen(p []byte) int {
+	switch {
+	case p[0] == 255:
+		return 9
+	case p[0]>>1 == 127:
+		return 8
+	case p[0]>>2 == 63:
+		return 7
+	case p[0]>>3 == 31:
+		return 6
+	case p[0]>>4 == 15:
+		return 5
+	case p[0]>>5 == 7:
+		return 4
+	case p[0]>>6 == 3:
+		return 3
+	case p[0]>>7 == 1:
+		return 2
+	default:
+		return 1
+	}
+}
+
+func decZigZag32(n uint32) int32 {
+	return int32((n >> 1) ^ -(n & 1))
+}
+
+func decZigZag64(n uint64) int64 {
+	return int64((n >> 1) ^ -(n & 1))
+}
+
+func decZigZagDur(n uint64) time.Duration {
+	return time.Duration((n >> 1) ^ -(n & 1))
+}
+
+func dec5Vint32(p []byte) uint32 {
+	return uint32(p[1])<<24 | uint32(p[2])<<16 | uint32(p[3])<<8 | uint32(p[4])
+}
+
+func dec4Vint32(p []byte) uint32 {
+	return uint32(p[0]&^vintPrefix3)<<24 | uint32(p[1])<<16 | uint32(p[2])<<8 | uint32(p[3])
+}
+
+func dec3Vint32(p []byte) uint32 {
+	return uint32(p[0]&^vintPrefix2)<<16 | uint32(p[1])<<8 | uint32(p[2])
+}
+
+func dec2Vint32(p []byte) uint32 {
+	return uint32(p[0]&^vintPrefix1)<<8 | uint32(p[1])
+}
+
+func dec1Vint32(p []byte) uint32 {
+	return uint32(p[0])
+}
+
+func dec9Vint64(p []byte) uint64 {
+	return uint64(p[1])<<56 | uint64(p[2])<<48 | uint64(p[3])<<40 | uint64(p[4])<<32 | uint64(p[5])<<24 | uint64(p[6])<<16 | uint64(p[7])<<8 | uint64(p[8])
+}
+
+func dec8Vint64(p []byte) uint64 {
+	return uint64(p[0]&^vintPrefix7)<<56 | uint64(p[1])<<48 | uint64(p[2])<<40 | uint64(p[3])<<32 | uint64(p[4])<<24 | uint64(p[5])<<16 | uint64(p[6])<<8 | uint64(p[7])
+}
+
+func dec7Vint64(p []byte) uint64 {
+	return uint64(p[0]&^vintPrefix6)<<48 | uint64(p[1])<<40 | uint64(p[2])<<32 | uint64(p[3])<<24 | uint64(p[4])<<16 | uint64(p[5])<<8 | uint64(p[6])
+}
+
+func dec6Vint64(p []byte) uint64 {
+	return uint64(p[0]&^vintPrefix5)<<40 | uint64(p[1])<<32 | uint64(p[2])<<24 | uint64(p[3])<<16 | uint64(p[4])<<8 | uint64(p[5])
+}
+
+func dec5Vint64(p []byte) uint64 {
+	return uint64(p[0]&^vintPrefix4)<<32 | uint64(p[1])<<24 | uint64(p[2])<<16 | uint64(p[3])<<8 | uint64(p[4])
+}
+
+func dec4Vint64(p []byte) uint64 {
+	return uint64(p[0]&^vintPrefix3)<<24 | uint64(p[1])<<16 | uint64(p[2])<<8 | uint64(p[3])
+}
+
+func dec3Vint64(p []byte) uint64 {
+	return uint64(p[0]&^vintPrefix2)<<16 | uint64(p[1])<<8 | uint64(p[2])
+}
+
+func dec2Vint64(p []byte) uint64 {
+	return uint64(p[0]&^vintPrefix1)<<8 | uint64(p[1])
+}
+
+func dec1Vint64(p []byte) uint64 {
+	return uint64(p[0])
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/float/marshal.go b/vendor/github.com/gocql/gocql/serialization/float/marshal.go
new file mode 100644
index 0000000..91f4141
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/float/marshal.go
@@ -0,0 +1,24 @@
+package float
+
+import (
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case float32:
+		return EncFloat32(v)
+	case *float32:
+		return EncFloat32R(v)
+	default:
+		// Custom types (type MyFloat float32) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/float/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/float/marshal_utils.go
new file mode 100644
index 0000000..92e9787
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/float/marshal_utils.go
@@ -0,0 +1,59 @@
+package float
+
+import (
+	"fmt"
+	"reflect"
+	"unsafe"
+)
+
+func EncFloat32(v float32) ([]byte, error) {
+	return encFloat32(v), nil
+}
+
+func EncFloat32R(v *float32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encFloat32R(v), nil
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.Float32:
+		return encFloat32(float32(v.Float())), nil
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal float: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal float: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encFloat32(v float32) []byte {
+	return encUint32(floatToUint(v))
+}
+
+func encFloat32R(v *float32) []byte {
+	return encUint32(floatToUintR(v))
+}
+
+func encUint32(v uint32) []byte {
+	return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
+
+func floatToUint(v float32) uint32 {
+	return *(*uint32)(unsafe.Pointer(&v))
+}
+
+func floatToUintR(v *float32) uint32 {
+	return *(*uint32)(unsafe.Pointer(v))
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/float/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/float/unmarshal.go
new file mode 100644
index 0000000..1d809b3
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/float/unmarshal.go
@@ -0,0 +1,29 @@
+package float
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+	case *float32:
+		return DecFloat32(data, v)
+	case **float32:
+		return DecFloat32R(data, v)
+	default:
+		// Custom types (type MyFloat float32) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal float: unsupported value type (%T)(%[1]v)", v)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/float/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/float/unmarshal_utils.go
new file mode 100644
index 0000000..d4ad55e
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/float/unmarshal_utils.go
@@ -0,0 +1,126 @@
+package float
+
+import (
+	"fmt"
+	"reflect"
+	"unsafe"
+)
+
+var errWrongDataLen = fmt.Errorf("failed to unmarshal float: the length of the data should be 0 or 4")
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal float: can not unmarshal into nil reference(%T)(%[1]v)", v)
+}
+
+func DecFloat32(p []byte, v *float32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 4:
+		*v = decFloat32(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecFloat32R(p []byte, v **float32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(float32)
+		}
+	case 4:
+		*v = decFloat32R(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Float32:
+		return decReflectFloat32(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal float: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v.Type().Elem().Elem().Kind() {
+	case reflect.Float32:
+		return decReflectFloat32R(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal float: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectFloat32(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetFloat(0)
+	case 4:
+		v.SetFloat(float64(decFloat32(p)))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectFloat32R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 4:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetFloat(float64(decFloat32(p)))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectNullableR(p []byte, v reflect.Value) reflect.Value {
+	if p == nil {
+		return reflect.Zero(v.Elem().Type())
+	}
+	return reflect.New(v.Type().Elem().Elem())
+}
+
+func decFloat32(p []byte) float32 {
+	return uint32ToFloat(decUint32(p))
+}
+
+func decFloat32R(p []byte) *float32 {
+	return uint32ToFloatR(decUint32(p))
+}
+
+func uint32ToFloat(v uint32) float32 {
+	return *(*float32)(unsafe.Pointer(&v))
+}
+
+func uint32ToFloatR(v uint32) *float32 {
+	return (*float32)(unsafe.Pointer(&v))
+}
+
+func decUint32(p []byte) uint32 {
+	return uint32(p[0])<<24 | uint32(p[1])<<16 | uint32(p[2])<<8 | uint32(p[3])
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/inet/marshal.go b/vendor/github.com/gocql/gocql/serialization/inet/marshal.go
new file mode 100644
index 0000000..035ab8a
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/inet/marshal.go
@@ -0,0 +1,41 @@
+package inet
+
+import (
+	"net"
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case []byte:
+		return EncBytes(v)
+	case *[]byte:
+		return EncBytesR(v)
+	case net.IP:
+		return EncNetIP(v)
+	case *net.IP:
+		return EncNetIPr(v)
+	case [4]byte:
+		return EncArray4(v)
+	case *[4]byte:
+		return EncArray4R(v)
+	case [16]byte:
+		return EncArray16(v)
+	case *[16]byte:
+		return EncArray16R(v)
+	case string:
+		return EncString(v)
+	case *string:
+		return EncStringR(v)
+	default:
+		// Custom types (type MyIP []byte) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/inet/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/inet/marshal_utils.go
new file mode 100644
index 0000000..cb5d18b
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/inet/marshal_utils.go
@@ -0,0 +1,192 @@
+package inet
+
+import (
+	"fmt"
+	"net"
+	"reflect"
+)
+
+func EncBytes(v []byte) ([]byte, error) {
+	switch len(v) {
+	case 0:
+		if v == nil {
+			return nil, nil
+		}
+		return make([]byte, 0), nil
+	case 4:
+		tmp := make([]byte, 4)
+		copy(tmp, v)
+		return tmp, nil
+	case 16:
+		tmp := make([]byte, 16)
+		copy(tmp, v)
+		return tmp, nil
+	default:
+		return nil, fmt.Errorf("failed to marshal inet: the ([]byte) length can be 0,4,16")
+	}
+}
+
+func EncBytesR(v *[]byte) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncBytes(*v)
+}
+
+func EncNetIP(v net.IP) ([]byte, error) {
+	switch len(v) {
+	case 0:
+		if v == nil {
+			return nil, nil
+		}
+		return make([]byte, 0), nil
+	case 4, 16:
+		t := v.To4()
+		if t == nil {
+			return v.To16(), nil
+		}
+		return t, nil
+	default:
+		return nil, fmt.Errorf("failed to marshal inet: the (net.IP) length can be 0,4,16")
+	}
+}
+
+func EncNetIPr(v *net.IP) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncNetIP(*v)
+}
+
+func EncArray16(v [16]byte) ([]byte, error) {
+	tmp := make([]byte, 16)
+	copy(tmp, v[:])
+	return tmp, nil
+}
+
+func EncArray16R(v *[16]byte) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncArray16(*v)
+}
+
+func EncArray4(v [4]byte) ([]byte, error) {
+	tmp := make([]byte, 4)
+	copy(tmp, v[:])
+	return tmp, nil
+}
+
+func EncArray4R(v *[4]byte) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncArray4(*v)
+}
+
+func EncString(v string) ([]byte, error) {
+	if len(v) == 0 {
+		return nil, nil
+	}
+	b := net.ParseIP(v)
+	if b != nil {
+		t := b.To4()
+		if t == nil {
+			return b.To16(), nil
+		}
+		return t, nil
+	}
+	return nil, fmt.Errorf("failed to marshal inet: invalid IP string %s", v)
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncString(*v)
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.Array:
+		if l := v.Len(); v.Type().Elem().Kind() != reflect.Uint8 || (l != 16 && l != 4) {
+			return nil, fmt.Errorf("failed to marshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		nv := reflect.New(v.Type())
+		nv.Elem().Set(v)
+		return nv.Elem().Bytes(), nil
+	case reflect.Slice:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return nil, fmt.Errorf("failed to marshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return encReflectBytes(v)
+	case reflect.String:
+		return encReflectString(v)
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	switch ev := v.Elem(); ev.Kind() {
+	case reflect.Array:
+		if l := v.Len(); ev.Type().Elem().Kind() != reflect.Uint8 || (l != 16 && l != 4) {
+			return nil, fmt.Errorf("failed to marshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return v.Elem().Bytes(), nil
+	case reflect.Slice:
+		if ev.Type().Elem().Kind() != reflect.Uint8 {
+			return nil, fmt.Errorf("failed to marshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return encReflectBytes(ev)
+	case reflect.String:
+		return encReflectString(ev)
+	default:
+		return nil, fmt.Errorf("failed to marshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func encReflectString(v reflect.Value) ([]byte, error) {
+	val := v.String()
+	if len(val) == 0 {
+		return nil, nil
+	}
+	b := net.ParseIP(val)
+	if b != nil {
+		t := b.To4()
+		if t == nil {
+			return b.To16(), nil
+		}
+		return t, nil
+	}
+	return nil, fmt.Errorf("failed to marshal inet: invalid IP string (%T)(%[1]v)", v.Interface())
+}
+
+func encReflectBytes(v reflect.Value) ([]byte, error) {
+	val := v.Bytes()
+	switch len(val) {
+	case 0:
+		if val == nil {
+			return nil, nil
+		}
+		return make([]byte, 0), nil
+	case 4:
+		tmp := make([]byte, 4)
+		copy(tmp, val)
+		return tmp, nil
+	case 16:
+		tmp := make([]byte, 16)
+		copy(tmp, val)
+		return tmp, nil
+	default:
+		return nil, fmt.Errorf("failed to marshal inet: the (%T) length can be 0,4,16", v.Interface())
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/inet/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/inet/unmarshal.go
new file mode 100644
index 0000000..35f0bf6
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/inet/unmarshal.go
@@ -0,0 +1,46 @@
+package inet
+
+import (
+	"fmt"
+	"net"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+	case *[]byte:
+		return DecBytes(data, v)
+	case **[]byte:
+		return DecBytesR(data, v)
+	case *net.IP:
+		return DecNetIP(data, v)
+	case **net.IP:
+		return DecNetIPr(data, v)
+	case *[4]byte:
+		return DecArray4(data, v)
+	case **[4]byte:
+		return DecArray4R(data, v)
+	case *[16]byte:
+		return DecArray16(data, v)
+	case **[16]byte:
+		return DecArray16R(data, v)
+	case *string:
+		return DecString(data, v)
+	case **string:
+		return DecStringR(data, v)
+	default:
+		// Custom types (type MyIP []byte) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal inet: unsupported value type (%T)(%[1]v)", v)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/inet/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/inet/unmarshal_utils.go
new file mode 100644
index 0000000..90c560c
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/inet/unmarshal_utils.go
@@ -0,0 +1,514 @@
+package inet
+
+import (
+	"fmt"
+	"net"
+	"net/netip"
+	"reflect"
+	"unsafe"
+)
+
+var (
+	errWrongDataLen = fmt.Errorf("failed to unmarshal inet: the length of the data can be 0,4,16")
+
+	digits = getDigits()
+)
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal inet: can not unmarshal into nil reference(%T)(%[1]v)", v)
+}
+
+func DecBytes(p []byte, v *[]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = make([]byte, 0)
+		}
+	case 4:
+		*v = make([]byte, 4)
+		copy(*v, p)
+	case 16:
+		*v = make([]byte, 16)
+		copy(*v, p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecBytesR(p []byte, v **[]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			tmp := make([]byte, 0)
+			*v = &tmp
+		}
+	case 4:
+		*v = &[]byte{0, 0, 0, 0}
+		copy(**v, p)
+	case 16:
+		*v = &[]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+		copy(**v, p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecNetIP(p []byte, v *net.IP) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = make(net.IP, 0)
+		}
+	case 4:
+		*v = make(net.IP, 4)
+		copy(*v, p)
+	case 16:
+		*v = make(net.IP, 16)
+		copy(*v, p)
+		if v4 := v.To4(); v4 != nil {
+			*v = v4
+		}
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecNetIPr(p []byte, v **net.IP) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			tmp := make(net.IP, 0)
+			*v = &tmp
+		}
+	case 4:
+		*v = &net.IP{0, 0, 0, 0}
+		copy(**v, p)
+	case 16:
+		*v = &net.IP{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+		copy(**v, p)
+		if v4 := (*v).To4(); v4 != nil {
+			**v = v4
+		}
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecArray4(p []byte, v *[4]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = [4]byte{}
+	case 4:
+		*v = [4]byte{}
+		copy(v[:], p)
+	case 16:
+		if !isFist10Zeros(p) {
+			return fmt.Errorf("failed to unmarshal inet: can not unmarshal ipV6 into [4]byte")
+		}
+		*v = [4]byte{}
+		copy(v[:], p[12:16])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecArray4R(p []byte, v **[4]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = &[4]byte{}
+		}
+	case 4:
+		*v = &[4]byte{}
+		copy((*v)[:], p)
+	case 16:
+		if !isFist10Zeros(p) {
+			return fmt.Errorf("failed to unmarshal inet: can not unmarshal ipV6 into [4]byte")
+		}
+		*v = &[4]byte{}
+		copy((*v)[:], p[12:16])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecArray16(p []byte, v *[16]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = [16]byte{}
+	case 4, 16:
+		*v = [16]byte{}
+		copy(v[:], p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecArray16R(p []byte, v **[16]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = &[16]byte{}
+		}
+	case 4, 16:
+		*v = &[16]byte{}
+		copy((*v)[:], p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = ""
+		} else {
+			*v = "0.0.0.0"
+		}
+	case 4:
+		*v = decString4(p)
+	case 16:
+		*v = decString16(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			tmp := "0.0.0.0"
+			*v = &tmp
+		}
+	case 4:
+		tmp := decString4(p)
+		*v = &tmp
+	case 16:
+		tmp := decString16(p)
+		*v = &tmp
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Array:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to unmarshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		switch v.Len() {
+		case 4:
+			return decReflectArray4(p, v)
+		case 16:
+			return decReflectArray16(p, v)
+		default:
+			return fmt.Errorf("failed to unmarshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+	case reflect.Slice:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to unmarshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return decReflectBytes(p, v)
+	case reflect.String:
+		return decReflectString(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	ev := v.Elem()
+	switch evt := ev.Type().Elem(); evt.Kind() {
+	case reflect.Array:
+		if evt.Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to marshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		switch ev.Len() {
+		case 4:
+			return decReflectArray4R(p, ev)
+		case 16:
+			return decReflectArray16R(p, ev)
+		default:
+			return fmt.Errorf("failed to unmarshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+	case reflect.Slice:
+		if evt.Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to marshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return decReflectBytesR(p, ev)
+	case reflect.String:
+		return decReflectStringR(p, ev)
+	default:
+		return fmt.Errorf("failed to unmarshal inet: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectArray4(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetZero()
+	case 4:
+		val := reflect.New(v.Type())
+		copy((*[4]byte)(val.UnsafePointer())[:], p)
+		v.Set(val.Elem())
+	case 16:
+		if !isFist10Zeros(p) {
+			return fmt.Errorf("failed to unmarshal inet: can not unmarshal ipV6 into (%T)", v.Interface())
+		}
+		val := reflect.New(v.Type())
+		copy((*[4]byte)(val.UnsafePointer())[:], p[12:16])
+		v.Set(val.Elem())
+	default:
+		return fmt.Errorf("failed to unmarshal inet: to unmarshal into (%T) the length of the data can be 0,4,16", v.Interface())
+	}
+	return nil
+}
+
+func decReflectArray16(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetZero()
+	case 4, 16:
+		val := reflect.New(v.Type())
+		copy((*[16]byte)(val.UnsafePointer())[:], p)
+		v.Set(val.Elem())
+	default:
+		return fmt.Errorf("failed to unmarshal inet: to unmarshal into (%T) the length of the data can be 0,4,16", v.Interface())
+	}
+	return nil
+}
+
+func decReflectBytes(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.SetBytes(nil)
+		} else {
+			v.SetBytes(make([]byte, 0))
+		}
+	case 4:
+		tmp := make([]byte, 4)
+		copy(tmp, p)
+		v.SetBytes(tmp)
+	case 16:
+		tmp := make([]byte, 16)
+		copy(tmp, p)
+		v.SetBytes(tmp)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectString(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.SetString("")
+		} else {
+			v.SetString("0.0.0.0")
+		}
+	case 4:
+		v.SetString(decString4(p))
+	case 16:
+		v.SetString(decString16(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectArray4R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.Set(reflect.Zero(v.Type()))
+		} else {
+			val := reflect.New(v.Type().Elem())
+			v.Set(val)
+		}
+	case 4:
+		val := reflect.New(v.Type().Elem())
+		copy((*[4]byte)(val.UnsafePointer())[:], p)
+		v.Set(val)
+	case 16:
+		if !isFist10Zeros(p) {
+			return fmt.Errorf("failed to unmarshal inet: can not unmarshal ipV6 into (%T)", v.Interface())
+		}
+		val := reflect.New(v.Type().Elem())
+		copy((*[4]byte)(val.UnsafePointer())[:], p[12:16])
+		v.Set(val)
+	default:
+		return fmt.Errorf("failed to unmarshal inet: to unmarshal into (%T) the length of the data can be 0,4,16", v.Interface())
+	}
+	return nil
+}
+
+func decReflectArray16R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.Set(reflect.Zero(v.Type()))
+		} else {
+			val := reflect.New(v.Type().Elem())
+			v.Set(val)
+		}
+	case 4, 16:
+		val := reflect.New(v.Type().Elem())
+		copy((*[16]byte)(val.UnsafePointer())[:], p)
+		v.Set(val)
+	default:
+		return fmt.Errorf("failed to unmarshal inet: to unmarshal into (%T) the length of the data can be 0,4,16", v.Interface())
+	}
+	return nil
+}
+
+func decReflectBytesR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.Set(reflect.Zero(v.Type()))
+		} else {
+			val := reflect.New(v.Type().Elem())
+			val.Elem().SetBytes(make([]byte, 0))
+			v.Set(val)
+		}
+	case 4:
+		tmp := make([]byte, 4)
+		copy(tmp, p)
+		val := reflect.New(v.Type().Elem())
+		val.Elem().SetBytes(tmp)
+		v.Set(val)
+	case 16:
+		tmp := make([]byte, 16)
+		copy(tmp, p)
+		val := reflect.New(v.Type().Elem())
+		val.Elem().SetBytes(tmp)
+		v.Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.Set(reflect.Zero(v.Type()))
+		} else {
+			val := reflect.New(v.Type().Elem())
+			val.Elem().SetString("0.0.0.0")
+			v.Set(val)
+		}
+	case 4:
+		val := reflect.New(v.Type().Elem())
+		val.Elem().SetString(decString4(p))
+		v.Set(val)
+	case 16:
+		val := reflect.New(v.Type().Elem())
+		val.Elem().SetString(decString16(p))
+		v.Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decString4(p []byte) string {
+	out := make([]byte, 0, 15)
+	for _, x := range p {
+		out = append(out, digits[x]...)
+	}
+	return string(out[:len(out)-1])
+}
+
+func decString16(p []byte) string {
+	if isV4MappedToV6(p) {
+		return decString4(p[12:16])
+	}
+	return netip.AddrFrom16(*(*[16]byte)(unsafe.Pointer(&p[0]))).String()
+}
+
+func getDigits() []string {
+	out := make([]string, 256)
+	for i := range out {
+		out[i] = fmt.Sprintf("%d.", i)
+	}
+	return out
+}
+
+func isV4MappedToV6(p []byte) bool {
+	return p[0] == 0 && p[1] == 0 && p[2] == 0 && p[3] == 0 && p[4] == 0 &&
+		p[5] == 0 && p[6] == 0 && p[7] == 0 && p[8] == 0 && p[9] == 0 && p[10] == 255 && p[11] == 255
+}
+
+func isFist10Zeros(p []byte) bool {
+	return p[0] == 0 && p[1] == 0 && p[2] == 0 && p[3] == 0 && p[4] == 0 &&
+		p[5] == 0 && p[6] == 0 && p[7] == 0 && p[8] == 0 && p[9] == 0
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/smallint/marshal.go b/vendor/github.com/gocql/gocql/serialization/smallint/marshal.go
new file mode 100644
index 0000000..6ddca7d
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/smallint/marshal.go
@@ -0,0 +1,74 @@
+package smallint
+
+import (
+	"math/big"
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case int8:
+		return EncInt8(v)
+	case int32:
+		return EncInt32(v)
+	case int16:
+		return EncInt16(v)
+	case int64:
+		return EncInt64(v)
+	case int:
+		return EncInt(v)
+
+	case uint8:
+		return EncUint8(v)
+	case uint16:
+		return EncUint16(v)
+	case uint32:
+		return EncUint32(v)
+	case uint64:
+		return EncUint64(v)
+	case uint:
+		return EncUint(v)
+
+	case big.Int:
+		return EncBigInt(v)
+	case string:
+		return EncString(v)
+
+	case *int8:
+		return EncInt8R(v)
+	case *int16:
+		return EncInt16R(v)
+	case *int32:
+		return EncInt32R(v)
+	case *int64:
+		return EncInt64R(v)
+	case *int:
+		return EncIntR(v)
+
+	case *uint8:
+		return EncUint8R(v)
+	case *uint16:
+		return EncUint16R(v)
+	case *uint32:
+		return EncUint32R(v)
+	case *uint64:
+		return EncUint64R(v)
+	case *uint:
+		return EncUintR(v)
+
+	case *big.Int:
+		return EncBigIntR(v)
+	case *string:
+		return EncStringR(v)
+	default:
+		// Custom types (type MyInt int) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/smallint/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/smallint/marshal_utils.go
new file mode 100644
index 0000000..0164ba4
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/smallint/marshal_utils.go
@@ -0,0 +1,247 @@
+package smallint
+
+import (
+	"fmt"
+	"math"
+	"math/big"
+	"reflect"
+	"strconv"
+)
+
+var (
+	maxBigInt = big.NewInt(math.MaxInt16)
+	minBigInt = big.NewInt(math.MinInt16)
+)
+
+func EncInt8(v int8) ([]byte, error) {
+	if v < 0 {
+		return []byte{255, byte(v)}, nil
+	}
+	return []byte{0, byte(v)}, nil
+}
+
+func EncInt8R(v *int8) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt8(*v)
+}
+
+func EncInt16(v int16) ([]byte, error) {
+	return encInt16(v), nil
+}
+
+func EncInt16R(v *int16) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt16(*v)
+}
+
+func EncInt32(v int32) ([]byte, error) {
+	if v > math.MaxInt16 || v < math.MinInt16 {
+		return nil, fmt.Errorf("failed to marshal smallint: value %#v out of range", v)
+	}
+	return []byte{byte(v >> 8), byte(v)}, nil
+}
+
+func EncInt32R(v *int32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt32(*v)
+}
+
+func EncInt64(v int64) ([]byte, error) {
+	if v > math.MaxInt16 || v < math.MinInt16 {
+		return nil, fmt.Errorf("failed to marshal smallint: value %#v out of range", v)
+	}
+	return encInt64(v), nil
+}
+
+func EncInt64R(v *int64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt64(*v)
+}
+
+func EncInt(v int) ([]byte, error) {
+	if v > math.MaxInt16 || v < math.MinInt16 {
+		return nil, fmt.Errorf("failed to marshal smallint: value %#v out of range", v)
+	}
+	return []byte{byte(v >> 8), byte(v)}, nil
+}
+
+func EncIntR(v *int) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt(*v)
+}
+
+func EncUint8(v uint8) ([]byte, error) {
+	return []byte{0, v}, nil
+}
+
+func EncUint8R(v *uint8) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint8(*v)
+}
+
+func EncUint16(v uint16) ([]byte, error) {
+	return []byte{byte(v >> 8), byte(v)}, nil
+}
+
+func EncUint16R(v *uint16) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint16(*v)
+}
+
+func EncUint32(v uint32) ([]byte, error) {
+	if v > math.MaxUint16 {
+		return nil, fmt.Errorf("failed to marshal smallint: value %#v out of range", v)
+	}
+	return []byte{byte(v >> 8), byte(v)}, nil
+}
+
+func EncUint32R(v *uint32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint32(*v)
+}
+
+func EncUint64(v uint64) ([]byte, error) {
+	if v > math.MaxUint16 {
+		return nil, fmt.Errorf("failed to marshal smallint: value %#v out of range", v)
+	}
+	return encUint64(v), nil
+}
+
+func EncUint64R(v *uint64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint64(*v)
+}
+
+func EncUint(v uint) ([]byte, error) {
+	if v > math.MaxUint16 {
+		return nil, fmt.Errorf("failed to marshal smallint: value %#v out of range", v)
+	}
+	return []byte{byte(v >> 8), byte(v)}, nil
+}
+
+func EncUintR(v *uint) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint(*v)
+}
+
+func EncBigInt(v big.Int) ([]byte, error) {
+	if v.Cmp(maxBigInt) == 1 || v.Cmp(minBigInt) == -1 {
+		return nil, fmt.Errorf("failed to marshal smallint: value (%T)(%s) out of range", v, v.String())
+	}
+	return encInt64(v.Int64()), nil
+}
+
+func EncBigIntR(v *big.Int) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	if v.Cmp(maxBigInt) == 1 || v.Cmp(minBigInt) == -1 {
+		return nil, fmt.Errorf("failed to marshal smallint: value (%T)(%s) out of range", v, v.String())
+	}
+	return encInt64(v.Int64()), nil
+}
+
+func EncString(v string) ([]byte, error) {
+	if v == "" {
+		return nil, nil
+	}
+
+	n, err := strconv.ParseInt(v, 10, 16)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal smallint: can not marshal (%T)(%[1]v) %s", v, err)
+	}
+	return encInt64(n), nil
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncString(*v)
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Type().Kind() {
+	case reflect.Int8:
+		val := v.Int()
+		if val < 0 {
+			return []byte{255, byte(val)}, nil
+		}
+		return []byte{0, byte(val)}, nil
+	case reflect.Int16:
+		return encInt64(v.Int()), nil
+	case reflect.Int, reflect.Int64, reflect.Int32:
+		val := v.Int()
+		if val > math.MaxInt16 || val < math.MinInt16 {
+			return nil, fmt.Errorf("failed to marshal smallint: custom type value (%T)(%[1]v) out of range", v.Interface())
+		}
+		return encInt64(val), nil
+	case reflect.Uint8:
+		return []byte{0, byte(v.Uint())}, nil
+	case reflect.Uint16:
+		return encUint64(v.Uint()), nil
+	case reflect.Uint, reflect.Uint64, reflect.Uint32:
+		val := v.Uint()
+		if val > math.MaxUint16 {
+			return nil, fmt.Errorf("failed to marshal smallint: custom type value (%T)(%[1]v) out of range", v.Interface())
+		}
+		return encUint64(val), nil
+	case reflect.String:
+		val := v.String()
+		if val == "" {
+			return nil, nil
+		}
+
+		n, err := strconv.ParseInt(val, 10, 16)
+		if err != nil {
+			return nil, fmt.Errorf("failed to marshal smallint: can not marshal (%T)(%[1]v), %s", v.Interface(), err)
+		}
+		return encInt64(n), nil
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal smallint: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal smallint: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encInt16(v int16) []byte {
+	return []byte{byte(v >> 8), byte(v)}
+}
+
+func encInt64(v int64) []byte {
+	return []byte{byte(v >> 8), byte(v)}
+}
+
+func encUint64(v uint64) []byte {
+	return []byte{byte(v >> 8), byte(v)}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/smallint/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/smallint/unmarshal.go
new file mode 100644
index 0000000..4383763
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/smallint/unmarshal.go
@@ -0,0 +1,81 @@
+package smallint
+
+import (
+	"fmt"
+	"math/big"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+
+	case *int8:
+		return DecInt8(data, v)
+	case *int16:
+		return DecInt16(data, v)
+	case *int32:
+		return DecInt32(data, v)
+	case *int64:
+		return DecInt64(data, v)
+	case *int:
+		return DecInt(data, v)
+
+	case *uint8:
+		return DecUint8(data, v)
+	case *uint16:
+		return DecUint16(data, v)
+	case *uint32:
+		return DecUint32(data, v)
+	case *uint64:
+		return DecUint64(data, v)
+	case *uint:
+		return DecUint(data, v)
+
+	case *big.Int:
+		return DecBigInt(data, v)
+	case *string:
+		return DecString(data, v)
+
+	case **int8:
+		return DecInt8R(data, v)
+	case **int16:
+		return DecInt16R(data, v)
+	case **int32:
+		return DecInt32R(data, v)
+	case **int64:
+		return DecInt64R(data, v)
+	case **int:
+		return DecIntR(data, v)
+
+	case **uint8:
+		return DecUint8R(data, v)
+	case **uint16:
+		return DecUint16R(data, v)
+	case **uint32:
+		return DecUint32R(data, v)
+	case **uint64:
+		return DecUint64R(data, v)
+	case **uint:
+		return DecUintR(data, v)
+
+	case **big.Int:
+		return DecBigIntR(data, v)
+	case **string:
+		return DecStringR(data, v)
+	default:
+
+		// Custom types (type MyInt int) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal smallint: unsupported value type (%T)(%[1]v)", value)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/smallint/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/smallint/unmarshal_utils.go
new file mode 100644
index 0000000..e16d248
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/smallint/unmarshal_utils.go
@@ -0,0 +1,704 @@
+package smallint
+
+import (
+	"fmt"
+	"math"
+	"math/big"
+	"reflect"
+	"strconv"
+)
+
+const (
+	negInt32 = int32(-1) << 16
+	negInt64 = int64(-1) << 16
+	negInt   = int(-1) << 16
+)
+
+var errWrongDataLen = fmt.Errorf("failed to unmarshal smallint: the length of the data should be 0 or 2")
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal smallint: can not unmarshal into nil reference (%T)(%[1]v))", v)
+}
+
+func DecInt8(p []byte, v *int8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 2:
+		val := decInt16(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal smallint: to unmarshal into int8, the data should be in the int8 range")
+		}
+		*v = int8(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt8R(p []byte, v **int8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int8)
+		}
+	case 2:
+		val := decInt16(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal smallint: to unmarshal into int8, the data should be in the int8 range")
+		}
+		tmp := int8(val)
+		*v = &tmp
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt16(p []byte, v *int16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 2:
+		*v = decInt16(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt16R(p []byte, v **int16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int16)
+		}
+	case 2:
+		val := decInt16(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt32(p []byte, v *int32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 2:
+		*v = decInt32(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt32R(p []byte, v **int32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int32)
+		}
+	case 2:
+		val := decInt32(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt64(p []byte, v *int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 2:
+		*v = decInt64(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt64R(p []byte, v **int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int64)
+		}
+	case 2:
+		val := decInt64(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt(p []byte, v *int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 2:
+		*v = decInt(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecIntR(p []byte, v **int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int)
+		}
+	case 2:
+		val := decInt(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint8(p []byte, v *uint8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 2:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal smallint: to unmarshal into uint8, the data should be in the uint8 range")
+		}
+		*v = p[1]
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint8R(p []byte, v **uint8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint8)
+		}
+	case 2:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal smallint: to unmarshal into uint8, the data should be in the uint8 range")
+		}
+		val := p[1]
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint16(p []byte, v *uint16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 2:
+		*v = decUint16(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint16R(p []byte, v **uint16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint16)
+		}
+	case 2:
+		val := decUint16(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint32(p []byte, v *uint32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 2:
+		*v = decUint32(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint32R(p []byte, v **uint32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint32)
+		}
+	case 2:
+		val := decUint32(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint64(p []byte, v *uint64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 2:
+		*v = decUint64(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint64R(p []byte, v **uint64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint64)
+		}
+	case 2:
+		val := decUint64(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint(p []byte, v *uint) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 2:
+		*v = decUint(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUintR(p []byte, v **uint) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint)
+		}
+	case 2:
+		val := decUint(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = ""
+		} else {
+			*v = "0"
+		}
+	case 2:
+		*v = strconv.FormatInt(decInt64(p), 10)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			val := "0"
+			*v = &val
+		}
+	case 2:
+		val := strconv.FormatInt(decInt64(p), 10)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecBigInt(p []byte, v *big.Int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		v.SetInt64(0)
+	case 2:
+		v.SetInt64(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecBigIntR(p []byte, v **big.Int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = big.NewInt(0)
+		}
+	case 2:
+		*v = big.NewInt(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal smallint: can not unmarshal into nil reference (%T)(%[1]v)", v.Interface())
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Int8:
+		return decReflectInt8(p, v)
+	case reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int:
+		return decReflectInts(p, v)
+	case reflect.Uint8:
+		return decReflectUint8(p, v)
+	case reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uint:
+		return decReflectUints(p, v)
+	case reflect.String:
+		return decReflectString(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal smallint: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal tinyint: can not unmarshal into nil reference (%T)(%[1]v)", v.Interface())
+	}
+
+	switch v.Type().Elem().Elem().Kind() {
+	case reflect.Int8:
+		return decReflectInt8R(p, v)
+	case reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int:
+		return decReflectIntsR(p, v)
+	case reflect.Uint8:
+		return decReflectUint8R(p, v)
+	case reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uint:
+		return decReflectUintsR(p, v)
+	case reflect.String:
+		return decReflectStringR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal tinyint: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectInt8(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 2:
+		val := decInt64(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal smallint: to unmarshal into (%T), the data should be in the int8 range", v.Interface())
+		}
+		v.SetInt(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectInts(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 2:
+		v.SetInt(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint8(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 2:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal smallint: to unmarshal into (%T), the data should be in the uint8 range", v.Interface())
+		}
+		v.SetUint(uint64(p[1]))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUints(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 2:
+		v.SetUint(decUint64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectString(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p != nil {
+			v.SetString("0")
+		} else {
+			v.SetString("")
+		}
+	case 2:
+		v.SetString(strconv.FormatInt(decInt64(p), 10))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectNullableR(p []byte, v reflect.Value) reflect.Value {
+	if p == nil {
+		return reflect.Zero(v.Elem().Type())
+	}
+	return reflect.New(v.Type().Elem().Elem())
+}
+
+func decReflectInt8R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 2:
+		val := decInt64(p)
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return fmt.Errorf("failed to unmarshal smallint: to unmarshal into (%T), the data should be in the int8 range", v.Interface())
+		}
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(val)
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectIntsR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 2:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetInt(decInt64(p))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUint8R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 2:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal smallint: to unmarshal into (%T), the data should be in the uint8 range", v.Interface())
+		}
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(uint64(p[1]))
+		v.Elem().Set(newVal)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUintsR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 2:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetUint(decUint64(p))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		var val reflect.Value
+		if p == nil {
+			val = reflect.Zero(v.Type().Elem())
+		} else {
+			val = reflect.New(v.Type().Elem().Elem())
+			val.Elem().SetString("0")
+		}
+		v.Elem().Set(val)
+	case 2:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetString(strconv.FormatInt(decInt64(p), 10))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decInt16(p []byte) int16 {
+	return int16(p[0])<<8 | int16(p[1])
+}
+
+func decInt32(p []byte) int32 {
+	if p[0] > math.MaxInt8 {
+		return negInt32 | int32(p[0])<<8 | int32(p[1])
+	}
+	return int32(p[0])<<8 | int32(p[1])
+}
+
+func decInt64(p []byte) int64 {
+	if p[0] > math.MaxInt8 {
+		return negInt64 | int64(p[0])<<8 | int64(p[1])
+	}
+	return int64(p[0])<<8 | int64(p[1])
+}
+
+func decInt(p []byte) int {
+	if p[0] > math.MaxInt8 {
+		return negInt | int(p[0])<<8 | int(p[1])
+	}
+	return int(p[0])<<8 | int(p[1])
+}
+
+func decUint16(p []byte) uint16 {
+	return uint16(p[0])<<8 | uint16(p[1])
+}
+
+func decUint32(p []byte) uint32 {
+	return uint32(p[0])<<8 | uint32(p[1])
+}
+
+func decUint64(p []byte) uint64 {
+	return uint64(p[0])<<8 | uint64(p[1])
+}
+
+func decUint(p []byte) uint {
+	return uint(p[0])<<8 | uint(p[1])
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/text/marshal.go b/vendor/github.com/gocql/gocql/serialization/text/marshal.go
new file mode 100644
index 0000000..a3b50e2
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/text/marshal.go
@@ -0,0 +1,28 @@
+package text
+
+import (
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case string:
+		return EncString(v)
+	case *string:
+		return EncStringR(v)
+	case []byte:
+		return EncBytes(v)
+	case *[]byte:
+		return EncBytesR(v)
+	default:
+		// Custom types (type MyString string) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(rv)
+		}
+		return EncReflectR(rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/text/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/text/marshal_utils.go
new file mode 100644
index 0000000..39f144b
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/text/marshal_utils.go
@@ -0,0 +1,61 @@
+package text
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func EncString(v string) ([]byte, error) {
+	return encString(v), nil
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encString(*v), nil
+}
+
+func EncBytes(v []byte) ([]byte, error) {
+	return v, nil
+}
+
+func EncBytesR(v *[]byte) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return *v, nil
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.String:
+		return encString(v.String()), nil
+	case reflect.Slice:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return nil, fmt.Errorf("failed to marshal text: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return EncBytes(v.Bytes())
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal text: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal text: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encString(v string) []byte {
+	if v == "" {
+		return make([]byte, 0)
+	}
+	return []byte(v)
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/text/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/text/unmarshal.go
new file mode 100644
index 0000000..b7e9d61
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/text/unmarshal.go
@@ -0,0 +1,35 @@
+package text
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+	case *string:
+		return DecString(data, v)
+	case **string:
+		return DecStringR(data, v)
+	case *[]byte:
+		return DecBytes(data, v)
+	case **[]byte:
+		return DecBytesR(data, v)
+	case *interface{}:
+		return DecInterface(data, v)
+	default:
+		// Custom types (type MyString string) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal text: unsupported value type (%T)(%[1]v)", v)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/text/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/text/unmarshal_utils.go
new file mode 100644
index 0000000..d03e968
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/text/unmarshal_utils.go
@@ -0,0 +1,167 @@
+package text
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal text: can not unmarshal into nil reference(%T)(%[1]v)", v)
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decString(p)
+	return nil
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decStringR(p)
+	return nil
+}
+
+func DecBytes(p []byte, v *[]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decBytes(p)
+	return nil
+}
+
+func DecBytesR(p []byte, v **[]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decBytesR(p)
+	return nil
+}
+
+func DecInterface(p []byte, v *interface{}) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decBytes(p)
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.String:
+		v.SetString(decString(p))
+	case reflect.Slice:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to marshal text: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		v.SetBytes(decBytes(p))
+	case reflect.Interface:
+		v.Set(reflect.ValueOf(decBytes(p)))
+	default:
+		return fmt.Errorf("failed to unmarshal text: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+	return nil
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch ev := v.Type().Elem().Elem(); ev.Kind() {
+	case reflect.String:
+		return decReflectStringR(p, v)
+	case reflect.Slice:
+		if ev.Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to marshal text: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return decReflectBytesR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal text: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	if len(p) == 0 {
+		if p == nil {
+			v.Elem().Set(reflect.Zero(v.Type().Elem()))
+		} else {
+			v.Elem().Set(reflect.New(v.Type().Elem().Elem()))
+		}
+		return nil
+	}
+	val := reflect.New(v.Type().Elem().Elem())
+	val.Elem().SetString(string(p))
+	v.Elem().Set(val)
+	return nil
+}
+
+func decReflectBytesR(p []byte, v reflect.Value) error {
+	if len(p) == 0 {
+		if p == nil {
+			v.Elem().Set(reflect.Zero(v.Elem().Type()))
+		} else {
+			val := reflect.New(v.Type().Elem().Elem())
+			val.Elem().SetBytes(make([]byte, 0))
+			v.Elem().Set(val)
+		}
+		return nil
+	}
+	tmp := make([]byte, len(p))
+	copy(tmp, p)
+
+	val := reflect.New(v.Type().Elem().Elem())
+	val.Elem().SetBytes(tmp)
+	v.Elem().Set(val)
+	return nil
+}
+
+func decString(p []byte) string {
+	if len(p) == 0 {
+		return ""
+	}
+	return string(p)
+}
+
+func decStringR(p []byte) *string {
+	if len(p) == 0 {
+		if p == nil {
+			return nil
+		}
+		return new(string)
+	}
+	tmp := string(p)
+	return &tmp
+}
+
+func decBytes(p []byte) []byte {
+	if len(p) == 0 {
+		if p == nil {
+			return nil
+		}
+		return make([]byte, 0)
+	}
+	tmp := make([]byte, len(p))
+	copy(tmp, p)
+	return tmp
+}
+
+func decBytesR(p []byte) *[]byte {
+	if len(p) == 0 {
+		if p == nil {
+			return nil
+		}
+		tmp := make([]byte, 0)
+		return &tmp
+	}
+	tmp := make([]byte, len(p))
+	copy(tmp, p)
+	return &tmp
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/timestamp/marshal.go b/vendor/github.com/gocql/gocql/serialization/timestamp/marshal.go
new file mode 100644
index 0000000..50288a2
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/timestamp/marshal.go
@@ -0,0 +1,30 @@
+package timestamp
+
+import (
+	"reflect"
+	"time"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case int64:
+		return EncInt64(v)
+	case *int64:
+		return EncInt64R(v)
+	case time.Time:
+		return EncTime(v)
+	case *time.Time:
+		return EncTimeR(v)
+
+	default:
+		// Custom types (type MyTime int64) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/timestamp/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/timestamp/marshal_utils.go
new file mode 100644
index 0000000..8a7e5b1
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/timestamp/marshal_utils.go
@@ -0,0 +1,64 @@
+package timestamp
+
+import (
+	"fmt"
+	"reflect"
+	"time"
+)
+
+var (
+	maxTimestamp  = time.Date(292278994, 8, 17, 7, 12, 55, 807*1000000, time.UTC)
+	zeroTimestamp = time.Date(1970, 1, 1, 0, 0, 0, 0, time.UTC)
+	minTimestamp  = time.Date(-292275055, 5, 16, 16, 47, 4, 192*1000000, time.UTC)
+)
+
+func EncInt64(v int64) ([]byte, error) {
+	return encInt64(v), nil
+}
+
+func EncInt64R(v *int64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt64(*v)
+}
+
+func EncTime(v time.Time) ([]byte, error) {
+	if v.After(maxTimestamp) || v.Before(minTimestamp) {
+		return nil, fmt.Errorf("failed to marshal timestamp: the (%T)(%s) value should be in the range from -292275055-05-16T16:47:04.192Z to 292278994-08-17T07:12:55.807", v, v.Format(time.RFC3339Nano))
+	}
+	ms := v.Unix()*1e3 + int64(v.Nanosecond())/1e6
+	return []byte{byte(ms >> 56), byte(ms >> 48), byte(ms >> 40), byte(ms >> 32), byte(ms >> 24), byte(ms >> 16), byte(ms >> 8), byte(ms)}, nil
+}
+
+func EncTimeR(v *time.Time) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncTime(*v)
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.Int64:
+		return encInt64(v.Int()), nil
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal timestamp: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal timestamp: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encInt64(v int64) []byte {
+	return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/timestamp/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/timestamp/unmarshal.go
new file mode 100644
index 0000000..9d9c92a
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/timestamp/unmarshal.go
@@ -0,0 +1,36 @@
+package timestamp
+
+import (
+	"fmt"
+	"reflect"
+	"time"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+
+	case *int64:
+		return DecInt64(data, v)
+	case **int64:
+		return DecInt64R(data, v)
+	case *time.Time:
+		return DecTime(data, v)
+	case **time.Time:
+		return DecTimeR(data, v)
+	default:
+
+		// Custom types (type MyTime int64) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal timestamp: unsupported value type (%T)(%[1]v)", value)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/timestamp/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/timestamp/unmarshal_utils.go
new file mode 100644
index 0000000..c1eaab1
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/timestamp/unmarshal_utils.go
@@ -0,0 +1,151 @@
+package timestamp
+
+import (
+	"fmt"
+	"reflect"
+	"time"
+)
+
+var (
+	errWrongDataLen = fmt.Errorf("failed to unmarshal timestamp: the length of the data should be 0 or 8")
+)
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal timestamp: can not unmarshal into nil reference (%T)(%[1]v))", v)
+}
+
+func DecInt64(p []byte, v *int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 8:
+		*v = decInt64(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt64R(p []byte, v **int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int64)
+		}
+	case 8:
+		val := decInt64(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecTime(p []byte, v *time.Time) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = zeroTimestamp
+	case 8:
+		*v = decTime(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecTimeR(p []byte, v **time.Time) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			val := zeroTimestamp
+			*v = &val
+		}
+	case 8:
+		val := decTime(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal timestamp: can not unmarshal into nil reference (%T)(%[1]v))", v.Interface())
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Int64:
+		return decReflectInt64(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal timestamp: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectInt64(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 8:
+		v.SetInt(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal timestamp: can not unmarshal into nil reference (%T)(%[1]v)", v.Interface())
+	}
+
+	switch v.Type().Elem().Elem().Kind() {
+	case reflect.Int64:
+		return decReflectIntsR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal timestamp: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectIntsR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.Elem().Set(reflect.Zero(v.Elem().Type()))
+		} else {
+			v.Elem().Set(reflect.New(v.Type().Elem().Elem()))
+		}
+	case 8:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetInt(decInt64(p))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decInt64(p []byte) int64 {
+	return int64(p[0])<<56 | int64(p[1])<<48 | int64(p[2])<<40 | int64(p[3])<<32 | int64(p[4])<<24 | int64(p[5])<<16 | int64(p[6])<<8 | int64(p[7])
+}
+
+func decTime(p []byte) time.Time {
+	msec := decInt64(p)
+	return time.Unix(msec/1e3, (msec%1e3)*1e6).UTC()
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/timeuuid/marshal.go b/vendor/github.com/gocql/gocql/serialization/timeuuid/marshal.go
new file mode 100644
index 0000000..da6b78b
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/timeuuid/marshal.go
@@ -0,0 +1,32 @@
+package timeuuid
+
+import (
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case [16]byte:
+		return EncArray(v)
+	case *[16]byte:
+		return EncArrayR(v)
+	case []byte:
+		return EncSlice(v)
+	case *[]byte:
+		return EncSliceR(v)
+	case string:
+		return EncString(v)
+	case *string:
+		return EncStringR(v)
+	default:
+		// Custom types (type MyUUID [16]byte) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(rv)
+		}
+		return EncReflectR(rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/timeuuid/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/timeuuid/marshal_utils.go
new file mode 100644
index 0000000..35d0a58
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/timeuuid/marshal_utils.go
@@ -0,0 +1,248 @@
+package timeuuid
+
+import (
+	"fmt"
+	"reflect"
+	"strings"
+)
+
+func EncArray(v [16]byte) ([]byte, error) {
+	return v[:], nil
+}
+
+func EncArrayR(v *[16]byte) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return v[:], nil
+}
+
+func EncSlice(v []byte) ([]byte, error) {
+	switch len(v) {
+	case 0:
+		if v == nil {
+			return nil, nil
+		}
+		return make([]byte, 0), nil
+	case 16:
+		return v, nil
+	default:
+		return nil, fmt.Errorf("failed to marshal timeuuid: the ([]byte) length should be 0 or 16")
+	}
+}
+
+func EncSliceR(v *[]byte) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncSlice(*v)
+}
+
+func EncString(v string) ([]byte, error) {
+	return encString(v)
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encString(*v)
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.Array:
+		if v.Type().Elem().Kind() != reflect.Uint8 || v.Len() != 16 {
+			return nil, fmt.Errorf("failed to marshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		nv := reflect.New(v.Type())
+		nv.Elem().Set(v)
+		return nv.Elem().Bytes(), nil
+	case reflect.Slice:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return nil, fmt.Errorf("failed to marshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return encReflectBytes(v)
+	case reflect.String:
+		return encReflectString(v)
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal timeuuid: timeuuid value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	switch ev := v.Elem(); ev.Kind() {
+	case reflect.Array:
+		if ev.Type().Elem().Kind() != reflect.Uint8 || ev.Len() != 16 {
+			return nil, fmt.Errorf("failed to marshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return v.Elem().Bytes(), nil
+	case reflect.Slice:
+		if ev.Type().Elem().Kind() != reflect.Uint8 {
+			return nil, fmt.Errorf("failed to marshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return encReflectBytes(ev)
+	case reflect.String:
+		return encReflectString(ev)
+	default:
+		return nil, fmt.Errorf("failed to marshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func encReflectBytes(rv reflect.Value) ([]byte, error) {
+	switch rv.Len() {
+	case 0:
+		if rv.IsNil() {
+			return nil, nil
+		}
+		return make([]byte, 0), nil
+	case 16:
+		return rv.Bytes(), nil
+	default:
+		return nil, fmt.Errorf("failed to marshal timeuuid: the (%T) length should be 0 or 16", rv.Interface())
+	}
+}
+
+// encReflectString encodes uuid strings via reflect package.
+// The following code was taken from the `Parse` function of the "github.com/google/uuid" package.
+func encReflectString(v reflect.Value) ([]byte, error) {
+	s := v.String()
+	if s == zeroUUID {
+		return make([]byte, 0), nil
+	}
+	switch len(s) {
+	case 45: // urn:timeuuid:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+		if !strings.EqualFold(s[:9], "urn:timeuuid:") {
+			return nil, fmt.Errorf("failed to marshal timeuuid: the (%T) have invalid urn prefix: %q", v.Interface(), s[:9])
+		}
+		s = s[9:]
+	case 38: // {xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx}
+		s = s[1:]
+	case 36: // xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+	case 32: // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+		var ok bool
+		data := make([]byte, 16)
+		for i := range data {
+			data[i], ok = xtob(s[i*2], s[i*2+1])
+			if !ok {
+				return nil, fmt.Errorf("failed to marshal timeuuid: the (%T) have invalid UUID format: %q", v.Interface(), s)
+			}
+		}
+		return data, nil
+	case 0:
+		return nil, nil
+	default:
+		return nil, fmt.Errorf("failed to marshal timeuuid: the (%T) length can be 0,32,36,38,45", v.Interface())
+	}
+
+	// s is now at least 36 bytes long
+	// it must be of the form  xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+	if s[8] != '-' || s[13] != '-' || s[18] != '-' || s[23] != '-' {
+		return nil, fmt.Errorf("failed to marshal timeuuid: the (%T) have invalid UUID format: %q", v.Interface(), s)
+	}
+	data := make([]byte, 16)
+	for i, x := range [16]int{
+		0, 2, 4, 6,
+		9, 11,
+		14, 16,
+		19, 21,
+		24, 26, 28, 30, 32, 34,
+	} {
+		b, ok := xtob(s[x], s[x+1])
+		if !ok {
+			return nil, fmt.Errorf("failed to marshal timeuuid: the (%T) have invalid UUID format: %q", v.Interface(), b)
+		}
+		data[i] = b
+	}
+	return data, nil
+}
+
+// encString encodes uuid strings.
+// The following code was taken from the `Parse` function of the "github.com/google/uuid" package.
+func encString(s string) ([]byte, error) {
+	if s == zeroUUID {
+		return make([]byte, 0), nil
+	}
+	switch len(s) {
+	case 45: // urn:timeuuid:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+		if !strings.EqualFold(s[:9], "urn:timeuuid:") {
+			return nil, fmt.Errorf("failed to marshal timeuuid: (string) have invalid urn prefix: %q", s[:9])
+		}
+		s = s[9:]
+	case 38: // {xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx}
+		s = s[1:]
+	case 36: // xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+	case 32: // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+		var ok bool
+		data := make([]byte, 16)
+		for i := range data {
+			data[i], ok = xtob(s[i*2], s[i*2+1])
+			if !ok {
+				return nil, fmt.Errorf("failed to marshal timeuuid: the (string) have invalid UUID format: %q", s)
+			}
+		}
+		return data, nil
+	case 0:
+		return nil, nil
+	default:
+		return nil, fmt.Errorf("failed to marshal timeuuid: the (string) length can be 0,32,36,38,45")
+	}
+
+	// s is now at least 36 bytes long
+	// it must be of the form  xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+	if s[8] != '-' || s[13] != '-' || s[18] != '-' || s[23] != '-' {
+		return nil, fmt.Errorf("failed to marshal timeuuid: the (string) have invalid UUID format: %q", s)
+	}
+	data := make([]byte, 16)
+	for i, x := range [16]int{
+		0, 2, 4, 6,
+		9, 11,
+		14, 16,
+		19, 21,
+		24, 26, 28, 30, 32, 34,
+	} {
+		b, ok := xtob(s[x], s[x+1])
+		if !ok {
+			return nil, fmt.Errorf("failed to marshal timeuuid: the (string) have invalid UUID format: %q", b)
+		}
+		data[i] = b
+	}
+	return data, nil
+}
+
+// xtob converts hex characters x1 and x2 into a byte.
+// The following code was taken from the "github.com/google/uuid" package.
+func xtob(x1, x2 byte) (byte, bool) {
+	b1 := xvalues[x1]
+	b2 := xvalues[x2]
+	return (b1 << 4) | b2, b1 != 255 && b2 != 255
+}
+
+// xvalues returns the value of a byte as a hexadecimal digit or 255.
+// The following code was taken from the "github.com/google/uuid" package.
+var xvalues = [256]byte{
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
+	255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/timeuuid/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/timeuuid/unmarshal.go
new file mode 100644
index 0000000..48289bf
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/timeuuid/unmarshal.go
@@ -0,0 +1,41 @@
+package timeuuid
+
+import (
+	"fmt"
+	"reflect"
+	"time"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+	case *[16]byte:
+		return DecArray(data, v)
+	case **[16]byte:
+		return DecArrayR(data, v)
+	case *[]byte:
+		return DecSlice(data, v)
+	case **[]byte:
+		return DecSliceR(data, v)
+	case *string:
+		return DecString(data, v)
+	case **string:
+		return DecStringR(data, v)
+	case *time.Time:
+		return DecTime(data, v)
+	case **time.Time:
+		return DecTimeR(data, v)
+	default:
+		// Custom types (type MyFloat float32) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal timeuuid: unsupported value type (%T)(%[1]v)", v)
+		}
+		if rv.Type().Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/timeuuid/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/timeuuid/unmarshal_utils.go
new file mode 100644
index 0000000..d6b6ad3
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/timeuuid/unmarshal_utils.go
@@ -0,0 +1,357 @@
+package timeuuid
+
+import (
+	"fmt"
+	"reflect"
+	"time"
+)
+
+const (
+	hexString = "0123456789abcdef"
+	zeroUUID  = "00000000-0000-0000-0000-000000000000"
+)
+
+var (
+	offsets  = [...]int{0, 2, 4, 6, 9, 11, 14, 16, 19, 21, 24, 26, 28, 30, 32, 34}
+	timeBase = time.Date(1582, time.October, 15, 0, 0, 0, 0, time.UTC).Unix()
+
+	errWrongDataLen = fmt.Errorf("failed to unmarshal timeuuid: the length of the data should be 0 or 16")
+)
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal timeuuid: can not unmarshal into nil reference(%T)(%[1]v)", v)
+}
+
+func DecArray(p []byte, v *[16]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = [16]byte{}
+	case 16:
+		copy(v[:], p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecArrayR(p []byte, v **[16]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new([16]byte)
+		}
+	case 16:
+		*v = &[16]byte{}
+		copy((*v)[:], p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecSlice(p []byte, v *[]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = make([]byte, 0)
+		}
+	case 16:
+		*v = make([]byte, 16)
+		copy(*v, p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecSliceR(p []byte, v **[]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			tmp := make([]byte, 0)
+			*v = &tmp
+		}
+	case 16:
+		*v = &[]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+		copy(**v, p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = ""
+		} else {
+			*v = zeroUUID
+		}
+	case 16:
+		*v = decString(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			tmp := zeroUUID
+			*v = &tmp
+		}
+	case 16:
+		tmp := decString(p)
+		*v = &tmp
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecTime(p []byte, v *time.Time) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = time.Time{}
+	case 16:
+		*v = decTime(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecTimeR(p []byte, v **time.Time) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(time.Time)
+		}
+	case 16:
+		val := decTime(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Array:
+		if v.Type().Elem().Kind() != reflect.Uint8 || v.Len() != 16 {
+			return fmt.Errorf("failed to unmarshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return decReflectArray(p, v)
+	case reflect.Slice:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to unmarshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return decReflectBytes(p, v)
+	case reflect.String:
+		return decReflectString(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	ev := v.Elem()
+	switch evt := ev.Type().Elem(); evt.Kind() {
+	case reflect.Array:
+		if evt.Elem().Kind() != reflect.Uint8 || ev.Len() != 16 {
+			return fmt.Errorf("failed to marshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return decReflectArrayR(p, ev)
+	case reflect.Slice:
+		if evt.Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to marshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return decReflectBytesR(p, ev)
+	case reflect.String:
+		return decReflectStringR(p, ev)
+	default:
+		return fmt.Errorf("failed to unmarshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectArray(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetZero()
+	case 16:
+		val := reflect.New(v.Type())
+		copy((*[16]byte)(val.UnsafePointer())[:], p)
+		v.Set(val.Elem())
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectBytes(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.SetBytes(nil)
+		} else {
+			v.SetBytes(make([]byte, 0))
+		}
+	case 16:
+		tmp := make([]byte, 16)
+		copy(tmp, p)
+		v.SetBytes(tmp)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectString(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.SetString("")
+		} else {
+			v.SetString(zeroUUID)
+		}
+	case 16:
+		v.SetString(decString(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectArrayR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.Set(reflect.Zero(v.Type()))
+		} else {
+			val := reflect.New(v.Type().Elem())
+			v.Set(val)
+		}
+	case 16:
+		val := reflect.New(v.Type().Elem())
+		copy((*[16]byte)(val.UnsafePointer())[:], p)
+		v.Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectBytesR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.Set(reflect.Zero(v.Type()))
+		} else {
+			val := reflect.New(v.Type().Elem())
+			val.Elem().SetBytes(make([]byte, 0))
+			v.Set(val)
+		}
+	case 16:
+		tmp := make([]byte, 16)
+		copy(tmp, p)
+		val := reflect.New(v.Type().Elem())
+		val.Elem().SetBytes(tmp)
+		v.Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.Set(reflect.Zero(v.Type()))
+		} else {
+			val := reflect.New(v.Type().Elem())
+			val.Elem().SetString(zeroUUID)
+			v.Set(val)
+		}
+	case 16:
+		val := reflect.New(v.Type().Elem())
+		val.Elem().SetString(decString(p))
+		v.Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decString(p []byte) string {
+	r := make([]byte, 36)
+	for i, b := range p {
+		r[offsets[i]] = hexString[b>>4]
+		r[offsets[i]+1] = hexString[b&0xF]
+	}
+	r[8] = '-'
+	r[13] = '-'
+	r[18] = '-'
+	r[23] = '-'
+	return string(r)
+}
+
+func decTime(u []byte) time.Time {
+	ts := decTimestamp(u)
+	sec := ts / 1e7
+	nsec := (ts % 1e7) * 100
+	return time.Unix(sec+timeBase, nsec).UTC()
+}
+
+func decTimestamp(u []byte) int64 {
+	return int64(uint64(u[0])<<24|uint64(u[1])<<16|
+		uint64(u[2])<<8|uint64(u[3])) +
+		int64(uint64(u[4])<<40|uint64(u[5])<<32) +
+		int64(uint64(u[6]&0x0F)<<56|uint64(u[7])<<48)
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/tinyint/marshal.go b/vendor/github.com/gocql/gocql/serialization/tinyint/marshal.go
new file mode 100644
index 0000000..1cfb85c
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/tinyint/marshal.go
@@ -0,0 +1,74 @@
+package tinyint
+
+import (
+	"math/big"
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case int8:
+		return EncInt8(v)
+	case int32:
+		return EncInt32(v)
+	case int16:
+		return EncInt16(v)
+	case int64:
+		return EncInt64(v)
+	case int:
+		return EncInt(v)
+
+	case uint8:
+		return EncUint8(v)
+	case uint16:
+		return EncUint16(v)
+	case uint32:
+		return EncUint32(v)
+	case uint64:
+		return EncUint64(v)
+	case uint:
+		return EncUint(v)
+
+	case big.Int:
+		return EncBigInt(v)
+	case string:
+		return EncString(v)
+
+	case *int8:
+		return EncInt8R(v)
+	case *int16:
+		return EncInt16R(v)
+	case *int32:
+		return EncInt32R(v)
+	case *int64:
+		return EncInt64R(v)
+	case *int:
+		return EncIntR(v)
+
+	case *uint8:
+		return EncUint8R(v)
+	case *uint16:
+		return EncUint16R(v)
+	case *uint32:
+		return EncUint32R(v)
+	case *uint64:
+		return EncUint64R(v)
+	case *uint:
+		return EncUintR(v)
+
+	case *big.Int:
+		return EncBigIntR(v)
+	case *string:
+		return EncStringR(v)
+	default:
+		// Custom types (type MyInt int) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/tinyint/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/tinyint/marshal_utils.go
new file mode 100644
index 0000000..664d117
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/tinyint/marshal_utils.go
@@ -0,0 +1,227 @@
+package tinyint
+
+import (
+	"fmt"
+	"math"
+	"math/big"
+	"reflect"
+	"strconv"
+)
+
+var (
+	maxBigInt = big.NewInt(math.MaxInt8)
+	minBigInt = big.NewInt(math.MinInt8)
+)
+
+func EncInt8(v int8) ([]byte, error) {
+	return []byte{byte(v)}, nil
+}
+
+func EncInt8R(v *int8) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt8(*v)
+}
+
+func EncInt16(v int16) ([]byte, error) {
+	if v > math.MaxInt8 || v < math.MinInt8 {
+		return nil, fmt.Errorf("failed to marshal tinyint: value %#v out of range", v)
+	}
+	return []byte{byte(v)}, nil
+}
+
+func EncInt16R(v *int16) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt16(*v)
+}
+
+func EncInt32(v int32) ([]byte, error) {
+	if v > math.MaxInt8 || v < math.MinInt8 {
+		return nil, fmt.Errorf("failed to marshal tinyint: value %#v out of range", v)
+	}
+	return []byte{byte(v)}, nil
+}
+
+func EncInt32R(v *int32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt32(*v)
+}
+
+func EncInt64(v int64) ([]byte, error) {
+	if v > math.MaxInt8 || v < math.MinInt8 {
+		return nil, fmt.Errorf("failed to marshal tinyint: value %#v out of range", v)
+	}
+	return []byte{byte(v)}, nil
+}
+
+func EncInt64R(v *int64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt64(*v)
+}
+
+func EncInt(v int) ([]byte, error) {
+	if v > math.MaxInt8 || v < math.MinInt8 {
+		return nil, fmt.Errorf("failed to marshal tinyint: value %#v out of range", v)
+	}
+	return []byte{byte(v)}, nil
+}
+
+func EncIntR(v *int) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt(*v)
+}
+
+func EncUint8(v uint8) ([]byte, error) {
+	return []byte{v}, nil
+}
+
+func EncUint8R(v *uint8) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint8(*v)
+}
+
+func EncUint16(v uint16) ([]byte, error) {
+	if v > math.MaxUint8 {
+		return nil, fmt.Errorf("failed to marshal tinyint: value %#v out of range", v)
+	}
+	return []byte{byte(v)}, nil
+}
+
+func EncUint16R(v *uint16) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint16(*v)
+}
+
+func EncUint32(v uint32) ([]byte, error) {
+	if v > math.MaxUint8 {
+		return nil, fmt.Errorf("failed to marshal tinyint: value %#v out of range", v)
+	}
+	return []byte{byte(v)}, nil
+}
+
+func EncUint32R(v *uint32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint32(*v)
+}
+
+func EncUint64(v uint64) ([]byte, error) {
+	if v > math.MaxUint8 {
+		return nil, fmt.Errorf("failed to marshal tinyint: value %#v out of range", v)
+	}
+	return []byte{byte(v)}, nil
+}
+
+func EncUint64R(v *uint64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint64(*v)
+}
+
+func EncUint(v uint) ([]byte, error) {
+	if v > math.MaxUint8 {
+		return nil, fmt.Errorf("failed to marshal tinyint: value %#v out of range", v)
+	}
+	return []byte{byte(v)}, nil
+}
+
+func EncUintR(v *uint) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncUint(*v)
+}
+
+func EncBigInt(v big.Int) ([]byte, error) {
+	if v.Cmp(maxBigInt) == 1 || v.Cmp(minBigInt) == -1 {
+		return nil, fmt.Errorf("failed to marshal tinyint: value (%T)(%s) out of range", v, v.String())
+	}
+	return []byte{byte(v.Int64())}, nil
+}
+
+func EncBigIntR(v *big.Int) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncBigInt(*v)
+}
+
+func EncString(v string) ([]byte, error) {
+	if v == "" {
+		return nil, nil
+	}
+
+	n, err := strconv.ParseInt(v, 10, 8)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal tinyint: can not marshal (%T)(%[1]v) %s", v, err)
+	}
+	return []byte{byte(n)}, nil
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncString(*v)
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.Int8:
+		return []byte{byte(v.Int())}, nil
+	case reflect.Int, reflect.Int64, reflect.Int32, reflect.Int16:
+		val := v.Int()
+		if val > math.MaxInt8 || val < math.MinInt8 {
+			return nil, fmt.Errorf("failed to marshal tinyint: value (%T)(%[1]v) out of range", v.Interface())
+		}
+		return []byte{byte(val)}, nil
+	case reflect.Uint8:
+		return []byte{byte(v.Uint())}, nil
+	case reflect.Uint, reflect.Uint64, reflect.Uint32, reflect.Uint16:
+		val := v.Uint()
+		if val > math.MaxUint8 {
+			return nil, fmt.Errorf("failed to marshal tinyint: value (%T)(%[1]v) out of range", v.Interface())
+		}
+		return []byte{byte(val)}, nil
+	case reflect.String:
+		val := v.String()
+		if val == "" {
+			return nil, nil
+		}
+
+		n, err := strconv.ParseInt(val, 10, 8)
+		if err != nil {
+			return nil, fmt.Errorf("failed to marshal tinyint: can not marshal (%T)(%[1]v) %s", v.Interface(), err)
+		}
+		return []byte{byte(n)}, nil
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal tinyint: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal tinyint: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/tinyint/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/tinyint/unmarshal.go
new file mode 100644
index 0000000..3e1f719
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/tinyint/unmarshal.go
@@ -0,0 +1,81 @@
+package tinyint
+
+import (
+	"fmt"
+	"math/big"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+
+	case *int8:
+		return DecInt8(data, v)
+	case *int16:
+		return DecInt16(data, v)
+	case *int32:
+		return DecInt32(data, v)
+	case *int64:
+		return DecInt64(data, v)
+	case *int:
+		return DecInt(data, v)
+
+	case *uint8:
+		return DecUint8(data, v)
+	case *uint16:
+		return DecUint16(data, v)
+	case *uint32:
+		return DecUint32(data, v)
+	case *uint64:
+		return DecUint64(data, v)
+	case *uint:
+		return DecUint(data, v)
+
+	case *big.Int:
+		return DecBigInt(data, v)
+	case *string:
+		return DecString(data, v)
+
+	case **int8:
+		return DecInt8R(data, v)
+	case **int16:
+		return DecInt16R(data, v)
+	case **int32:
+		return DecInt32R(data, v)
+	case **int64:
+		return DecInt64R(data, v)
+	case **int:
+		return DecIntR(data, v)
+
+	case **uint8:
+		return DecUint8R(data, v)
+	case **uint16:
+		return DecUint16R(data, v)
+	case **uint32:
+		return DecUint32R(data, v)
+	case **uint64:
+		return DecUint64R(data, v)
+	case **uint:
+		return DecUintR(data, v)
+
+	case **big.Int:
+		return DecBigIntR(data, v)
+	case **string:
+		return DecStringR(data, v)
+	default:
+
+		// Custom types (type MyInt int) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal tinyint: unsupported value type (%T)(%[1]v)", v)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/tinyint/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/tinyint/unmarshal_utils.go
new file mode 100644
index 0000000..7f1c9ab
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/tinyint/unmarshal_utils.go
@@ -0,0 +1,604 @@
+package tinyint
+
+import (
+	"fmt"
+	"math"
+	"math/big"
+	"reflect"
+	"strconv"
+)
+
+const (
+	negInt16 = int16(-1) << 8
+	negInt32 = int32(-1) << 8
+	negInt64 = int64(-1) << 8
+	negInt   = int(-1) << 8
+)
+
+var errWrongDataLen = fmt.Errorf("failed to unmarshal tinyint: the length of the data should less or equal then 1")
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal tinyint: can not unmarshal into nil reference(%T)(%[1]v)", v)
+}
+
+func DecInt8(p []byte, v *int8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 1:
+		*v = int8(p[0])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt8R(p []byte, v **int8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int8)
+		}
+	case 1:
+		val := int8(p[0])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt16(p []byte, v *int16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 1:
+		*v = decInt16(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt16R(p []byte, v **int16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int16)
+		}
+	case 1:
+		val := decInt16(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt32(p []byte, v *int32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 1:
+		*v = decInt32(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt32R(p []byte, v **int32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int32)
+		}
+	case 1:
+		val := decInt32(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt64(p []byte, v *int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 1:
+		*v = decInt64(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt64R(p []byte, v **int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int64)
+		}
+	case 1:
+		val := decInt64(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecInt(p []byte, v *int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 1:
+		*v = decInt(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecIntR(p []byte, v **int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int)
+		}
+	case 1:
+		val := decInt(p)
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint8(p []byte, v *uint8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 1:
+		*v = p[0]
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint8R(p []byte, v **uint8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint8)
+		}
+	case 1:
+		val := p[0]
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint16(p []byte, v *uint16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 1:
+		*v = uint16(p[0])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint16R(p []byte, v **uint16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint16)
+		}
+	case 1:
+		val := uint16(p[0])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint32(p []byte, v *uint32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 1:
+		*v = uint32(p[0])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint32R(p []byte, v **uint32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint32)
+		}
+	case 1:
+		val := uint32(p[0])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint64(p []byte, v *uint64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 1:
+		*v = uint64(p[0])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint64R(p []byte, v **uint64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint64)
+		}
+	case 1:
+		val := uint64(p[0])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUint(p []byte, v *uint) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 1:
+		*v = uint(p[0])
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecUintR(p []byte, v **uint) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint)
+		}
+	case 1:
+		val := uint(p[0])
+		*v = &val
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = ""
+		} else {
+			*v = "0"
+		}
+	case 1:
+		*v = strconv.FormatInt(decInt64(p), 10)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			val := "0"
+			*v = &val
+		}
+	case 1:
+		*v = new(string)
+		**v = strconv.FormatInt(decInt64(p), 10)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecBigInt(p []byte, v *big.Int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		v.SetInt64(0)
+	case 1:
+		v.SetInt64(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecBigIntR(p []byte, v **big.Int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = big.NewInt(0)
+		}
+	case 1:
+		*v = big.NewInt(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int:
+		return decReflectInts(p, v)
+	case reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uint:
+		return decReflectUints(p, v)
+	case reflect.String:
+		return decReflectString(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal tinyint: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v.Type().Elem().Elem().Kind() {
+	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int:
+		return decReflectIntsR(p, v)
+	case reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uint:
+		return decReflectUintsR(p, v)
+	case reflect.String:
+		return decReflectStringR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal tinyint: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectInts(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 1:
+		v.SetInt(decInt64(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUints(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+	case 1:
+		v.SetUint(uint64(p[0]))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectString(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.SetString("")
+		} else {
+			v.SetString("0")
+		}
+	case 1:
+		v.SetString(strconv.FormatInt(decInt64(p), 10))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectIntsR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 1:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetInt(decInt64(p))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectUintsR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 1:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetUint(uint64(p[0]))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		var val reflect.Value
+		if p == nil {
+			val = reflect.Zero(v.Type().Elem())
+		} else {
+			val = reflect.New(v.Type().Elem().Elem())
+			val.Elem().SetString("0")
+		}
+		v.Elem().Set(val)
+	case 1:
+		val := reflect.New(v.Type().Elem().Elem())
+		val.Elem().SetString(strconv.FormatInt(decInt64(p), 10))
+		v.Elem().Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectNullableR(p []byte, v reflect.Value) reflect.Value {
+	if p == nil {
+		return reflect.Zero(v.Elem().Type())
+	}
+	return reflect.New(v.Type().Elem().Elem())
+}
+
+func decInt16(p []byte) int16 {
+	if p[0] > math.MaxInt8 {
+		return negInt16 | int16(p[0])
+	}
+	return int16(p[0])
+}
+
+func decInt32(p []byte) int32 {
+	if p[0] > math.MaxInt8 {
+		return negInt32 | int32(p[0])
+	}
+	return int32(p[0])
+}
+
+func decInt64(p []byte) int64 {
+	if p[0] > math.MaxInt8 {
+		return negInt64 | int64(p[0])
+	}
+	return int64(p[0])
+}
+
+func decInt(p []byte) int {
+	if p[0] > math.MaxInt8 {
+		return negInt | int(p[0])
+	}
+	return int(p[0])
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/uuid/marshal.go b/vendor/github.com/gocql/gocql/serialization/uuid/marshal.go
new file mode 100644
index 0000000..222c333
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/uuid/marshal.go
@@ -0,0 +1,32 @@
+package uuid
+
+import (
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case [16]byte:
+		return EncArray(v)
+	case *[16]byte:
+		return EncArrayR(v)
+	case []byte:
+		return EncSlice(v)
+	case *[]byte:
+		return EncSliceR(v)
+	case string:
+		return EncString(v)
+	case *string:
+		return EncStringR(v)
+	default:
+		// Custom types (type MyUUID [16]byte) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(rv)
+		}
+		return EncReflectR(rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/uuid/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/uuid/marshal_utils.go
new file mode 100644
index 0000000..9a50aae
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/uuid/marshal_utils.go
@@ -0,0 +1,242 @@
+package uuid
+
+import (
+	"fmt"
+	"reflect"
+	"strings"
+)
+
+func EncArray(v [16]byte) ([]byte, error) {
+	return v[:], nil
+}
+
+func EncArrayR(v *[16]byte) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return v[:], nil
+}
+
+func EncSlice(v []byte) ([]byte, error) {
+	switch len(v) {
+	case 0:
+		if v == nil {
+			return nil, nil
+		}
+		return make([]byte, 0), nil
+	case 16:
+		return v, nil
+	default:
+		return nil, fmt.Errorf("failed to marshal uuid: the ([]byte) length should be 0 or 16")
+	}
+}
+
+func EncSliceR(v *[]byte) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncSlice(*v)
+}
+
+func EncString(v string) ([]byte, error) {
+	return encString(v)
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encString(*v)
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.Array:
+		if v.Type().Elem().Kind() != reflect.Uint8 || v.Len() != 16 {
+			return nil, fmt.Errorf("failed to marshal uuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		nv := reflect.New(v.Type())
+		nv.Elem().Set(v)
+		return nv.Elem().Bytes(), nil
+	case reflect.Slice:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return nil, fmt.Errorf("failed to marshal uuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return encReflectBytes(v)
+	case reflect.String:
+		return encReflectString(v)
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal uuid: timeuuid value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal uuid: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	switch ev := v.Elem(); ev.Kind() {
+	case reflect.Array:
+		if ev.Type().Elem().Kind() != reflect.Uint8 || ev.Len() != 16 {
+			return nil, fmt.Errorf("failed to marshal uuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return v.Elem().Bytes(), nil
+	case reflect.Slice:
+		if ev.Type().Elem().Kind() != reflect.Uint8 {
+			return nil, fmt.Errorf("failed to marshal uuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return encReflectBytes(ev)
+	case reflect.String:
+		return encReflectString(ev)
+	default:
+		return nil, fmt.Errorf("failed to marshal uuid: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func encReflectBytes(rv reflect.Value) ([]byte, error) {
+	switch rv.Len() {
+	case 0:
+		if rv.IsNil() {
+			return nil, nil
+		}
+		return make([]byte, 0), nil
+	case 16:
+		return rv.Bytes(), nil
+	default:
+		return nil, fmt.Errorf("failed to marshal uuid: the (%T) length should be 0 or 16", rv.Interface())
+	}
+}
+
+// encReflectString encodes uuid strings via reflect package.
+// The following code was taken from the `Parse` function of the "github.com/google/uuid" package.
+func encReflectString(v reflect.Value) ([]byte, error) {
+	s := v.String()
+	switch len(s) {
+	case 45: // urn:uuid:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+		if !strings.EqualFold(s[:9], "urn:uuid:") {
+			return nil, fmt.Errorf("failed to marshal uuid: the (%T) have invalid urn prefix: %q", v.Interface(), s[:9])
+		}
+		s = s[9:]
+	case 38: // {xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx}
+		s = s[1:]
+	case 36: // xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+	case 32: // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+		var ok bool
+		data := make([]byte, 16)
+		for i := range data {
+			data[i], ok = xtob(s[i*2], s[i*2+1])
+			if !ok {
+				return nil, fmt.Errorf("failed to marshal uuid: the (%T) have invalid UUID format: %q", v.Interface(), s)
+			}
+		}
+		return data, nil
+	case 0:
+		return nil, nil
+	default:
+		return nil, fmt.Errorf("failed to marshal uuid: the (%T) length can be 0,32,36,38,45", v.Interface())
+	}
+
+	// s is now at least 36 bytes long
+	// it must be of the form  xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+	if s[8] != '-' || s[13] != '-' || s[18] != '-' || s[23] != '-' {
+		return nil, fmt.Errorf("failed to marshal uuid: the (%T) have invalid UUID format: %q", v.Interface(), s)
+	}
+	data := make([]byte, 16)
+	for i, x := range [16]int{
+		0, 2, 4, 6,
+		9, 11,
+		14, 16,
+		19, 21,
+		24, 26, 28, 30, 32, 34,
+	} {
+		b, ok := xtob(s[x], s[x+1])
+		if !ok {
+			return nil, fmt.Errorf("failed to marshal uuid: the (%T) have invalid UUID format: %q", v.Interface(), b)
+		}
+		data[i] = b
+	}
+	return data, nil
+}
+
+// encString encodes uuid strings.
+// The following code was taken from the `Parse` function of the "github.com/google/uuid" package.
+func encString(s string) ([]byte, error) {
+	switch len(s) {
+	case 45: // urn:uuid:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+		if !strings.EqualFold(s[:9], "urn:uuid:") {
+			return nil, fmt.Errorf("failed to marshal uuid: (string) have invalid urn prefix: %q", s[:9])
+		}
+		s = s[9:]
+	case 38: // {xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx}
+		s = s[1:]
+	case 36: // xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+	case 32: // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+		var ok bool
+		data := make([]byte, 16)
+		for i := range data {
+			data[i], ok = xtob(s[i*2], s[i*2+1])
+			if !ok {
+				return nil, fmt.Errorf("failed to marshal uuid: the (string) have invalid UUID format: %q", s)
+			}
+		}
+		return data, nil
+	case 0:
+		return nil, nil
+	default:
+		return nil, fmt.Errorf("failed to marshal uuid: the (string) length can be 0,32,36,38,45")
+	}
+
+	// s is now at least 36 bytes long
+	// it must be of the form  xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+	if s[8] != '-' || s[13] != '-' || s[18] != '-' || s[23] != '-' {
+		return nil, fmt.Errorf("failed to marshal uuid: the (string) have invalid UUID format: %q", s)
+	}
+	data := make([]byte, 16)
+	for i, x := range [16]int{
+		0, 2, 4, 6,
+		9, 11,
+		14, 16,
+		19, 21,
+		24, 26, 28, 30, 32, 34,
+	} {
+		b, ok := xtob(s[x], s[x+1])
+		if !ok {
+			return nil, fmt.Errorf("failed to marshal uuid: the (string) have invalid UUID format: %q", b)
+		}
+		data[i] = b
+	}
+	return data, nil
+}
+
+// xtob converts hex characters x1 and x2 into a byte.
+// The following code was taken from the "github.com/google/uuid" package.
+func xtob(x1, x2 byte) (byte, bool) {
+	b1 := xvalues[x1]
+	b2 := xvalues[x2]
+	return (b1 << 4) | b2, b1 != 255 && b2 != 255
+}
+
+// xvalues returns the value of a byte as a hexadecimal digit or 255.
+// The following code was taken from the "github.com/google/uuid" package.
+var xvalues = [256]byte{
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
+	255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/uuid/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/uuid/unmarshal.go
new file mode 100644
index 0000000..1ade8b9
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/uuid/unmarshal.go
@@ -0,0 +1,36 @@
+package uuid
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+	case *[16]byte:
+		return DecArray(data, v)
+	case **[16]byte:
+		return DecArrayR(data, v)
+	case *[]byte:
+		return DecSlice(data, v)
+	case **[]byte:
+		return DecSliceR(data, v)
+	case *string:
+		return DecString(data, v)
+	case **string:
+		return DecStringR(data, v)
+	default:
+		// Custom types (type MyFloat float32) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal uuid: unsupported value type (%T)(%[1]v)", v)
+		}
+		if rv.Type().Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/uuid/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/uuid/unmarshal_utils.go
new file mode 100644
index 0000000..d5d5606
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/uuid/unmarshal_utils.go
@@ -0,0 +1,302 @@
+package uuid
+
+import (
+	"fmt"
+	"reflect"
+)
+
+const hexString = "0123456789abcdef"
+
+var (
+	offsets         = [...]int{0, 2, 4, 6, 9, 11, 14, 16, 19, 21, 24, 26, 28, 30, 32, 34}
+	errWrongDataLen = fmt.Errorf("failed to unmarshal uuid: the length of the data should be 0 or 16")
+)
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal uuid: can not unmarshal into nil reference(%T)(%[1]v)", v)
+}
+
+func DecArray(p []byte, v *[16]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = [16]byte{}
+	case 16:
+		copy(v[:], p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecArrayR(p []byte, v **[16]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new([16]byte)
+		}
+	case 16:
+		*v = &[16]byte{}
+		copy((*v)[:], p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecSlice(p []byte, v *[]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = make([]byte, 0)
+		}
+	case 16:
+		*v = make([]byte, 16)
+		copy(*v, p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecSliceR(p []byte, v **[]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			tmp := make([]byte, 0)
+			*v = &tmp
+		}
+	case 16:
+		*v = &[]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+		copy(**v, p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = ""
+		} else {
+			*v = "00000000-0000-0000-0000-000000000000"
+		}
+	case 16:
+		*v = decString(p)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			tmp := "00000000-0000-0000-0000-000000000000"
+			*v = &tmp
+		}
+	case 16:
+		tmp := decString(p)
+		*v = &tmp
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Array:
+		if v.Type().Elem().Kind() != reflect.Uint8 || v.Len() != 16 {
+			return fmt.Errorf("failed to unmarshal uuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return decReflectArray(p, v)
+	case reflect.Slice:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to unmarshal uuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return decReflectBytes(p, v)
+	case reflect.String:
+		return decReflectString(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal uuid: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	ev := v.Elem()
+	switch evt := ev.Type().Elem(); evt.Kind() {
+	case reflect.Array:
+		if evt.Elem().Kind() != reflect.Uint8 || ev.Len() != 16 {
+			return fmt.Errorf("failed to marshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return decReflectArrayR(p, ev)
+	case reflect.Slice:
+		if evt.Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to marshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return decReflectBytesR(p, ev)
+	case reflect.String:
+		return decReflectStringR(p, ev)
+	default:
+		return fmt.Errorf("failed to unmarshal timeuuid: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectArray(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetZero()
+	case 16:
+		val := reflect.New(v.Type())
+		copy((*[16]byte)(val.UnsafePointer())[:], p)
+		v.Set(val.Elem())
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectBytes(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.SetBytes(nil)
+		} else {
+			v.SetBytes(make([]byte, 0))
+		}
+	case 16:
+		tmp := make([]byte, 16)
+		copy(tmp, p)
+		v.SetBytes(tmp)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectString(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.SetString("")
+		} else {
+			v.SetString("00000000-0000-0000-0000-000000000000")
+		}
+	case 16:
+		v.SetString(decString(p))
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectArrayR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.Set(reflect.Zero(v.Type()))
+		} else {
+			val := reflect.New(v.Type().Elem())
+			v.Set(val)
+		}
+	case 16:
+		val := reflect.New(v.Type().Elem())
+		copy((*[16]byte)(val.UnsafePointer())[:], p)
+		v.Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectBytesR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.Set(reflect.Zero(v.Type()))
+		} else {
+			val := reflect.New(v.Type().Elem())
+			val.Elem().SetBytes(make([]byte, 0))
+			v.Set(val)
+		}
+	case 16:
+		tmp := make([]byte, 16)
+		copy(tmp, p)
+		val := reflect.New(v.Type().Elem())
+		val.Elem().SetBytes(tmp)
+		v.Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.Set(reflect.Zero(v.Type()))
+		} else {
+			val := reflect.New(v.Type().Elem())
+			val.Elem().SetString("00000000-0000-0000-0000-000000000000")
+			v.Set(val)
+		}
+	case 16:
+		val := reflect.New(v.Type().Elem())
+		val.Elem().SetString(decString(p))
+		v.Set(val)
+	default:
+		return errWrongDataLen
+	}
+	return nil
+}
+
+func decString(p []byte) string {
+	r := make([]byte, 36)
+	for i, b := range p {
+		r[offsets[i]] = hexString[b>>4]
+		r[offsets[i]+1] = hexString[b&0xF]
+	}
+	r[8] = '-'
+	r[13] = '-'
+	r[18] = '-'
+	r[23] = '-'
+	return string(r)
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/varchar/marshal.go b/vendor/github.com/gocql/gocql/serialization/varchar/marshal.go
new file mode 100644
index 0000000..0dcbbff
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/varchar/marshal.go
@@ -0,0 +1,28 @@
+package varchar
+
+import (
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case string:
+		return EncString(v)
+	case *string:
+		return EncStringR(v)
+	case []byte:
+		return EncBytes(v)
+	case *[]byte:
+		return EncBytesR(v)
+	default:
+		// Custom types (type MyString string) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(rv)
+		}
+		return EncReflectR(rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/varchar/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/varchar/marshal_utils.go
new file mode 100644
index 0000000..cfe87c5
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/varchar/marshal_utils.go
@@ -0,0 +1,61 @@
+package varchar
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func EncString(v string) ([]byte, error) {
+	return encString(v), nil
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encString(*v), nil
+}
+
+func EncBytes(v []byte) ([]byte, error) {
+	return v, nil
+}
+
+func EncBytesR(v *[]byte) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return *v, nil
+}
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Kind() {
+	case reflect.String:
+		return encString(v.String()), nil
+	case reflect.Slice:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return nil, fmt.Errorf("failed to marshal varchar: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return EncBytes(v.Bytes())
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal varchar: unsupported value type (%T)(%[1]v)", v.Interface())
+	default:
+		return nil, fmt.Errorf("failed to marshal varchar: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encString(v string) []byte {
+	if v == "" {
+		return make([]byte, 0)
+	}
+	return []byte(v)
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/varchar/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/varchar/unmarshal.go
new file mode 100644
index 0000000..9aae7f8
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/varchar/unmarshal.go
@@ -0,0 +1,35 @@
+package varchar
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+	case *string:
+		return DecString(data, v)
+	case **string:
+		return DecStringR(data, v)
+	case *[]byte:
+		return DecBytes(data, v)
+	case **[]byte:
+		return DecBytesR(data, v)
+	case *interface{}:
+		return DecInterface(data, v)
+	default:
+		// Custom types (type MyString string) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal varchar: unsupported value type (%T)(%[1]v)", v)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/varchar/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/varchar/unmarshal_utils.go
new file mode 100644
index 0000000..85f8ca1
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/varchar/unmarshal_utils.go
@@ -0,0 +1,167 @@
+package varchar
+
+import (
+	"fmt"
+	"reflect"
+)
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal varchar: can not unmarshal into nil reference(%T)(%[1]v)", v)
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decString(p)
+	return nil
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decStringR(p)
+	return nil
+}
+
+func DecBytes(p []byte, v *[]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decBytes(p)
+	return nil
+}
+
+func DecBytesR(p []byte, v **[]byte) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decBytesR(p)
+	return nil
+}
+
+func DecInterface(p []byte, v *interface{}) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	*v = decBytes(p)
+	return nil
+}
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.String:
+		v.SetString(decString(p))
+	case reflect.Slice:
+		if v.Type().Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to marshal varchar: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		v.SetBytes(decBytes(p))
+	case reflect.Interface:
+		v.Set(reflect.ValueOf(decBytes(p)))
+	default:
+		return fmt.Errorf("failed to unmarshal varchar: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+	return nil
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return errNilReference(v)
+	}
+
+	switch ev := v.Type().Elem().Elem(); ev.Kind() {
+	case reflect.String:
+		return decReflectStringR(p, v)
+	case reflect.Slice:
+		if ev.Elem().Kind() != reflect.Uint8 {
+			return fmt.Errorf("failed to marshal varchar: unsupported value type (%T)(%[1]v)", v.Interface())
+		}
+		return decReflectBytesR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal varchar: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	if len(p) == 0 {
+		if p == nil {
+			v.Elem().Set(reflect.Zero(v.Type().Elem()))
+		} else {
+			v.Elem().Set(reflect.New(v.Type().Elem().Elem()))
+		}
+		return nil
+	}
+	val := reflect.New(v.Type().Elem().Elem())
+	val.Elem().SetString(string(p))
+	v.Elem().Set(val)
+	return nil
+}
+
+func decReflectBytesR(p []byte, v reflect.Value) error {
+	if len(p) == 0 {
+		if p == nil {
+			v.Elem().Set(reflect.Zero(v.Elem().Type()))
+		} else {
+			val := reflect.New(v.Type().Elem().Elem())
+			val.Elem().SetBytes(make([]byte, 0))
+			v.Elem().Set(val)
+		}
+		return nil
+	}
+	tmp := make([]byte, len(p))
+	copy(tmp, p)
+
+	val := reflect.New(v.Type().Elem().Elem())
+	val.Elem().SetBytes(tmp)
+	v.Elem().Set(val)
+	return nil
+}
+
+func decString(p []byte) string {
+	if len(p) == 0 {
+		return ""
+	}
+	return string(p)
+}
+
+func decStringR(p []byte) *string {
+	if len(p) == 0 {
+		if p == nil {
+			return nil
+		}
+		return new(string)
+	}
+	tmp := string(p)
+	return &tmp
+}
+
+func decBytes(p []byte) []byte {
+	if len(p) == 0 {
+		if p == nil {
+			return nil
+		}
+		return make([]byte, 0)
+	}
+	tmp := make([]byte, len(p))
+	copy(tmp, p)
+	return tmp
+}
+
+func decBytesR(p []byte) *[]byte {
+	if len(p) == 0 {
+		if p == nil {
+			return nil
+		}
+		tmp := make([]byte, 0)
+		return &tmp
+	}
+	tmp := make([]byte, len(p))
+	copy(tmp, p)
+	return &tmp
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/varint/marshal.go b/vendor/github.com/gocql/gocql/serialization/varint/marshal.go
new file mode 100644
index 0000000..4561934
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/varint/marshal.go
@@ -0,0 +1,74 @@
+package varint
+
+import (
+	"math/big"
+	"reflect"
+)
+
+func Marshal(value interface{}) ([]byte, error) {
+	switch v := value.(type) {
+	case nil:
+		return nil, nil
+	case int8:
+		return EncInt8(v)
+	case int32:
+		return EncInt32(v)
+	case int16:
+		return EncInt16(v)
+	case int64:
+		return EncInt64(v)
+	case int:
+		return EncInt(v)
+
+	case uint8:
+		return EncUint8(v)
+	case uint16:
+		return EncUint16(v)
+	case uint32:
+		return EncUint32(v)
+	case uint64:
+		return EncUint64(v)
+	case uint:
+		return EncUint(v)
+
+	case big.Int:
+		return EncBigInt(v)
+	case string:
+		return EncString(v)
+
+	case *int8:
+		return EncInt8R(v)
+	case *int16:
+		return EncInt16R(v)
+	case *int32:
+		return EncInt32R(v)
+	case *int64:
+		return EncInt64R(v)
+	case *int:
+		return EncIntR(v)
+
+	case *uint8:
+		return EncUint8R(v)
+	case *uint16:
+		return EncUint16R(v)
+	case *uint32:
+		return EncUint32R(v)
+	case *uint64:
+		return EncUint64R(v)
+	case *uint:
+		return EncUintR(v)
+
+	case *big.Int:
+		return EncBigIntR(v)
+	case *string:
+		return EncStringR(v)
+	default:
+		// Custom types (type MyInt int) can be serialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.TypeOf(value)
+		if rv.Kind() != reflect.Ptr {
+			return EncReflect(reflect.ValueOf(v))
+		}
+		return EncReflectR(reflect.ValueOf(v))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/varint/marshal_custom.go b/vendor/github.com/gocql/gocql/serialization/varint/marshal_custom.go
new file mode 100644
index 0000000..bb85284
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/varint/marshal_custom.go
@@ -0,0 +1,77 @@
+package varint
+
+import (
+	"fmt"
+	"math/big"
+	"reflect"
+	"strconv"
+)
+
+func EncReflect(v reflect.Value) ([]byte, error) {
+	switch v.Type().Kind() {
+	case reflect.Int8:
+		return EncInt8(int8(v.Int()))
+	case reflect.Int16:
+		return EncInt16(int16(v.Int()))
+	case reflect.Int32:
+		return EncInt32(int32(v.Int()))
+	case reflect.Int, reflect.Int64:
+		return EncInt64(v.Int())
+	case reflect.Uint8:
+		return EncUint8(uint8(v.Uint()))
+	case reflect.Uint16:
+		return EncUint16(uint16(v.Uint()))
+	case reflect.Uint32:
+		return EncUint32(uint32(v.Uint()))
+	case reflect.Uint, reflect.Uint64:
+		return EncUint64(v.Uint())
+	case reflect.String:
+		return encReflectString(v)
+	case reflect.Struct:
+		if v.Type().String() == "gocql.unsetColumn" {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to marshal varint: unsupported value type (%T)(%[1]v)", v.Interface())
+
+	default:
+		return nil, fmt.Errorf("failed to marshal varint: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func EncReflectR(v reflect.Value) ([]byte, error) {
+	if v.IsNil() {
+		return nil, nil
+	}
+	return EncReflect(v.Elem())
+}
+
+func encReflectString(v reflect.Value) ([]byte, error) {
+	val := v.String()
+	switch {
+	case len(val) == 0:
+		return nil, nil
+	case len(val) <= 18:
+		n, err := strconv.ParseInt(val, 10, 64)
+		if err != nil {
+			return nil, fmt.Errorf("failed to marshal varint: can not marshal (%T)(%[1]v), %s", v.Interface(), err)
+		}
+		return EncInt64Ext(n), nil
+	case len(val) <= 20:
+		n, err := strconv.ParseInt(val, 10, 64)
+		if err == nil {
+			return EncInt64Ext(n), nil
+		}
+
+		t, ok := new(big.Int).SetString(val, 10)
+		if !ok {
+			return nil, fmt.Errorf("failed to marshal varint: can not marshal (%T)(%[1]v)", v.Interface())
+		}
+		return EncBigIntRS(t), nil
+	default:
+		t, ok := new(big.Int).SetString(val, 10)
+		if !ok {
+			return nil, fmt.Errorf("failed to marshal varint: can not marshal (%T)(%[1]v)", v.Interface())
+		}
+		return EncBigIntRS(t), nil
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/varint/marshal_ints.go b/vendor/github.com/gocql/gocql/serialization/varint/marshal_ints.go
new file mode 100644
index 0000000..e13bf77
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/varint/marshal_ints.go
@@ -0,0 +1,130 @@
+package varint
+
+func EncInt8(v int8) ([]byte, error) {
+	return encInt8(v), nil
+}
+
+func EncInt8R(v *int8) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encInt8(*v), nil
+}
+
+func EncInt16(v int16) ([]byte, error) {
+	return encInt16(v), nil
+}
+
+func EncInt16R(v *int16) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encInt16(*v), nil
+}
+
+func EncInt32(v int32) ([]byte, error) {
+	return encInt32(v), nil
+}
+
+func EncInt32R(v *int32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encInt32(*v), nil
+}
+
+func EncInt64(v int64) ([]byte, error) {
+	return EncInt64Ext(v), nil
+}
+
+func EncInt64R(v *int64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncInt64Ext(*v), nil
+}
+
+func EncInt(v int) ([]byte, error) {
+	return encInt(v), nil
+}
+
+func EncIntR(v *int) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encInt(*v), nil
+}
+
+func encInt8(v int8) []byte {
+	return []byte{byte(v)}
+}
+
+func encInt16(v int16) []byte {
+	if v <= maxInt8 && v >= minInt8 {
+		return []byte{byte(v)}
+	}
+	return []byte{byte(v >> 8), byte(v)}
+}
+
+func encInt32(v int32) []byte {
+	if v <= maxInt8 && v >= minInt8 {
+		return []byte{byte(v)}
+	}
+	if v <= maxInt16 && v >= minInt16 {
+		return []byte{byte(v >> 8), byte(v)}
+	}
+	if v <= maxInt24 && v >= minInt24 {
+		return []byte{byte(v >> 16), byte(v >> 8), byte(v)}
+	}
+	return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
+
+func EncInt64Ext(v int64) []byte {
+	if v <= maxInt8 && v >= minInt8 {
+		return []byte{byte(v)}
+	}
+	if v <= maxInt16 && v >= minInt16 {
+		return []byte{byte(v >> 8), byte(v)}
+	}
+	if v <= maxInt24 && v >= minInt24 {
+		return []byte{byte(v >> 16), byte(v >> 8), byte(v)}
+	}
+	if v <= maxInt32 && v >= minInt32 {
+		return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	}
+	if v <= maxInt40 && v >= minInt40 {
+		return []byte{byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	}
+	if v <= maxInt48 && v >= minInt48 {
+		return []byte{byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	}
+	if v <= maxInt56 && v >= minInt56 {
+		return []byte{byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	}
+	return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
+
+func encInt(v int) []byte {
+	if v <= maxInt8 && v >= minInt8 {
+		return []byte{byte(v)}
+	}
+	if v <= maxInt16 && v >= minInt16 {
+		return []byte{byte(v >> 8), byte(v)}
+	}
+	if v <= maxInt24 && v >= minInt24 {
+		return []byte{byte(v >> 16), byte(v >> 8), byte(v)}
+	}
+	if v <= maxInt32 && v >= minInt32 {
+		return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	}
+	if v <= maxInt40 && v >= minInt40 {
+		return []byte{byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	}
+	if v <= maxInt48 && v >= minInt48 {
+		return []byte{byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	}
+	if v <= maxInt56 && v >= minInt56 {
+		return []byte{byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	}
+	return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/varint/marshal_uints.go b/vendor/github.com/gocql/gocql/serialization/varint/marshal_uints.go
new file mode 100644
index 0000000..343dc1b
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/varint/marshal_uints.go
@@ -0,0 +1,135 @@
+package varint
+
+func EncUint8(v uint8) ([]byte, error) {
+	return encUint8(v), nil
+}
+
+func EncUint8R(v *uint8) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encUint8(*v), nil
+}
+
+func EncUint16(v uint16) ([]byte, error) {
+	return encUint16(v), nil
+}
+
+func EncUint16R(v *uint16) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encUint16(*v), nil
+}
+
+func EncUint32(v uint32) ([]byte, error) {
+	return encUint32(v), nil
+}
+
+func EncUint32R(v *uint32) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encUint32(*v), nil
+}
+
+func EncUint64(v uint64) ([]byte, error) {
+	return encUint64(v), nil
+}
+
+func EncUint64R(v *uint64) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encUint64(*v), nil
+}
+
+func EncUint(v uint) ([]byte, error) {
+	return encUint(v), nil
+}
+
+func EncUintR(v *uint) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return encUint(*v), nil
+}
+
+func encUint8(v uint8) []byte {
+	if v > maxInt8 {
+		return []byte{0, v}
+	}
+	return []byte{v}
+}
+
+func encUint16(v uint16) []byte {
+	switch {
+	case byte(v>>15) != 0:
+		return []byte{0, byte(v >> 8), byte(v)}
+	case byte(v>>7) != 0:
+		return []byte{byte(v >> 8), byte(v)}
+	default:
+		return []byte{byte(v)}
+	}
+}
+
+func encUint32(v uint32) []byte {
+	switch {
+	case byte(v>>31) != 0:
+		return []byte{0, byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>23) != 0:
+		return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>15) != 0:
+		return []byte{byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>7) != 0:
+		return []byte{byte(v >> 8), byte(v)}
+	default:
+		return []byte{byte(v)}
+	}
+}
+
+func encUint64(v uint64) []byte {
+	switch {
+	case byte(v>>63) != 0:
+		return []byte{0, byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>55) != 0:
+		return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>47) != 0:
+		return []byte{byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>39) != 0:
+		return []byte{byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>31) != 0:
+		return []byte{byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>23) != 0:
+		return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>15) != 0:
+		return []byte{byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>7) != 0:
+		return []byte{byte(v >> 8), byte(v)}
+	default:
+		return []byte{byte(v)}
+	}
+}
+
+func encUint(v uint) []byte {
+	switch {
+	case byte(v>>63) != 0:
+		return []byte{0, byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>55) != 0:
+		return []byte{byte(v >> 56), byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>47) != 0:
+		return []byte{byte(v >> 48), byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>39) != 0:
+		return []byte{byte(v >> 40), byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>31) != 0:
+		return []byte{byte(v >> 32), byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>23) != 0:
+		return []byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>15) != 0:
+		return []byte{byte(v >> 16), byte(v >> 8), byte(v)}
+	case byte(v>>7) != 0:
+		return []byte{byte(v >> 8), byte(v)}
+	default:
+		return []byte{byte(v)}
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/varint/marshal_utils.go b/vendor/github.com/gocql/gocql/serialization/varint/marshal_utils.go
new file mode 100644
index 0000000..b21827f
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/varint/marshal_utils.go
@@ -0,0 +1,137 @@
+package varint
+
+import (
+	"fmt"
+	"math"
+	"math/big"
+	"strconv"
+)
+
+const (
+	maxInt8  = 1<<7 - 1
+	maxInt16 = 1<<15 - 1
+	maxInt24 = 1<<23 - 1
+	maxInt32 = 1<<31 - 1
+	maxInt40 = 1<<39 - 1
+	maxInt48 = 1<<47 - 1
+	maxInt56 = 1<<55 - 1
+	maxInt64 = 1<<63 - 1
+
+	minInt8  = -1 << 7
+	minInt16 = -1 << 15
+	minInt24 = -1 << 23
+	minInt32 = -1 << 31
+	minInt40 = -1 << 39
+	minInt48 = -1 << 47
+	minInt56 = -1 << 55
+)
+
+func EncBigInt(v big.Int) ([]byte, error) {
+	return encBigInt(v), nil
+}
+
+func EncBigIntR(v *big.Int) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncBigIntRS(v), nil
+}
+
+func EncString(v string) ([]byte, error) {
+	switch {
+	case len(v) == 0:
+		return nil, nil
+	case len(v) <= 18:
+		n, err := strconv.ParseInt(v, 10, 64)
+		if err != nil {
+			return nil, fmt.Errorf("failed to marshal varint: can not marshal %#v, %s", v, err)
+		}
+		return EncInt64Ext(n), nil
+	case len(v) <= 20:
+		n, err := strconv.ParseInt(v, 10, 64)
+		if err == nil {
+			return EncInt64Ext(n), nil
+		}
+
+		t, ok := new(big.Int).SetString(v, 10)
+		if !ok {
+			return nil, fmt.Errorf("failed to marshal varint: can not marshal %#v", v)
+		}
+		return EncBigIntRS(t), nil
+	default:
+		t, ok := new(big.Int).SetString(v, 10)
+		if !ok {
+			return nil, fmt.Errorf("failed to marshal varint: can not marshal %#v", v)
+		}
+		return EncBigIntRS(t), nil
+	}
+}
+
+func EncStringR(v *string) ([]byte, error) {
+	if v == nil {
+		return nil, nil
+	}
+	return EncString(*v)
+}
+
+func encBigInt(v big.Int) []byte {
+	switch v.Sign() {
+	case 1:
+		data := v.Bytes()
+		if data[0] > math.MaxInt8 {
+			data = append([]byte{0}, data...)
+		}
+		return data
+	case -1:
+		data := v.Bytes()
+		add := true
+		for i := len(data) - 1; i >= 0; i-- {
+			if !add {
+				data[i] = 255 - data[i]
+			} else {
+				data[i] = 255 - data[i] + 1
+				if data[i] != 0 {
+					add = false
+				}
+			}
+		}
+		if data[0] < 128 {
+			data = append([]byte{255}, data...)
+		}
+		return data
+	default:
+		return []byte{0}
+	}
+}
+
+// EncBigIntRS encode big.Int to []byte.
+// This function shared to use in marshal `decimal`.
+func EncBigIntRS(v *big.Int) []byte {
+	switch v.Sign() {
+	case 1:
+		data := v.Bytes()
+		if data[0] > math.MaxInt8 {
+			data = append([]byte{0}, data...)
+		}
+		return data
+	case -1:
+		data := v.Bytes()
+		add := true
+		for i := len(data) - 1; i >= 0; i-- {
+			if !add {
+				data[i] = 255 - data[i]
+			} else {
+				data[i] = 255 - data[i] + 1
+				if data[i] != 0 {
+					add = false
+				}
+			}
+		}
+		if data[0] < 128 {
+			data = append([]byte{255}, data...)
+		}
+		return data
+	default:
+		return []byte{0}
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/varint/unmarshal.go b/vendor/github.com/gocql/gocql/serialization/varint/unmarshal.go
new file mode 100644
index 0000000..799d3f6
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/varint/unmarshal.go
@@ -0,0 +1,81 @@
+package varint
+
+import (
+	"fmt"
+	"math/big"
+	"reflect"
+)
+
+func Unmarshal(data []byte, value interface{}) error {
+	switch v := value.(type) {
+	case nil:
+		return nil
+
+	case *int8:
+		return DecInt8(data, v)
+	case *int16:
+		return DecInt16(data, v)
+	case *int32:
+		return DecInt32(data, v)
+	case *int64:
+		return DecInt64(data, v)
+	case *int:
+		return DecInt(data, v)
+
+	case *uint8:
+		return DecUint8(data, v)
+	case *uint16:
+		return DecUint16(data, v)
+	case *uint32:
+		return DecUint32(data, v)
+	case *uint64:
+		return DecUint64(data, v)
+	case *uint:
+		return DecUint(data, v)
+
+	case *big.Int:
+		return DecBigInt(data, v)
+	case *string:
+		return DecString(data, v)
+
+	case **int8:
+		return DecInt8R(data, v)
+	case **int16:
+		return DecInt16R(data, v)
+	case **int32:
+		return DecInt32R(data, v)
+	case **int64:
+		return DecInt64R(data, v)
+	case **int:
+		return DecIntR(data, v)
+
+	case **uint8:
+		return DecUint8R(data, v)
+	case **uint16:
+		return DecUint16R(data, v)
+	case **uint32:
+		return DecUint32R(data, v)
+	case **uint64:
+		return DecUint64R(data, v)
+	case **uint:
+		return DecUintR(data, v)
+
+	case **big.Int:
+		return DecBigIntR(data, v)
+	case **string:
+		return DecStringR(data, v)
+	default:
+
+		// Custom types (type MyInt int) can be deserialized only via `reflect` package.
+		// Later, when generic-based serialization is introduced we can do that via generics.
+		rv := reflect.ValueOf(value)
+		rt := rv.Type()
+		if rt.Kind() != reflect.Ptr {
+			return fmt.Errorf("failed to unmarshal varint: unsupported value type (%T)(%#[1]v)", value)
+		}
+		if rt.Elem().Kind() != reflect.Ptr {
+			return DecReflect(data, rv)
+		}
+		return DecReflectR(data, rv)
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/varint/unmarshal_custom.go b/vendor/github.com/gocql/gocql/serialization/varint/unmarshal_custom.go
new file mode 100644
index 0000000..12da89a
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/varint/unmarshal_custom.go
@@ -0,0 +1,581 @@
+package varint
+
+import (
+	"fmt"
+	"reflect"
+	"strconv"
+)
+
+func DecReflect(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal varint: can not unmarshal into nil reference (%T)(%#[1]v)", v.Interface())
+	}
+
+	switch v = v.Elem(); v.Kind() {
+	case reflect.Int8:
+		return decReflectInt8(p, v)
+	case reflect.Int16:
+		return decReflectInt16(p, v)
+	case reflect.Int32:
+		return decReflectInt32(p, v)
+	case reflect.Int64, reflect.Int:
+		return decReflectInts(p, v)
+	case reflect.Uint8:
+		return decReflectUint8(p, v)
+	case reflect.Uint16:
+		return decReflectUint16(p, v)
+	case reflect.Uint32:
+		return decReflectUint32(p, v)
+	case reflect.Uint64, reflect.Uint:
+		return decReflectUints(p, v)
+	case reflect.String:
+		return decReflectString(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: unsupported value type (%T)(%#[1]v)", v.Interface())
+	}
+}
+
+func decReflectInt8(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+	case 1:
+		v.SetInt(dec1toInt64(p))
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the int8 range", v.Interface())
+	}
+	return nil
+}
+
+func decReflectInt16(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+		return nil
+	case 1:
+		v.SetInt(dec1toInt64(p))
+		return nil
+	case 2:
+		v.SetInt(dec2toInt64(p))
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the int16 range", v.Interface())
+	}
+	return errBrokenData(p)
+}
+
+func decReflectInt32(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+		return nil
+	case 1:
+		v.SetInt(dec1toInt64(p))
+		return nil
+	case 2:
+		v.SetInt(dec2toInt64(p))
+	case 3:
+		v.SetInt(dec3toInt64(p))
+	case 4:
+		v.SetInt(dec4toInt64(p))
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint32 range", v.Interface())
+	}
+	return errBrokenData(p)
+}
+
+func decReflectInts(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetInt(0)
+		return nil
+	case 1:
+		v.SetInt(dec1toInt64(p))
+		return nil
+	case 2:
+		v.SetInt(dec2toInt64(p))
+	case 3:
+		v.SetInt(dec3toInt64(p))
+	case 4:
+		v.SetInt(dec4toInt64(p))
+	case 5:
+		v.SetInt(dec5toInt64(p))
+	case 6:
+		v.SetInt(dec6toInt64(p))
+	case 7:
+		v.SetInt(dec7toInt64(p))
+	case 8:
+		v.SetInt(dec8toInt64(p))
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the int64 range", v.Interface())
+	}
+	return errBrokenData(p)
+}
+
+func decReflectUint8(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+		return nil
+	case 1:
+		v.SetUint(dec1toUint64(p))
+		return nil
+	case 2:
+		if p[0] == 0 {
+			v.SetUint(dec2toUint64(p))
+		} else {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint8 range", v.Interface())
+		}
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint8 range", v.Interface())
+	}
+	return errBrokenData(p)
+}
+
+func decReflectUint16(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+		return nil
+	case 1:
+		v.SetUint(dec1toUint64(p))
+		return nil
+	case 2:
+		v.SetUint(dec2toUint64(p))
+	case 3:
+		if p[0] == 0 {
+			v.SetUint(dec3toUint64(p))
+		} else {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint16 range", v.Interface())
+		}
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint16 range", v.Interface())
+	}
+	return errBrokenData(p)
+}
+
+func decReflectUint32(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+		return nil
+	case 1:
+		v.SetUint(dec1toUint64(p))
+		return nil
+	case 2:
+		v.SetUint(dec2toUint64(p))
+	case 3:
+		v.SetUint(dec3toUint64(p))
+	case 4:
+		v.SetUint(dec4toUint64(p))
+	case 5:
+		if p[0] == 0 {
+			v.SetUint(dec5toUint64(p))
+		} else {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint32 range", v.Interface())
+		}
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint32 range", v.Interface())
+	}
+	return errBrokenData(p)
+}
+
+func decReflectUints(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.SetUint(0)
+		return nil
+	case 1:
+		v.SetUint(dec1toUint64(p))
+		return nil
+	case 2:
+		v.SetUint(dec2toUint64(p))
+	case 3:
+		v.SetUint(dec3toUint64(p))
+	case 4:
+		v.SetUint(dec4toUint64(p))
+	case 5:
+		v.SetUint(dec5toUint64(p))
+	case 6:
+		v.SetUint(dec6toUint64(p))
+	case 7:
+		v.SetUint(dec7toUint64(p))
+	case 8:
+		v.SetUint(dec8toUint64(p))
+	case 9:
+		if p[0] == 0 {
+			v.SetUint(dec9toUint64(p))
+		} else {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint64 range", v.Interface())
+		}
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint64 range", v.Interface())
+	}
+	return errBrokenData(p)
+}
+
+func decReflectString(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		if p == nil {
+			v.SetString("")
+		} else {
+			v.SetString("0")
+		}
+		return nil
+	case 1:
+		v.SetString(strconv.FormatInt(dec1toInt64(p), 10))
+		return nil
+	case 2:
+		v.SetString(strconv.FormatInt(dec2toInt64(p), 10))
+	case 3:
+		v.SetString(strconv.FormatInt(dec3toInt64(p), 10))
+	case 4:
+		v.SetString(strconv.FormatInt(dec4toInt64(p), 10))
+	case 5:
+		v.SetString(strconv.FormatInt(dec5toInt64(p), 10))
+	case 6:
+		v.SetString(strconv.FormatInt(dec6toInt64(p), 10))
+	case 7:
+		v.SetString(strconv.FormatInt(dec7toInt64(p), 10))
+	case 8:
+		v.SetString(strconv.FormatInt(dec8toInt64(p), 10))
+	default:
+		v.SetString(Dec2BigInt(p).String())
+	}
+	return errBrokenData(p)
+}
+
+func DecReflectR(p []byte, v reflect.Value) error {
+	if v.IsNil() {
+		return fmt.Errorf("failed to unmarshal bigint: can not unmarshal into nil reference (%T)(%[1]v)", v.Interface())
+	}
+
+	switch v.Type().Elem().Elem().Kind() {
+	case reflect.Int8:
+		return decReflectInt8R(p, v)
+	case reflect.Int16:
+		return decReflectInt16R(p, v)
+	case reflect.Int32:
+		return decReflectInt32R(p, v)
+	case reflect.Int64, reflect.Int:
+		return decReflectIntsR(p, v)
+	case reflect.Uint8:
+		return decReflectUint8R(p, v)
+	case reflect.Uint16:
+		return decReflectUint16R(p, v)
+	case reflect.Uint32:
+		return decReflectUint32R(p, v)
+	case reflect.Uint64, reflect.Uint:
+		return decReflectUintsR(p, v)
+	case reflect.String:
+		return decReflectStringR(p, v)
+	default:
+		return fmt.Errorf("failed to unmarshal bigint: unsupported value type (%T)(%[1]v)", v.Interface())
+	}
+}
+
+func decReflectNullableR(p []byte, v reflect.Value) reflect.Value {
+	if p == nil {
+		return reflect.Zero(v.Elem().Type())
+	}
+	return reflect.New(v.Type().Elem().Elem())
+}
+
+func decReflectInt8R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+	case 1:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec1toInt64(p))
+		v.Elem().Set(newVal)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the int8 range", v.Interface())
+	}
+	return nil
+}
+
+func decReflectInt16R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+		return nil
+	case 1:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec1toInt64(p))
+		v.Elem().Set(newVal)
+		return nil
+	case 2:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec2toInt64(p))
+		v.Elem().Set(newVal)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the int16 range", v.Interface())
+	}
+	return errBrokenData(p)
+}
+
+func decReflectInt32R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+		return nil
+	case 1:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec1toInt64(p))
+		v.Elem().Set(newVal)
+		return nil
+	case 2:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec2toInt64(p))
+		v.Elem().Set(newVal)
+	case 3:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec3toInt64(p))
+		v.Elem().Set(newVal)
+	case 4:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec4toInt64(p))
+		v.Elem().Set(newVal)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the int32 range", v.Interface())
+	}
+	return errBrokenData(p)
+}
+
+func decReflectIntsR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+		return nil
+	case 1:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec1toInt64(p))
+		v.Elem().Set(newVal)
+		return nil
+	case 2:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec2toInt64(p))
+		v.Elem().Set(newVal)
+	case 3:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec3toInt64(p))
+		v.Elem().Set(newVal)
+	case 4:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec4toInt64(p))
+		v.Elem().Set(newVal)
+	case 5:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec5toInt64(p))
+		v.Elem().Set(newVal)
+	case 6:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec6toInt64(p))
+		v.Elem().Set(newVal)
+	case 7:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec7toInt64(p))
+		v.Elem().Set(newVal)
+	case 8:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetInt(dec8toInt64(p))
+		v.Elem().Set(newVal)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the int64 range", v.Interface())
+	}
+	return errBrokenData(p)
+}
+
+func decReflectUint8R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+		return nil
+	case 1:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec1toUint64(p))
+		v.Elem().Set(newVal)
+		return nil
+	case 2:
+		if p[0] == 0 {
+			newVal := reflect.New(v.Type().Elem().Elem())
+			newVal.Elem().SetUint(dec2toUint64(p))
+			v.Elem().Set(newVal)
+		} else {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint8 range", v.Interface())
+		}
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint8 range", v.Interface())
+	}
+	return errBrokenData(p)
+}
+
+func decReflectUint16R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+		return nil
+	case 1:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec1toUint64(p))
+		v.Elem().Set(newVal)
+		return nil
+	case 2:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec2toUint64(p))
+		v.Elem().Set(newVal)
+	case 3:
+		if p[0] == 0 {
+			newVal := reflect.New(v.Type().Elem().Elem())
+			newVal.Elem().SetUint(dec3toUint64(p))
+			v.Elem().Set(newVal)
+		} else {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint16 range", v.Interface())
+		}
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint16 range", v.Interface())
+	}
+	return errBrokenData(p)
+}
+
+func decReflectUint32R(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+		return nil
+	case 1:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec1toUint64(p))
+		v.Elem().Set(newVal)
+		return nil
+	case 2:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec2toUint64(p))
+		v.Elem().Set(newVal)
+	case 3:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec3toUint64(p))
+		v.Elem().Set(newVal)
+	case 4:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec4toUint64(p))
+		v.Elem().Set(newVal)
+	case 5:
+		if p[0] == 0 {
+			newVal := reflect.New(v.Type().Elem().Elem())
+			newVal.Elem().SetUint(dec5toUint64(p))
+			v.Elem().Set(newVal)
+		} else {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint32 range", v.Interface())
+		}
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint32 range", v.Interface())
+	}
+	return errBrokenData(p)
+}
+
+func decReflectUintsR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		v.Elem().Set(decReflectNullableR(p, v))
+		return nil
+	case 1:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec1toUint64(p))
+		v.Elem().Set(newVal)
+		return nil
+	case 2:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec2toUint64(p))
+		v.Elem().Set(newVal)
+	case 3:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec3toUint64(p))
+		v.Elem().Set(newVal)
+	case 4:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec4toUint64(p))
+		v.Elem().Set(newVal)
+	case 5:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec5toUint64(p))
+		v.Elem().Set(newVal)
+	case 6:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec6toUint64(p))
+		v.Elem().Set(newVal)
+	case 7:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec7toUint64(p))
+		v.Elem().Set(newVal)
+	case 8:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetUint(dec8toUint64(p))
+		v.Elem().Set(newVal)
+	case 9:
+		if p[0] == 0 {
+			newVal := reflect.New(v.Type().Elem().Elem())
+			newVal.Elem().SetUint(dec9toUint64(p))
+			v.Elem().Set(newVal)
+		} else {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint64 range", v.Interface())
+		}
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into %T, the data value should be in the uint64 range", v.Interface())
+	}
+	return errBrokenData(p)
+}
+
+func decReflectStringR(p []byte, v reflect.Value) error {
+	switch len(p) {
+	case 0:
+		var val reflect.Value
+		if p == nil {
+			val = reflect.Zero(v.Type().Elem())
+		} else {
+			val = reflect.New(v.Type().Elem().Elem())
+			val.Elem().SetString("0")
+		}
+		v.Elem().Set(val)
+		return nil
+	case 1:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(strconv.FormatInt(dec1toInt64(p), 10))
+		v.Elem().Set(newVal)
+		return nil
+	case 2:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(strconv.FormatInt(dec2toInt64(p), 10))
+		v.Elem().Set(newVal)
+	case 3:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(strconv.FormatInt(dec3toInt64(p), 10))
+		v.Elem().Set(newVal)
+	case 4:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(strconv.FormatInt(dec4toInt64(p), 10))
+		v.Elem().Set(newVal)
+	case 5:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(strconv.FormatInt(dec5toInt64(p), 10))
+		v.Elem().Set(newVal)
+	case 6:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(strconv.FormatInt(dec6toInt64(p), 10))
+		v.Elem().Set(newVal)
+	case 7:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(strconv.FormatInt(dec7toInt64(p), 10))
+		v.Elem().Set(newVal)
+	case 8:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(strconv.FormatInt(dec8toInt64(p), 10))
+		v.Elem().Set(newVal)
+	default:
+		newVal := reflect.New(v.Type().Elem().Elem())
+		newVal.Elem().SetString(Dec2BigInt(p).String())
+		v.Elem().Set(newVal)
+	}
+	return errBrokenData(p)
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/varint/unmarshal_ints.go b/vendor/github.com/gocql/gocql/serialization/varint/unmarshal_ints.go
new file mode 100644
index 0000000..e4cff7d
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/varint/unmarshal_ints.go
@@ -0,0 +1,456 @@
+package varint
+
+import (
+	"fmt"
+)
+
+const (
+	negInt16s8 = int16(-1) << 8
+
+	negInt32s8  = int32(-1) << 8
+	negInt32s16 = int32(-1) << 16
+	negInt32s24 = int32(-1) << 24
+
+	negInt64s8  = int64(-1) << 8
+	negInt64s16 = int64(-1) << 16
+	negInt64s24 = int64(-1) << 24
+	negInt64s32 = int64(-1) << 32
+	negInt64s40 = int64(-1) << 40
+	negInt64s48 = int64(-1) << 48
+	negInt64s56 = int64(-1) << 56
+
+	negIntS8  = int(-1) << 8
+	negIntS16 = int(-1) << 16
+	negIntS24 = int(-1) << 24
+	negIntS32 = int(-1) << 32
+	negIntS40 = int(-1) << 40
+	negIntS48 = int(-1) << 48
+	negIntS56 = int(-1) << 56
+)
+
+func DecInt8(p []byte, v *int8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+	case 1:
+		*v = dec1toInt8(p)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into int8, the data value should be in the int8 range")
+	}
+	return nil
+}
+
+func DecInt8R(p []byte, v **int8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int8)
+		}
+	case 1:
+		val := dec1toInt8(p)
+		*v = &val
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into int8, the data value should be in the int8 range")
+	}
+	return nil
+}
+
+func DecInt16(p []byte, v *int16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+		return nil
+	case 1:
+		*v = dec1toInt16(p)
+		return nil
+	case 2:
+		*v = dec2toInt16(p)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into int16, the data value should be in the int16 range")
+	}
+	return errBrokenData(p)
+}
+
+func DecInt16R(p []byte, v **int16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int16)
+		}
+		return nil
+	case 1:
+		val := dec1toInt16(p)
+		*v = &val
+		return nil
+	case 2:
+		val := dec2toInt16(p)
+		*v = &val
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into int16, the data value should be in the int16 range")
+	}
+	return errBrokenData(p)
+}
+
+func DecInt32(p []byte, v *int32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+		return nil
+	case 1:
+		*v = dec1toInt32(p)
+		return nil
+	case 2:
+		*v = dec2toInt32(p)
+	case 3:
+		*v = dec3toInt32(p)
+	case 4:
+		*v = dec4toInt32(p)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into int32, the data value should be in the int32 range")
+	}
+	return errBrokenData(p)
+}
+
+func DecInt32R(p []byte, v **int32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int32)
+		}
+		return nil
+	case 1:
+		val := dec1toInt32(p)
+		*v = &val
+		return nil
+	case 2:
+		val := dec2toInt32(p)
+		*v = &val
+	case 3:
+		val := dec3toInt32(p)
+		*v = &val
+	case 4:
+		val := dec4toInt32(p)
+		*v = &val
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into int32, the data value should be in the int32 range")
+	}
+	return errBrokenData(p)
+}
+
+func DecInt64(p []byte, v *int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+		return nil
+	case 1:
+		*v = dec1toInt64(p)
+		return nil
+	case 2:
+		*v = dec2toInt64(p)
+	case 3:
+		*v = dec3toInt64(p)
+	case 4:
+		*v = dec4toInt64(p)
+	case 5:
+		*v = dec5toInt64(p)
+	case 6:
+		*v = dec6toInt64(p)
+	case 7:
+		*v = dec7toInt64(p)
+	case 8:
+		*v = dec8toInt64(p)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into int64, the data value should be in the int64 range")
+	}
+	return errBrokenData(p)
+}
+
+func DecInt64R(p []byte, v **int64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int64)
+		}
+		return nil
+	case 1:
+		val := dec1toInt64(p)
+		*v = &val
+		return nil
+	case 2:
+		val := dec2toInt64(p)
+		*v = &val
+	case 3:
+		val := dec3toInt64(p)
+		*v = &val
+	case 4:
+		val := dec4toInt64(p)
+		*v = &val
+	case 5:
+		val := dec5toInt64(p)
+		*v = &val
+	case 6:
+		val := dec6toInt64(p)
+		*v = &val
+	case 7:
+		val := dec7toInt64(p)
+		*v = &val
+	case 8:
+		val := dec8toInt64(p)
+		*v = &val
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into int64, the data value should be in the int64 range")
+	}
+	return errBrokenData(p)
+}
+
+func DecInt(p []byte, v *int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+		return nil
+	case 1:
+		*v = dec1toInt(p)
+		return nil
+	case 2:
+		*v = dec2toInt(p)
+	case 3:
+		*v = dec3toInt(p)
+	case 4:
+		*v = dec4toInt(p)
+	case 5:
+		*v = dec5toInt(p)
+	case 6:
+		*v = dec6toInt(p)
+	case 7:
+		*v = dec7toInt(p)
+	case 8:
+		*v = dec8toInt(p)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into int, the data value should be in the int range")
+	}
+	return errBrokenData(p)
+}
+
+func DecIntR(p []byte, v **int) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(int)
+		}
+		return nil
+	case 1:
+		val := dec1toInt(p)
+		*v = &val
+		return nil
+	case 2:
+		val := dec2toInt(p)
+		*v = &val
+	case 3:
+		val := dec3toInt(p)
+		*v = &val
+	case 4:
+		val := dec4toInt(p)
+		*v = &val
+	case 5:
+		val := dec5toInt(p)
+		*v = &val
+	case 6:
+		val := dec6toInt(p)
+		*v = &val
+	case 7:
+		val := dec7toInt(p)
+		*v = &val
+	case 8:
+		val := dec8toInt(p)
+		*v = &val
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into int, the data value should be in the int range")
+	}
+	return errBrokenData(p)
+}
+
+func dec1toInt8(p []byte) int8 {
+	return int8(p[0])
+}
+
+func dec1toInt16(p []byte) int16 {
+	if p[0] > 127 {
+		return negInt16s8 | int16(p[0])
+	}
+	return int16(p[0])
+}
+
+func dec1toInt32(p []byte) int32 {
+	if p[0] > 127 {
+		return negInt32s8 | int32(p[0])
+	}
+	return int32(p[0])
+}
+
+func dec1toInt64(p []byte) int64 {
+	if p[0] > 127 {
+		return negInt64s8 | int64(p[0])
+	}
+	return int64(p[0])
+}
+
+func dec1toInt(p []byte) int {
+	if p[0] > 127 {
+		return negIntS8 | int(p[0])
+	}
+	return int(p[0])
+}
+
+func dec2toInt16(p []byte) int16 {
+	return int16(p[0])<<8 | int16(p[1])
+}
+
+func dec2toInt32(p []byte) int32 {
+	if p[0] > 127 {
+		return negInt32s16 | int32(p[0])<<8 | int32(p[1])
+	}
+	return int32(p[0])<<8 | int32(p[1])
+}
+
+func dec2toInt64(p []byte) int64 {
+	if p[0] > 127 {
+		return negInt64s16 | int64(p[0])<<8 | int64(p[1])
+	}
+	return int64(p[0])<<8 | int64(p[1])
+}
+
+func dec2toInt(p []byte) int {
+	if p[0] > 127 {
+		return negIntS16 | int(p[0])<<8 | int(p[1])
+	}
+	return int(p[0])<<8 | int(p[1])
+}
+
+func dec3toInt32(p []byte) int32 {
+	if p[0] > 127 {
+		return negInt32s24 | int32(p[0])<<16 | int32(p[1])<<8 | int32(p[2])
+	}
+	return int32(p[0])<<16 | int32(p[1])<<8 | int32(p[2])
+}
+
+func dec3toInt64(p []byte) int64 {
+	if p[0] > 127 {
+		return negInt64s24 | int64(p[0])<<16 | int64(p[1])<<8 | int64(p[2])
+	}
+	return int64(p[0])<<16 | int64(p[1])<<8 | int64(p[2])
+}
+
+func dec3toInt(p []byte) int {
+	if p[0] > 127 {
+		return negIntS24 | int(p[0])<<16 | int(p[1])<<8 | int(p[2])
+	}
+	return int(p[0])<<16 | int(p[1])<<8 | int(p[2])
+}
+
+func dec4toInt32(p []byte) int32 {
+	return int32(p[0])<<24 | int32(p[1])<<16 | int32(p[2])<<8 | int32(p[3])
+}
+
+func dec4toInt64(p []byte) int64 {
+	if p[0] > 127 {
+		return negInt64s32 | int64(p[0])<<24 | int64(p[1])<<16 | int64(p[2])<<8 | int64(p[3])
+	}
+	return int64(p[0])<<24 | int64(p[1])<<16 | int64(p[2])<<8 | int64(p[3])
+}
+
+func dec4toInt(p []byte) int {
+	if p[0] > 127 {
+		return negIntS32 | int(p[0])<<24 | int(p[1])<<16 | int(p[2])<<8 | int(p[3])
+	}
+	return int(p[0])<<24 | int(p[1])<<16 | int(p[2])<<8 | int(p[3])
+}
+
+func dec5toInt64(p []byte) int64 {
+	if p[0] > 127 {
+		return negInt64s40 | int64(p[0])<<32 | int64(p[1])<<24 | int64(p[2])<<16 | int64(p[3])<<8 | int64(p[4])
+	}
+	return int64(p[0])<<32 | int64(p[1])<<24 | int64(p[2])<<16 | int64(p[3])<<8 | int64(p[4])
+}
+
+func dec5toInt(p []byte) int {
+	if p[0] > 127 {
+		return negIntS40 | int(p[0])<<32 | int(p[1])<<24 | int(p[2])<<16 | int(p[3])<<8 | int(p[4])
+	}
+	return int(p[0])<<32 | int(p[1])<<24 | int(p[2])<<16 | int(p[3])<<8 | int(p[4])
+}
+
+func dec6toInt64(p []byte) int64 {
+	if p[0] > 127 {
+		return negInt64s48 | int64(p[0])<<40 | int64(p[1])<<32 | int64(p[2])<<24 | int64(p[3])<<16 | int64(p[4])<<8 | int64(p[5])
+	}
+	return int64(p[0])<<40 | int64(p[1])<<32 | int64(p[2])<<24 | int64(p[3])<<16 | int64(p[4])<<8 | int64(p[5])
+}
+
+func dec6toInt(p []byte) int {
+	if p[0] > 127 {
+		return negIntS48 | int(p[0])<<40 | int(p[1])<<32 | int(p[2])<<24 | int(p[3])<<16 | int(p[4])<<8 | int(p[5])
+	}
+	return int(p[0])<<40 | int(p[1])<<32 | int(p[2])<<24 | int(p[3])<<16 | int(p[4])<<8 | int(p[5])
+}
+
+func dec7toInt64(p []byte) int64 {
+	if p[0] > 127 {
+		return negInt64s56 | int64(p[0])<<48 | int64(p[1])<<40 | int64(p[2])<<32 | int64(p[3])<<24 | int64(p[4])<<16 | int64(p[5])<<8 | int64(p[6])
+	}
+	return int64(p[0])<<48 | int64(p[1])<<40 | int64(p[2])<<32 | int64(p[3])<<24 | int64(p[4])<<16 | int64(p[5])<<8 | int64(p[6])
+}
+
+func dec7toInt(p []byte) int {
+	if p[0] > 127 {
+		return negIntS56 | int(p[0])<<48 | int(p[1])<<40 | int(p[2])<<32 | int(p[3])<<24 | int(p[4])<<16 | int(p[5])<<8 | int(p[6])
+	}
+	return int(p[0])<<48 | int(p[1])<<40 | int(p[2])<<32 | int(p[3])<<24 | int(p[4])<<16 | int(p[5])<<8 | int(p[6])
+}
+
+func dec8toInt64(p []byte) int64 {
+	return int64(p[0])<<56 | int64(p[1])<<48 | int64(p[2])<<40 | int64(p[3])<<32 | int64(p[4])<<24 | int64(p[5])<<16 | int64(p[6])<<8 | int64(p[7])
+}
+
+func dec8toInt(p []byte) int {
+	return int(p[0])<<56 | int(p[1])<<48 | int(p[2])<<40 | int(p[3])<<32 | int(p[4])<<24 | int(p[5])<<16 | int(p[6])<<8 | int(p[7])
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/varint/unmarshal_uints.go b/vendor/github.com/gocql/gocql/serialization/varint/unmarshal_uints.go
new file mode 100644
index 0000000..99301e5
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/varint/unmarshal_uints.go
@@ -0,0 +1,457 @@
+package varint
+
+import (
+	"fmt"
+)
+
+func DecUint8(p []byte, v *uint8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+		return nil
+	case 1:
+		*v = dec1toUint8(p)
+		return nil
+	case 2:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint8, the data value should be in the uint8 range")
+		}
+		*v = dec2toUint8(p)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint8, the data value should be in the uint8 range")
+	}
+	return errBrokenData(p)
+}
+
+func DecUint8R(p []byte, v **uint8) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint8)
+		}
+		return nil
+	case 1:
+		val := dec1toUint8(p)
+		*v = &val
+		return nil
+	case 2:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint8, the data value should be in the uint8 range")
+		}
+		val := dec2toUint8(p)
+		*v = &val
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint8, the data value should be in the uint8 range")
+	}
+	return errBrokenData(p)
+}
+
+func DecUint16(p []byte, v *uint16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+		return nil
+	case 1:
+		*v = dec1toUint16(p)
+		return nil
+	case 2:
+		*v = dec2toUint16(p)
+	case 3:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint16, the data value should be in the uint16 range")
+		}
+		*v = dec3toUint16(p)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint16, the data value should be in the uint16 range")
+	}
+	return errBrokenData(p)
+}
+
+func DecUint16R(p []byte, v **uint16) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint16)
+		}
+		return nil
+	case 1:
+		val := dec1toUint16(p)
+		*v = &val
+		return nil
+	case 2:
+		val := dec2toUint16(p)
+		*v = &val
+	case 3:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint16, the data value should be in the uint16 range")
+		}
+		val := dec3toUint16(p)
+		*v = &val
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint16, the data value should be in the uint16 range")
+	}
+	return errBrokenData(p)
+}
+
+func DecUint32(p []byte, v *uint32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+		return nil
+	case 1:
+		*v = dec1toUint32(p)
+		return nil
+	case 2:
+		*v = dec2toUint32(p)
+	case 3:
+		*v = dec3toUint32(p)
+	case 4:
+		*v = dec4toUint32(p)
+	case 5:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint32, the data value should be in the uint32 range")
+		}
+		*v = dec5toUint32(p)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint32, the data value should be in the uint32 range")
+	}
+	return errBrokenData(p)
+}
+
+func DecUint32R(p []byte, v **uint32) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint32)
+		}
+		return nil
+	case 1:
+		val := dec1toUint32(p)
+		*v = &val
+		return nil
+	case 2:
+		val := dec2toUint32(p)
+		*v = &val
+	case 3:
+		val := dec3toUint32(p)
+		*v = &val
+	case 4:
+		val := dec4toUint32(p)
+		*v = &val
+	case 5:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint32, the data value should be in the uint32 range")
+		}
+		val := dec5toUint32(p)
+		*v = &val
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint32, the data value should be in the uint32 range")
+	}
+	return errBrokenData(p)
+}
+
+func DecUint64(p []byte, v *uint64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+		return nil
+	case 1:
+		*v = dec1toUint64(p)
+		return nil
+	case 2:
+		*v = dec2toUint64(p)
+	case 3:
+		*v = dec3toUint64(p)
+	case 4:
+		*v = dec4toUint64(p)
+	case 5:
+		*v = dec5toUint64(p)
+	case 6:
+		*v = dec6toUint64(p)
+	case 7:
+		*v = dec7toUint64(p)
+	case 8:
+		*v = dec8toUint64(p)
+	case 9:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint64, the data value should be in the uint64 range")
+		}
+		*v = dec9toUint64(p)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint64, the data value should be in the uint64 range")
+	}
+	return errBrokenData(p)
+}
+
+func DecUint64R(p []byte, v **uint64) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint64)
+		}
+		return nil
+	case 1:
+		val := dec1toUint64(p)
+		*v = &val
+		return nil
+	case 2:
+		val := dec2toUint64(p)
+		*v = &val
+	case 3:
+		val := dec3toUint64(p)
+		*v = &val
+	case 4:
+		val := dec4toUint64(p)
+		*v = &val
+	case 5:
+		val := dec5toUint64(p)
+		*v = &val
+	case 6:
+		val := dec6toUint64(p)
+		*v = &val
+	case 7:
+		val := dec7toUint64(p)
+		*v = &val
+	case 8:
+		val := dec8toUint64(p)
+		*v = &val
+	case 9:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint64, the data value should be in the uint64 range")
+		}
+		val := dec9toUint64(p)
+		*v = &val
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint64, the data value should be in the uint64 range")
+	}
+	return errBrokenData(p)
+}
+
+func DecUint(p []byte, v *uint) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		*v = 0
+		return nil
+	case 1:
+		*v = dec1toUint(p)
+		return nil
+	case 2:
+		*v = dec2toUint(p)
+	case 3:
+		*v = dec3toUint(p)
+	case 4:
+		*v = dec4toUint(p)
+	case 5:
+		*v = dec5toUint(p)
+	case 6:
+		*v = dec6toUint(p)
+	case 7:
+		*v = dec7toUint(p)
+	case 8:
+		*v = dec8toUint(p)
+	case 9:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint, the data value should be in the uint range")
+		}
+		*v = dec9toUint(p)
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint, the data value should be in the uint range")
+	}
+	return errBrokenData(p)
+}
+
+func DecUintR(p []byte, v **uint) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			*v = new(uint)
+		}
+		return nil
+	case 1:
+		val := dec1toUint(p)
+		*v = &val
+		return nil
+	case 2:
+		val := dec2toUint(p)
+		*v = &val
+	case 3:
+		val := dec3toUint(p)
+		*v = &val
+	case 4:
+		val := dec4toUint(p)
+		*v = &val
+	case 5:
+		val := dec5toUint(p)
+		*v = &val
+	case 6:
+		val := dec6toUint(p)
+		*v = &val
+	case 7:
+		val := dec7toUint(p)
+		*v = &val
+	case 8:
+		val := dec8toUint(p)
+		*v = &val
+	case 9:
+		if p[0] != 0 {
+			return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint, the data value should be in the uint range")
+		}
+		val := dec9toUint(p)
+		*v = &val
+	default:
+		return fmt.Errorf("failed to unmarshal varint: to unmarshal into uint, the data value should be in the uint range")
+	}
+	return errBrokenData(p)
+}
+
+func dec1toUint8(p []byte) uint8 {
+	return p[0]
+}
+
+func dec1toUint16(p []byte) uint16 {
+	return uint16(p[0])
+}
+
+func dec1toUint32(p []byte) uint32 {
+	return uint32(p[0])
+}
+
+func dec1toUint64(p []byte) uint64 {
+	return uint64(p[0])
+}
+
+func dec1toUint(p []byte) uint {
+	return uint(p[0])
+}
+
+func dec2toUint8(p []byte) uint8 {
+	return p[1]
+}
+
+func dec2toUint16(p []byte) uint16 {
+	return uint16(p[0])<<8 | uint16(p[1])
+}
+
+func dec2toUint32(p []byte) uint32 {
+	return uint32(p[0])<<8 | uint32(p[1])
+}
+
+func dec2toUint64(p []byte) uint64 {
+	return uint64(p[0])<<8 | uint64(p[1])
+}
+
+func dec2toUint(p []byte) uint {
+	return uint(p[0])<<8 | uint(p[1])
+}
+
+func dec3toUint16(p []byte) uint16 {
+	return uint16(p[1])<<8 | uint16(p[2])
+}
+
+func dec3toUint32(p []byte) uint32 {
+	return uint32(p[0])<<16 | uint32(p[1])<<8 | uint32(p[2])
+}
+
+func dec3toUint64(p []byte) uint64 {
+	return uint64(p[0])<<16 | uint64(p[1])<<8 | uint64(p[2])
+}
+
+func dec3toUint(p []byte) uint {
+	return uint(p[0])<<16 | uint(p[1])<<8 | uint(p[2])
+}
+
+func dec4toUint32(p []byte) uint32 {
+	return uint32(p[0])<<24 | uint32(p[1])<<16 | uint32(p[2])<<8 | uint32(p[3])
+}
+
+func dec4toUint64(p []byte) uint64 {
+	return uint64(p[0])<<24 | uint64(p[1])<<16 | uint64(p[2])<<8 | uint64(p[3])
+}
+
+func dec4toUint(p []byte) uint {
+	return uint(p[0])<<24 | uint(p[1])<<16 | uint(p[2])<<8 | uint(p[3])
+}
+
+func dec5toUint32(p []byte) uint32 {
+	return uint32(p[1])<<24 | uint32(p[2])<<16 | uint32(p[3])<<8 | uint32(p[4])
+}
+
+func dec5toUint64(p []byte) uint64 {
+	return uint64(p[0])<<32 | uint64(p[1])<<24 | uint64(p[2])<<16 | uint64(p[3])<<8 | uint64(p[4])
+}
+
+func dec5toUint(p []byte) uint {
+	return uint(p[0])<<32 | uint(p[1])<<24 | uint(p[2])<<16 | uint(p[3])<<8 | uint(p[4])
+}
+
+func dec6toUint64(p []byte) uint64 {
+	return uint64(p[0])<<40 | uint64(p[1])<<32 | uint64(p[2])<<24 | uint64(p[3])<<16 | uint64(p[4])<<8 | uint64(p[5])
+}
+
+func dec6toUint(p []byte) uint {
+	return uint(p[0])<<40 | uint(p[1])<<32 | uint(p[2])<<24 | uint(p[3])<<16 | uint(p[4])<<8 | uint(p[5])
+}
+
+func dec7toUint64(p []byte) uint64 {
+	return uint64(p[0])<<48 | uint64(p[1])<<40 | uint64(p[2])<<32 | uint64(p[3])<<24 | uint64(p[4])<<16 | uint64(p[5])<<8 | uint64(p[6])
+}
+
+func dec7toUint(p []byte) uint {
+	return uint(p[0])<<48 | uint(p[1])<<40 | uint(p[2])<<32 | uint(p[3])<<24 | uint(p[4])<<16 | uint(p[5])<<8 | uint(p[6])
+}
+
+func dec8toUint64(p []byte) uint64 {
+	return uint64(p[0])<<56 | uint64(p[1])<<48 | uint64(p[2])<<40 | uint64(p[3])<<32 | uint64(p[4])<<24 | uint64(p[5])<<16 | uint64(p[6])<<8 | uint64(p[7])
+}
+
+func dec8toUint(p []byte) uint {
+	return uint(p[0])<<56 | uint(p[1])<<48 | uint(p[2])<<40 | uint(p[3])<<32 | uint(p[4])<<24 | uint(p[5])<<16 | uint(p[6])<<8 | uint(p[7])
+}
+
+func dec9toUint64(p []byte) uint64 {
+	return uint64(p[1])<<56 | uint64(p[2])<<48 | uint64(p[3])<<40 | uint64(p[4])<<32 | uint64(p[5])<<24 | uint64(p[6])<<16 | uint64(p[7])<<8 | uint64(p[8])
+}
+
+func dec9toUint(p []byte) uint {
+	return uint(p[1])<<56 | uint(p[2])<<48 | uint(p[3])<<40 | uint(p[4])<<32 | uint(p[5])<<24 | uint(p[6])<<16 | uint(p[7])<<8 | uint(p[8])
+}
diff --git a/vendor/github.com/gocql/gocql/serialization/varint/unmarshal_utils.go b/vendor/github.com/gocql/gocql/serialization/varint/unmarshal_utils.go
new file mode 100644
index 0000000..bda0af4
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/serialization/varint/unmarshal_utils.go
@@ -0,0 +1,185 @@
+package varint
+
+import (
+	"fmt"
+	"math/big"
+	"strconv"
+)
+
+func errBrokenData(p []byte) error {
+	if p[0] == 0 && p[1] <= 127 || p[0] == 255 && p[1] > 127 {
+		return fmt.Errorf("failed to unmarshal varint: the data is broken")
+	}
+	return nil
+}
+
+func errNilReference(v interface{}) error {
+	return fmt.Errorf("failed to unmarshal varint: can not unmarshal into nil reference %#v)", v)
+}
+
+func DecString(p []byte, v *string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = ""
+		} else {
+			*v = "0"
+		}
+		return nil
+	case 1:
+		*v = strconv.FormatInt(dec1toInt64(p), 10)
+		return nil
+	case 2:
+		*v = strconv.FormatInt(dec2toInt64(p), 10)
+	case 3:
+		*v = strconv.FormatInt(dec3toInt64(p), 10)
+	case 4:
+		*v = strconv.FormatInt(dec4toInt64(p), 10)
+	case 5:
+		*v = strconv.FormatInt(dec5toInt64(p), 10)
+	case 6:
+		*v = strconv.FormatInt(dec6toInt64(p), 10)
+	case 7:
+		*v = strconv.FormatInt(dec7toInt64(p), 10)
+	case 8:
+		*v = strconv.FormatInt(dec8toInt64(p), 10)
+	default:
+		*v = Dec2BigInt(p).String()
+	}
+	return errBrokenData(p)
+}
+
+func DecStringR(p []byte, v **string) error {
+	if v == nil {
+		return errNilReference(v)
+	}
+	switch len(p) {
+	case 0:
+		if p == nil {
+			*v = nil
+		} else {
+			val := "0"
+			*v = &val
+		}
+		return nil
+	case 1:
+		val := strconv.FormatInt(dec1toInt64(p), 10)
+		*v = &val
+		return nil
+	case 2:
+		val := strconv.FormatInt(dec2toInt64(p), 10)
+		*v = &val
+	case 3:
+		val := strconv.FormatInt(dec3toInt64(p), 10)
+		*v = &val
+	case 4:
+		val := strconv.FormatInt(dec4toInt64(p), 10)
+		*v = &val
+	case 5:
+		val := strconv.FormatInt(dec5toInt64(p), 10)
+		*v = &val
+	case 6:
+		val := strconv.FormatInt(dec6toInt64(p), 10)
+		*v = &val
+	case 7:
+		val := strconv.FormatInt(dec7toInt64(p), 10)
+		*v = &val
+	case 8:
+		val := strconv.FormatInt(dec8toInt64(p), 10)
+		*v = &val
+	default:
+		val := Dec2BigInt(p).String()
+		*v = &val
+	}
+	return errBrokenData(p)
+}
+
+func DecBigInt(p []byte, v *big.Int) error {
+	switch len(p) {
+	case 0:
+		v.SetInt64(0)
+		return nil
+	case 1:
+		v.SetInt64(dec1toInt64(p))
+		return nil
+	case 2:
+		v.SetInt64(dec2toInt64(p))
+	case 3:
+		v.SetInt64(dec3toInt64(p))
+	case 4:
+		v.SetInt64(dec4toInt64(p))
+	case 5:
+		v.SetInt64(dec5toInt64(p))
+	case 6:
+		v.SetInt64(dec6toInt64(p))
+	case 7:
+		v.SetInt64(dec7toInt64(p))
+	case 8:
+		v.SetInt64(dec8toInt64(p))
+	default:
+		dec2ToBigInt(p, v)
+	}
+	return errBrokenData(p)
+}
+
+func DecBigIntR(p []byte, v **big.Int) error {
+	if p != nil {
+		*v = big.NewInt(0)
+		return DecBigInt(p, *v)
+	}
+	*v = nil
+	return nil
+}
+
+// Dec2BigInt decode p to big.Int. Use for cases with len(p)>=2.
+// This function shared to use in unmarshal `decimal`.
+func Dec2BigInt(p []byte) *big.Int {
+	// Positive range processing
+	if p[0] <= 127 {
+		return new(big.Int).SetBytes(p)
+	}
+	// negative range processing
+	data := make([]byte, len(p))
+	copy(data, p)
+
+	add := true
+	for i := len(data) - 1; i >= 0; i-- {
+		if !add {
+			data[i] = 255 - data[i]
+		} else {
+			data[i] = 255 - data[i] + 1
+			if data[i] != 0 {
+				add = false
+			}
+		}
+	}
+
+	return new(big.Int).Neg(new(big.Int).SetBytes(data))
+}
+
+func dec2ToBigInt(p []byte, v *big.Int) {
+	if p[0] <= 127 {
+		// Positive range processing
+		v.SetBytes(p)
+	} else {
+		// negative range processing
+		data := make([]byte, len(p))
+		copy(data, p)
+
+		add := true
+		for i := len(data) - 1; i >= 0; i-- {
+			if !add {
+				data[i] = 255 - data[i]
+			} else {
+				data[i] = 255 - data[i] + 1
+				if data[i] != 0 {
+					add = false
+				}
+			}
+		}
+		v.Set(new(big.Int).Neg(new(big.Int).SetBytes(data)))
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/session.go b/vendor/github.com/gocql/gocql/session.go
new file mode 100644
index 0000000..59e2533
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/session.go
@@ -0,0 +1,2447 @@
+// Copyright (c) 2012 The gocql Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gocql
+
+import (
+	"bytes"
+	"context"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"net"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+	"unicode"
+
+	"github.com/gocql/gocql/debounce"
+	"github.com/gocql/gocql/internal/lru"
+)
+
+// Session is the interface used by users to interact with the database.
+//
+// It's safe for concurrent use by multiple goroutines and a typical usage
+// scenario is to have one global session object to interact with the
+// whole Cassandra cluster.
+//
+// This type extends the Node interface by adding a convenient query builder
+// and automatically sets a default consistency level on all operations
+// that do not have a consistency level set.
+type Session struct {
+	cons                Consistency
+	pageSize            int
+	prefetch            float64
+	routingKeyInfoCache routingKeyInfoLRU
+	metadataDescriber   *metadataDescriber
+	trace               Tracer
+	queryObserver       QueryObserver
+	batchObserver       BatchObserver
+	connectObserver     ConnectObserver
+	frameObserver       FrameHeaderObserver
+	streamObserver      StreamObserver
+	hostSource          *ringDescriber
+	ringRefresher       *debounce.RefreshDebouncer
+	stmtsLRU            *preparedLRU
+
+	connCfg *ConnConfig
+
+	executor *queryExecutor
+	pool     *policyConnPool
+	policy   HostSelectionPolicy
+
+	mu sync.RWMutex
+
+	control controlConnection
+
+	// event handlers
+	nodeEvents   *eventDebouncer
+	schemaEvents *eventDebouncer
+
+	// ring metadata
+	useSystemSchema           bool
+	hasAggregatesAndFunctions bool
+
+	cfg ClusterConfig
+
+	ctx    context.Context
+	cancel context.CancelFunc
+
+	// sessionStateMu protects isClosed and isInitialized.
+	sessionStateMu sync.RWMutex
+	// isClosed is true once Session.Close is finished.
+	isClosed bool
+	// isClosing bool is true once Session.Close is started.
+	isClosing bool
+	// isInitialized is true once Session.init succeeds.
+	// you can use initialized() to read the value.
+	isInitialized bool
+	initErr       error
+	readyCh       chan struct{}
+
+	logger StdLogger
+
+	tabletsRoutingV1 bool
+
+	usingTimeoutClause string
+	warningHandler     WarningHandler
+}
+
+var queryPool = &sync.Pool{
+	New: func() interface{} {
+		return &Query{routingInfo: &queryRoutingInfo{}, refCount: 1}
+	},
+}
+
+func addrsToHosts(addrs []string, defaultPort int, logger StdLogger) ([]*HostInfo, error) {
+	var hosts []*HostInfo
+	for _, hostaddr := range addrs {
+		resolvedHosts, err := hostInfo(hostaddr, defaultPort)
+		if err != nil {
+			// Try other hosts if unable to resolve DNS name
+			if _, ok := err.(*net.DNSError); ok {
+				logger.Printf("gocql: dns error: %v\n", err)
+				continue
+			}
+			return nil, err
+		}
+
+		hosts = append(hosts, resolvedHosts...)
+	}
+	if len(hosts) == 0 {
+		return nil, errors.New("failed to resolve any of the provided hostnames")
+	}
+	return hosts, nil
+}
+
+func newSessionCommon(cfg ClusterConfig) (*Session, error) {
+	if err := cfg.Validate(); err != nil {
+		return nil, fmt.Errorf("gocql: unable to create session: cluster config validation failed: %v", err)
+	}
+	// TODO: we should take a context in here at some point
+	ctx, cancel := context.WithCancel(context.TODO())
+
+	s := &Session{
+		cons:            cfg.Consistency,
+		prefetch:        0.25,
+		cfg:             cfg,
+		pageSize:        cfg.PageSize,
+		stmtsLRU:        &preparedLRU{lru: lru.New(cfg.MaxPreparedStmts)},
+		connectObserver: cfg.ConnectObserver,
+		ctx:             ctx,
+		cancel:          cancel,
+		logger:          cfg.logger(),
+		readyCh:         make(chan struct{}, 1),
+	}
+
+	// Close created resources on error otherwise they'll leak
+	var err error
+	defer func() {
+		if err != nil {
+			s.Close()
+		}
+	}()
+
+	s.metadataDescriber = newMetadataDescriber(s)
+
+	s.nodeEvents = newEventDebouncer("NodeEvents", s.handleNodeEvent, s.logger)
+	s.schemaEvents = newEventDebouncer("SchemaEvents", s.handleSchemaEvent, s.logger)
+
+	s.routingKeyInfoCache.lru = lru.New(cfg.MaxRoutingKeyInfo)
+
+	s.hostSource = &ringDescriber{cfg: &s.cfg, logger: s.logger}
+	s.ringRefresher = debounce.NewRefreshDebouncer(debounce.RingRefreshDebounceTime, func() error {
+		return s.refreshRing()
+	})
+
+	if cfg.PoolConfig.HostSelectionPolicy == nil {
+		cfg.PoolConfig.HostSelectionPolicy = RoundRobinHostPolicy()
+	}
+	s.pool = cfg.PoolConfig.buildPool(s)
+
+	s.policy = cfg.PoolConfig.HostSelectionPolicy
+	s.policy.Init(s)
+
+	s.executor = &queryExecutor{
+		pool:   s.pool,
+		policy: cfg.PoolConfig.HostSelectionPolicy,
+	}
+
+	s.queryObserver = cfg.QueryObserver
+	s.batchObserver = cfg.BatchObserver
+	s.connectObserver = cfg.ConnectObserver
+	s.frameObserver = cfg.FrameHeaderObserver
+	s.streamObserver = cfg.StreamObserver
+
+	//Check the TLS Config before trying to connect to anything external
+	connCfg, err := connConfig(&s.cfg)
+	if err != nil {
+		//TODO: Return a typed error
+		return nil, fmt.Errorf("gocql: unable to create session: %v", err)
+	}
+	s.connCfg = connCfg
+	if cfg.WarningsHandlerBuilder != nil {
+		s.warningHandler = cfg.WarningsHandlerBuilder(s)
+	}
+	return s, nil
+}
+
+// NewSession wraps an existing Node.
+func NewSession(cfg ClusterConfig) (*Session, error) {
+	s, err := newSessionCommon(cfg)
+	if err != nil {
+		return nil, err
+	}
+
+	if err = s.init(); err != nil {
+		if err == ErrNoConnectionsStarted {
+			//This error used to be generated inside NewSession & returned directly
+			//Forward it on up to be backwards compatible
+			return nil, ErrNoConnectionsStarted
+		} else {
+			// TODO(zariel): dont wrap this error in fmt.Errorf, return a typed error
+			return nil, fmt.Errorf("gocql: unable to create session: %v", err)
+		}
+	}
+
+	s.readyCh <- struct{}{}
+	close(s.readyCh)
+
+	return s, nil
+}
+
+func NewSessionNonBlocking(cfg ClusterConfig) (*Session, error) {
+	s, err := newSessionCommon(cfg)
+	if err != nil {
+		return nil, err
+	}
+
+	go func() {
+		if initErr := s.init(); initErr != nil {
+			s.sessionStateMu.Lock()
+			s.initErr = fmt.Errorf("gocql: unable to create session: %v", initErr)
+			s.sessionStateMu.Unlock()
+		}
+
+		s.readyCh <- struct{}{}
+		close(s.readyCh)
+	}()
+
+	return s, nil
+}
+
+func (s *Session) init() error {
+	if s.cfg.disableInit {
+		return nil
+	}
+
+	hosts, err := addrsToHosts(s.cfg.Hosts, s.cfg.Port, s.logger)
+	if err != nil {
+		return err
+	}
+
+	if !s.cfg.disableControlConn {
+		s.control = createControlConn(s)
+		reconnectionPolicy := s.cfg.InitialReconnectionPolicy
+		var lastErr error
+		for i := 0; i < reconnectionPolicy.GetMaxRetries(); i++ {
+			lastErr = nil
+			if i != 0 {
+				time.Sleep(reconnectionPolicy.GetInterval(i))
+			}
+
+			if s.cfg.ProtoVersion == 0 {
+				proto, err := s.control.discoverProtocol(hosts)
+				if err != nil {
+					err = fmt.Errorf("unable to discover protocol version: %v\n", err)
+					if gocqlDebug {
+						s.logger.Println(err.Error())
+					}
+					lastErr = err
+					continue
+				} else if proto == 0 {
+					return errors.New("unable to discovery protocol version")
+				}
+
+				// TODO(zariel): we really only need this in 1 place
+				s.cfg.ProtoVersion = proto
+				s.connCfg.ProtoVersion = proto
+			}
+
+			if err := s.control.connect(hosts); err != nil {
+				err = fmt.Errorf("unable to create control connection: %v\n", err)
+				if gocqlDebug {
+					s.logger.Println(err.Error())
+				}
+				lastErr = err
+				continue
+			}
+		}
+		if lastErr != nil {
+			return fmt.Errorf("unable to connect to the cluster, last error: %v", lastErr.Error())
+		}
+
+		conn := s.control.getConn().conn.(*Conn)
+		conn.mu.Lock()
+		s.tabletsRoutingV1 = conn.isTabletSupported()
+		if s.cfg.MetadataSchemaRequestTimeout > time.Duration(0) && conn.isScyllaConn() {
+			s.usingTimeoutClause = " USING TIMEOUT " + strconv.FormatInt(int64(s.cfg.MetadataSchemaRequestTimeout.Milliseconds()), 10) + "ms"
+		}
+		conn.mu.Unlock()
+
+		s.hostSource.setControlConn(s.control)
+
+		if !s.cfg.DisableInitialHostLookup {
+			var partitioner string
+			newHosts, partitioner, err := s.hostSource.GetHostsFromSystem()
+			if err != nil {
+				return err
+			}
+			s.policy.SetPartitioner(partitioner)
+			filteredHosts := make([]*HostInfo, 0, len(newHosts))
+			for _, host := range newHosts {
+				if !s.cfg.filterHost(host) {
+					filteredHosts = append(filteredHosts, host)
+				}
+			}
+
+			hosts = filteredHosts
+
+			if s.tabletsRoutingV1 {
+				tablets := TabletInfoList{}
+				s.metadataDescriber.setTablets(tablets)
+			}
+		}
+	}
+
+	for _, host := range hosts {
+		// In case when host lookup is disabled and when we are in unit tests,
+		// host are not discovered, and we are missing host ID information used
+		// by internal logic.
+		// Associate random UUIDs here with all hosts missing this information.
+		if len(host.HostID()) == 0 {
+			host.SetHostID(MustRandomUUID().String())
+		}
+	}
+
+	hostMap := make(map[string]*HostInfo, len(hosts))
+	for _, host := range hosts {
+		hostMap[host.HostID()] = host
+	}
+
+	hosts = hosts[:0]
+	// each host will increment left and decrement it after connecting and once
+	// there's none left, we'll close hostCh
+	var left int64
+	// we will receive up to len(hostMap) of messages so create a buffer so we
+	// don't end up stuck in a goroutine if we stopped listening
+	connectedCh := make(chan struct{}, len(hostMap))
+	// we add one here because we don't want to end up closing hostCh until we're
+	// done looping and the decerement code might be reached before we've looped
+	// again
+	atomic.AddInt64(&left, 1)
+	for _, host := range hostMap {
+		host := s.hostSource.addOrUpdate(host)
+		if s.cfg.filterHost(host) {
+			continue
+		}
+
+		atomic.AddInt64(&left, 1)
+		go func() {
+			s.pool.addHost(host)
+			connectedCh <- struct{}{}
+
+			// if there are no hosts left, then close the hostCh to unblock the loop
+			// below if its still waiting
+			if atomic.AddInt64(&left, -1) == 0 {
+				close(connectedCh)
+			}
+		}()
+
+		hosts = append(hosts, host)
+	}
+	// once we're done looping we subtract the one we initially added and check
+	// to see if we should close
+	if atomic.AddInt64(&left, -1) == 0 {
+		close(connectedCh)
+	}
+
+	// before waiting for them to connect, add them all to the policy so we can
+	// utilize efficiencies by calling AddHosts if the policy supports it
+	type bulkAddHosts interface {
+		AddHosts([]*HostInfo)
+	}
+	if v, ok := s.policy.(bulkAddHosts); ok {
+		v.AddHosts(hosts)
+	} else {
+		for _, host := range hosts {
+			s.policy.AddHost(host)
+		}
+	}
+
+	readyPolicy, _ := s.policy.(ReadyPolicy)
+	// now loop over connectedCh until it's closed (meaning we've connected to all)
+	// or until the policy says we're ready
+	for range connectedCh {
+		if readyPolicy != nil && readyPolicy.Ready() {
+			break
+		}
+	}
+
+	// TODO(zariel): we probably dont need this any more as we verify that we
+	// can connect to one of the endpoints supplied by using the control conn.
+	// See if there are any connections in the pool
+	if s.cfg.ReconnectInterval > 0 {
+		go s.reconnectDownedHosts(s.cfg.ReconnectInterval)
+	}
+
+	// If we disable the initial host lookup, we need to still check if the
+	// cluster is using the newer system schema or not... however, if control
+	// connection is disable, we really have no choice, so we just make our
+	// best guess...
+	if !s.cfg.disableControlConn && s.cfg.DisableInitialHostLookup {
+		newer, _ := checkSystemSchema(s.control)
+		s.useSystemSchema = newer
+	} else {
+		version := s.hostSource.getHostsList()[0].Version()
+		s.useSystemSchema = version.AtLeast(3, 0, 0)
+		s.hasAggregatesAndFunctions = version.AtLeast(2, 2, 0)
+	}
+
+	if s.pool.Size() == 0 {
+		return ErrNoConnectionsStarted
+	}
+
+	// Invoke KeyspaceChanged to let the policy cache the session keyspace
+	// parameters. This is used by tokenAwareHostPolicy to discover replicas.
+	if !s.cfg.disableControlConn && s.cfg.Keyspace != "" {
+		s.policy.KeyspaceChanged(KeyspaceUpdateEvent{Keyspace: s.cfg.Keyspace})
+	}
+
+	if err = s.policy.IsOperational(s); err != nil {
+		return fmt.Errorf("gocql: unable to create session: %v", err)
+	}
+
+	s.sessionStateMu.Lock()
+	s.isInitialized = true
+	s.sessionStateMu.Unlock()
+
+	return nil
+}
+
+// AwaitSchemaAgreement will wait until schema versions across all nodes in the
+// cluster are the same (as seen from the point of view of the control connection).
+// The maximum amount of time this takes is governed
+// by the MaxWaitSchemaAgreement setting in the configuration (default: 60s).
+// AwaitSchemaAgreement returns an error in case schema versions are not the same
+// after the timeout specified in MaxWaitSchemaAgreement elapses.
+func (s *Session) AwaitSchemaAgreement(ctx context.Context) error {
+	if s.cfg.disableControlConn {
+		return errNoControl
+	}
+	if err := s.Ready(); err != nil {
+		return err
+	}
+	ch := s.control.getConn()
+	return (&Iter{err: ch.conn.awaitSchemaAgreement(ctx)}).err
+}
+
+func (s *Session) reconnectDownedHosts(intv time.Duration) {
+	reconnectTicker := time.NewTicker(intv)
+	defer reconnectTicker.Stop()
+
+	for {
+		select {
+		case <-reconnectTicker.C:
+			hosts := s.hostSource.getHostsList()
+
+			// Print session.hostSource for debug.
+			if gocqlDebug {
+				buf := bytes.NewBufferString("Session.hostSource:")
+				for _, h := range hosts {
+					buf.WriteString("[" + h.ConnectAddress().String() + ":" + h.State().String() + "]")
+				}
+				s.logger.Println(buf.String())
+			}
+
+			for _, h := range hosts {
+				if h.IsUp() {
+					continue
+				}
+				// we let the pool call handleNodeConnected to change the host state
+				s.pool.addHost(h)
+			}
+		case <-s.ctx.Done():
+			return
+		}
+	}
+}
+
+// SetConsistency sets the default consistency level for this session. This
+// setting can also be changed on a per-query basis and the default value
+// is Quorum.
+func (s *Session) SetConsistency(cons Consistency) {
+	s.mu.Lock()
+	s.cons = cons
+	s.mu.Unlock()
+}
+
+// SetPageSize sets the default page size for this session. A value <= 0 will
+// disable paging. This setting can also be changed on a per-query basis.
+func (s *Session) SetPageSize(n int) {
+	s.mu.Lock()
+	s.pageSize = n
+	s.mu.Unlock()
+}
+
+// SetPrefetch sets the default threshold for pre-fetching new pages. If
+// there are only p*pageSize rows remaining, the next page will be requested
+// automatically. This value can also be changed on a per-query basis and
+// the default value is 0.25.
+func (s *Session) SetPrefetch(p float64) {
+	s.mu.Lock()
+	s.prefetch = p
+	s.mu.Unlock()
+}
+
+// SetTrace sets the default tracer for this session. This setting can also
+// be changed on a per-query basis.
+func (s *Session) SetTrace(trace Tracer) {
+	s.mu.Lock()
+	s.trace = trace
+	s.mu.Unlock()
+}
+
+// Query generates a new query object for interacting with the database.
+// Further details of the query may be tweaked using the resulting query
+// value before the query is executed. Query is automatically prepared
+// if it has not previously been executed.
+func (s *Session) Query(stmt string, values ...interface{}) *Query {
+	qry := queryPool.Get().(*Query)
+	qry.session = s
+	qry.stmt = stmt
+	qry.values = values
+	qry.defaultsFromSession()
+	qry.routingInfo.lwt = false
+	return qry
+}
+
+type QueryInfo struct {
+	Id          []byte
+	Args        []ColumnInfo
+	Rval        []ColumnInfo
+	PKeyColumns []int
+}
+
+// Bind generates a new query object based on the query statement passed in.
+// The query is automatically prepared if it has not previously been executed.
+// The binding callback allows the application to define which query argument
+// values will be marshalled as part of the query execution.
+// During execution, the meta data of the prepared query will be routed to the
+// binding callback, which is responsible for producing the query argument values.
+func (s *Session) Bind(stmt string, b func(q *QueryInfo) ([]interface{}, error)) *Query {
+	qry := queryPool.Get().(*Query)
+	qry.session = s
+	qry.stmt = stmt
+	qry.binding = b
+	qry.defaultsFromSession()
+	qry.routingInfo.lwt = false
+	return qry
+}
+
+// Close closes all connections. The session is unusable after this
+// operation.
+func (s *Session) Close() {
+
+	s.sessionStateMu.Lock()
+	if s.isClosing {
+		s.sessionStateMu.Unlock()
+		return
+	}
+	s.isClosing = true
+	s.sessionStateMu.Unlock()
+
+	if s.pool != nil {
+		s.pool.Close()
+	}
+
+	if s.control != nil {
+		s.control.close()
+	}
+
+	if s.nodeEvents != nil {
+		s.nodeEvents.stop()
+	}
+
+	if s.schemaEvents != nil {
+		s.schemaEvents.stop()
+	}
+
+	if s.ringRefresher != nil {
+		s.ringRefresher.Stop()
+	}
+
+	if s.cancel != nil {
+		s.cancel()
+	}
+
+	if s.policy != nil {
+		s.policy.Reset()
+	}
+
+	s.sessionStateMu.Lock()
+	s.isClosed = true
+	s.sessionStateMu.Unlock()
+}
+
+func (s *Session) Closed() bool {
+	s.sessionStateMu.RLock()
+	closed := s.isClosed
+	s.sessionStateMu.RUnlock()
+	return closed
+}
+
+func (s *Session) initialized() bool {
+	s.sessionStateMu.RLock()
+	initialized := s.isInitialized
+	s.sessionStateMu.RUnlock()
+	return initialized
+}
+
+func (s *Session) Ready() error {
+	s.sessionStateMu.RLock()
+	err := ErrSessionNotReady
+	if s.isInitialized || s.initErr != nil {
+		err = s.initErr
+	}
+	s.sessionStateMu.RUnlock()
+	return err
+}
+
+func (s *Session) WaitUntilReady() error {
+	<-s.readyCh
+	return s.initErr
+}
+
+func (s *Session) executeQuery(qry *Query) (it *Iter) {
+	// fail fast
+	if s.Closed() {
+		return &Iter{err: ErrSessionClosed}
+	}
+	if err := s.Ready(); err != nil {
+		return &Iter{err: err}
+	}
+
+	iter, err := s.executor.executeQuery(qry)
+	if err != nil {
+		return &Iter{err: err}
+	}
+	if iter == nil {
+		panic("nil iter")
+	}
+
+	return iter
+}
+
+func (s *Session) removeHost(h *HostInfo) {
+	s.policy.RemoveHost(h)
+	hostID := h.HostID()
+	s.pool.removeHost(hostID)
+	s.hostSource.removeHost(hostID)
+}
+
+// KeyspaceMetadata returns the schema metadata for the keyspace specified. Returns an error if the keyspace does not exist.
+func (s *Session) KeyspaceMetadata(keyspace string) (*KeyspaceMetadata, error) {
+	// fail fast
+	if s.Closed() {
+		return nil, ErrSessionClosed
+	} else if err := s.Ready(); err != nil {
+		return nil, err
+	} else if keyspace == "" {
+		return nil, ErrNoKeyspace
+	}
+
+	return s.metadataDescriber.getSchema(keyspace)
+}
+
+// TabletsMetadata returns the metadata about tablets
+func (s *Session) TabletsMetadata() (TabletInfoList, error) {
+	// fail fast
+	if s.Closed() {
+		return nil, ErrSessionClosed
+	} else if err := s.Ready(); err != nil {
+		return nil, err
+	} else if !s.tabletsRoutingV1 {
+		return nil, ErrTabletsNotUsed
+	}
+
+	return s.metadataDescriber.getTablets(), nil
+}
+
+func (s *Session) getConn() *Conn {
+	hosts := s.hostSource.getHostsList()
+	for _, host := range hosts {
+		if !host.IsUp() {
+			continue
+		}
+
+		pool, ok := s.pool.getPool(host)
+		if !ok {
+			continue
+		} else if conn := pool.Pick(nil, nil); conn != nil {
+			return conn
+		}
+	}
+
+	return nil
+}
+
+func (s *Session) getTablets() TabletInfoList {
+	return s.metadataDescriber.getTablets()
+}
+
+// returns routing key indexes and type info
+func (s *Session) routingKeyInfo(ctx context.Context, stmt string) (*routingKeyInfo, error) {
+	s.routingKeyInfoCache.mu.Lock()
+
+	entry, cached := s.routingKeyInfoCache.lru.Get(stmt)
+	if cached {
+		// done accessing the cache
+		s.routingKeyInfoCache.mu.Unlock()
+		// the entry is an inflight struct similar to that used by
+		// Conn to prepare statements
+		inflight := entry.(*inflightCachedEntry)
+
+		// wait for any inflight work
+		inflight.wg.Wait()
+
+		if inflight.err != nil {
+			return nil, inflight.err
+		}
+
+		key, _ := inflight.value.(*routingKeyInfo)
+
+		return key, nil
+	}
+
+	// create a new inflight entry while the data is created
+	inflight := new(inflightCachedEntry)
+	inflight.wg.Add(1)
+	defer inflight.wg.Done()
+	s.routingKeyInfoCache.lru.Add(stmt, inflight)
+	s.routingKeyInfoCache.mu.Unlock()
+
+	var (
+		info         *preparedStatment
+		partitionKey []*ColumnMetadata
+	)
+
+	conn := s.getConn()
+	if conn == nil {
+		// TODO: better error?
+		inflight.err = errors.New("gocql: unable to fetch prepared info: no connection available")
+		return nil, inflight.err
+	}
+
+	// get the query info for the statement
+	info, inflight.err = conn.prepareStatement(ctx, stmt, nil)
+	if inflight.err != nil {
+		// don't cache this error
+		s.routingKeyInfoCache.Remove(stmt)
+		return nil, inflight.err
+	}
+
+	// TODO: it would be nice to mark hosts here but as we are not using the policies
+	// to fetch hosts we cant
+
+	if info.request.colCount == 0 {
+		// no arguments, no routing key, and no error
+		return nil, nil
+	}
+
+	table := info.request.table
+	keyspace := info.request.keyspace
+
+	partitioner, err := scyllaGetTablePartitioner(s, keyspace, table)
+	if err != nil {
+		// don't cache this error
+		s.routingKeyInfoCache.Remove(stmt)
+		return nil, inflight.err
+	}
+
+	if len(info.request.pkeyColumns) > 0 {
+		// proto v4 dont need to calculate primary key columns
+		types := make([]TypeInfo, len(info.request.pkeyColumns))
+		for i, col := range info.request.pkeyColumns {
+			types[i] = info.request.columns[col].TypeInfo
+		}
+
+		routingKeyInfo := &routingKeyInfo{
+			indexes:     info.request.pkeyColumns,
+			types:       types,
+			lwt:         info.request.lwt,
+			partitioner: partitioner,
+			keyspace:    keyspace,
+			table:       table,
+		}
+
+		inflight.value = routingKeyInfo
+		return routingKeyInfo, nil
+	}
+
+	// get the table metadata
+
+	var keyspaceMetadata *KeyspaceMetadata
+	keyspaceMetadata, inflight.err = s.KeyspaceMetadata(info.request.columns[0].Keyspace)
+	if inflight.err != nil {
+		// don't cache this error
+		s.routingKeyInfoCache.Remove(stmt)
+		return nil, inflight.err
+	}
+
+	tableMetadata, found := keyspaceMetadata.Tables[table]
+	if !found {
+		// unlikely that the statement could be prepared and the metadata for
+		// the table couldn't be found, but this may indicate either a bug
+		// in the metadata code, or that the table was just dropped.
+		inflight.err = ErrNoMetadata
+		// don't cache this error
+		s.routingKeyInfoCache.Remove(stmt)
+		return nil, inflight.err
+	}
+
+	partitionKey = tableMetadata.PartitionKey
+
+	size := len(partitionKey)
+	routingKeyInfo := &routingKeyInfo{
+		indexes:     make([]int, size),
+		types:       make([]TypeInfo, size),
+		lwt:         info.request.lwt,
+		partitioner: partitioner,
+		keyspace:    keyspace,
+		table:       table,
+	}
+
+	for keyIndex, keyColumn := range partitionKey {
+		// set an indicator for checking if the mapping is missing
+		routingKeyInfo.indexes[keyIndex] = -1
+
+		// find the column in the query info
+		for argIndex, boundColumn := range info.request.columns {
+			if keyColumn.Name == boundColumn.Name {
+				// there may be many such bound columns, pick the first
+				routingKeyInfo.indexes[keyIndex] = argIndex
+				routingKeyInfo.types[keyIndex] = boundColumn.TypeInfo
+				break
+			}
+		}
+
+		if routingKeyInfo.indexes[keyIndex] == -1 {
+			// missing a routing key column mapping
+			// no routing key, and no error
+			return nil, nil
+		}
+	}
+
+	// cache this result
+	inflight.value = routingKeyInfo
+
+	return routingKeyInfo, nil
+}
+
+func (b *Batch) execute(ctx context.Context, conn *Conn) *Iter {
+	return conn.executeBatch(ctx, b)
+}
+
+// Exec executes a batch operation and returns nil if successful
+// otherwise an error is returned describing the failure.
+func (b *Batch) Exec() error {
+	iter := b.session.executeBatch(b)
+	return iter.Close()
+}
+
+func (s *Session) executeBatch(batch *Batch) *Iter {
+	// fail fast
+	if s.Closed() {
+		return &Iter{err: ErrSessionClosed}
+	}
+	if err := s.Ready(); err != nil {
+		return &Iter{err: err}
+	}
+
+	// Prevent the execution of the batch if greater than the limit
+	// Currently batches have a limit of 65536 queries.
+	// https://datastax-oss.atlassian.net/browse/JAVA-229
+	if batch.Size() > BatchSizeMaximum {
+		return &Iter{err: ErrTooManyStmts}
+	}
+
+	iter, err := s.executor.executeQuery(batch)
+	if err != nil {
+		return &Iter{err: err}
+	}
+
+	return iter
+}
+
+// ExecuteBatch executes a batch operation and returns nil if successful
+// otherwise an error is returned describing the failure.
+func (s *Session) ExecuteBatch(batch *Batch) error {
+	iter := s.executeBatch(batch)
+	return iter.Close()
+}
+
+// ExecuteBatchCAS executes a batch operation and returns true if successful and
+// an iterator (to scan additional rows if more than one conditional statement)
+// was sent.
+// Further scans on the interator must also remember to include
+// the applied boolean as the first argument to *Iter.Scan
+func (s *Session) ExecuteBatchCAS(batch *Batch, dest ...interface{}) (applied bool, iter *Iter, err error) {
+	iter = s.executeBatch(batch)
+	if err := iter.checkErrAndNotFound(); err != nil {
+		iter.Close()
+		return false, nil, err
+	}
+
+	if len(iter.Columns()) > 1 {
+		dest = append([]interface{}{&applied}, dest...)
+		iter.Scan(dest...)
+	} else {
+		iter.Scan(&applied)
+	}
+
+	return applied, iter, nil
+}
+
+// MapExecuteBatchCAS executes a batch operation much like ExecuteBatchCAS,
+// however it accepts a map rather than a list of arguments for the initial
+// scan.
+func (s *Session) MapExecuteBatchCAS(batch *Batch, dest map[string]interface{}) (applied bool, iter *Iter, err error) {
+	iter = s.executeBatch(batch)
+	if err := iter.checkErrAndNotFound(); err != nil {
+		iter.Close()
+		return false, nil, err
+	}
+	iter.MapScan(dest)
+	applied = dest["[applied]"].(bool)
+	delete(dest, "[applied]")
+
+	// we usually close here, but instead of closing, just returin an error
+	// if MapScan failed. Although Close just returns err, using Close
+	// here might be confusing as we are not actually closing the iter
+	return applied, iter, iter.err
+}
+
+type hostMetrics struct {
+	// Attempts is count of how many times this query has been attempted for this host.
+	// An attempt is either a retry or fetching next page of results.
+	Attempts int
+
+	// TotalLatency is the sum of attempt latencies for this host in nanoseconds.
+	TotalLatency int64
+}
+
+type queryMetrics struct {
+	l sync.RWMutex
+	m map[string]*hostMetrics
+	// totalAttempts is total number of attempts.
+	// Equal to sum of all hostMetrics' Attempts.
+	totalAttempts int
+}
+
+// preFilledQueryMetrics initializes new queryMetrics based on per-host supplied data.
+func preFilledQueryMetrics(m map[string]*hostMetrics) *queryMetrics {
+	qm := &queryMetrics{m: m}
+	for _, hm := range qm.m {
+		qm.totalAttempts += hm.Attempts
+	}
+	return qm
+}
+
+// hostMetrics returns a snapshot of metrics for given host.
+// If the metrics for host don't exist, they are created.
+func (qm *queryMetrics) hostMetrics(host *HostInfo) *hostMetrics {
+	qm.l.Lock()
+	metrics := qm.hostMetricsLocked(host)
+	copied := new(hostMetrics)
+	*copied = *metrics
+	qm.l.Unlock()
+	return copied
+}
+
+// hostMetricsLocked gets or creates host metrics for given host.
+// It must be called only while holding qm.l lock.
+func (qm *queryMetrics) hostMetricsLocked(host *HostInfo) *hostMetrics {
+	metrics, exists := qm.m[host.ConnectAddress().String()]
+	if !exists {
+		// if the host is not in the map, it means it's been accessed for the first time
+		metrics = &hostMetrics{}
+		qm.m[host.ConnectAddress().String()] = metrics
+	}
+
+	return metrics
+}
+
+// attempts returns the number of times the query was executed.
+func (qm *queryMetrics) attempts() int {
+	qm.l.Lock()
+	attempts := qm.totalAttempts
+	qm.l.Unlock()
+	return attempts
+}
+
+func (qm *queryMetrics) latency() int64 {
+	qm.l.Lock()
+	var (
+		attempts int
+		latency  int64
+	)
+	for _, metric := range qm.m {
+		attempts += metric.Attempts
+		latency += metric.TotalLatency
+	}
+	qm.l.Unlock()
+	if attempts > 0 {
+		return latency / int64(attempts)
+	}
+	return 0
+}
+
+// attempt adds given number of attempts and latency for given host.
+// It returns previous total attempts.
+// If needsHostMetrics is true, a copy of updated hostMetrics is returned.
+func (qm *queryMetrics) attempt(addAttempts int, addLatency time.Duration,
+	host *HostInfo, needsHostMetrics bool) (int, *hostMetrics) {
+	qm.l.Lock()
+
+	totalAttempts := qm.totalAttempts
+	qm.totalAttempts += addAttempts
+
+	updateHostMetrics := qm.hostMetricsLocked(host)
+	updateHostMetrics.Attempts += addAttempts
+	updateHostMetrics.TotalLatency += addLatency.Nanoseconds()
+
+	var hostMetricsCopy *hostMetrics
+	if needsHostMetrics {
+		hostMetricsCopy = new(hostMetrics)
+		*hostMetricsCopy = *updateHostMetrics
+	}
+
+	qm.l.Unlock()
+	return totalAttempts, hostMetricsCopy
+}
+
+// Query represents a CQL statement that can be executed.
+type Query struct {
+	stmt                  string
+	values                []interface{}
+	cons                  Consistency
+	pageSize              int
+	routingKey            []byte
+	pageState             []byte
+	prefetch              float64
+	trace                 Tracer
+	observer              QueryObserver
+	session               *Session
+	conn                  ConnInterface
+	rt                    RetryPolicy
+	spec                  SpeculativeExecutionPolicy
+	binding               func(q *QueryInfo) ([]interface{}, error)
+	serialCons            Consistency
+	defaultTimestamp      bool
+	defaultTimestampValue int64
+	disableSkipMetadata   bool
+	context               context.Context
+	idempotent            bool
+	customPayload         map[string][]byte
+	metrics               *queryMetrics
+	refCount              uint32
+
+	disableAutoPage bool
+
+	// getKeyspace is field so that it can be overriden in tests
+	getKeyspace func() string
+
+	// used by control conn queries to prevent triggering a write to systems
+	// tables in AWS MCS see
+	skipPrepare bool
+
+	// routingInfo is a pointer because Query can be copied and copyable struct can't hold a mutex.
+	routingInfo *queryRoutingInfo
+}
+
+type queryRoutingInfo struct {
+	// mu protects contents of queryRoutingInfo.
+	mu sync.RWMutex
+
+	// "lwt" denotes the query being an LWT operation
+	// In effect if the query is of the form "INSERT/UPDATE/DELETE ... IF ..."
+	// For more details see https://docs.scylladb.com/using-scylla/lwt/
+	lwt bool
+
+	// If not nil, represents a custom partitioner for the table.
+	partitioner Partitioner
+
+	keyspace string
+
+	table string
+}
+
+func (qri *queryRoutingInfo) isLWT() bool {
+	qri.mu.RLock()
+	defer qri.mu.RUnlock()
+	return qri.lwt
+}
+
+func (qri *queryRoutingInfo) getPartitioner() Partitioner {
+	qri.mu.RLock()
+	defer qri.mu.RUnlock()
+	return qri.partitioner
+}
+
+func (q *Query) defaultsFromSession() {
+	s := q.session
+
+	s.mu.RLock()
+	q.cons = s.cons
+	q.pageSize = s.pageSize
+	q.trace = s.trace
+	q.observer = s.queryObserver
+	q.prefetch = s.prefetch
+	q.rt = s.cfg.RetryPolicy
+	q.serialCons = s.cfg.SerialConsistency
+	q.defaultTimestamp = s.cfg.DefaultTimestamp
+	q.idempotent = s.cfg.DefaultIdempotence
+	q.metrics = &queryMetrics{m: make(map[string]*hostMetrics)}
+
+	q.spec = &NonSpeculativeExecution{}
+	s.mu.RUnlock()
+}
+
+// Statement returns the statement that was used to generate this query.
+func (q Query) Statement() string {
+	return q.stmt
+}
+
+// Values returns the values passed in via Bind.
+// This can be used by a wrapper type that needs to access the bound values.
+func (q Query) Values() []interface{} {
+	return q.values
+}
+
+// String implements the stringer interface.
+func (q Query) String() string {
+	return fmt.Sprintf("[query statement=%q values=%+v consistency=%s]", q.stmt, q.values, q.cons)
+}
+
+// Attempts returns the number of times the query was executed.
+func (q *Query) Attempts() int {
+	return q.metrics.attempts()
+}
+
+func (q *Query) AddAttempts(i int, host *HostInfo) {
+	q.metrics.attempt(i, 0, host, false)
+}
+
+// Latency returns the average amount of nanoseconds per attempt of the query.
+func (q *Query) Latency() int64 {
+	return q.metrics.latency()
+}
+
+func (q *Query) AddLatency(l int64, host *HostInfo) {
+	q.metrics.attempt(0, time.Duration(l)*time.Nanosecond, host, false)
+}
+
+// Consistency sets the consistency level for this query. If no consistency
+// level have been set, the default consistency level of the cluster
+// is used.
+func (q *Query) Consistency(c Consistency) *Query {
+	q.cons = c
+	return q
+}
+
+// GetConsistency returns the currently configured consistency level for
+// the query.
+func (q *Query) GetConsistency() Consistency {
+	return q.cons
+}
+
+// Same as Consistency but without a return value
+func (q *Query) SetConsistency(c Consistency) {
+	q.cons = c
+}
+
+// CustomPayload sets the custom payload level for this query.
+func (q *Query) CustomPayload(customPayload map[string][]byte) *Query {
+	q.customPayload = customPayload
+	return q
+}
+
+func (q *Query) Context() context.Context {
+	if q.context == nil {
+		return context.Background()
+	}
+	return q.context
+}
+
+// Trace enables tracing of this query. Look at the documentation of the
+// Tracer interface to learn more about tracing.
+func (q *Query) Trace(trace Tracer) *Query {
+	q.trace = trace
+	return q
+}
+
+// Observer enables query-level observer on this query.
+// The provided observer will be called every time this query is executed.
+func (q *Query) Observer(observer QueryObserver) *Query {
+	q.observer = observer
+	return q
+}
+
+// PageSize will tell the iterator to fetch the result in pages of size n.
+// This is useful for iterating over large result sets, but setting the
+// page size too low might decrease the performance. This feature is only
+// available in Cassandra 2 and onwards.
+func (q *Query) PageSize(n int) *Query {
+	q.pageSize = n
+	return q
+}
+
+// DefaultTimestamp will enable the with default timestamp flag on the query.
+// If enable, this will replace the server side assigned
+// timestamp as default timestamp. Note that a timestamp in the query itself
+// will still override this timestamp. This is entirely optional.
+//
+// Only available on protocol >= 3
+func (q *Query) DefaultTimestamp(enable bool) *Query {
+	q.defaultTimestamp = enable
+	return q
+}
+
+// WithTimestamp will enable the with default timestamp flag on the query
+// like DefaultTimestamp does. But also allows to define value for timestamp.
+// It works the same way as USING TIMESTAMP in the query itself, but
+// should not break prepared query optimization.
+//
+// Only available on protocol >= 3
+func (q *Query) WithTimestamp(timestamp int64) *Query {
+	q.DefaultTimestamp(true)
+	q.defaultTimestampValue = timestamp
+	return q
+}
+
+// RoutingKey sets the routing key to use when a token aware connection
+// pool is used to optimize the routing of this query.
+func (q *Query) RoutingKey(routingKey []byte) *Query {
+	q.routingKey = routingKey
+	return q
+}
+
+func (q *Query) withContext(ctx context.Context) ExecutableQuery {
+	// I really wish go had covariant types
+	return q.WithContext(ctx)
+}
+
+// WithContext returns a shallow copy of q with its context
+// set to ctx.
+//
+// The provided context controls the entire lifetime of executing a
+// query, queries will be canceled and return once the context is
+// canceled.
+func (q *Query) WithContext(ctx context.Context) *Query {
+	q2 := *q
+	q2.context = ctx
+	return &q2
+}
+
+// Deprecate: does nothing, cancel the context passed to WithContext
+func (q *Query) Cancel() {
+	// TODO: delete
+}
+
+func (q *Query) execute(ctx context.Context, conn *Conn) *Iter {
+	return conn.executeQuery(ctx, q)
+}
+
+func (q *Query) attempt(keyspace string, end, start time.Time, iter *Iter, host *HostInfo) {
+	latency := end.Sub(start)
+	attempt, metricsForHost := q.metrics.attempt(1, latency, host, q.observer != nil)
+
+	if q.observer != nil {
+		q.observer.ObserveQuery(q.Context(), ObservedQuery{
+			Keyspace:  keyspace,
+			Statement: q.stmt,
+			Values:    q.values,
+			Start:     start,
+			End:       end,
+			Rows:      iter.numRows,
+			Host:      host,
+			Metrics:   metricsForHost,
+			Err:       iter.err,
+			Attempt:   attempt,
+		})
+	}
+}
+
+func (q *Query) retryPolicy() RetryPolicy {
+	return q.rt
+}
+
+// Keyspace returns the keyspace the query will be executed against.
+func (q *Query) Keyspace() string {
+	if q.getKeyspace != nil {
+		return q.getKeyspace()
+	}
+	if q.routingInfo.keyspace != "" {
+		return q.routingInfo.keyspace
+	}
+
+	if q.session == nil {
+		return ""
+	}
+	// TODO(chbannis): this should be parsed from the query or we should let
+	// this be set by users.
+	return q.session.cfg.Keyspace
+}
+
+// Table returns name of the table the query will be executed against.
+func (q *Query) Table() string {
+	return q.routingInfo.table
+}
+
+func (q *Query) GetSession() *Session {
+	return q.session
+}
+
+// GetRoutingKey gets the routing key to use for routing this query. If
+// a routing key has not been explicitly set, then the routing key will
+// be constructed if possible using the keyspace's schema and the query
+// info for this query statement. If the routing key cannot be determined
+// then nil will be returned with no error. On any error condition,
+// an error description will be returned.
+func (q *Query) GetRoutingKey() ([]byte, error) {
+	if q.routingKey != nil {
+		return q.routingKey, nil
+	} else if q.binding != nil && len(q.values) == 0 {
+		// If this query was created using session.Bind we wont have the query
+		// values yet, so we have to pass down to the next policy.
+		// TODO: Remove this and handle this case
+		return nil, nil
+	}
+
+	// try to determine the routing key
+	routingKeyInfo, err := q.session.routingKeyInfo(q.Context(), q.stmt)
+	if err != nil {
+		return nil, err
+	}
+
+	if routingKeyInfo != nil {
+		q.routingInfo.mu.Lock()
+		q.routingInfo.lwt = routingKeyInfo.lwt
+		q.routingInfo.partitioner = routingKeyInfo.partitioner
+		q.routingInfo.keyspace = routingKeyInfo.keyspace
+		q.routingInfo.table = routingKeyInfo.table
+		q.routingInfo.mu.Unlock()
+	}
+	return createRoutingKey(routingKeyInfo, q.values)
+}
+
+func (q *Query) shouldPrepare() bool {
+
+	stmt := strings.TrimLeftFunc(strings.TrimRightFunc(q.stmt, func(r rune) bool {
+		return unicode.IsSpace(r) || r == ';'
+	}), unicode.IsSpace)
+
+	var stmtType string
+	if n := strings.IndexFunc(stmt, unicode.IsSpace); n >= 0 {
+		stmtType = strings.ToLower(stmt[:n])
+	}
+	if stmtType == "begin" {
+		if n := strings.LastIndexFunc(stmt, unicode.IsSpace); n >= 0 {
+			stmtType = strings.ToLower(stmt[n+1:])
+		}
+	}
+	switch stmtType {
+	case "select", "insert", "update", "delete", "batch":
+		return true
+	}
+	return false
+}
+
+// SetPrefetch sets the default threshold for pre-fetching new pages. If
+// there are only p*pageSize rows remaining, the next page will be requested
+// automatically.
+func (q *Query) Prefetch(p float64) *Query {
+	q.prefetch = p
+	return q
+}
+
+// RetryPolicy sets the policy to use when retrying the query.
+func (q *Query) RetryPolicy(r RetryPolicy) *Query {
+	q.rt = r
+	return q
+}
+
+// SetSpeculativeExecutionPolicy sets the execution policy
+func (q *Query) SetSpeculativeExecutionPolicy(sp SpeculativeExecutionPolicy) *Query {
+	q.spec = sp
+	return q
+}
+
+// speculativeExecutionPolicy fetches the policy
+func (q *Query) speculativeExecutionPolicy() SpeculativeExecutionPolicy {
+	return q.spec
+}
+
+// IsIdempotent returns whether the query is marked as idempotent.
+// Non-idempotent query won't be retried.
+// See "Retries and speculative execution" in package docs for more details.
+func (q *Query) IsIdempotent() bool {
+	return q.idempotent
+}
+
+func (q *Query) IsLWT() bool {
+	return q.routingInfo.isLWT()
+}
+
+func (q *Query) GetCustomPartitioner() Partitioner {
+	return q.routingInfo.getPartitioner()
+}
+
+// Idempotent marks the query as being idempotent or not depending on
+// the value.
+// Non-idempotent query won't be retried.
+// See "Retries and speculative execution" in package docs for more details.
+func (q *Query) Idempotent(value bool) *Query {
+	q.idempotent = value
+	return q
+}
+
+// Bind sets query arguments of query. This can also be used to rebind new query arguments
+// to an existing query instance.
+func (q *Query) Bind(v ...interface{}) *Query {
+	q.values = v
+	q.pageState = nil
+	return q
+}
+
+// SerialConsistency sets the consistency level for the
+// serial phase of conditional updates. That consistency can only be
+// either SERIAL or LOCAL_SERIAL and if not present, it defaults to
+// SERIAL. This option will be ignored for anything else that a
+// conditional update/insert.
+func (q *Query) SerialConsistency(cons SerialConsistency) *Query {
+	if !cons.IsSerial() {
+		panic("Serial consistency can only be SERIAL or LOCAL_SERIAL got " + cons.String())
+	}
+	q.serialCons = cons
+	return q
+}
+
+// PageState sets the paging state for the query to resume paging from a specific
+// point in time. Setting this will disable to query paging for this query, and
+// must be used for all subsequent pages.
+func (q *Query) PageState(state []byte) *Query {
+	q.pageState = state
+	q.disableAutoPage = true
+	return q
+}
+
+// NoSkipMetadata will override the internal result metadata cache so that the driver does not
+// send skip_metadata for queries, this means that the result will always contain
+// the metadata to parse the rows and will not reuse the metadata from the prepared
+// statement. This should only be used to work around cassandra bugs, such as when using
+// CAS operations which do not end in Cas.
+//
+// See https://issues.apache.org/jira/browse/CASSANDRA-11099
+// https://github.com/gocql/gocql/issues/612
+func (q *Query) NoSkipMetadata() *Query {
+	q.disableSkipMetadata = true
+	return q
+}
+
+// Exec executes the query without returning any rows.
+func (q *Query) Exec() error {
+	return q.Iter().Close()
+}
+
+func isUseStatement(stmt string) bool {
+	if len(stmt) < 3 {
+		return false
+	}
+
+	return strings.EqualFold(stmt[0:3], "use")
+}
+
+// Iter executes the query and returns an iterator capable of iterating
+// over all results.
+func (q *Query) Iter() *Iter {
+	if isUseStatement(q.stmt) {
+		return &Iter{err: ErrUseStmt}
+	}
+
+	if !q.disableAutoPage {
+		return q.executeQuery()
+	}
+
+	// Retry on empty page if pagination is manual
+	iter := q.executeQuery()
+	for iter.err == nil && iter.numRows == 0 && !iter.LastPage() {
+		q.PageState(iter.PageState())
+		iter = q.executeQuery()
+	}
+	return iter
+}
+
+func (q *Query) executeQuery() *Iter {
+	if q.conn != nil {
+		// if the query was specifically run on a connection then re-use that
+		// connection when fetching the next results
+		return q.conn.executeQuery(q.Context(), q)
+	}
+	return q.session.executeQuery(q)
+}
+
+// MapScan executes the query, copies the columns of the first selected
+// row into the map pointed at by m and discards the rest. If no rows
+// were selected, ErrNotFound is returned.
+func (q *Query) MapScan(m map[string]interface{}) error {
+	iter := q.Iter()
+	if err := iter.checkErrAndNotFound(); err != nil {
+		return err
+	}
+	iter.MapScan(m)
+	return iter.Close()
+}
+
+// Scan executes the query, copies the columns of the first selected
+// row into the values pointed at by dest and discards the rest. If no rows
+// were selected, ErrNotFound is returned.
+func (q *Query) Scan(dest ...interface{}) error {
+	iter := q.Iter()
+	if err := iter.checkErrAndNotFound(); err != nil {
+		return err
+	}
+	iter.Scan(dest...)
+	return iter.Close()
+}
+
+// ScanCAS executes a lightweight transaction (i.e. an UPDATE or INSERT
+// statement containing an IF clause). If the transaction fails because
+// the existing values did not match, the previous values will be stored
+// in dest.
+//
+// As for INSERT .. IF NOT EXISTS, previous values will be returned as if
+// SELECT * FROM. So using ScanCAS with INSERT is inherently prone to
+// column mismatching. Use MapScanCAS to capture them safely.
+func (q *Query) ScanCAS(dest ...interface{}) (applied bool, err error) {
+	q.disableSkipMetadata = true
+	iter := q.Iter()
+	if err := iter.checkErrAndNotFound(); err != nil {
+		return false, err
+	}
+	if len(iter.Columns()) > 1 {
+		dest = append([]interface{}{&applied}, dest...)
+		iter.Scan(dest...)
+	} else {
+		iter.Scan(&applied)
+	}
+	return applied, iter.Close()
+}
+
+// MapScanCAS executes a lightweight transaction (i.e. an UPDATE or INSERT
+// statement containing an IF clause). If the transaction fails because
+// the existing values did not match, the previous values will be stored
+// in dest map.
+//
+// As for INSERT .. IF NOT EXISTS, previous values will be returned as if
+// SELECT * FROM. So using ScanCAS with INSERT is inherently prone to
+// column mismatching. MapScanCAS is added to capture them safely.
+func (q *Query) MapScanCAS(dest map[string]interface{}) (applied bool, err error) {
+	q.disableSkipMetadata = true
+	iter := q.Iter()
+	if err := iter.checkErrAndNotFound(); err != nil {
+		return false, err
+	}
+	iter.MapScan(dest)
+	applied = dest["[applied]"].(bool)
+	delete(dest, "[applied]")
+
+	return applied, iter.Close()
+}
+
+// Release releases a query back into a pool of queries. Released Queries
+// cannot be reused.
+//
+// Example:
+//
+//	qry := session.Query("SELECT * FROM my_table")
+//	qry.Exec()
+//	qry.Release()
+func (q *Query) Release() {
+	q.decRefCount()
+}
+
+// reset zeroes out all fields of a query so that it can be safely pooled.
+func (q *Query) reset() {
+	*q = Query{routingInfo: &queryRoutingInfo{}, refCount: 1}
+}
+
+func (q *Query) incRefCount() {
+	atomic.AddUint32(&q.refCount, 1)
+}
+
+func (q *Query) decRefCount() {
+	if res := atomic.AddUint32(&q.refCount, ^uint32(0)); res == 0 {
+		// do release
+		q.reset()
+		queryPool.Put(q)
+	}
+}
+
+func (q *Query) borrowForExecution() {
+	q.incRefCount()
+}
+
+func (q *Query) releaseAfterExecution() {
+	q.decRefCount()
+}
+
+// Iter represents an iterator that can be used to iterate over all rows that
+// were returned by a query. The iterator might send additional queries to the
+// database during the iteration if paging was enabled.
+type Iter struct {
+	err     error
+	pos     int
+	meta    resultMetadata
+	numRows int
+	next    *nextIter
+	host    *HostInfo
+
+	framer framerInterface
+	closed int32
+}
+
+// Host returns the host which the query was sent to.
+func (iter *Iter) Host() *HostInfo {
+	return iter.host
+}
+
+// Columns returns the name and type of the selected columns.
+func (iter *Iter) Columns() []ColumnInfo {
+	return iter.meta.columns
+}
+
+type Scanner interface {
+	// Next advances the row pointer to point at the next row, the row is valid until
+	// the next call of Next. It returns true if there is a row which is available to be
+	// scanned into with Scan.
+	// Next must be called before every call to Scan.
+	Next() bool
+
+	// Scan copies the current row's columns into dest. If the length of dest does not equal
+	// the number of columns returned in the row an error is returned. If an error is encountered
+	// when unmarshalling a column into the value in dest an error is returned and the row is invalidated
+	// until the next call to Next.
+	// Next must be called before calling Scan, if it is not an error is returned.
+	Scan(...interface{}) error
+
+	// Err returns the if there was one during iteration that resulted in iteration being unable to complete.
+	// Err will also release resources held by the iterator, the Scanner should not used after being called.
+	Err() error
+}
+
+type iterScanner struct {
+	iter  *Iter
+	cols  [][]byte
+	valid bool
+}
+
+func (is *iterScanner) Next() bool {
+	iter := is.iter
+	if iter.err != nil {
+		return false
+	}
+
+	if iter.pos >= iter.numRows {
+		if iter.next != nil {
+			is.iter = iter.next.fetch()
+			return is.Next()
+		}
+		return false
+	}
+
+	for i := 0; i < len(is.cols); i++ {
+		col, err := iter.readColumn()
+		if err != nil {
+			iter.err = err
+			return false
+		}
+		is.cols[i] = col
+	}
+	iter.pos++
+	is.valid = true
+
+	return true
+}
+
+func scanColumn(p []byte, col ColumnInfo, dest []interface{}) (int, error) {
+	if dest[0] == nil {
+		return 1, nil
+	}
+
+	if col.TypeInfo.Type() == TypeTuple {
+		// this will panic, actually a bug, please report
+		tuple := col.TypeInfo.(TupleTypeInfo)
+
+		count := len(tuple.Elems)
+		// here we pass in a slice of the struct which has the number number of
+		// values as elements in the tuple
+		if err := Unmarshal(col.TypeInfo, p, dest[:count]); err != nil {
+			return 0, err
+		}
+		return count, nil
+	} else {
+		if err := Unmarshal(col.TypeInfo, p, dest[0]); err != nil {
+			return 0, err
+		}
+		return 1, nil
+	}
+}
+
+func (is *iterScanner) Scan(dest ...interface{}) error {
+	if !is.valid {
+		return errors.New("gocql: Scan called without calling Next")
+	}
+
+	iter := is.iter
+	// currently only support scanning into an expand tuple, such that its the same
+	// as scanning in more values from a single column
+	if len(dest) != iter.meta.actualColCount {
+		return fmt.Errorf("gocql: not enough columns to scan into: have %d want %d", len(dest), iter.meta.actualColCount)
+	}
+
+	// i is the current position in dest, could posible replace it and just use
+	// slices of dest
+	i := 0
+	var err error
+	for _, col := range iter.meta.columns {
+		var n int
+		n, err = scanColumn(is.cols[i], col, dest[i:])
+		if err != nil {
+			break
+		}
+		i += n
+	}
+
+	is.valid = false
+	return err
+}
+
+func (is *iterScanner) Err() error {
+	iter := is.iter
+	is.iter = nil
+	is.cols = nil
+	is.valid = false
+	return iter.Close()
+}
+
+// Scanner returns a row Scanner which provides an interface to scan rows in a manner which is
+// similar to database/sql. The iter should NOT be used again after calling this method.
+func (iter *Iter) Scanner() Scanner {
+	if iter == nil {
+		return nil
+	}
+
+	return &iterScanner{iter: iter, cols: make([][]byte, len(iter.meta.columns))}
+}
+
+func (iter *Iter) readColumn() ([]byte, error) {
+	return iter.framer.ReadBytesInternal()
+}
+
+// Scan consumes the next row of the iterator and copies the columns of the
+// current row into the values pointed at by dest. Use nil as a dest value
+// to skip the corresponding column. Scan might send additional queries
+// to the database to retrieve the next set of rows if paging was enabled.
+//
+// Scan returns true if the row was successfully unmarshaled or false if the
+// end of the result set was reached or if an error occurred. Close should
+// be called afterwards to retrieve any potential errors.
+func (iter *Iter) Scan(dest ...interface{}) bool {
+	if iter.err != nil {
+		return false
+	}
+
+	if iter.pos >= iter.numRows {
+		if iter.next != nil {
+			*iter = *iter.next.fetch()
+			return iter.Scan(dest...)
+		}
+		return false
+	}
+
+	if iter.next != nil && iter.pos >= iter.next.pos {
+		iter.next.fetchAsync()
+	}
+
+	// currently only support scanning into an expand tuple, such that its the same
+	// as scanning in more values from a single column
+	if len(dest) != iter.meta.actualColCount {
+		iter.err = fmt.Errorf("gocql: not enough columns to scan into: have %d want %d", len(dest), iter.meta.actualColCount)
+		return false
+	}
+
+	// i is the current position in dest, could posible replace it and just use
+	// slices of dest
+	i := 0
+	for _, col := range iter.meta.columns {
+		colBytes, err := iter.readColumn()
+		if err != nil {
+			iter.err = err
+			return false
+		}
+
+		n, err := scanColumn(colBytes, col, dest[i:])
+		if err != nil {
+			iter.err = err
+			return false
+		}
+		i += n
+	}
+
+	iter.pos++
+	return true
+}
+
+// GetCustomPayload returns any parsed custom payload results if given in the
+// response from Cassandra. Note that the result is not a copy.
+//
+// This additional feature of CQL Protocol v4
+// allows additional results and query information to be returned by
+// custom QueryHandlers running in your C* cluster.
+// See https://datastax.github.io/java-driver/manual/custom_payloads/
+func (iter *Iter) GetCustomPayload() map[string][]byte {
+	if iter.framer != nil {
+		return iter.framer.GetCustomPayload()
+	}
+	return nil
+}
+
+// Warnings returns any warnings generated if given in the response from Cassandra.
+//
+// This is only available starting with CQL Protocol v4.
+func (iter *Iter) Warnings() []string {
+	if iter.framer != nil {
+		return iter.framer.GetHeaderWarnings()
+	}
+	return nil
+}
+
+// Close closes the iterator and returns any errors that happened during
+// the query or the iteration.
+func (iter *Iter) Close() error {
+	if atomic.CompareAndSwapInt32(&iter.closed, 0, 1) {
+		if iter.framer != nil {
+			iter.framer = nil
+		}
+	}
+
+	return iter.err
+}
+
+// WillSwitchPage detects if iterator reached end of current page
+// and the next page is available.
+func (iter *Iter) WillSwitchPage() bool {
+	return iter.pos >= iter.numRows && iter.next != nil
+}
+
+// checkErrAndNotFound handle error and NotFound in one method.
+func (iter *Iter) checkErrAndNotFound() error {
+	if iter.err != nil {
+		return iter.err
+	} else if iter.numRows == 0 {
+		return ErrNotFound
+	}
+	return nil
+}
+
+// PageState return the current paging state for a query which can be used for
+// subsequent queries to resume paging this point.
+func (iter *Iter) PageState() []byte {
+	return iter.meta.pagingState
+}
+
+// LastPage returns true if there are no more pages to fetch.
+func (iter *Iter) LastPage() bool {
+	return len(iter.meta.pagingState) == 0
+}
+
+// NumRows returns the number of rows in this pagination, it will update when new
+// pages are fetched, it is not the value of the total number of rows this iter
+// will return unless there is only a single page returned.
+func (iter *Iter) NumRows() int {
+	return iter.numRows
+}
+
+// nextIter holds state for fetching a single page in an iterator.
+// single page might be attempted multiple times due to retries.
+type nextIter struct {
+	qry   *Query
+	pos   int
+	oncea sync.Once
+	once  sync.Once
+	next  *Iter
+}
+
+func (n *nextIter) fetchAsync() {
+	n.oncea.Do(func() {
+		go n.fetch()
+	})
+}
+
+func (n *nextIter) fetch() *Iter {
+	n.once.Do(func() {
+		// if the query was specifically run on a connection then re-use that
+		// connection when fetching the next results
+		if n.qry.conn != nil {
+			n.next = n.qry.conn.executeQuery(n.qry.Context(), n.qry)
+		} else {
+			n.next = n.qry.session.executeQuery(n.qry)
+		}
+	})
+	return n.next
+}
+
+type Batch struct {
+	Type                  BatchType
+	Entries               []BatchEntry
+	Cons                  Consistency
+	routingKey            []byte
+	CustomPayload         map[string][]byte
+	rt                    RetryPolicy
+	spec                  SpeculativeExecutionPolicy
+	trace                 Tracer
+	observer              BatchObserver
+	session               *Session
+	serialCons            Consistency
+	defaultTimestamp      bool
+	defaultTimestampValue int64
+	context               context.Context
+	cancelBatch           func()
+	keyspace              string
+	metrics               *queryMetrics
+
+	// routingInfo is a pointer because Query can be copied and copyable struct can't hold a mutex.
+	routingInfo *queryRoutingInfo
+}
+
+// NewBatch creates a new batch operation using defaults defined in the cluster
+//
+// Deprecated: use session.Batch instead
+func (s *Session) NewBatch(typ BatchType) *Batch {
+	return s.Batch(typ)
+}
+
+// Batch creates a new batch operation using defaults defined in the cluster
+func (s *Session) Batch(typ BatchType) *Batch {
+	s.mu.RLock()
+	batch := &Batch{
+		Type:             typ,
+		rt:               s.cfg.RetryPolicy,
+		serialCons:       s.cfg.SerialConsistency,
+		trace:            s.trace,
+		observer:         s.batchObserver,
+		session:          s,
+		Cons:             s.cons,
+		defaultTimestamp: s.cfg.DefaultTimestamp,
+		keyspace:         s.cfg.Keyspace,
+		metrics:          &queryMetrics{m: make(map[string]*hostMetrics)},
+		spec:             &NonSpeculativeExecution{},
+		routingInfo:      &queryRoutingInfo{},
+	}
+
+	s.mu.RUnlock()
+	return batch
+}
+
+// Trace enables tracing of this batch. Look at the documentation of the
+// Tracer interface to learn more about tracing.
+func (b *Batch) Trace(trace Tracer) *Batch {
+	b.trace = trace
+	return b
+}
+
+// Observer enables batch-level observer on this batch.
+// The provided observer will be called every time this batched query is executed.
+func (b *Batch) Observer(observer BatchObserver) *Batch {
+	b.observer = observer
+	return b
+}
+
+func (b *Batch) Keyspace() string {
+	return b.keyspace
+}
+
+// Batch has no reasonable eqivalent of Query.Table().
+func (b *Batch) Table() string {
+	return b.routingInfo.table
+}
+
+func (b *Batch) GetSession() *Session {
+	return b.session
+}
+
+// Attempts returns the number of attempts made to execute the batch.
+func (b *Batch) Attempts() int {
+	return b.metrics.attempts()
+}
+
+func (b *Batch) AddAttempts(i int, host *HostInfo) {
+	b.metrics.attempt(i, 0, host, false)
+}
+
+// Latency returns the average number of nanoseconds to execute a single attempt of the batch.
+func (b *Batch) Latency() int64 {
+	return b.metrics.latency()
+}
+
+func (b *Batch) AddLatency(l int64, host *HostInfo) {
+	b.metrics.attempt(0, time.Duration(l)*time.Nanosecond, host, false)
+}
+
+// GetConsistency returns the currently configured consistency level for the batch
+// operation.
+func (b *Batch) GetConsistency() Consistency {
+	return b.Cons
+}
+
+// SetConsistency sets the currently configured consistency level for the batch
+// operation.
+func (b *Batch) SetConsistency(c Consistency) {
+	b.Cons = c
+}
+
+func (b *Batch) Context() context.Context {
+	if b.context == nil {
+		return context.Background()
+	}
+	return b.context
+}
+
+func (b *Batch) IsIdempotent() bool {
+	for _, entry := range b.Entries {
+		if !entry.Idempotent {
+			return false
+		}
+	}
+	return true
+}
+
+func (b *Batch) IsLWT() bool {
+	return b.routingInfo.isLWT()
+}
+
+func (b *Batch) GetCustomPartitioner() Partitioner {
+	return b.routingInfo.getPartitioner()
+}
+
+func (b *Batch) speculativeExecutionPolicy() SpeculativeExecutionPolicy {
+	return b.spec
+}
+
+func (b *Batch) SpeculativeExecutionPolicy(sp SpeculativeExecutionPolicy) *Batch {
+	b.spec = sp
+	return b
+}
+
+// Query adds the query to the batch operation
+func (b *Batch) Query(stmt string, args ...interface{}) *Batch {
+	b.Entries = append(b.Entries, BatchEntry{Stmt: stmt, Args: args})
+	return b
+}
+
+// Bind adds the query to the batch operation and correlates it with a binding callback
+// that will be invoked when the batch is executed. The binding callback allows the application
+// to define which query argument values will be marshalled as part of the batch execution.
+func (b *Batch) Bind(stmt string, bind func(q *QueryInfo) ([]interface{}, error)) {
+	b.Entries = append(b.Entries, BatchEntry{Stmt: stmt, binding: bind})
+}
+
+func (b *Batch) retryPolicy() RetryPolicy {
+	return b.rt
+}
+
+// RetryPolicy sets the retry policy to use when executing the batch operation
+func (b *Batch) RetryPolicy(r RetryPolicy) *Batch {
+	b.rt = r
+	return b
+}
+
+func (b *Batch) withContext(ctx context.Context) ExecutableQuery {
+	return b.WithContext(ctx)
+}
+
+// WithContext returns a shallow copy of b with its context
+// set to ctx.
+//
+// The provided context controls the entire lifetime of executing a
+// query, queries will be canceled and return once the context is
+// canceled.
+func (b *Batch) WithContext(ctx context.Context) *Batch {
+	b2 := *b
+	b2.context = ctx
+	return &b2
+}
+
+// Deprecate: does nothing, cancel the context passed to WithContext
+func (*Batch) Cancel() {
+	// TODO: delete
+}
+
+// Size returns the number of batch statements to be executed by the batch operation.
+func (b *Batch) Size() int {
+	return len(b.Entries)
+}
+
+// SerialConsistency sets the consistency level for the
+// serial phase of conditional updates. That consistency can only be
+// either SERIAL or LOCAL_SERIAL and if not present, it defaults to
+// SERIAL. This option will be ignored for anything else that a
+// conditional update/insert.
+//
+// Only available for protocol 3 and above
+func (b *Batch) SerialConsistency(cons SerialConsistency) *Batch {
+	if !cons.IsSerial() {
+		panic("Serial consistency can only be SERIAL or LOCAL_SERIAL got " + cons.String())
+	}
+	b.serialCons = cons
+	return b
+}
+
+// DefaultTimestamp will enable the with default timestamp flag on the query.
+// If enable, this will replace the server side assigned
+// timestamp as default timestamp. Note that a timestamp in the query itself
+// will still override this timestamp. This is entirely optional.
+//
+// Only available on protocol >= 3
+func (b *Batch) DefaultTimestamp(enable bool) *Batch {
+	b.defaultTimestamp = enable
+	return b
+}
+
+// WithTimestamp will enable the with default timestamp flag on the query
+// like DefaultTimestamp does. But also allows to define value for timestamp.
+// It works the same way as USING TIMESTAMP in the query itself, but
+// should not break prepared query optimization.
+//
+// Only available on protocol >= 3
+func (b *Batch) WithTimestamp(timestamp int64) *Batch {
+	b.DefaultTimestamp(true)
+	b.defaultTimestampValue = timestamp
+	return b
+}
+
+func (b *Batch) attempt(keyspace string, end, start time.Time, iter *Iter, host *HostInfo) {
+	latency := end.Sub(start)
+	attempt, metricsForHost := b.metrics.attempt(1, latency, host, b.observer != nil)
+
+	if b.observer == nil {
+		return
+	}
+
+	statements := make([]string, len(b.Entries))
+	values := make([][]interface{}, len(b.Entries))
+
+	for i, entry := range b.Entries {
+		statements[i] = entry.Stmt
+		values[i] = entry.Args
+	}
+
+	b.observer.ObserveBatch(b.Context(), ObservedBatch{
+		Keyspace:   keyspace,
+		Statements: statements,
+		Values:     values,
+		Start:      start,
+		End:        end,
+		// Rows not used in batch observations // TODO - might be able to support it when using BatchCAS
+		Host:    host,
+		Metrics: metricsForHost,
+		Err:     iter.err,
+		Attempt: attempt,
+	})
+}
+
+func (b *Batch) GetRoutingKey() ([]byte, error) {
+	if b.routingKey != nil {
+		return b.routingKey, nil
+	}
+
+	if len(b.Entries) == 0 {
+		return nil, nil
+	}
+
+	entry := b.Entries[0]
+	if entry.binding != nil {
+		// bindings do not have the values let's skip it like Query does.
+		return nil, nil
+	}
+	// try to determine the routing key
+	routingKeyInfo, err := b.session.routingKeyInfo(b.Context(), entry.Stmt)
+	if err != nil {
+		return nil, err
+	}
+	if routingKeyInfo != nil {
+		b.routingInfo.mu.Lock()
+		b.routingInfo.lwt = routingKeyInfo.lwt
+		b.routingInfo.partitioner = routingKeyInfo.partitioner
+		b.routingInfo.mu.Unlock()
+	}
+
+	return createRoutingKey(routingKeyInfo, entry.Args)
+}
+
+func createRoutingKey(routingKeyInfo *routingKeyInfo, values []interface{}) ([]byte, error) {
+	if routingKeyInfo == nil {
+		return nil, nil
+	}
+
+	if len(routingKeyInfo.indexes) == 1 {
+		// single column routing key
+		routingKey, err := Marshal(
+			routingKeyInfo.types[0],
+			values[routingKeyInfo.indexes[0]],
+		)
+		if err != nil {
+			return nil, err
+		}
+		return routingKey, nil
+	}
+
+	// composite routing key
+	buf := bytes.NewBuffer(make([]byte, 0, 256))
+	for i := range routingKeyInfo.indexes {
+		encoded, err := Marshal(
+			routingKeyInfo.types[i],
+			values[routingKeyInfo.indexes[i]],
+		)
+		if err != nil {
+			return nil, err
+		}
+		lenBuf := []byte{0x00, 0x00}
+		binary.BigEndian.PutUint16(lenBuf, uint16(len(encoded)))
+		buf.Write(lenBuf)
+		buf.Write(encoded)
+		buf.WriteByte(0x00)
+	}
+	routingKey := buf.Bytes()
+	return routingKey, nil
+}
+
+func (b *Batch) borrowForExecution() {
+	// empty, because Batch has no equivalent of Query.Release()
+	// that would race with speculative executions.
+}
+
+func (b *Batch) releaseAfterExecution() {
+	// empty, because Batch has no equivalent of Query.Release()
+	// that would race with speculative executions.
+}
+
+type BatchType byte
+
+const (
+	LoggedBatch   BatchType = 0
+	UnloggedBatch BatchType = 1
+	CounterBatch  BatchType = 2
+)
+
+type BatchEntry struct {
+	Stmt       string
+	Args       []interface{}
+	Idempotent bool
+	binding    func(q *QueryInfo) ([]interface{}, error)
+}
+
+type ColumnInfo struct {
+	Keyspace string
+	Table    string
+	Name     string
+	TypeInfo TypeInfo
+}
+
+func (c ColumnInfo) String() string {
+	return fmt.Sprintf("[column keyspace=%s table=%s name=%s type=%v]", c.Keyspace, c.Table, c.Name, c.TypeInfo)
+}
+
+// routing key indexes LRU cache
+type routingKeyInfoLRU struct {
+	lru *lru.Cache
+	mu  sync.Mutex
+}
+
+type routingKeyInfo struct {
+	indexes     []int
+	types       []TypeInfo
+	keyspace    string
+	table       string
+	lwt         bool
+	partitioner Partitioner
+}
+
+func (r *routingKeyInfo) String() string {
+	return fmt.Sprintf("routing key index=%v types=%v", r.indexes, r.types)
+}
+
+func (r *routingKeyInfoLRU) Remove(key string) {
+	r.mu.Lock()
+	r.lru.Remove(key)
+	r.mu.Unlock()
+}
+
+// Max adjusts the maximum size of the cache and cleans up the oldest records if
+// the new max is lower than the previous value. Not concurrency safe.
+func (r *routingKeyInfoLRU) Max(max int) {
+	r.mu.Lock()
+	for r.lru.Len() > max {
+		r.lru.RemoveOldest()
+	}
+	r.lru.MaxEntries = max
+	r.mu.Unlock()
+}
+
+type inflightCachedEntry struct {
+	wg    sync.WaitGroup
+	err   error
+	value interface{}
+}
+
+type ObservedQuery struct {
+	Keyspace  string
+	Statement string
+
+	// Values holds a slice of bound values for the query.
+	// Do not modify the values here, they are shared with multiple goroutines.
+	Values []interface{}
+
+	Start time.Time // time immediately before the query was called
+	End   time.Time // time immediately after the query returned
+
+	// Rows is the number of rows in the current iter.
+	// In paginated queries, rows from previous scans are not counted.
+	// Rows is not used in batch queries and remains at the default value
+	Rows int
+
+	// Host is the informations about the host that performed the query
+	Host *HostInfo
+
+	// The metrics per this host
+	Metrics *hostMetrics
+
+	// Err is the error in the query.
+	// It only tracks network errors or errors of bad cassandra syntax, in particular selects with no match return nil error
+	Err error
+
+	// Attempt is the index of attempt at executing this query.
+	// The first attempt is number zero and any retries have non-zero attempt number.
+	Attempt int
+}
+
+// QueryObserver is the interface implemented by query observers / stat collectors.
+type QueryObserver interface {
+	// ObserveQuery gets called on every query to cassandra, including all queries in an iterator when paging is enabled.
+	// It doesn't get called if there is no query because the session is closed or there are no connections available.
+	// The error reported only shows query errors, i.e. if a SELECT is valid but finds no matches it will be nil.
+	ObserveQuery(context.Context, ObservedQuery)
+}
+
+type ObservedBatch struct {
+	Keyspace   string
+	Statements []string
+
+	// Values holds a slice of bound values for each statement.
+	// Values[i] are bound values passed to Statements[i].
+	// Do not modify the values here, they are shared with multiple goroutines.
+	Values [][]interface{}
+
+	Start time.Time // time immediately before the batch query was called
+	End   time.Time // time immediately after the batch query returned
+
+	// Host is the informations about the host that performed the batch
+	Host *HostInfo
+
+	// Err is the error in the batch query.
+	// It only tracks network errors or errors of bad cassandra syntax, in particular selects with no match return nil error
+	Err error
+
+	// The metrics per this host
+	Metrics *hostMetrics
+
+	// Attempt is the index of attempt at executing this query.
+	// The first attempt is number zero and any retries have non-zero attempt number.
+	Attempt int
+}
+
+// BatchObserver is the interface implemented by batch observers / stat collectors.
+type BatchObserver interface {
+	// ObserveBatch gets called on every batch query to cassandra.
+	// It also gets called once for each query in a batch.
+	// It doesn't get called if there is no query because the session is closed or there are no connections available.
+	// The error reported only shows query errors, i.e. if a SELECT is valid but finds no matches it will be nil.
+	// Unlike QueryObserver.ObserveQuery it does no reporting on rows read.
+	ObserveBatch(context.Context, ObservedBatch)
+}
+
+type ObservedConnect struct {
+	// Host is the information about the host about to connect
+	Host *HostInfo
+
+	Start time.Time // time immediately before the dial is called
+	End   time.Time // time immediately after the dial returned
+
+	// Err is the connection error (if any)
+	Err error
+}
+
+// ConnectObserver is the interface implemented by connect observers / stat collectors.
+type ConnectObserver interface {
+	// ObserveConnect gets called when a new connection to cassandra is made.
+	ObserveConnect(ObservedConnect)
+}
+
+type Error struct {
+	Code    int
+	Message string
+}
+
+func (e Error) Error() string {
+	return e.Message
+}
+
+var (
+	ErrNotFound             = errors.New("not found")
+	ErrUnavailable          = errors.New("unavailable")
+	ErrUnsupported          = errors.New("feature not supported")
+	ErrTooManyStmts         = errors.New("too many statements")
+	ErrUseStmt              = errors.New("use statements aren't supported. Please see https://github.com/gocql/gocql for explanation.")
+	ErrSessionClosed        = errors.New("session has been closed")
+	ErrNoConnections        = errors.New("gocql: no hosts available in the pool")
+	ErrNoKeyspace           = errors.New("no keyspace provided")
+	ErrKeyspaceDoesNotExist = errors.New("keyspace does not exist")
+	ErrNoMetadata           = errors.New("no metadata available")
+	ErrTabletsNotUsed       = errors.New("tablets not used")
+	ErrSessionNotReady      = errors.New("session is not ready yet")
+)
+
+type ErrProtocol struct{ error }
+
+func NewErrProtocol(format string, args ...interface{}) error {
+	return ErrProtocol{fmt.Errorf(format, args...)}
+}
+
+// BatchSizeMaximum is the maximum number of statements a batch operation can have.
+// This limit is set by cassandra and could change in the future.
+const BatchSizeMaximum = 65535
diff --git a/vendor/github.com/gocql/gocql/tablets.go b/vendor/github.com/gocql/gocql/tablets.go
new file mode 100644
index 0000000..fd29cb6
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/tablets.go
@@ -0,0 +1,198 @@
+package gocql
+
+import (
+	"sync"
+)
+
+type ReplicaInfo struct {
+	hostId  UUID
+	shardId int
+}
+
+type TabletInfo struct {
+	keyspaceName string
+	tableName    string
+	firstToken   int64
+	lastToken    int64
+	replicas     []ReplicaInfo
+}
+
+func (t *TabletInfo) KeyspaceName() string {
+	return t.keyspaceName
+}
+
+func (t *TabletInfo) FirstToken() int64 {
+	return t.firstToken
+}
+
+func (t *TabletInfo) LastToken() int64 {
+	return t.lastToken
+}
+
+func (t *TabletInfo) TableName() string {
+	return t.tableName
+}
+
+func (t *TabletInfo) Replicas() []ReplicaInfo {
+	return t.replicas
+}
+
+type TabletInfoList []*TabletInfo
+
+// Search for place in tablets table with specific Keyspace and Table name
+func (t TabletInfoList) findTablets(keyspace string, table string) (int, int) {
+	l := -1
+	r := -1
+	for i, tablet := range t {
+		if tablet.KeyspaceName() == keyspace && tablet.TableName() == table {
+			if l == -1 {
+				l = i
+			}
+			r = i
+		} else if l != -1 {
+			break
+		}
+	}
+
+	return l, r
+}
+
+func (t TabletInfoList) addTabletToTabletsList(tablet *TabletInfo) TabletInfoList {
+	l, r := t.findTablets(tablet.keyspaceName, tablet.tableName)
+	if l == -1 && r == -1 {
+		l = 0
+		r = 0
+	} else {
+		r = r + 1
+	}
+
+	l1, r1 := l, r
+	l2, r2 := l1, r1
+
+	// find first overlaping range
+	for l1 < r1 {
+		mid := (l1 + r1) / 2
+		if t[mid].FirstToken() < tablet.FirstToken() {
+			l1 = mid + 1
+		} else {
+			r1 = mid
+		}
+	}
+	start := l1
+
+	if start > l && t[start-1].LastToken() > tablet.FirstToken() {
+		start = start - 1
+	}
+
+	// find last overlaping range
+	for l2 < r2 {
+		mid := (l2 + r2) / 2
+		if t[mid].LastToken() < tablet.LastToken() {
+			l2 = mid + 1
+		} else {
+			r2 = mid
+		}
+	}
+	end := l2
+	if end < r && t[end].FirstToken() >= tablet.LastToken() {
+		end = end - 1
+	}
+	if end == len(t) {
+		end = end - 1
+	}
+
+	updated_tablets := t
+	if start <= end {
+		// Delete elements from index start to end
+		updated_tablets = append(t[:start], t[end+1:]...)
+	}
+	// Insert tablet element at index start
+	t = append(updated_tablets[:start], append([]*TabletInfo{tablet}, updated_tablets[start:]...)...)
+	return t
+}
+
+// Remove all tablets that have given host as a replica
+func (t TabletInfoList) removeTabletsWithHostFromTabletsList(host *HostInfo) TabletInfoList {
+	filteredTablets := make([]*TabletInfo, 0, len(t)) // Preallocate for efficiency
+
+	for _, tablet := range t {
+		// Check if any replica matches the given host ID
+		shouldExclude := false
+		for _, replica := range tablet.replicas {
+			if replica.hostId.String() == host.HostID() {
+				shouldExclude = true
+				break
+			}
+		}
+		if !shouldExclude {
+			filteredTablets = append(filteredTablets, tablet)
+		}
+	}
+
+	t = filteredTablets
+	return t
+}
+
+func (t TabletInfoList) removeTabletsWithKeyspaceFromTabletsList(keyspace string) TabletInfoList {
+	filteredTablets := make([]*TabletInfo, 0, len(t))
+
+	for _, tablet := range t {
+		if tablet.keyspaceName != keyspace {
+			filteredTablets = append(filteredTablets, tablet)
+		}
+	}
+
+	t = filteredTablets
+	return t
+}
+
+func (t TabletInfoList) removeTabletsWithTableFromTabletsList(keyspace string, table string) TabletInfoList {
+	filteredTablets := make([]*TabletInfo, 0, len(t))
+
+	for _, tablet := range t {
+		if !(tablet.keyspaceName == keyspace && tablet.tableName == table) {
+			filteredTablets = append(filteredTablets, tablet)
+		}
+	}
+
+	t = filteredTablets
+	return t
+}
+
+// Search for place in tablets table for token starting from index l to index r
+func (t TabletInfoList) findTabletForToken(token Token, l int, r int) *TabletInfo {
+	for l < r {
+		var m int
+		if r*l > 0 {
+			m = l + (r-l)/2
+		} else {
+			m = (r + l) / 2
+		}
+		if int64Token(t[m].LastToken()).Less(token) {
+			l = m + 1
+		} else {
+			r = m
+		}
+	}
+
+	return t[l]
+}
+
+// cowTabletList implements a copy on write tablet list, its equivalent type is TabletInfoList
+type cowTabletList struct {
+	list TabletInfoList
+	mu   sync.RWMutex
+}
+
+func (c *cowTabletList) get() TabletInfoList {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	return c.list
+}
+
+func (c *cowTabletList) set(tablets TabletInfoList) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	c.list = tablets
+}
diff --git a/vendor/github.com/gocql/gocql/token.go b/vendor/github.com/gocql/gocql/token.go
new file mode 100644
index 0000000..72d0ce4
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/token.go
@@ -0,0 +1,242 @@
+// Copyright (c) 2015 The gocql Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gocql
+
+import (
+	"bytes"
+	"crypto/md5"
+	"fmt"
+	"math/big"
+	"sort"
+	"strconv"
+	"strings"
+
+	"github.com/gocql/gocql/internal/murmur"
+)
+
+// a token partitioner
+type Partitioner interface {
+	Name() string
+	Hash([]byte) Token
+	ParseString(string) Token
+}
+
+// a Token
+type Token interface {
+	fmt.Stringer
+	Less(Token) bool
+}
+
+// murmur3 partitioner
+type murmur3Partitioner struct{}
+
+func (p murmur3Partitioner) Name() string {
+	return "Murmur3Partitioner"
+}
+
+func (p murmur3Partitioner) Hash(partitionKey []byte) Token {
+	h1 := murmur.Murmur3H1(partitionKey)
+	return int64Token(h1)
+}
+
+// murmur3 little-endian, 128-bit hash, but returns only h1
+func (p murmur3Partitioner) ParseString(str string) Token {
+	return parseInt64Token(str)
+}
+
+// int64 token
+type int64Token int64
+
+func parseInt64Token(str string) int64Token {
+	val, _ := strconv.ParseInt(str, 10, 64)
+	return int64Token(val)
+}
+
+func (m int64Token) String() string {
+	return strconv.FormatInt(int64(m), 10)
+}
+
+func (m int64Token) Less(token Token) bool {
+	return m < token.(int64Token)
+}
+
+// order preserving partitioner and token
+type orderedPartitioner struct{}
+type orderedToken string
+
+func (p orderedPartitioner) Name() string {
+	return "OrderedPartitioner"
+}
+
+func (p orderedPartitioner) Hash(partitionKey []byte) Token {
+	// the partition key is the token
+	return orderedToken(partitionKey)
+}
+
+func (p orderedPartitioner) ParseString(str string) Token {
+	return orderedToken(str)
+}
+
+func (o orderedToken) String() string {
+	return string(o)
+}
+
+func (o orderedToken) Less(token Token) bool {
+	return o < token.(orderedToken)
+}
+
+// random partitioner and token
+type randomPartitioner struct{}
+type randomToken big.Int
+
+func (r randomPartitioner) Name() string {
+	return "RandomPartitioner"
+}
+
+// 2 ** 128
+var maxHashInt, _ = new(big.Int).SetString("340282366920938463463374607431768211456", 10)
+
+func (p randomPartitioner) Hash(partitionKey []byte) Token {
+	sum := md5.Sum(partitionKey)
+	val := new(big.Int)
+	val.SetBytes(sum[:])
+	if sum[0] > 127 {
+		val.Sub(val, maxHashInt)
+		val.Abs(val)
+	}
+
+	return (*randomToken)(val)
+}
+
+func (p randomPartitioner) ParseString(str string) Token {
+	val := new(big.Int)
+	val.SetString(str, 10)
+	return (*randomToken)(val)
+}
+
+func (r *randomToken) String() string {
+	return (*big.Int)(r).String()
+}
+
+func (r *randomToken) Less(token Token) bool {
+	return -1 == (*big.Int)(r).Cmp((*big.Int)(token.(*randomToken)))
+}
+
+type hostToken struct {
+	token Token
+	host  *HostInfo
+}
+
+func (ht hostToken) String() string {
+	return fmt.Sprintf("{token=%v host=%v}", ht.token, ht.host.HostID())
+}
+
+// a data structure for organizing the relationship between tokens and hosts
+type tokenRing struct {
+	partitioner Partitioner
+
+	// tokens map token range to primary replica.
+	// The elements in tokens are sorted by token ascending.
+	// The range for a given item in tokens starts after preceding range and ends with the token specified in
+	// token. The end token is part of the range.
+	// The lowest (i.e. index 0) range wraps around the ring (its preceding range is the one with largest index).
+	tokens []hostToken
+
+	hosts []*HostInfo
+}
+
+func newTokenRing(partitioner string, hosts []*HostInfo) (*tokenRing, error) {
+	tokenRing := &tokenRing{
+		hosts: hosts,
+	}
+
+	if strings.HasSuffix(partitioner, "Murmur3Partitioner") {
+		tokenRing.partitioner = murmur3Partitioner{}
+	} else if strings.HasSuffix(partitioner, "OrderedPartitioner") {
+		tokenRing.partitioner = orderedPartitioner{}
+	} else if strings.HasSuffix(partitioner, "RandomPartitioner") {
+		tokenRing.partitioner = randomPartitioner{}
+	} else {
+		return nil, fmt.Errorf("unsupported partitioner '%s'", partitioner)
+	}
+
+	for _, host := range hosts {
+		for _, strToken := range host.Tokens() {
+			token := tokenRing.partitioner.ParseString(strToken)
+			tokenRing.tokens = append(tokenRing.tokens, hostToken{token, host})
+		}
+	}
+
+	sort.Sort(tokenRing)
+
+	return tokenRing, nil
+}
+
+func (t *tokenRing) Len() int {
+	return len(t.tokens)
+}
+
+func (t *tokenRing) Less(i, j int) bool {
+	return t.tokens[i].token.Less(t.tokens[j].token)
+}
+
+func (t *tokenRing) Swap(i, j int) {
+	t.tokens[i], t.tokens[j] = t.tokens[j], t.tokens[i]
+}
+
+func (t *tokenRing) String() string {
+	buf := &bytes.Buffer{}
+	buf.WriteString("TokenRing(")
+	if t.partitioner != nil {
+		buf.WriteString(t.partitioner.Name())
+	}
+	buf.WriteString("){")
+	sep := ""
+	for i, th := range t.tokens {
+		buf.WriteString(sep)
+		sep = ","
+		buf.WriteString("\n\t[")
+		buf.WriteString(strconv.Itoa(i))
+		buf.WriteString("]")
+		buf.WriteString(th.token.String())
+		buf.WriteString(":")
+		buf.WriteString(th.host.ConnectAddress().String())
+	}
+	buf.WriteString("\n}")
+	return string(buf.Bytes())
+}
+
+// GetHostForPartitionKey finds host information for given partition key.
+//
+// It returns two tokens. First is token that exactly corresponds to the partition key (and could be used to
+// determine shard, for example), second token is the endToken that corresponds to the host.
+func (t *tokenRing) GetHostForPartitionKey(partitionKey []byte) (host *HostInfo, token Token, endToken Token) {
+	if t == nil {
+		return nil, nil, nil
+	}
+
+	token = t.partitioner.Hash(partitionKey)
+	host, endToken = t.GetHostForToken(token)
+	return host, token, endToken
+}
+
+func (t *tokenRing) GetHostForToken(token Token) (host *HostInfo, endToken Token) {
+	if t == nil || len(t.tokens) == 0 {
+		return nil, nil
+	}
+
+	// find the primary replica
+	p := sort.Search(len(t.tokens), func(i int) bool {
+		return !t.tokens[i].token.Less(token)
+	})
+
+	if p == len(t.tokens) {
+		// wrap around to the first in the ring
+		p = 0
+	}
+
+	v := t.tokens[p]
+	return v.host, v.token
+}
diff --git a/vendor/github.com/gocql/gocql/topology.go b/vendor/github.com/gocql/gocql/topology.go
new file mode 100644
index 0000000..1048ee3
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/topology.go
@@ -0,0 +1,294 @@
+package gocql
+
+import (
+	"fmt"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+type hostTokens struct {
+	// token is end (inclusive) of token range these hosts belong to
+	token Token
+	hosts []*HostInfo
+}
+
+// tokenRingReplicas maps token ranges to list of replicas.
+// The elements in tokenRingReplicas are sorted by token ascending.
+// The range for a given item in tokenRingReplicas starts after preceding range and ends with the token specified in
+// token. The end token is part of the range.
+// The lowest (i.e. index 0) range wraps around the ring (its preceding range is the one with largest index).
+type tokenRingReplicas []hostTokens
+
+func (h tokenRingReplicas) Less(i, j int) bool { return h[i].token.Less(h[j].token) }
+func (h tokenRingReplicas) Len() int           { return len(h) }
+func (h tokenRingReplicas) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
+
+func (h tokenRingReplicas) replicasFor(t Token) *hostTokens {
+	if len(h) == 0 {
+		return nil
+	}
+
+	p := sort.Search(len(h), func(i int) bool {
+		return !h[i].token.Less(t)
+	})
+
+	if p >= len(h) {
+		// rollover
+		p = 0
+	}
+
+	return &h[p]
+}
+
+type placementStrategy interface {
+	replicaMap(tokenRing *tokenRing) tokenRingReplicas
+	replicationFactor(dc string) int
+}
+
+func getReplicationFactorFromOpts(val interface{}) (int, error) {
+	switch v := val.(type) {
+	case int:
+		if v < 0 {
+			return 0, fmt.Errorf("invalid replication_factor %d", v)
+		}
+		return v, nil
+	case string:
+		n, err := strconv.Atoi(v)
+		if err != nil {
+			return 0, fmt.Errorf("invalid replication_factor %q: %v", v, err)
+		} else if n < 0 {
+			return 0, fmt.Errorf("invalid replication_factor %d", n)
+		}
+		return n, nil
+	default:
+		return 0, fmt.Errorf("unknown replication_factor type %T", v)
+	}
+}
+
+func getStrategy(ks *KeyspaceMetadata, logger StdLogger) placementStrategy {
+	switch {
+	case strings.Contains(ks.StrategyClass, "SimpleStrategy"):
+		rf, err := getReplicationFactorFromOpts(ks.StrategyOptions["replication_factor"])
+		if err != nil {
+			logger.Printf("parse rf for keyspace %q: %v", ks.Name, err)
+			return nil
+		}
+		return &simpleStrategy{rf: rf}
+	case strings.Contains(ks.StrategyClass, "NetworkTopologyStrategy"):
+		dcs := make(map[string]int)
+		for dc, rf := range ks.StrategyOptions {
+			if dc == "class" {
+				continue
+			}
+
+			rf, err := getReplicationFactorFromOpts(rf)
+			if err != nil {
+				logger.Println("parse rf for keyspace %q, dc %q: %v", err)
+				// skip DC if the rf is invalid/unsupported, so that we can at least work with other working DCs.
+				continue
+			}
+
+			dcs[dc] = rf
+		}
+		return &networkTopology{dcs: dcs}
+	case strings.Contains(ks.StrategyClass, "LocalStrategy"):
+		return nil
+	default:
+		logger.Printf("parse rf for keyspace %q: unsupported strategy class: %v", ks.StrategyClass)
+		return nil
+	}
+}
+
+type simpleStrategy struct {
+	rf int
+}
+
+func (s *simpleStrategy) replicationFactor(dc string) int {
+	return s.rf
+}
+
+func (s *simpleStrategy) replicaMap(tokenRing *tokenRing) tokenRingReplicas {
+	tokens := tokenRing.tokens
+	ring := make(tokenRingReplicas, len(tokens))
+
+	for i, th := range tokens {
+		replicas := make([]*HostInfo, 0, s.rf)
+		seen := make(map[*HostInfo]bool)
+
+		for j := 0; j < len(tokens) && len(replicas) < s.rf; j++ {
+			h := tokens[(i+j)%len(tokens)]
+			if !seen[h.host] {
+				replicas = append(replicas, h.host)
+				seen[h.host] = true
+			}
+		}
+
+		ring[i] = hostTokens{th.token, replicas}
+	}
+
+	sort.Sort(ring)
+
+	return ring
+}
+
+type networkTopology struct {
+	dcs map[string]int
+}
+
+func (n *networkTopology) replicationFactor(dc string) int {
+	return n.dcs[dc]
+}
+
+func (n *networkTopology) haveRF(replicaCounts map[string]int) bool {
+	if len(replicaCounts) != len(n.dcs) {
+		return false
+	}
+
+	for dc, rf := range n.dcs {
+		if rf != replicaCounts[dc] {
+			return false
+		}
+	}
+
+	return true
+}
+
+func (n *networkTopology) replicaMap(tokenRing *tokenRing) tokenRingReplicas {
+	dcRacks := make(map[string]map[string]struct{}, len(n.dcs))
+	// skipped hosts in a dc
+	skipped := make(map[string][]*HostInfo, len(n.dcs))
+	// number of replicas per dc
+	replicasInDC := make(map[string]int, len(n.dcs))
+	// dc -> racks
+	seenDCRacks := make(map[string]map[string]struct{}, len(n.dcs))
+
+	for _, h := range tokenRing.hosts {
+		dc := h.DataCenter()
+		rack := h.Rack()
+
+		racks, ok := dcRacks[dc]
+		if !ok {
+			racks = make(map[string]struct{})
+			dcRacks[dc] = racks
+		}
+		racks[rack] = struct{}{}
+	}
+
+	for dc, racks := range dcRacks {
+		replicasInDC[dc] = 0
+		seenDCRacks[dc] = make(map[string]struct{}, len(racks))
+	}
+
+	tokens := tokenRing.tokens
+	replicaRing := make(tokenRingReplicas, 0, len(tokens))
+
+	var totalRF int
+	for _, rf := range n.dcs {
+		totalRF += rf
+	}
+
+	for i, th := range tokenRing.tokens {
+		if rf := n.dcs[th.host.DataCenter()]; rf == 0 {
+			// skip this token since no replica in this datacenter.
+			continue
+		}
+
+		for k, v := range skipped {
+			skipped[k] = v[:0]
+		}
+
+		for dc := range n.dcs {
+			replicasInDC[dc] = 0
+			for rack := range seenDCRacks[dc] {
+				delete(seenDCRacks[dc], rack)
+			}
+		}
+
+		replicas := make([]*HostInfo, 0, totalRF)
+		for j := 0; j < len(tokens) && (len(replicas) < totalRF && !n.haveRF(replicasInDC)); j++ {
+			// TODO: ensure we dont add the same host twice
+			p := i + j
+			if p >= len(tokens) {
+				p -= len(tokens)
+			}
+			h := tokens[p].host
+
+			dc := h.DataCenter()
+			rack := h.Rack()
+
+			rf := n.dcs[dc]
+			if rf == 0 {
+				// skip this DC, dont know about it or replication factor is zero
+				continue
+			} else if replicasInDC[dc] >= rf {
+				if replicasInDC[dc] > rf {
+					panic(fmt.Sprintf("replica overflow. rf=%d have=%d in dc %q", rf, replicasInDC[dc], dc))
+				}
+
+				// have enough replicas in this DC
+				continue
+			} else if _, ok := dcRacks[dc][rack]; !ok {
+				// dont know about this rack
+				continue
+			}
+
+			racks := seenDCRacks[dc]
+			if _, ok := racks[rack]; ok && len(racks) == len(dcRacks[dc]) {
+				// we have been through all the racks and dont have RF yet, add this
+				replicas = append(replicas, h)
+				replicasInDC[dc]++
+			} else if !ok {
+				if racks == nil {
+					racks = make(map[string]struct{}, 1)
+					seenDCRacks[dc] = racks
+				}
+
+				// new rack
+				racks[rack] = struct{}{}
+				replicas = append(replicas, h)
+				r := replicasInDC[dc] + 1
+
+				if len(racks) == len(dcRacks[dc]) {
+					// if we have been through all the racks, drain the rest of the skipped
+					// hosts until we have RF. The next iteration will skip in the block
+					// above
+					skippedHosts := skipped[dc]
+					var k int
+					for ; k < len(skippedHosts) && r+k < rf; k++ {
+						sh := skippedHosts[k]
+						replicas = append(replicas, sh)
+					}
+					r += k
+					skipped[dc] = skippedHosts[k:]
+				}
+				replicasInDC[dc] = r
+			} else {
+				// already seen this rack, keep hold of this host incase
+				// we dont get enough for rf
+				skipped[dc] = append(skipped[dc], h)
+			}
+		}
+
+		if len(replicas) == 0 {
+			panic(fmt.Sprintf("no replicas for token: %v", th.token))
+		} else if !replicas[0].Equal(th.host) {
+			panic(fmt.Sprintf("first replica is not the primary replica for the token: expected %v got %v", replicas[0].ConnectAddress(), th.host.ConnectAddress()))
+		}
+
+		replicaRing = append(replicaRing, hostTokens{th.token, replicas})
+	}
+
+	dcsWithReplicas := 0
+	for _, dc := range n.dcs {
+		if dc > 0 {
+			dcsWithReplicas++
+		}
+	}
+
+	if dcsWithReplicas == len(dcRacks) && len(replicaRing) != len(tokens) {
+		panic(fmt.Sprintf("token map different size to token ring: got %d expected %d", len(replicaRing), len(tokens)))
+	}
+
+	return replicaRing
+}
diff --git a/vendor/github.com/gocql/gocql/tracer.go b/vendor/github.com/gocql/gocql/tracer.go
new file mode 100644
index 0000000..b0caedb
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/tracer.go
@@ -0,0 +1,219 @@
+package gocql
+
+import (
+	"fmt"
+	"io"
+	"sync"
+	"time"
+)
+
+// Tracer is the interface implemented by query tracers. Tracers have the
+// ability to obtain a detailed event log of all events that happened during
+// the execution of a query from Cassandra. Gathering this information might
+// be essential for debugging and optimizing queries, but this feature should
+// not be used on production systems with very high load.
+type Tracer interface {
+	Trace(traceId []byte)
+}
+
+type TraceWriter struct {
+	session *Session
+	w       io.Writer
+	mu      sync.Mutex
+
+	maxAttempts   int
+	sleepInterval time.Duration
+}
+
+// NewTraceWriter returns a simple Tracer implementation that outputs
+// the event log in a textual format.
+func NewTraceWriter(session *Session, w io.Writer) *TraceWriter {
+	return &TraceWriter{session: session, w: w, maxAttempts: 5, sleepInterval: 3 * time.Millisecond}
+}
+
+func (t *TraceWriter) SetMaxAttempts(maxAttempts int) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	t.maxAttempts = maxAttempts
+}
+
+func (t *TraceWriter) SetSleepInterval(sleepInterval time.Duration) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	t.sleepInterval = sleepInterval
+}
+
+func (t *TraceWriter) Trace(traceId []byte) {
+	var (
+		timestamp time.Time
+		activity  string
+		source    string
+		elapsed   int
+		thread    string
+	)
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	fetchAttempts := 1
+	if t.maxAttempts > 0 {
+		fetchAttempts = t.maxAttempts
+	}
+
+	isDone := false
+	for i := 0; i < fetchAttempts; i++ {
+		var duration int
+
+		iter := t.session.control.query(`SELECT duration
+			FROM system_traces.sessions
+			WHERE session_id = ?`, traceId)
+		iter.Scan(&duration)
+		if duration > 0 {
+			isDone = true
+		}
+
+		if err := iter.Close(); err != nil {
+			fmt.Fprintln(t.w, "Error:", err)
+			return
+		}
+
+		if isDone || i == fetchAttempts-1 {
+			break
+		}
+
+		time.Sleep(t.sleepInterval)
+	}
+	if !isDone {
+		fmt.Fprintln(t.w, "Error: failed to wait tracing to complete. !!! Tracing is incomplete !!!")
+	}
+
+	var (
+		coordinator string
+		duration    int
+	)
+
+	iter := t.session.control.query(`SELECT coordinator, duration
+		FROM system_traces.sessions
+		WHERE session_id = ?`, traceId)
+
+	iter.Scan(&coordinator, &duration)
+	if err := iter.Close(); err != nil {
+		fmt.Fprintln(t.w, "Error:", err)
+		return
+	}
+
+	fmt.Fprintf(t.w, "Tracing session %016x (coordinator: %s, duration: %v):\n",
+		traceId, coordinator, time.Duration(duration)*time.Microsecond)
+
+	iter = t.session.control.query(`SELECT event_id, activity, source, source_elapsed, thread
+			FROM system_traces.events
+			WHERE session_id = ?`, traceId)
+
+	for iter.Scan(&timestamp, &activity, &source, &elapsed, &thread) {
+		fmt.Fprintf(t.w, "%s: %s [%s] (source: %s, elapsed: %d)\n",
+			timestamp.Format("2006/01/02 15:04:05.999999"), activity, thread, source, elapsed)
+	}
+
+	if err := iter.Close(); err != nil {
+		fmt.Fprintln(t.w, "Error:", err)
+	}
+}
+
+type TracerEnhanced struct {
+	session  *Session
+	traceIDs [][]byte
+	mu       sync.Mutex
+}
+
+func NewTracer(session *Session) *TracerEnhanced {
+	return &TracerEnhanced{session: session}
+}
+
+func (t *TracerEnhanced) Trace(traceId []byte) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.traceIDs = append(t.traceIDs, traceId)
+}
+
+func (t *TracerEnhanced) AllTraceIDs() [][]byte {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.traceIDs
+}
+
+func (t *TracerEnhanced) IsReady(traceId []byte) (bool, error) {
+	isDone := false
+	var duration int
+
+	iter := t.session.control.query(`SELECT duration
+		FROM system_traces.sessions
+		WHERE session_id = ?`, traceId)
+	iter.Scan(&duration)
+	if duration > 0 {
+		isDone = true
+	}
+
+	if err := iter.Close(); err != nil {
+		return false, err
+	}
+
+	if isDone {
+		return true, nil
+	}
+
+	return false, nil
+}
+
+func (t *TracerEnhanced) GetCoordinatorTime(traceId []byte) (string, time.Duration, error) {
+	var (
+		coordinator string
+		duration    int
+	)
+
+	iter := t.session.control.query(`SELECT coordinator, duration
+		FROM system_traces.sessions
+		WHERE session_id = ?`, traceId)
+
+	iter.Scan(&coordinator, &duration)
+	if err := iter.Close(); err != nil {
+		return coordinator, time.Duration(duration) * time.Microsecond, err
+	}
+
+	return coordinator, time.Duration(duration) * time.Microsecond, nil
+}
+
+type TraceEntry struct {
+	Timestamp time.Time
+	Activity  string
+	Source    string
+	Elapsed   int
+	Thread    string
+}
+
+func (t *TracerEnhanced) GetActivities(traceId []byte) ([]TraceEntry, error) {
+	iter := t.session.control.query(`SELECT event_id, activity, source, source_elapsed, thread
+		FROM system_traces.events
+		WHERE session_id = ?`, traceId)
+
+	var (
+		timestamp time.Time
+		activity  string
+		source    string
+		elapsed   int
+		thread    string
+	)
+
+	var activities []TraceEntry
+
+	for iter.Scan(&timestamp, &activity, &source, &elapsed, &thread) {
+		activities = append(activities, TraceEntry{Timestamp: timestamp, Activity: activity, Source: source, Elapsed: elapsed, Thread: thread})
+	}
+
+	if err := iter.Close(); err != nil {
+		return nil, err
+	}
+
+	return activities, nil
+}
diff --git a/vendor/github.com/gocql/gocql/uuid.go b/vendor/github.com/gocql/gocql/uuid.go
new file mode 100644
index 0000000..9bd9665
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/uuid.go
@@ -0,0 +1,332 @@
+// Copyright (c) 2012 The gocql Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gocql
+
+// The uuid package can be used to generate and parse universally unique
+// identifiers, a standardized format in the form of a 128 bit number.
+//
+// http://tools.ietf.org/html/rfc4122
+
+import (
+	"crypto/rand"
+	"errors"
+	"fmt"
+	"io"
+	"net"
+	"strings"
+	"sync/atomic"
+	"time"
+)
+
+type UUID [16]byte
+
+var hardwareAddr []byte
+var clockSeq uint32
+
+const (
+	VariantNCSCompat = 0
+	VariantIETF      = 2
+	VariantMicrosoft = 6
+	VariantFuture    = 7
+)
+
+func init() {
+	if interfaces, err := net.Interfaces(); err == nil {
+		for _, i := range interfaces {
+			if i.Flags&net.FlagLoopback == 0 && len(i.HardwareAddr) > 0 {
+				hardwareAddr = i.HardwareAddr
+				break
+			}
+		}
+	}
+	if hardwareAddr == nil {
+		// If we failed to obtain the MAC address of the current computer,
+		// we will use a randomly generated 6 byte sequence instead and set
+		// the multicast bit as recommended in RFC 4122.
+		hardwareAddr = make([]byte, 6)
+		_, err := io.ReadFull(rand.Reader, hardwareAddr)
+		if err != nil {
+			panic(err)
+		}
+		hardwareAddr[0] = hardwareAddr[0] | 0x01
+	}
+
+	// initialize the clock sequence with a random number
+	var clockSeqRand [2]byte
+	io.ReadFull(rand.Reader, clockSeqRand[:])
+	clockSeq = uint32(clockSeqRand[1])<<8 | uint32(clockSeqRand[0])
+}
+
+// ParseUUID parses a 32 digit hexadecimal number (that might contain hypens)
+// representing an UUID.
+func ParseUUID(input string) (UUID, error) {
+	var u UUID
+	j := 0
+	for _, r := range input {
+		switch {
+		case r == '-' && j&1 == 0:
+			continue
+		case r >= '0' && r <= '9' && j < 32:
+			u[j/2] |= byte(r-'0') << uint(4-j&1*4)
+		case r >= 'a' && r <= 'f' && j < 32:
+			u[j/2] |= byte(r-'a'+10) << uint(4-j&1*4)
+		case r >= 'A' && r <= 'F' && j < 32:
+			u[j/2] |= byte(r-'A'+10) << uint(4-j&1*4)
+		default:
+			return UUID{}, fmt.Errorf("invalid UUID %q", input)
+		}
+		j += 1
+	}
+	if j != 32 {
+		return UUID{}, fmt.Errorf("invalid UUID %q", input)
+	}
+	return u, nil
+}
+
+func ParseUUIDMust(input string) UUID {
+	uuid, err := ParseUUID(input)
+	if err != nil {
+		panic(err)
+	}
+	return uuid
+}
+
+// UUIDFromBytes converts a raw byte slice to an UUID.
+func UUIDFromBytes(input []byte) (UUID, error) {
+	var u UUID
+	if len(input) != 16 {
+		return u, errors.New("UUIDs must be exactly 16 bytes long")
+	}
+
+	copy(u[:], input)
+	return u, nil
+}
+
+func MustRandomUUID() UUID {
+	uuid, err := RandomUUID()
+	if err != nil {
+		panic(err)
+	}
+	return uuid
+}
+
+// RandomUUID generates a totally random UUID (version 4) as described in
+// RFC 4122.
+func RandomUUID() (UUID, error) {
+	var u UUID
+	_, err := io.ReadFull(rand.Reader, u[:])
+	if err != nil {
+		return u, err
+	}
+	u[6] &= 0x0F // clear version
+	u[6] |= 0x40 // set version to 4 (random uuid)
+	u[8] &= 0x3F // clear variant
+	u[8] |= 0x80 // set to IETF variant
+	return u, nil
+}
+
+var timeBase = time.Date(1582, time.October, 15, 0, 0, 0, 0, time.UTC).Unix()
+
+// getTimestamp converts time to UUID (version 1) timestamp.
+// It must be an interval of 100-nanoseconds since timeBase.
+func getTimestamp(t time.Time) int64 {
+	utcTime := t.In(time.UTC)
+	ts := int64(utcTime.Unix()-timeBase)*10000000 + int64(utcTime.Nanosecond()/100)
+
+	return ts
+}
+
+// TimeUUID generates a new time based UUID (version 1) using the current
+// time as the timestamp.
+func TimeUUID() UUID {
+	return UUIDFromTime(time.Now())
+}
+
+// The min and max clock values for a UUID.
+//
+// Cassandra's TimeUUIDType compares the lsb parts as signed byte arrays.
+// Thus, the min value for each byte is -128 and the max is +127.
+const (
+	minClock = 0x8080
+	maxClock = 0x7f7f
+)
+
+// The min and max node values for a UUID.
+//
+// See explanation about Cassandra's TimeUUIDType comparison logic above.
+var (
+	minNode = []byte{0x80, 0x80, 0x80, 0x80, 0x80, 0x80}
+	maxNode = []byte{0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f}
+)
+
+// MinTimeUUID generates a "fake" time based UUID (version 1) which will be
+// the smallest possible UUID generated for the provided timestamp.
+//
+// UUIDs generated by this function are not unique and are mostly suitable only
+// in queries to select a time range of a Cassandra's TimeUUID column.
+func MinTimeUUID(t time.Time) UUID {
+	return TimeUUIDWith(getTimestamp(t), minClock, minNode)
+}
+
+// MaxTimeUUID generates a "fake" time based UUID (version 1) which will be
+// the biggest possible UUID generated for the provided timestamp.
+//
+// UUIDs generated by this function are not unique and are mostly suitable only
+// in queries to select a time range of a Cassandra's TimeUUID column.
+func MaxTimeUUID(t time.Time) UUID {
+	return TimeUUIDWith(getTimestamp(t), maxClock, maxNode)
+}
+
+// UUIDFromTime generates a new time based UUID (version 1) as described in
+// RFC 4122. This UUID contains the MAC address of the node that generated
+// the UUID, the given timestamp and a sequence number.
+func UUIDFromTime(t time.Time) UUID {
+	ts := getTimestamp(t)
+	clock := atomic.AddUint32(&clockSeq, 1)
+
+	return TimeUUIDWith(ts, clock, hardwareAddr)
+}
+
+// TimeUUIDWith generates a new time based UUID (version 1) as described in
+// RFC4122 with given parameters. t is the number of 100's of nanoseconds
+// since 15 Oct 1582 (60bits). clock is the number of clock sequence (14bits).
+// node is a slice to gurarantee the uniqueness of the UUID (up to 6bytes).
+// Note: calling this function does not increment the static clock sequence.
+func TimeUUIDWith(t int64, clock uint32, node []byte) UUID {
+	var u UUID
+
+	u[0], u[1], u[2], u[3] = byte(t>>24), byte(t>>16), byte(t>>8), byte(t)
+	u[4], u[5] = byte(t>>40), byte(t>>32)
+	u[6], u[7] = byte(t>>56)&0x0F, byte(t>>48)
+
+	u[8] = byte(clock >> 8)
+	u[9] = byte(clock)
+
+	copy(u[10:], node)
+
+	u[6] |= 0x10 // set version to 1 (time based uuid)
+	u[8] &= 0x3F // clear variant
+	u[8] |= 0x80 // set to IETF variant
+
+	return u
+}
+
+// String returns the UUID in it's canonical form, a 32 digit hexadecimal
+// number in the form of xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx.
+func (u UUID) String() string {
+	var offsets = [...]int{0, 2, 4, 6, 9, 11, 14, 16, 19, 21, 24, 26, 28, 30, 32, 34}
+	const hexString = "0123456789abcdef"
+	r := make([]byte, 36)
+	for i, b := range u {
+		r[offsets[i]] = hexString[b>>4]
+		r[offsets[i]+1] = hexString[b&0xF]
+	}
+	r[8] = '-'
+	r[13] = '-'
+	r[18] = '-'
+	r[23] = '-'
+	return string(r)
+
+}
+
+// Bytes returns the raw byte slice for this UUID. A UUID is always 128 bits
+// (16 bytes) long.
+func (u UUID) Bytes() []byte {
+	return u[:]
+}
+
+// Variant returns the variant of this UUID. This package will only generate
+// UUIDs in the IETF variant.
+func (u UUID) Variant() int {
+	x := u[8]
+	if x&0x80 == 0 {
+		return VariantNCSCompat
+	}
+	if x&0x40 == 0 {
+		return VariantIETF
+	}
+	if x&0x20 == 0 {
+		return VariantMicrosoft
+	}
+	return VariantFuture
+}
+
+// Version extracts the version of this UUID variant. The RFC 4122 describes
+// five kinds of UUIDs.
+func (u UUID) Version() int {
+	return int(u[6] & 0xF0 >> 4)
+}
+
+// Node extracts the MAC address of the node who generated this UUID. It will
+// return nil if the UUID is not a time based UUID (version 1).
+func (u UUID) Node() []byte {
+	if u.Version() != 1 {
+		return nil
+	}
+	return u[10:]
+}
+
+// Clock extracts the clock sequence of this UUID. It will return zero if the
+// UUID is not a time based UUID (version 1).
+func (u UUID) Clock() uint32 {
+	if u.Version() != 1 {
+		return 0
+	}
+
+	// Clock sequence is the lower 14bits of u[8:10]
+	return uint32(u[8]&0x3F)<<8 | uint32(u[9])
+}
+
+// Timestamp extracts the timestamp information from a time based UUID
+// (version 1).
+func (u UUID) Timestamp() int64 {
+	if u.Version() != 1 {
+		return 0
+	}
+	return int64(uint64(u[0])<<24|uint64(u[1])<<16|
+		uint64(u[2])<<8|uint64(u[3])) +
+		int64(uint64(u[4])<<40|uint64(u[5])<<32) +
+		int64(uint64(u[6]&0x0F)<<56|uint64(u[7])<<48)
+}
+
+// Time is like Timestamp, except that it returns a time.Time.
+func (u UUID) Time() time.Time {
+	if u.Version() != 1 {
+		return time.Time{}
+	}
+	t := u.Timestamp()
+	sec := t / 1e7
+	nsec := (t % 1e7) * 100
+	return time.Unix(sec+timeBase, nsec).UTC()
+}
+
+// Marshaling for JSON
+func (u UUID) MarshalJSON() ([]byte, error) {
+	return []byte(`"` + u.String() + `"`), nil
+}
+
+// Unmarshaling for JSON
+func (u *UUID) UnmarshalJSON(data []byte) error {
+	str := strings.Trim(string(data), `"`)
+	if len(str) > 36 {
+		return fmt.Errorf("invalid JSON UUID %s", str)
+	}
+
+	parsed, err := ParseUUID(str)
+	if err == nil {
+		copy(u[:], parsed[:])
+	}
+
+	return err
+}
+
+func (u UUID) MarshalText() ([]byte, error) {
+	return []byte(u.String()), nil
+}
+
+func (u *UUID) UnmarshalText(text []byte) (err error) {
+	*u, err = ParseUUID(string(text))
+	return
+}
diff --git a/vendor/github.com/gocql/gocql/version.go b/vendor/github.com/gocql/gocql/version.go
new file mode 100644
index 0000000..2af4fbb
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/version.go
@@ -0,0 +1,24 @@
+package gocql
+
+import "runtime/debug"
+
+const (
+	mainModule = "github.com/gocql/gocql"
+)
+
+var defaultDriverVersion string
+
+func init() {
+	buildInfo, ok := debug.ReadBuildInfo()
+	if ok {
+		for _, d := range buildInfo.Deps {
+			if d.Path == mainModule {
+				defaultDriverVersion = d.Version
+				if d.Replace != nil {
+					defaultDriverVersion = d.Replace.Version
+				}
+				break
+			}
+		}
+	}
+}
diff --git a/vendor/github.com/gocql/gocql/warning_handler.go b/vendor/github.com/gocql/gocql/warning_handler.go
new file mode 100644
index 0000000..a913560
--- /dev/null
+++ b/vendor/github.com/gocql/gocql/warning_handler.go
@@ -0,0 +1,28 @@
+package gocql
+
+type DefaultWarningHandler struct {
+	logger StdLogger
+}
+
+func DefaultWarningHandlerBuilder(session *Session) WarningHandler {
+	return DefaultWarningHandler{
+		logger: session.logger,
+	}
+}
+
+func (d DefaultWarningHandler) HandleWarnings(qry ExecutableQuery, host *HostInfo, warnings []string) {
+	if d.logger == nil {
+		return
+	}
+	if host != nil && len(host.hostId) > 0 {
+		d.logger.Printf("[%s] warnings: %v", host.hostId, warnings)
+	} else {
+		d.logger.Printf("Cluster warnings: %v", warnings)
+	}
+}
+
+var _ WarningHandler = DefaultWarningHandler{}
+
+func NoopWarningHandlerBuilder(session *Session) WarningHandler {
+	return nil
+}
diff --git a/vendor/github.com/hailocab/go-hostpool/.gitignore b/vendor/github.com/hailocab/go-hostpool/.gitignore
new file mode 100644
index 0000000..0026861
--- /dev/null
+++ b/vendor/github.com/hailocab/go-hostpool/.gitignore
@@ -0,0 +1,22 @@
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
diff --git a/vendor/github.com/hailocab/go-hostpool/.travis.yml b/vendor/github.com/hailocab/go-hostpool/.travis.yml
new file mode 100644
index 0000000..e69de29
diff --git a/vendor/github.com/hailocab/go-hostpool/LICENSE b/vendor/github.com/hailocab/go-hostpool/LICENSE
new file mode 100644
index 0000000..f24db89
--- /dev/null
+++ b/vendor/github.com/hailocab/go-hostpool/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Bitly
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/vendor/github.com/hailocab/go-hostpool/README.md b/vendor/github.com/hailocab/go-hostpool/README.md
new file mode 100644
index 0000000..7f44372
--- /dev/null
+++ b/vendor/github.com/hailocab/go-hostpool/README.md
@@ -0,0 +1,17 @@
+go-hostpool
+===========
+
+A Go package to intelligently and flexibly pool among multiple hosts from your Go application.
+Host selection can operate in round robin or epsilon greedy mode, and unresponsive hosts are
+avoided.
+Usage example:
+
+```go
+hp := hostpool.NewEpsilonGreedy([]string{"a", "b"}, 0, &hostpool.LinearEpsilonValueCalculator{})
+hostResponse := hp.Get()
+hostname := hostResponse.Host()
+err := _ // (make a request with hostname)
+hostResponse.Mark(err)
+```
+
+View more detailed documentation on [godoc.org](http://godoc.org/github.com/bitly/go-hostpool)
diff --git a/vendor/github.com/hailocab/go-hostpool/epsilon_greedy.go b/vendor/github.com/hailocab/go-hostpool/epsilon_greedy.go
new file mode 100644
index 0000000..8627aa5
--- /dev/null
+++ b/vendor/github.com/hailocab/go-hostpool/epsilon_greedy.go
@@ -0,0 +1,220 @@
+package hostpool
+
+import (
+	"log"
+	"math/rand"
+	"time"
+)
+
+type epsilonHostPoolResponse struct {
+	standardHostPoolResponse
+	started time.Time
+	ended   time.Time
+}
+
+func (r *epsilonHostPoolResponse) Mark(err error) {
+	r.Do(func() {
+		r.ended = time.Now()
+		doMark(err, r)
+	})
+}
+
+type epsilonGreedyHostPool struct {
+	standardHostPool               // TODO - would be nifty if we could embed HostPool and Locker interfaces
+	epsilon                float32 // this is our exploration factor
+	decayDuration          time.Duration
+	EpsilonValueCalculator // embed the epsilonValueCalculator
+	timer
+	quit chan bool
+}
+
+// Construct an Epsilon Greedy HostPool
+//
+// Epsilon Greedy is an algorithm that allows HostPool not only to track failure state,
+// but also to learn about "better" options in terms of speed, and to pick from available hosts
+// based on how well they perform. This gives a weighted request rate to better
+// performing hosts, while still distributing requests to all hosts (proportionate to their performance).
+// The interface is the same as the standard HostPool, but be sure to mark the HostResponse immediately
+// after executing the request to the host, as that will stop the implicitly running request timer.
+//
+// A good overview of Epsilon Greedy is here http://stevehanov.ca/blog/index.php?id=132
+//
+// To compute the weighting scores, we perform a weighted average of recent response times, over the course of
+// `decayDuration`. decayDuration may be set to 0 to use the default value of 5 minutes
+// We then use the supplied EpsilonValueCalculator to calculate a score from that weighted average response time.
+func NewEpsilonGreedy(hosts []string, decayDuration time.Duration, calc EpsilonValueCalculator) HostPool {
+
+	if decayDuration <= 0 {
+		decayDuration = defaultDecayDuration
+	}
+	stdHP := New(hosts).(*standardHostPool)
+	p := &epsilonGreedyHostPool{
+		standardHostPool:       *stdHP,
+		epsilon:                float32(initialEpsilon),
+		decayDuration:          decayDuration,
+		EpsilonValueCalculator: calc,
+		timer: &realTimer{},
+		quit:  make(chan bool),
+	}
+
+	// allocate structures
+	for _, h := range p.hostList {
+		h.epsilonCounts = make([]int64, epsilonBuckets)
+		h.epsilonValues = make([]int64, epsilonBuckets)
+	}
+	go p.epsilonGreedyDecay()
+	return p
+}
+
+func (p *epsilonGreedyHostPool) Close() {
+	// No need to do p.quit <- true as close(p.quit) does the trick.
+	close(p.quit)
+}
+
+func (p *epsilonGreedyHostPool) SetEpsilon(newEpsilon float32) {
+	p.Lock()
+	defer p.Unlock()
+	p.epsilon = newEpsilon
+}
+
+func (p *epsilonGreedyHostPool) SetHosts(hosts []string) {
+	p.Lock()
+	defer p.Unlock()
+	p.standardHostPool.setHosts(hosts)
+	for _, h := range p.hostList {
+		h.epsilonCounts = make([]int64, epsilonBuckets)
+		h.epsilonValues = make([]int64, epsilonBuckets)
+	}
+}
+
+func (p *epsilonGreedyHostPool) epsilonGreedyDecay() {
+	durationPerBucket := p.decayDuration / epsilonBuckets
+	ticker := time.NewTicker(durationPerBucket)
+	for {
+		select {
+		case <-p.quit:
+			ticker.Stop()
+			return
+		case <-ticker.C:
+			p.performEpsilonGreedyDecay()
+		}
+	}
+}
+func (p *epsilonGreedyHostPool) performEpsilonGreedyDecay() {
+	p.Lock()
+	for _, h := range p.hostList {
+		h.epsilonIndex += 1
+		h.epsilonIndex = h.epsilonIndex % epsilonBuckets
+		h.epsilonCounts[h.epsilonIndex] = 0
+		h.epsilonValues[h.epsilonIndex] = 0
+	}
+	p.Unlock()
+}
+
+func (p *epsilonGreedyHostPool) Get() HostPoolResponse {
+	p.Lock()
+	defer p.Unlock()
+	host := p.getEpsilonGreedy()
+	if host == "" {
+		return nil
+	}
+
+	started := time.Now()
+	return &epsilonHostPoolResponse{
+		standardHostPoolResponse: standardHostPoolResponse{host: host, pool: p},
+		started:                  started,
+	}
+}
+
+func (p *epsilonGreedyHostPool) getEpsilonGreedy() string {
+	var hostToUse *hostEntry
+
+	// this is our exploration phase
+	if rand.Float32() < p.epsilon {
+		p.epsilon = p.epsilon * epsilonDecay
+		if p.epsilon < minEpsilon {
+			p.epsilon = minEpsilon
+		}
+		return p.getRoundRobin()
+	}
+
+	// calculate values for each host in the 0..1 range (but not ormalized)
+	var possibleHosts []*hostEntry
+	now := time.Now()
+	var sumValues float64
+	for _, h := range p.hostList {
+		if h.canTryHost(now) {
+			v := h.getWeightedAverageResponseTime()
+			if v > 0 {
+				ev := p.CalcValueFromAvgResponseTime(v)
+				h.epsilonValue = ev
+				sumValues += ev
+				possibleHosts = append(possibleHosts, h)
+			}
+		}
+	}
+
+	if len(possibleHosts) != 0 {
+		// now normalize to the 0..1 range to get a percentage
+		for _, h := range possibleHosts {
+			h.epsilonPercentage = h.epsilonValue / sumValues
+		}
+
+		// do a weighted random choice among hosts
+		ceiling := 0.0
+		pickPercentage := rand.Float64()
+		for _, h := range possibleHosts {
+			ceiling += h.epsilonPercentage
+			if pickPercentage <= ceiling {
+				hostToUse = h
+				break
+			}
+		}
+	}
+
+	if hostToUse == nil {
+		if len(possibleHosts) != 0 {
+			log.Println("Failed to randomly choose a host, Dan loses")
+		}
+
+		return p.getRoundRobin()
+	}
+
+	if hostToUse.dead {
+		hostToUse.willRetryHost(p.maxRetryInterval)
+	}
+	return hostToUse.host
+}
+
+func (p *epsilonGreedyHostPool) markSuccess(hostR HostPoolResponse) {
+	// first do the base markSuccess - a little redundant with host lookup but cleaner than repeating logic
+	p.standardHostPool.markSuccess(hostR)
+	eHostR, ok := hostR.(*epsilonHostPoolResponse)
+	if !ok {
+		log.Printf("Incorrect type in eps markSuccess!") // TODO reflection to print out offending type
+		return
+	}
+	host := eHostR.host
+	duration := p.between(eHostR.started, eHostR.ended)
+
+	p.Lock()
+	defer p.Unlock()
+	h, ok := p.hosts[host]
+	if !ok {
+		log.Fatalf("host %s not in HostPool %v", host, p.Hosts())
+	}
+	h.epsilonCounts[h.epsilonIndex]++
+	h.epsilonValues[h.epsilonIndex] += int64(duration.Seconds() * 1000)
+}
+
+// --- timer: this just exists for testing
+
+type timer interface {
+	between(time.Time, time.Time) time.Duration
+}
+
+type realTimer struct{}
+
+func (rt *realTimer) between(start time.Time, end time.Time) time.Duration {
+	return end.Sub(start)
+}
diff --git a/vendor/github.com/hailocab/go-hostpool/epsilon_value_calculators.go b/vendor/github.com/hailocab/go-hostpool/epsilon_value_calculators.go
new file mode 100644
index 0000000..9bc3102
--- /dev/null
+++ b/vendor/github.com/hailocab/go-hostpool/epsilon_value_calculators.go
@@ -0,0 +1,40 @@
+package hostpool
+
+// --- Value Calculators -----------------
+
+import (
+	"math"
+)
+
+// --- Definitions -----------------------
+
+// Structs implementing this interface are used to convert the average response time for a host
+// into a score that can be used to weight hosts in the epsilon greedy hostpool. Lower response
+// times should yield higher scores (we want to select the faster hosts more often) The default
+// LinearEpsilonValueCalculator just uses the reciprocal of the response time. In practice, any
+// decreasing function from the positive reals to the positive reals should work.
+type EpsilonValueCalculator interface {
+	CalcValueFromAvgResponseTime(float64) float64
+}
+
+type LinearEpsilonValueCalculator struct{}
+type LogEpsilonValueCalculator struct{ LinearEpsilonValueCalculator }
+type PolynomialEpsilonValueCalculator struct {
+	LinearEpsilonValueCalculator
+	Exp float64 // the exponent to which we will raise the value to reweight
+}
+
+// -------- Methods -----------------------
+
+func (c *LinearEpsilonValueCalculator) CalcValueFromAvgResponseTime(v float64) float64 {
+	return 1.0 / v
+}
+
+func (c *LogEpsilonValueCalculator) CalcValueFromAvgResponseTime(v float64) float64 {
+	// we need to add 1 to v so that this will be defined on all positive floats
+	return c.LinearEpsilonValueCalculator.CalcValueFromAvgResponseTime(math.Log(v + 1.0))
+}
+
+func (c *PolynomialEpsilonValueCalculator) CalcValueFromAvgResponseTime(v float64) float64 {
+	return c.LinearEpsilonValueCalculator.CalcValueFromAvgResponseTime(math.Pow(v, c.Exp))
+}
diff --git a/vendor/github.com/hailocab/go-hostpool/host_entry.go b/vendor/github.com/hailocab/go-hostpool/host_entry.go
new file mode 100644
index 0000000..dcec9a0
--- /dev/null
+++ b/vendor/github.com/hailocab/go-hostpool/host_entry.go
@@ -0,0 +1,62 @@
+package hostpool
+
+import (
+	"time"
+)
+
+// --- hostEntry - this is due to get upgraded
+
+type hostEntry struct {
+	host              string
+	nextRetry         time.Time
+	retryCount        int16
+	retryDelay        time.Duration
+	dead              bool
+	epsilonCounts     []int64
+	epsilonValues     []int64
+	epsilonIndex      int
+	epsilonValue      float64
+	epsilonPercentage float64
+}
+
+func (h *hostEntry) canTryHost(now time.Time) bool {
+	if !h.dead {
+		return true
+	}
+	if h.nextRetry.Before(now) {
+		return true
+	}
+	return false
+}
+
+func (h *hostEntry) willRetryHost(maxRetryInterval time.Duration) {
+	h.retryCount += 1
+	newDelay := h.retryDelay * 2
+	if newDelay < maxRetryInterval {
+		h.retryDelay = newDelay
+	} else {
+		h.retryDelay = maxRetryInterval
+	}
+	h.nextRetry = time.Now().Add(h.retryDelay)
+}
+
+func (h *hostEntry) getWeightedAverageResponseTime() float64 {
+	var value float64
+	var lastValue float64
+
+	// start at 1 so we start with the oldest entry
+	for i := 1; i <= epsilonBuckets; i += 1 {
+		pos := (h.epsilonIndex + i) % epsilonBuckets
+		bucketCount := h.epsilonCounts[pos]
+		// Changing the line below to what I think it should be to get the weights right
+		weight := float64(i) / float64(epsilonBuckets)
+		if bucketCount > 0 {
+			currentValue := float64(h.epsilonValues[pos]) / float64(bucketCount)
+			value += currentValue * weight
+			lastValue = currentValue
+		} else {
+			value += lastValue * weight
+		}
+	}
+	return value
+}
diff --git a/vendor/github.com/hailocab/go-hostpool/hostpool.go b/vendor/github.com/hailocab/go-hostpool/hostpool.go
new file mode 100644
index 0000000..702ca92
--- /dev/null
+++ b/vendor/github.com/hailocab/go-hostpool/hostpool.go
@@ -0,0 +1,243 @@
+// A Go package to intelligently and flexibly pool among multiple hosts from your Go application.
+// Host selection can operate in round robin or epsilon greedy mode, and unresponsive hosts are
+// avoided. A good overview of Epsilon Greedy is here http://stevehanov.ca/blog/index.php?id=132
+package hostpool
+
+import (
+	"log"
+	"sync"
+	"time"
+)
+
+// Returns current version
+func Version() string {
+	return "0.1"
+}
+
+// --- Response interfaces and structs ----
+
+// This interface represents the response from HostPool. You can retrieve the
+// hostname by calling Host(), and after making a request to the host you should
+// call Mark with any error encountered, which will inform the HostPool issuing
+// the HostPoolResponse of what happened to the request and allow it to update.
+type HostPoolResponse interface {
+	Host() string
+	Mark(error)
+	hostPool() HostPool
+}
+
+type standardHostPoolResponse struct {
+	host string
+	sync.Once
+	pool HostPool
+}
+
+// --- HostPool structs and interfaces ----
+
+// This is the main HostPool interface. Structs implementing this interface
+// allow you to Get a HostPoolResponse (which includes a hostname to use),
+// get the list of all Hosts, and use ResetAll to reset state.
+type HostPool interface {
+	Get() HostPoolResponse
+	// keep the marks separate so we can override independently
+	markSuccess(HostPoolResponse)
+	markFailed(HostPoolResponse)
+
+	ResetAll()
+	// ReturnUnhealthy when called with true will prevent an unhealthy node from
+	// being returned and will instead return a nil HostPoolResponse. If using
+	// this feature then you should check the result of Get for nil
+	ReturnUnhealthy(v bool)
+	Hosts() []string
+	SetHosts([]string)
+
+	// Close the hostpool and release all resources.
+	Close()
+}
+
+type standardHostPool struct {
+	sync.RWMutex
+	hosts             map[string]*hostEntry
+	hostList          []*hostEntry
+	returnUnhealthy   bool
+	initialRetryDelay time.Duration
+	maxRetryInterval  time.Duration
+	nextHostIndex     int
+}
+
+// ------ constants -------------------
+
+const epsilonBuckets = 120
+const epsilonDecay = 0.90 // decay the exploration rate
+const minEpsilon = 0.01   // explore one percent of the time
+const initialEpsilon = 0.3
+const defaultDecayDuration = time.Duration(5) * time.Minute
+
+// Construct a basic HostPool using the hostnames provided
+func New(hosts []string) HostPool {
+	p := &standardHostPool{
+		returnUnhealthy:   true,
+		hosts:             make(map[string]*hostEntry, len(hosts)),
+		hostList:          make([]*hostEntry, len(hosts)),
+		initialRetryDelay: time.Duration(30) * time.Second,
+		maxRetryInterval:  time.Duration(900) * time.Second,
+	}
+
+	for i, h := range hosts {
+		e := &hostEntry{
+			host:       h,
+			retryDelay: p.initialRetryDelay,
+		}
+		p.hosts[h] = e
+		p.hostList[i] = e
+	}
+
+	return p
+}
+
+func (r *standardHostPoolResponse) Host() string {
+	return r.host
+}
+
+func (r *standardHostPoolResponse) hostPool() HostPool {
+	return r.pool
+}
+
+func (r *standardHostPoolResponse) Mark(err error) {
+	r.Do(func() {
+		doMark(err, r)
+	})
+}
+
+func doMark(err error, r HostPoolResponse) {
+	if err == nil {
+		r.hostPool().markSuccess(r)
+	} else {
+		r.hostPool().markFailed(r)
+	}
+}
+
+// return an entry from the HostPool
+func (p *standardHostPool) Get() HostPoolResponse {
+	p.Lock()
+	defer p.Unlock()
+	host := p.getRoundRobin()
+	if host == "" {
+		return nil
+	}
+
+	return &standardHostPoolResponse{host: host, pool: p}
+}
+
+func (p *standardHostPool) getRoundRobin() string {
+	now := time.Now()
+	hostCount := len(p.hostList)
+	for i := range p.hostList {
+		// iterate via sequenece from where we last iterated
+		currentIndex := (i + p.nextHostIndex) % hostCount
+
+		h := p.hostList[currentIndex]
+		if !h.dead {
+			p.nextHostIndex = currentIndex + 1
+			return h.host
+		}
+		if h.nextRetry.Before(now) {
+			h.willRetryHost(p.maxRetryInterval)
+			p.nextHostIndex = currentIndex + 1
+			return h.host
+		}
+	}
+
+	// all hosts are down and returnUnhealhy is false then return no host
+	if !p.returnUnhealthy {
+		return ""
+	}
+
+	// all hosts are down. re-add them
+	p.doResetAll()
+	p.nextHostIndex = 0
+	return p.hostList[0].host
+}
+
+func (p *standardHostPool) ResetAll() {
+	p.Lock()
+	defer p.Unlock()
+	p.doResetAll()
+}
+
+func (p *standardHostPool) SetHosts(hosts []string) {
+	p.Lock()
+	defer p.Unlock()
+	p.setHosts(hosts)
+}
+
+func (p *standardHostPool) ReturnUnhealthy(v bool) {
+	p.Lock()
+	defer p.Unlock()
+	p.returnUnhealthy = v
+}
+
+func (p *standardHostPool) setHosts(hosts []string) {
+	p.hosts = make(map[string]*hostEntry, len(hosts))
+	p.hostList = make([]*hostEntry, len(hosts))
+
+	for i, h := range hosts {
+		e := &hostEntry{
+			host:       h,
+			retryDelay: p.initialRetryDelay,
+		}
+		p.hosts[h] = e
+		p.hostList[i] = e
+	}
+}
+
+// this actually performs the logic to reset,
+// and should only be called when the lock has
+// already been acquired
+func (p *standardHostPool) doResetAll() {
+	for _, h := range p.hosts {
+		h.dead = false
+	}
+}
+
+func (p *standardHostPool) Close() {
+	for _, h := range p.hosts {
+		h.dead = true
+	}
+}
+
+func (p *standardHostPool) markSuccess(hostR HostPoolResponse) {
+	host := hostR.Host()
+	p.Lock()
+	defer p.Unlock()
+
+	h, ok := p.hosts[host]
+	if !ok {
+		log.Fatalf("host %s not in HostPool %v", host, p.Hosts())
+	}
+	h.dead = false
+}
+
+func (p *standardHostPool) markFailed(hostR HostPoolResponse) {
+	host := hostR.Host()
+	p.Lock()
+	defer p.Unlock()
+	h, ok := p.hosts[host]
+	if !ok {
+		log.Fatalf("host %s not in HostPool %v", host, p.Hosts())
+	}
+	if !h.dead {
+		h.dead = true
+		h.retryCount = 0
+		h.retryDelay = p.initialRetryDelay
+		h.nextRetry = time.Now().Add(h.retryDelay)
+	}
+
+}
+func (p *standardHostPool) Hosts() []string {
+	hosts := make([]string, 0, len(p.hosts))
+	for host := range p.hosts {
+		hosts = append(hosts, host)
+	}
+	return hosts
+}
diff --git a/vendor/github.com/klauspost/compress/LICENSE b/vendor/github.com/klauspost/compress/LICENSE
new file mode 100644
index 0000000..87d5574
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/LICENSE
@@ -0,0 +1,304 @@
+Copyright (c) 2012 The Go Authors. All rights reserved.
+Copyright (c) 2019 Klaus Post. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+------------------
+
+Files: gzhttp/*
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2016-2017 The New York Times Company
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+------------------
+
+Files: s2/cmd/internal/readahead/*
+
+The MIT License (MIT)
+
+Copyright (c) 2015 Klaus Post
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+---------------------
+Files: snappy/*
+Files: internal/snapref/*
+
+Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-----------------
+
+Files: s2/cmd/internal/filepathx/*
+
+Copyright 2016 The filepathx Authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/vendor/github.com/klauspost/compress/internal/race/norace.go b/vendor/github.com/klauspost/compress/internal/race/norace.go
new file mode 100644
index 0000000..affbbbb
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/race/norace.go
@@ -0,0 +1,13 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !race
+
+package race
+
+func ReadSlice[T any](s []T) {
+}
+
+func WriteSlice[T any](s []T) {
+}
diff --git a/vendor/github.com/klauspost/compress/internal/race/race.go b/vendor/github.com/klauspost/compress/internal/race/race.go
new file mode 100644
index 0000000..f5e240d
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/race/race.go
@@ -0,0 +1,26 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build race
+
+package race
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+func ReadSlice[T any](s []T) {
+	if len(s) == 0 {
+		return
+	}
+	runtime.RaceReadRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0])))
+}
+
+func WriteSlice[T any](s []T) {
+	if len(s) == 0 {
+		return
+	}
+	runtime.RaceWriteRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0])))
+}
diff --git a/vendor/github.com/klauspost/compress/s2/.gitignore b/vendor/github.com/klauspost/compress/s2/.gitignore
new file mode 100644
index 0000000..3a89c6e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/.gitignore
@@ -0,0 +1,15 @@
+testdata/bench
+
+# These explicitly listed benchmark data files are for an obsolete version of
+# snappy_test.go.
+testdata/alice29.txt
+testdata/asyoulik.txt
+testdata/fireworks.jpeg
+testdata/geo.protodata
+testdata/html
+testdata/html_x_4
+testdata/kppkn.gtb
+testdata/lcet10.txt
+testdata/paper-100k.pdf
+testdata/plrabn12.txt
+testdata/urls.10K
diff --git a/vendor/github.com/klauspost/compress/s2/LICENSE b/vendor/github.com/klauspost/compress/s2/LICENSE
new file mode 100644
index 0000000..1d2d645
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
+Copyright (c) 2019 Klaus Post. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md
new file mode 100644
index 0000000..8284bb0
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/README.md
@@ -0,0 +1,1120 @@
+# S2 Compression
+
+S2 is an extension of [Snappy](https://github.com/google/snappy).
+
+S2 is aimed for high throughput, which is why it features concurrent compression for bigger payloads.
+
+Decoding is compatible with Snappy compressed content, but content compressed with S2 cannot be decompressed by Snappy.
+This means that S2 can seamlessly replace Snappy without converting compressed content.
+
+S2 can produce Snappy compatible output, faster and better than Snappy.
+If you want full benefit of the changes you should use s2 without Snappy compatibility. 
+
+S2 is designed to have high throughput on content that cannot be compressed.
+This is important, so you don't have to worry about spending CPU cycles on already compressed data. 
+
+## Benefits over Snappy
+
+* Better compression
+* Adjustable compression (3 levels) 
+* Concurrent stream compression
+* Faster decompression, even for Snappy compatible content
+* Concurrent Snappy/S2 stream decompression
+* Skip forward in compressed stream
+* Random seeking with indexes
+* Compatible with reading Snappy compressed content
+* Smaller block size overhead on incompressible blocks
+* Block concatenation
+* Block Dictionary support
+* Uncompressed stream mode
+* Automatic stream size padding
+* Snappy compatible block compression
+
+## Drawbacks over Snappy
+
+* Not optimized for 32 bit systems
+* Streams use slightly more memory due to larger blocks and concurrency (configurable)
+
+# Usage
+
+Installation: `go get -u github.com/klauspost/compress/s2`
+
+Full package documentation:
+ 
+[![godoc][1]][2]
+
+[1]: https://godoc.org/github.com/klauspost/compress?status.svg
+[2]: https://godoc.org/github.com/klauspost/compress/s2
+
+## Compression
+
+```Go
+func EncodeStream(src io.Reader, dst io.Writer) error {
+    enc := s2.NewWriter(dst)
+    _, err := io.Copy(enc, src)
+    if err != nil {
+        enc.Close()
+        return err
+    }
+    // Blocks until compression is done.
+    return enc.Close() 
+}
+```
+
+You should always call `enc.Close()`, otherwise you will leak resources and your encode will be incomplete.
+
+For the best throughput, you should attempt to reuse the `Writer` using the `Reset()` method.
+
+The Writer in S2 is always buffered, therefore `NewBufferedWriter` in Snappy can be replaced with `NewWriter` in S2.
+It is possible to flush any buffered data using the `Flush()` method. 
+This will block until all data sent to the encoder has been written to the output.
+
+S2 also supports the `io.ReaderFrom` interface, which will consume all input from a reader.
+
+As a final method to compress data, if you have a single block of data you would like to have encoded as a stream,
+a slightly more efficient method is to use the `EncodeBuffer` method.
+This will take ownership of the buffer until the stream is closed.
+
+```Go
+func EncodeStream(src []byte, dst io.Writer) error {
+    enc := s2.NewWriter(dst)
+    // The encoder owns the buffer until Flush or Close is called.
+    err := enc.EncodeBuffer(buf)
+    if err != nil {
+        enc.Close()
+        return err
+    }
+    // Blocks until compression is done.
+    return enc.Close()
+}
+```
+
+Each call to `EncodeBuffer` will result in discrete blocks being created without buffering, 
+so it should only be used a single time per stream.
+If you need to write several blocks, you should use the regular io.Writer interface.
+
+
+## Decompression
+
+```Go
+func DecodeStream(src io.Reader, dst io.Writer) error {
+    dec := s2.NewReader(src)
+    _, err := io.Copy(dst, dec)
+    return err
+}
+```
+
+Similar to the Writer, a Reader can be reused using the `Reset` method.
+
+For the best possible throughput, there is a `EncodeBuffer(buf []byte)` function available.
+However, it requires that the provided buffer isn't used after it is handed over to S2 and until the stream is flushed or closed.  
+
+For smaller data blocks, there is also a non-streaming interface: `Encode()`, `EncodeBetter()` and `Decode()`.
+Do however note that these functions (similar to Snappy) does not provide validation of data, 
+so data corruption may be undetected. Stream encoding provides CRC checks of data.
+
+It is possible to efficiently skip forward in a compressed stream using the `Skip()` method. 
+For big skips the decompressor is able to skip blocks without decompressing them.
+
+## Single Blocks
+
+Similar to Snappy S2 offers single block compression. 
+Blocks do not offer the same flexibility and safety as streams,
+but may be preferable for very small payloads, less than 100K.
+
+Using a simple `dst := s2.Encode(nil, src)` will compress `src` and return the compressed result. 
+It is possible to provide a destination buffer. 
+If the buffer has a capacity of `s2.MaxEncodedLen(len(src))` it will be used. 
+If not a new will be allocated. 
+
+Alternatively `EncodeBetter`/`EncodeBest` can also be used for better, but slightly slower compression.
+
+Similarly to decompress a block you can use `dst, err := s2.Decode(nil, src)`. 
+Again an optional destination buffer can be supplied. 
+The `s2.DecodedLen(src)` can be used to get the minimum capacity needed. 
+If that is not satisfied a new buffer will be allocated.
+
+Block function always operate on a single goroutine since it should only be used for small payloads.
+
+# Commandline tools
+
+Some very simply commandline tools are provided; `s2c` for compression and `s2d` for decompression.
+
+Binaries can be downloaded on the [Releases Page](https://github.com/klauspost/compress/releases).
+
+Installing then requires Go to be installed. To install them, use:
+
+`go install github.com/klauspost/compress/s2/cmd/s2c@latest && go install github.com/klauspost/compress/s2/cmd/s2d@latest`
+
+To build binaries to the current folder use:
+
+`go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d`
+
+
+## s2c
+
+```
+Usage: s2c [options] file1 file2
+
+Compresses all files supplied as input separately.
+Output files are written as 'filename.ext.s2' or 'filename.ext.snappy'.
+By default output files will be overwritten.
+Use - as the only file name to read from stdin and write to stdout.
+
+Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
+Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
+
+File names beginning with 'http://' and 'https://' will be downloaded and compressed.
+Only http response code 200 is accepted.
+
+Options:
+  -bench int
+    	Run benchmark n times. No output will be written
+  -blocksize string
+    	Max  block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB (default "4M")
+  -c	Write all output to stdout. Multiple input files will be concatenated
+  -cpu int
+    	Compress using this amount of threads (default 32)
+  -faster
+    	Compress faster, but with a minor compression loss
+  -help
+    	Display help
+  -index
+        Add seek index (default true)    	
+  -o string
+        Write output to another file. Single input file only
+  -pad string
+    	Pad size to a multiple of this value, Examples: 500, 64K, 256K, 1M, 4M, etc (default "1")
+  -q	Don't write any output to terminal, except errors
+  -rm
+    	Delete source file(s) after successful compression
+  -safe
+    	Do not overwrite output files
+  -slower
+    	Compress more, but a lot slower
+  -snappy
+        Generate Snappy compatible output stream
+  -verify
+    	Verify written files  
+
+```
+
+## s2d
+
+```
+Usage: s2d [options] file1 file2
+
+Decompresses all files supplied as input. Input files must end with '.s2' or '.snappy'.
+Output file names have the extension removed. By default output files will be overwritten.
+Use - as the only file name to read from stdin and write to stdout.
+
+Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
+Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
+
+File names beginning with 'http://' and 'https://' will be downloaded and decompressed.
+Extensions on downloaded files are ignored. Only http response code 200 is accepted.
+
+Options:
+  -bench int
+    	Run benchmark n times. No output will be written
+  -c	Write all output to stdout. Multiple input files will be concatenated
+  -help
+    	Display help
+  -o string
+        Write output to another file. Single input file only
+  -offset string
+        Start at offset. Examples: 92, 64K, 256K, 1M, 4M. Requires Index
+  -q    Don't write any output to terminal, except errors
+  -rm
+        Delete source file(s) after successful decompression
+  -safe
+        Do not overwrite output files
+  -tail string
+        Return last of compressed file. Examples: 92, 64K, 256K, 1M, 4M. Requires Index
+  -verify
+    	Verify files, but do not write output                                      
+```
+
+## s2sx: self-extracting archives
+
+s2sx allows creating self-extracting archives with no dependencies.
+
+By default, executables are created for the same platforms as the host os, 
+but this can be overridden with `-os` and `-arch` parameters.
+
+Extracted files have 0666 permissions, except when untar option used.
+
+```
+Usage: s2sx [options] file1 file2
+
+Compresses all files supplied as input separately.
+If files have '.s2' extension they are assumed to be compressed already.
+Output files are written as 'filename.s2sx' and with '.exe' for windows targets.
+If output is big, an additional file with ".more" is written. This must be included as well.
+By default output files will be overwritten.
+
+Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
+Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
+
+Options:
+  -arch string
+        Destination architecture (default "amd64")
+  -c    Write all output to stdout. Multiple input files will be concatenated
+  -cpu int
+        Compress using this amount of threads (default 32)
+  -help
+        Display help
+  -max string
+        Maximum executable size. Rest will be written to another file. (default "1G")
+  -os string
+        Destination operating system (default "windows")
+  -q    Don't write any output to terminal, except errors
+  -rm
+        Delete source file(s) after successful compression
+  -safe
+        Do not overwrite output files
+  -untar
+        Untar on destination
+```
+
+Available platforms are:
+
+ * darwin-amd64
+ * darwin-arm64
+ * linux-amd64
+ * linux-arm
+ * linux-arm64
+ * linux-mips64
+ * linux-ppc64le
+ * windows-386
+ * windows-amd64                                                                             
+
+By default, there is a size limit of 1GB for the output executable.
+
+When this is exceeded the remaining file content is written to a file called
+output+`.more`. This file must be included for a successful extraction and 
+placed alongside the executable for a successful extraction.
+
+This file *must* have the same name as the executable, so if the executable is renamed, 
+so must the `.more` file. 
+
+This functionality is disabled with stdin/stdout. 
+
+### Self-extracting TAR files
+
+If you wrap a TAR file you can specify `-untar` to make it untar on the destination host.
+
+Files are extracted to the current folder with the path specified in the tar file.
+
+Note that tar files are not validated before they are wrapped.
+
+For security reasons files that move below the root folder are not allowed.
+
+# Performance
+
+This section will focus on comparisons to Snappy. 
+This package is solely aimed at replacing Snappy as a high speed compression package.
+If you are mainly looking for better compression [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd)
+gives better compression, but typically at speeds slightly below "better" mode in this package.
+
+Compression is increased compared to Snappy, mostly around 5-20% and the throughput is typically 25-40% increased (single threaded) compared to the Snappy Go implementation.
+
+Streams are concurrently compressed. The stream will be distributed among all available CPU cores for the best possible throughput.
+
+A "better" compression mode is also available. This allows to trade a bit of speed for a minor compression gain.
+The content compressed in this mode is fully compatible with the standard decoder.
+
+Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU):
+
+| File                                                                                                    | S2 Speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller |
+|---------------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------|
+| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z)                        | 16.33x   | 10556 MB/s    | 8.0%         | 6.04x       | 5252 MB/s           | 14.7%              |
+| (1 CPU)                                                                                                 | 1.08x    | 940 MB/s      | -            | 0.46x       | 400 MB/s            | -                  |
+| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst)     | 16.51x   | 15224 MB/s    | 31.70%       | 9.47x       | 8734 MB/s           | 37.71%             |
+| (1 CPU)                                                                                                 | 1.26x    | 1157 MB/s     | -            | 0.60x       | 556 MB/s            | -                  |
+| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst)             | 15.14x   | 12598 MB/s    | -5.76%       | 6.23x       | 5675 MB/s           | 3.62%              |
+| (1 CPU)                                                                                                 | 1.02x    | 932 MB/s      | -            | 0.47x       | 432 MB/s            | -                  |
+| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst)                         | 11.21x   | 12116 MB/s    | 15.95%       | 3.24x       | 3500 MB/s           | 18.00%             |
+| (1 CPU)                                                                                                 | 1.05x    | 1135 MB/s     | -            | 0.27x       | 292 MB/s            | -                  |
+| [apache.log](https://files.klauspost.com/compress/apache.log.zst)                                       | 8.55x    | 16673 MB/s    | 20.54%       | 5.85x       | 11420 MB/s          | 24.97%             |
+| (1 CPU)                                                                                                 | 1.91x    | 1771 MB/s     | -            | 0.53x       | 1041 MB/s           | -                  |
+| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z)                                        | 15.76x   | 14357 MB/s    | 24.01%       | 8.67x       | 7891 MB/s           | 33.68%             |
+| (1 CPU)                                                                                                 | 1.17x    | 1064 MB/s     | -            | 0.65x       | 595 MB/s            | -                  |
+| [10gb.tar](http://mattmahoney.net/dc/10gb.html)                                                         | 13.33x   | 9835 MB/s     | 2.34%        | 6.85x       | 4863 MB/s           | 9.96%              |
+| (1 CPU)                                                                                                 | 0.97x    | 689 MB/s      | -            | 0.55x       | 387 MB/s            | -                  |
+| sharnd.out.2gb                                                                                          | 9.11x    | 13213 MB/s    | 0.01%        | 1.49x       | 9184 MB/s           | 0.01%              |
+| (1 CPU)                                                                                                 | 0.88x    | 5418 MB/s     | -            | 0.77x       | 5417 MB/s           | -                  |
+| [sofia-air-quality-dataset csv](https://files.klauspost.com/compress/sofia-air-quality-dataset.tar.zst) | 22.00x   | 11477 MB/s    | 18.73%       | 11.15x      | 5817 MB/s           | 27.88%             |
+| (1 CPU)                                                                                                 | 1.23x    | 642 MB/s      | -            | 0.71x       | 642 MB/s            | -                  |
+| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip)                                        | 11.23x   | 6520 MB/s     | 5.9%         | 5.35x       | 3109 MB/s           | 15.88%             |
+| (1 CPU)                                                                                                 | 1.05x    | 607 MB/s      | -            | 0.52x       | 304 MB/s            | -                  |
+| [enwik9](https://files.klauspost.com/compress/enwik9.zst)                                               | 19.28x   | 8440 MB/s     | 4.04%        | 9.31x       | 4076 MB/s           | 18.04%             |
+| (1 CPU)                                                                                                 | 1.12x    | 488 MB/s      | -            | 0.57x       | 250 MB/s            | -                  |
+
+### Legend
+
+* `S2 Speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core.
+* `S2 Throughput`: Throughput of S2 in MB/s. 
+* `S2 % smaller`: How many percent of the Snappy output size is S2 better.
+* `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy. 
+* `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy. 
+* `"better" % smaller`: How many percent of the Snappy output size is S2 better when using "better" compression.
+
+There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads.
+
+Machine generated data gets by far the biggest compression boost, with size being reduced by up to 35% of Snappy size.
+
+The "better" compression mode sees a good improvement in all cases, but usually at a performance cost.
+
+Incompressible content (`sharnd.out.2gb`, 2GB random data) sees the smallest speedup. 
+This is likely dominated by synchronization overhead, which is confirmed by the fact that single threaded performance is higher (see above). 
+
+## Decompression
+
+S2 attempts to create content that is also fast to decompress, except in "better" mode where the smallest representation is used.
+
+S2 vs Snappy **decompression** speed. Both operating on single core:
+
+| File                                                                                                | S2 Throughput | vs. Snappy | Better Throughput | vs. Snappy |
+|-----------------------------------------------------------------------------------------------------|---------------|------------|-------------------|------------|
+| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z)                    | 2117 MB/s     | 1.14x      | 1738 MB/s         | 0.94x      |
+| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 2401 MB/s     | 1.25x      | 2307 MB/s         | 1.20x      |
+| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst)         | 2075 MB/s     | 0.98x      | 1764 MB/s         | 0.83x      |
+| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst)                     | 2967 MB/s     | 1.05x      | 2885 MB/s         | 1.02x      |
+| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst)                             | 4141 MB/s     | 1.07x      | 4184 MB/s         | 1.08x      |
+| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z)                                    | 2264 MB/s     | 1.12x      | 2185 MB/s         | 1.08x      |
+| [10gb.tar](http://mattmahoney.net/dc/10gb.html)                                                     | 1525 MB/s     | 1.03x      | 1347 MB/s         | 0.91x      |
+| sharnd.out.2gb                                                                                      | 3813 MB/s     | 0.79x      | 3900 MB/s         | 0.81x      |
+| [enwik9](http://mattmahoney.net/dc/textdata.html)                                                   | 1246 MB/s     | 1.29x      | 967 MB/s          | 1.00x      |
+| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip)                                    | 1433 MB/s     | 1.12x      | 1203 MB/s         | 0.94x      |
+| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results)                                 | 1284 MB/s     | 1.32x      | 1010 MB/s         | 1.04x      |
+
+### Legend
+
+* `S2 Throughput`: Decompression speed of S2 encoded content.
+* `Better Throughput`: Decompression speed of S2 "better" encoded content.
+* `vs Snappy`: Decompression speed of S2 "better" mode compared to Snappy and absolute speed.
+
+
+While the decompression code hasn't changed, there is a significant speedup in decompression speed. 
+S2 prefers longer matches and will typically only find matches that are 6 bytes or longer. 
+While this reduces compression a bit, it improves decompression speed.
+
+The "better" compression mode will actively look for shorter matches, which is why it has a decompression speed quite similar to Snappy.   
+
+Without assembly decompression is also very fast; single goroutine decompression speed. No assembly:
+
+| File                           | S2 Throughput | S2 throughput |
+|--------------------------------|---------------|---------------|
+| consensus.db.10gb.s2           | 1.84x         | 2289.8 MB/s   |
+| 10gb.tar.s2                    | 1.30x         | 867.07 MB/s   |
+| rawstudio-mint14.tar.s2        | 1.66x         | 1329.65 MB/s  |
+| github-june-2days-2019.json.s2 | 2.36x         | 1831.59 MB/s  |
+| github-ranks-backup.bin.s2     | 1.73x         | 1390.7 MB/s   |
+| enwik9.s2                      | 1.67x         | 681.53 MB/s   |
+| adresser.json.s2               | 3.41x         | 4230.53 MB/s  |
+| silesia.tar.s2                 | 1.52x         | 811.58        |
+
+Even though S2 typically compresses better than Snappy, decompression speed is always better. 
+
+### Concurrent Stream Decompression
+
+For full stream decompression S2 offers a [DecodeConcurrent](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.DecodeConcurrent) 
+that will decode a full stream using multiple goroutines.
+
+Example scaling, AMD Ryzen 3950X, 16 cores, decompression using `s2d -bench=3 <input>`, best of 3: 
+
+| Input                                     | `-cpu=1`   | `-cpu=2`   | `-cpu=4`   | `-cpu=8`   | `-cpu=16`   |
+|-------------------------------------------|------------|------------|------------|------------|-------------|
+| enwik10.snappy                            | 1098.6MB/s | 1819.8MB/s | 3625.6MB/s | 6910.6MB/s | 10818.2MB/s |
+| enwik10.s2                                | 1303.5MB/s | 2606.1MB/s | 4847.9MB/s | 8878.4MB/s | 9592.1MB/s  |
+| sofia-air-quality-dataset.tar.snappy      | 1302.0MB/s | 2165.0MB/s | 4244.5MB/s | 8241.0MB/s | 12920.5MB/s |
+| sofia-air-quality-dataset.tar.s2          | 1399.2MB/s | 2463.2MB/s | 5196.5MB/s | 9639.8MB/s | 11439.5MB/s |
+| sofia-air-quality-dataset.tar.s2 (no asm) | 837.5MB/s  | 1652.6MB/s | 3183.6MB/s | 5945.0MB/s | 9620.7MB/s  |
+
+Scaling can be expected to be pretty linear until memory bandwidth is saturated. 
+
+For now the DecodeConcurrent can only be used for full streams without seeking or combining with regular reads.
+
+## Block compression
+
+
+When compressing blocks no concurrent compression is performed just as Snappy. 
+This is because blocks are for smaller payloads and generally will not benefit from concurrent compression.
+
+An important change is that incompressible blocks will not be more than at most 10 bytes bigger than the input.
+In rare, worst case scenario Snappy blocks could be significantly bigger than the input.  
+
+### Mixed content blocks
+
+The most reliable is a wide dataset. 
+For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
+53927 files, total input size: 4,014,735,833 bytes. Single goroutine used.
+
+| *                 | Input      | Output     | Reduction  | MB/s       |
+|-------------------|------------|------------|------------|------------|
+| S2                | 4014735833 | 1059723369 | 73.60%     | **936.73** |
+| S2 Better         | 4014735833 | 961580539  | 76.05%     | 451.10     |
+| S2 Best           | 4014735833 | 899182886  | **77.60%** | 46.84      |
+| Snappy            | 4014735833 | 1128706759 | 71.89%     | 790.15     |
+| S2, Snappy Output | 4014735833 | 1093823291 | 72.75%     | 936.60     |
+| LZ4               | 4014735833 | 1063768713 | 73.50%     | 452.02     |
+
+S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best".
+"Better" mode provides the same compression speed as LZ4 with better compression ratio. 
+
+When outputting Snappy compatible output it still delivers better throughput (150MB/s more) and better compression.
+
+As can be seen from the other benchmarks decompression should also be easier on the S2 generated output.
+
+Though they cannot be compared due to different decompression speeds here are the speed/size comparisons for
+other Go compressors:
+
+| *                 | Input      | Output     | Reduction | MB/s   |
+|-------------------|------------|------------|-----------|--------|
+| Zstd Fastest (Go) | 4014735833 | 794608518  | 80.21%    | 236.04 |
+| Zstd Best (Go)    | 4014735833 | 704603356  | 82.45%    | 35.63  |
+| Deflate (Go) l1   | 4014735833 | 871294239  | 78.30%    | 214.04 |
+| Deflate (Go) l9   | 4014735833 | 730389060  | 81.81%    | 41.17  |
+
+### Standard block compression
+
+Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns.
+So individual benchmarks should only be seen as a guideline and the overall picture is more important.
+
+These micro-benchmarks are with data in cache and trained branch predictors. For a more realistic benchmark see the mixed content above. 
+
+Block compression. Parallel benchmark running on 16 cores, 16 goroutines.
+
+AMD64 assembly is use for both S2 and Snappy.
+
+| Absolute Perf         | Snappy size | S2 Size | Snappy Speed | S2 Speed    | Snappy dec  | S2 dec      |
+|-----------------------|-------------|---------|--------------|-------------|-------------|-------------|
+| html                  | 22843       | 20868   | 16246 MB/s   | 18617 MB/s  | 40972 MB/s  | 49263 MB/s  |
+| urls.10K              | 335492      | 286541  | 7943 MB/s    | 10201 MB/s  | 22523 MB/s  | 26484 MB/s  |
+| fireworks.jpeg        | 123034      | 123100  | 349544 MB/s  | 303228 MB/s | 718321 MB/s | 827552 MB/s |
+| fireworks.jpeg (200B) | 146         | 155     | 8869 MB/s    | 20180 MB/s  | 33691 MB/s  | 52421 MB/s  |
+| paper-100k.pdf        | 85304       | 84202   | 167546 MB/s  | 112988 MB/s | 326905 MB/s | 291944 MB/s |
+| html_x_4              | 92234       | 20870   | 15194 MB/s   | 54457 MB/s  | 30843 MB/s  | 32217 MB/s  |
+| alice29.txt           | 88034       | 85934   | 5936 MB/s    | 6540 MB/s   | 12882 MB/s  | 20044 MB/s  |
+| asyoulik.txt          | 77503       | 79575   | 5517 MB/s    | 6657 MB/s   | 12735 MB/s  | 22806 MB/s  |
+| lcet10.txt            | 234661      | 220383  | 6235 MB/s    | 6303 MB/s   | 14519 MB/s  | 18697 MB/s  |
+| plrabn12.txt          | 319267      | 318196  | 5159 MB/s    | 6074 MB/s   | 11923 MB/s  | 19901 MB/s  |
+| geo.protodata         | 23335       | 18606   | 21220 MB/s   | 25432 MB/s  | 56271 MB/s  | 62540 MB/s  |
+| kppkn.gtb             | 69526       | 65019   | 9732 MB/s    | 8905 MB/s   | 18491 MB/s  | 18969 MB/s  |
+| alice29.txt (128B)    | 80          | 82      | 6691 MB/s    | 17179 MB/s  | 31883 MB/s  | 38874 MB/s  |
+| alice29.txt (1000B)   | 774         | 774     | 12204 MB/s   | 13273 MB/s  | 48056 MB/s  | 52341 MB/s  |
+| alice29.txt (10000B)  | 6648        | 6933    | 10044 MB/s   | 12824 MB/s  | 32378 MB/s  | 46322 MB/s  |
+| alice29.txt (20000B)  | 12686       | 13516   | 7733 MB/s    | 12160 MB/s  | 30566 MB/s  | 58969 MB/s  |
+
+
+Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size. 
+
+Decompression speed is better than Snappy, except in one case. 
+
+Since payloads are very small the variance in terms of size is rather big, so they should only be seen as a general guideline.
+
+Size is on average around Snappy, but varies on content type. 
+In cases where compression is worse, it usually is compensated by a speed boost. 
+
+
+### Better compression
+
+Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns.
+So individual benchmarks should only be seen as a guideline and the overall picture is more important.
+
+| Absolute Perf         | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec  | Better dec  |
+|-----------------------|-------------|-------------|--------------|--------------|-------------|-------------|
+| html                  | 22843       | 18972       | 16246 MB/s   | 8621 MB/s    | 40972 MB/s  | 40292 MB/s  |
+| urls.10K              | 335492      | 248079      | 7943 MB/s    | 5104 MB/s    | 22523 MB/s  | 20981 MB/s  |
+| fireworks.jpeg        | 123034      | 123100      | 349544 MB/s  | 84429 MB/s   | 718321 MB/s | 823698 MB/s |
+| fireworks.jpeg (200B) | 146         | 149         | 8869 MB/s    | 7125 MB/s    | 33691 MB/s  | 30101 MB/s  |
+| paper-100k.pdf        | 85304       | 82887       | 167546 MB/s  | 11087 MB/s   | 326905 MB/s | 198869 MB/s |
+| html_x_4              | 92234       | 18982       | 15194 MB/s   | 29316 MB/s   | 30843 MB/s  | 30937 MB/s  |
+| alice29.txt           | 88034       | 71611       | 5936 MB/s    | 3709 MB/s    | 12882 MB/s  | 16611 MB/s  |
+| asyoulik.txt          | 77503       | 65941       | 5517 MB/s    | 3380 MB/s    | 12735 MB/s  | 14975 MB/s  |
+| lcet10.txt            | 234661      | 184939      | 6235 MB/s    | 3537 MB/s    | 14519 MB/s  | 16634 MB/s  |
+| plrabn12.txt          | 319267      | 264990      | 5159 MB/s    | 2960 MB/s    | 11923 MB/s  | 13382 MB/s  |
+| geo.protodata         | 23335       | 17689       | 21220 MB/s   | 10859 MB/s   | 56271 MB/s  | 57961 MB/s  |
+| kppkn.gtb             | 69526       | 55398       | 9732 MB/s    | 5206 MB/s    | 18491 MB/s  | 16524 MB/s  |
+| alice29.txt (128B)    | 80          | 78          | 6691 MB/s    | 7422 MB/s    | 31883 MB/s  | 34225 MB/s  |
+| alice29.txt (1000B)   | 774         | 746         | 12204 MB/s   | 5734 MB/s    | 48056 MB/s  | 42068 MB/s  |
+| alice29.txt (10000B)  | 6648        | 6218        | 10044 MB/s   | 6055 MB/s    | 32378 MB/s  | 28813 MB/s  |
+| alice29.txt (20000B)  | 12686       | 11492       | 7733 MB/s    | 3143 MB/s    | 30566 MB/s  | 27315 MB/s  |
+
+
+Except for the mostly incompressible JPEG image compression is better and usually in the 
+double digits in terms of percentage reduction over Snappy.
+
+The PDF sample shows a significant slowdown compared to Snappy, as this mode tries harder 
+to compress the data. Very small blocks are also not favorable for better compression, so throughput is way down.
+
+This mode aims to provide better compression at the expense of performance and achieves that 
+without a huge performance penalty, except on very small blocks. 
+
+Decompression speed suffers a little compared to the regular S2 mode, 
+but still manages to be close to Snappy in spite of increased compression.  
+ 
+# Best compression mode
+
+S2 offers a "best" compression mode. 
+
+This will compress as much as possible with little regard to CPU usage.
+
+Mainly for offline compression, but where decompression speed should still
+be high and compatible with other S2 compressed data.
+
+Some examples compared on 16 core CPU, amd64 assembly used:
+
+```
+* enwik10
+Default... 10000000000 -> 4759950115 [47.60%]; 1.03s, 9263.0MB/s
+Better...  10000000000 -> 4084706676 [40.85%]; 2.16s, 4415.4MB/s
+Best...    10000000000 -> 3615520079 [36.16%]; 42.259s, 225.7MB/s
+
+* github-june-2days-2019.json
+Default... 6273951764 -> 1041700255 [16.60%]; 431ms, 13882.3MB/s
+Better...  6273951764 -> 945841238 [15.08%]; 547ms, 10938.4MB/s
+Best...    6273951764 -> 826392576 [13.17%]; 9.455s, 632.8MB/s
+
+* nyc-taxi-data-10M.csv
+Default... 3325605752 -> 1093516949 [32.88%]; 324ms, 9788.7MB/s
+Better...  3325605752 -> 885394158 [26.62%]; 491ms, 6459.4MB/s
+Best...    3325605752 -> 773681257 [23.26%]; 8.29s, 412.0MB/s
+
+* 10gb.tar
+Default... 10065157632 -> 5915541066 [58.77%]; 1.028s, 9337.4MB/s
+Better...  10065157632 -> 5453844650 [54.19%]; 1.597s, 4862.7MB/s
+Best...    10065157632 -> 5192495021 [51.59%]; 32.78s, 308.2MB/
+
+* consensus.db.10gb
+Default... 10737418240 -> 4549762344 [42.37%]; 882ms, 12118.4MB/s
+Better...  10737418240 -> 4438535064 [41.34%]; 1.533s, 3500.9MB/s
+Best...    10737418240 -> 4210602774 [39.21%]; 42.96s, 254.4MB/s
+```
+
+Decompression speed should be around the same as using the 'better' compression mode. 
+
+## Dictionaries
+
+*Note: S2 dictionary compression is currently at an early implementation stage, with no assembly for
+neither encoding nor decoding. Performance improvements can be expected in the future.*
+
+Adding dictionaries allow providing a custom dictionary that will serve as lookup in the beginning of blocks.
+
+The same dictionary *must* be used for both encoding and decoding. 
+S2 does not keep track of whether the same dictionary is used,
+and using the wrong dictionary will most often not result in an error when decompressing.
+
+Blocks encoded *without* dictionaries can be decompressed seamlessly *with* a dictionary.
+This means it is possible to switch from an encoding without dictionaries to an encoding with dictionaries
+and treat the blocks similarly.
+
+Similar to [zStandard dictionaries](https://github.com/facebook/zstd#the-case-for-small-data-compression), 
+the same usage scenario applies to S2 dictionaries.  
+
+> Training works if there is some correlation in a family of small data samples. The more data-specific a dictionary is, the more efficient it is (there is no universal dictionary). Hence, deploying one dictionary per type of data will provide the greatest benefits. Dictionary gains are mostly effective in the first few KB. Then, the compression algorithm will gradually use previously decoded content to better compress the rest of the file.
+
+S2 further limits the dictionary to only be enabled on the first 64KB of a block.
+This will remove any negative (speed) impacts of the dictionaries on bigger blocks. 
+
+### Compression
+
+Using the [github_users_sample_set](https://github.com/facebook/zstd/releases/download/v1.1.3/github_users_sample_set.tar.zst) 
+and a 64KB dictionary trained with zStandard the following sizes can be achieved. 
+
+|                    | Default          | Better           | Best                  |
+|--------------------|------------------|------------------|-----------------------|
+| Without Dictionary | 3362023 (44.92%) | 3083163 (41.19%) | 3057944 (40.86%)      |
+| With Dictionary    | 921524 (12.31%)  | 873154 (11.67%)  | 785503 bytes (10.49%) |
+
+So for highly repetitive content, this case provides an almost 3x reduction in size.
+
+For less uniform data we will use the Go source code tree.
+Compressing First 64KB of all `.go` files in `go/src`, Go 1.19.5, 8912 files, 51253563 bytes input:
+
+|                    | Default           | Better            | Best              |
+|--------------------|-------------------|-------------------|-------------------|
+| Without Dictionary | 22955767 (44.79%) | 20189613 (39.39%  | 19482828 (38.01%) |
+| With Dictionary    | 19654568 (38.35%) | 16289357 (31.78%) | 15184589 (29.63%) |
+| Saving/file        | 362 bytes         | 428 bytes         | 472 bytes         |
+
+
+### Creating Dictionaries
+
+There are no tools to create dictionaries in S2. 
+However, there are multiple ways to create a useful dictionary:
+
+#### Using a Sample File
+
+If your input is very uniform, you can just use a sample file as the dictionary.
+
+For example in the `github_users_sample_set` above, the average compression only goes up from 
+10.49% to 11.48% by using the first file as dictionary compared to using a dedicated dictionary.
+
+```Go
+    // Read a sample
+    sample, err := os.ReadFile("sample.json")
+
+    // Create a dictionary.
+    dict := s2.MakeDict(sample, nil)
+	
+    // b := dict.Bytes() will provide a dictionary that can be saved
+    // and reloaded with s2.NewDict(b).
+	
+    // To encode:
+    encoded := dict.Encode(nil, file)
+
+    // To decode:
+    decoded, err := dict.Decode(nil, file)
+```
+
+#### Using Zstandard
+
+Zstandard dictionaries can easily be converted to S2 dictionaries.
+
+This can be helpful to generate dictionaries for files that don't have a fixed structure.
+
+
+Example, with training set files  placed in `./training-set`: 
+
+`λ zstd -r --train-fastcover training-set/* --maxdict=65536 -o name.dict`
+
+This will create a dictionary of 64KB, that can be converted to a dictionary like this:
+
+```Go
+    // Decode the Zstandard dictionary.
+    insp, err := zstd.InspectDictionary(zdict)
+    if err != nil {
+        panic(err)
+    }
+	
+    // We are only interested in the contents.
+    // Assume that files start with "// Copyright (c) 2023".
+    // Search for the longest match for that.
+    // This may save a few bytes.
+    dict := s2.MakeDict(insp.Content(), []byte("// Copyright (c) 2023"))
+
+    // b := dict.Bytes() will provide a dictionary that can be saved
+    // and reloaded with s2.NewDict(b).
+
+    // We can now encode using this dictionary
+    encodedWithDict := dict.Encode(nil, payload)
+
+    // To decode content:
+    decoded, err := dict.Decode(nil, encodedWithDict)
+```
+
+It is recommended to save the dictionary returned by ` b:= dict.Bytes()`, since that will contain only the S2 dictionary.
+
+This dictionary can later be loaded using `s2.NewDict(b)`. The dictionary then no longer requires `zstd` to be initialized.
+
+Also note how `s2.MakeDict` allows you to search for a common starting sequence of your files.
+This can be omitted, at the expense of a few bytes.
+
+# Snappy Compatibility
+
+S2 now offers full compatibility with Snappy.
+
+This means that the efficient encoders of S2 can be used to generate fully Snappy compatible output.
+
+There is a [snappy](https://github.com/klauspost/compress/tree/master/snappy) package that can be used by
+simply changing imports from `github.com/golang/snappy` to `github.com/klauspost/compress/snappy`.
+This uses "better" mode for all operations.
+If you would like more control, you can use the s2 package as described below: 
+
+## Blocks
+
+Snappy compatible blocks can be generated with the S2 encoder. 
+Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace 
+
+| Snappy                    | S2 replacement        |
+|---------------------------|-----------------------|
+| snappy.Encode(...)        | s2.EncodeSnappy(...)  |
+| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) |
+
+`s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output. 
+
+`s2.ConcatBlocks` is compatible with snappy blocks.
+
+Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
+53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used:
+
+| Encoder               | Size       | MB/s       | Reduction  |
+|-----------------------|------------|------------|------------|
+| snappy.Encode         | 1128706759 | 725.59     | 71.89%     |
+| s2.EncodeSnappy       | 1093823291 | **899.16** | 72.75%     |
+| s2.EncodeSnappyBetter | 1001158548 | 578.49     | 75.06%     |
+| s2.EncodeSnappyBest   | 944507998  | 66.00      | **76.47%** |
+
+## Streams
+
+For streams, replace `enc = snappy.NewBufferedWriter(w)` with `enc = s2.NewWriter(w, s2.WriterSnappyCompat())`.
+All other options are available, but note that block size limit is different for snappy.
+
+Comparison of different streams, AMD Ryzen 3950x, 16 cores. Size and throughput: 
+
+| File                        | snappy.NewWriter         | S2 Snappy                 | S2 Snappy, Better        | S2 Snappy, Best         |
+|-----------------------------|--------------------------|---------------------------|--------------------------|-------------------------|
+| nyc-taxi-data-10M.csv       | 1316042016 - 539.47MB/s  | 1307003093 - 10132.73MB/s | 1174534014 - 5002.44MB/s | 1115904679 - 177.97MB/s |
+| enwik10 (xml)               | 5088294643 - 451.13MB/s  | 5175840939 -  9440.69MB/s | 4560784526 - 4487.21MB/s | 4340299103 - 158.92MB/s |
+| 10gb.tar (mixed)            | 6056946612 - 729.73MB/s  | 6208571995 -  9978.05MB/s | 5741646126 - 4919.98MB/s | 5548973895 - 180.44MB/s |
+| github-june-2days-2019.json | 1525176492 - 933.00MB/s  | 1476519054 - 13150.12MB/s | 1400547532 - 5803.40MB/s | 1321887137 - 204.29MB/s |
+| consensus.db.10gb (db)      | 5412897703 - 1102.14MB/s | 5354073487 - 13562.91MB/s | 5335069899 - 5294.73MB/s | 5201000954 - 175.72MB/s |
+
+# Decompression
+
+All decompression functions map directly to equivalent s2 functions.
+
+| Snappy                 | S2 replacement     |
+|------------------------|--------------------|
+| snappy.Decode(...)     | s2.Decode(...)     |
+| snappy.DecodedLen(...) | s2.DecodedLen(...) |
+| snappy.NewReader(...)  | s2.NewReader(...)  |
+
+Features like [quick forward skipping without decompression](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.Skip)
+are also available for Snappy streams.
+
+If you know you are only decompressing snappy streams, setting [`ReaderMaxBlockSize(64<<10)`](https://pkg.go.dev/github.com/klauspost/compress/s2#ReaderMaxBlockSize)
+on your Reader will reduce memory consumption.
+
+# Concatenating blocks and streams.
+
+Concatenating streams will concatenate the output of both without recompressing them. 
+While this is inefficient in terms of compression it might be usable in certain scenarios. 
+The 10 byte 'stream identifier' of the second stream can optionally be stripped, but it is not a requirement.
+
+Blocks can be concatenated using the `ConcatBlocks` function.
+
+Snappy blocks/streams can safely be concatenated with S2 blocks and streams.
+Streams with indexes (see below) will currently not work on concatenated streams.
+
+# Stream Seek Index
+
+S2 and Snappy streams can have indexes. These indexes will allow random seeking within the compressed data.
+
+The index can either be appended to the stream as a skippable block or returned for separate storage.
+
+When the index is appended to a stream it will be skipped by regular decoders, 
+so the output remains compatible with other decoders. 
+
+## Creating an Index
+
+To automatically add an index to a stream, add `WriterAddIndex()` option to your writer.
+Then the index will be added to the stream when `Close()` is called.
+
+```
+	// Add Index to stream...
+	enc := s2.NewWriter(w, s2.WriterAddIndex())
+	io.Copy(enc, r)
+	enc.Close()
+```
+
+If you want to store the index separately, you can use `CloseIndex()` instead of the regular `Close()`.
+This will return the index. Note that `CloseIndex()` should only be called once, and you shouldn't call `Close()`.
+
+```
+	// Get index for separate storage... 
+	enc := s2.NewWriter(w)
+	io.Copy(enc, r)
+	index, err := enc.CloseIndex()
+```
+
+The `index` can then be used needing to read from the stream. 
+This means the index can be used without needing to seek to the end of the stream 
+or for manually forwarding streams. See below.
+
+Finally, an existing S2/Snappy stream can be indexed using the `s2.IndexStream(r io.Reader)` function.
+
+## Using Indexes
+
+To use indexes there is a `ReadSeeker(random bool, index []byte) (*ReadSeeker, error)` function available.
+
+Calling ReadSeeker will return an [io.ReadSeeker](https://pkg.go.dev/io#ReadSeeker) compatible version of the reader.
+
+If 'random' is specified the returned io.Seeker can be used for random seeking, otherwise only forward seeking is supported.
+Enabling random seeking requires the original input to support the [io.Seeker](https://pkg.go.dev/io#Seeker) interface.
+
+```
+	dec := s2.NewReader(r)
+	rs, err := dec.ReadSeeker(false, nil)
+	rs.Seek(wantOffset, io.SeekStart)	
+```
+
+Get a seeker to seek forward. Since no index is provided, the index is read from the stream.
+This requires that an index was added and that `r` supports the [io.Seeker](https://pkg.go.dev/io#Seeker) interface.
+
+A custom index can be specified which will be used if supplied.
+When using a custom index, it will not be read from the input stream.
+
+```
+	dec := s2.NewReader(r)
+	rs, err := dec.ReadSeeker(false, index)
+	rs.Seek(wantOffset, io.SeekStart)	
+```
+
+This will read the index from `index`. Since we specify non-random (forward only) seeking `r` does not have to be an io.Seeker
+
+```
+	dec := s2.NewReader(r)
+	rs, err := dec.ReadSeeker(true, index)
+	rs.Seek(wantOffset, io.SeekStart)	
+```
+
+Finally, since we specify that we want to do random seeking `r` must be an io.Seeker. 
+
+The returned [ReadSeeker](https://pkg.go.dev/github.com/klauspost/compress/s2#ReadSeeker) contains a shallow reference to the existing Reader,
+meaning changes performed to one is reflected in the other.
+
+To check if a stream contains an index at the end, the `(*Index).LoadStream(rs io.ReadSeeker) error` can be used.
+
+## Manually Forwarding Streams
+
+Indexes can also be read outside the decoder using the [Index](https://pkg.go.dev/github.com/klauspost/compress/s2#Index) type.
+This can be used for parsing indexes, either separate or in streams.
+
+In some cases it may not be possible to serve a seekable stream.
+This can for instance be an HTTP stream, where the Range request 
+is sent at the start of the stream. 
+
+With a little bit of extra code it is still possible to use indexes
+to forward to specific offset with a single forward skip. 
+
+It is possible to load the index manually like this: 
+```
+	var index s2.Index
+	_, err = index.Load(idxBytes)
+```
+
+This can be used to figure out how much to offset the compressed stream:
+
+```
+	compressedOffset, uncompressedOffset, err := index.Find(wantOffset)
+```
+
+The `compressedOffset` is the number of bytes that should be skipped 
+from the beginning of the compressed file.
+
+The `uncompressedOffset` will then be offset of the uncompressed bytes returned
+when decoding from that position. This will always be <= wantOffset.
+
+When creating a decoder it must be specified that it should *not* expect a stream identifier
+at the beginning of the stream. Assuming the io.Reader `r` has been forwarded to `compressedOffset`
+we create the decoder like this:
+
+```
+	dec := s2.NewReader(r, s2.ReaderIgnoreStreamIdentifier())
+```
+
+We are not completely done. We still need to forward the stream the uncompressed bytes we didn't want.
+This is done using the regular "Skip" function:
+
+```
+	err = dec.Skip(wantOffset - uncompressedOffset)
+```
+
+This will ensure that we are at exactly the offset we want, and reading from `dec` will start at the requested offset.
+
+# Compact storage
+
+For compact storage [RemoveIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RemoveIndexHeaders) can be used to remove any redundant info from 
+a serialized index. If you remove the header it must be restored before [Loading](https://pkg.go.dev/github.com/klauspost/compress/s2#Index.Load).
+
+This is expected to save 20 bytes. These can be restored using [RestoreIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RestoreIndexHeaders). This removes a layer of security, but is the most compact representation. Returns nil if headers contains errors.
+
+## Index Format:
+
+Each block is structured as a snappy skippable block, with the chunk ID 0x99.
+
+The block can be read from the front, but contains information so it can be read from the back as well.
+
+Numbers are stored as fixed size little endian values or [zigzag encoded](https://developers.google.com/protocol-buffers/docs/encoding#signed_integers) [base 128 varints](https://developers.google.com/protocol-buffers/docs/encoding), 
+with un-encoded value length of 64 bits, unless other limits are specified. 
+
+| Content                              | Format                                                                                                                        |
+|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
+| ID, `[1]byte`                        | Always 0x99.                                                                                                                  |
+| Data Length, `[3]byte`               | 3 byte little-endian length of the chunk in bytes, following this.                                                            |
+| Header `[6]byte`                     | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00".                                                        |
+| UncompressedSize, Varint             | Total Uncompressed size.                                                                                                      |
+| CompressedSize, Varint               | Total Compressed size if known. Should be -1 if unknown.                                                                      |
+| EstBlockSize, Varint                 | Block Size, used for guessing uncompressed offsets. Must be >= 0.                                                             |
+| Entries, Varint                      | Number of Entries in index, must be < 65536 and >=0.                                                                          |
+| HasUncompressedOffsets `byte`        | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid.                                             |
+| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode.                                                                                |
+| CompressedOffsets, [Entries]VarInt   | Compressed offsets. See below how to decode.                                                                                  |
+| Block Size, `[4]byte`                | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block.       |
+| Trailer `[6]byte`                    | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. |
+
+For regular streams the uncompressed offsets are fully predictable,
+so `HasUncompressedOffsets` allows to specify that compressed blocks all have 
+exactly `EstBlockSize` bytes of uncompressed content.
+
+Entries *must* be in order, starting with the lowest offset, 
+and there *must* be no uncompressed offset duplicates.  
+Entries *may* point to the start of a skippable block, 
+but it is then not allowed to also have an entry for the next block since 
+that would give an uncompressed offset duplicate.
+
+There is no requirement for all blocks to be represented in the index. 
+In fact there is a maximum of 65536 block entries in an index.
+
+The writer can use any method to reduce the number of entries.
+An implicit block start at 0,0 can be assumed.
+
+### Decoding entries:
+
+```
+// Read Uncompressed entries.
+// Each assumes EstBlockSize delta from previous.
+for each entry {
+    uOff = 0
+    if HasUncompressedOffsets == 1 {
+        uOff = ReadVarInt // Read value from stream
+    }
+   
+    // Except for the first entry, use previous values.
+    if entryNum == 0 {
+        entry[entryNum].UncompressedOffset = uOff
+        continue
+    }
+    
+    // Uncompressed uses previous offset and adds EstBlockSize
+    entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize + uOff
+}
+
+
+// Guess that the first block will be 50% of uncompressed size.
+// Integer truncating division must be used.
+CompressGuess := EstBlockSize / 2
+
+// Read Compressed entries.
+// Each assumes CompressGuess delta from previous.
+// CompressGuess is adjusted for each value.
+for each entry {
+    cOff = ReadVarInt // Read value from stream
+    
+    // Except for the first entry, use previous values.
+    if entryNum == 0 {
+        entry[entryNum].CompressedOffset = cOff
+        continue
+    }
+    
+    // Compressed uses previous and our estimate.
+    entry[entryNum].CompressedOffset = entry[entryNum-1].CompressedOffset + CompressGuess + cOff
+        
+     // Adjust compressed offset for next loop, integer truncating division must be used. 
+     CompressGuess += cOff/2               
+}
+```
+
+To decode from any given uncompressed offset `(wantOffset)`:
+
+* Iterate entries until `entry[n].UncompressedOffset > wantOffset`.
+* Start decoding from `entry[n-1].CompressedOffset`.
+* Discard `entry[n-1].UncompressedOffset - wantOffset` bytes from the decoded stream.
+
+See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface.
+
+
+# Format Extensions
+
+* Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`.
+* [Framed compressed blocks](https://github.com/google/snappy/blob/master/format_description.txt) can be up to 4MB (up from 64KB).
+* Compressed blocks can have an offset of `0`, which indicates to repeat the last seen offset.
+
+Repeat offsets must be encoded as a [2.2.1. Copy with 1-byte offset (01)](https://github.com/google/snappy/blob/master/format_description.txt#L89), where the offset is 0.
+
+The length is specified by reading the 3-bit length specified in the tag and decode using this table:
+
+| Length | Actual Length        |
+|--------|----------------------|
+| 0      | 4                    |
+| 1      | 5                    |
+| 2      | 6                    |
+| 3      | 7                    |
+| 4      | 8                    |
+| 5      | 8 + read 1 byte      |
+| 6      | 260 + read 2 bytes   |
+| 7      | 65540 + read 3 bytes |
+
+This allows any repeat offset + length to be represented by 2 to 5 bytes.
+It also allows to emit matches longer than 64 bytes with one copy + one repeat instead of several 64 byte copies.
+
+Lengths are stored as little endian values.
+
+The first copy of a block cannot be a repeat offset and the offset is reset on every block in streams.
+
+Default streaming block size is 1MB.
+
+# Dictionary Encoding
+
+Adding dictionaries allow providing a custom dictionary that will serve as lookup in the beginning of blocks.
+
+A dictionary provides an initial repeat value that can be used to point to a common header.
+
+Other than that the dictionary contains values that can be used as back-references.
+
+Often used data should be placed at the *end* of the dictionary since offsets < 2048 bytes will be smaller.
+
+## Format
+
+Dictionary *content* must at least 16 bytes and less or equal to 64KiB (65536 bytes).
+
+Encoding: `[repeat value (uvarint)][dictionary content...]`
+
+Before the dictionary content, an unsigned base-128 (uvarint) encoded value specifying the initial repeat offset.
+This value is an offset into the dictionary content and not a back-reference offset,
+so setting this to 0 will make the repeat value point to the first value of the dictionary.
+
+The value must be less than the dictionary length-8
+
+## Encoding
+
+From the decoder point of view the dictionary content is seen as preceding the encoded content.
+
+`[dictionary content][decoded output]`
+
+Backreferences to the dictionary are encoded as ordinary backreferences that have an offset before the start of the decoded block.
+
+Matches copying from the dictionary are **not** allowed to cross from the dictionary into the decoded data.
+However, if a copy ends at the end of the dictionary the next repeat will point to the start of the decoded buffer, which is allowed.
+
+The first match can be a repeat value, which will use the repeat offset stored in the dictionary.
+
+When 64KB (65536 bytes) has been en/decoded it is no longer allowed to reference the dictionary, 
+neither by a copy nor repeat operations. 
+If the boundary is crossed while copying from the dictionary, the operation should complete, 
+but the next instruction is not allowed to reference the dictionary.
+
+Valid blocks encoded *without* a dictionary can be decoded with any dictionary. 
+There are no checks whether the supplied dictionary is the correct for a block.
+Because of this there is no overhead by using a dictionary.
+
+## Example
+
+This is the dictionary content. Elements are separated by `[]`.
+
+Dictionary: `[0x0a][Yesterday 25 bananas were added to Benjamins brown bag]`.
+
+Initial repeat offset is set at 10, which is the letter `2`.
+
+Encoded `[LIT "10"][REPEAT len=10][LIT "hich"][MATCH off=50 len=6][MATCH off=31 len=6][MATCH off=61 len=10]`
+
+Decoded: `[10][ bananas w][hich][ were ][brown ][were added]`
+
+Output: `10 bananas which were brown were added`
+
+
+## Streams
+
+For streams each block can use the dictionary.
+
+The dictionary cannot not currently be provided on the stream.
+
+
+# LICENSE
+
+This code is based on the [Snappy-Go](https://github.com/golang/snappy) implementation.
+
+Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go
new file mode 100644
index 0000000..264ffd0
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode.go
@@ -0,0 +1,443 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"strconv"
+
+	"github.com/klauspost/compress/internal/race"
+)
+
+var (
+	// ErrCorrupt reports that the input is invalid.
+	ErrCorrupt = errors.New("s2: corrupt input")
+	// ErrCRC reports that the input failed CRC validation (streams only)
+	ErrCRC = errors.New("s2: corrupt input, crc mismatch")
+	// ErrTooLarge reports that the uncompressed length is too large.
+	ErrTooLarge = errors.New("s2: decoded block is too large")
+	// ErrUnsupported reports that the input isn't supported.
+	ErrUnsupported = errors.New("s2: unsupported input")
+)
+
+// DecodedLen returns the length of the decoded block.
+func DecodedLen(src []byte) (int, error) {
+	v, _, err := decodedLen(src)
+	return v, err
+}
+
+// decodedLen returns the length of the decoded block and the number of bytes
+// that the length header occupied.
+func decodedLen(src []byte) (blockLen, headerLen int, err error) {
+	v, n := binary.Uvarint(src)
+	if n <= 0 || v > 0xffffffff {
+		return 0, 0, ErrCorrupt
+	}
+
+	const wordSize = 32 << (^uint(0) >> 32 & 1)
+	if wordSize == 32 && v > 0x7fffffff {
+		return 0, 0, ErrTooLarge
+	}
+	return int(v), n, nil
+}
+
+const (
+	decodeErrCodeCorrupt = 1
+)
+
+// Decode returns the decoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire decoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func Decode(dst, src []byte) ([]byte, error) {
+	dLen, s, err := decodedLen(src)
+	if err != nil {
+		return nil, err
+	}
+	if dLen <= cap(dst) {
+		dst = dst[:dLen]
+	} else {
+		dst = make([]byte, dLen)
+	}
+
+	race.WriteSlice(dst)
+	race.ReadSlice(src[s:])
+
+	if s2Decode(dst, src[s:]) != 0 {
+		return nil, ErrCorrupt
+	}
+	return dst, nil
+}
+
+// s2DecodeDict writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func s2DecodeDict(dst, src []byte, dict *Dict) int {
+	if dict == nil {
+		return s2Decode(dst, src)
+	}
+	const debug = false
+	const debugErrs = debug
+
+	if debug {
+		fmt.Println("Starting decode, dst len:", len(dst))
+	}
+	var d, s, length int
+	offset := len(dict.dict) - dict.repeat
+
+	// As long as we can read at least 5 bytes...
+	for s < len(src)-5 {
+		// Removing bounds checks is SLOWER, when if doing
+		// in := src[s:s+5]
+		// Checked on Go 1.18
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				x = uint32(src[s-1])
+			case x == 61:
+				in := src[s : s+3]
+				x = uint32(in[1]) | uint32(in[2])<<8
+				s += 3
+			case x == 62:
+				in := src[s : s+4]
+				// Load as 32 bit and shift down.
+				x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
+				x >>= 8
+				s += 4
+			case x == 63:
+				in := src[s : s+5]
+				x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24
+				s += 5
+			}
+			length = int(x) + 1
+			if debug {
+				fmt.Println("literals, length:", length, "d-after:", d+length)
+			}
+			if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+				if debugErrs {
+					fmt.Println("corrupt literal: length:", length, "d-left:", len(dst)-d, "src-left:", len(src)-s)
+				}
+				return decodeErrCodeCorrupt
+			}
+
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+			length = int(src[s-2]) >> 2 & 0x7
+			if toffset == 0 {
+				if debug {
+					fmt.Print("(repeat) ")
+				}
+				// keep last offset
+				switch length {
+				case 5:
+					length = int(src[s]) + 4
+					s += 1
+				case 6:
+					in := src[s : s+2]
+					length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8)
+					s += 2
+				case 7:
+					in := src[s : s+3]
+					length = int((uint32(in[2])<<16)|(uint32(in[1])<<8)|uint32(in[0])) + (1 << 16)
+					s += 3
+				default: // 0-> 4
+				}
+			} else {
+				offset = toffset
+			}
+			length += 4
+		case tagCopy2:
+			in := src[s : s+3]
+			offset = int(uint32(in[1]) | uint32(in[2])<<8)
+			length = 1 + int(in[0])>>2
+			s += 3
+
+		case tagCopy4:
+			in := src[s : s+5]
+			offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24)
+			length = 1 + int(in[0])>>2
+			s += 5
+		}
+
+		if offset <= 0 || length > len(dst)-d {
+			if debugErrs {
+				fmt.Println("match error; offset:", offset, "length:", length, "dst-left:", len(dst)-d)
+			}
+			return decodeErrCodeCorrupt
+		}
+
+		// copy from dict
+		if d < offset {
+			if d > MaxDictSrcOffset {
+				if debugErrs {
+					fmt.Println("dict after", MaxDictSrcOffset, "d:", d, "offset:", offset, "length:", length)
+				}
+				return decodeErrCodeCorrupt
+			}
+			startOff := len(dict.dict) - offset + d
+			if startOff < 0 || startOff+length > len(dict.dict) {
+				if debugErrs {
+					fmt.Printf("offset (%d) + length (%d) bigger than dict (%d)\n", offset, length, len(dict.dict))
+				}
+				return decodeErrCodeCorrupt
+			}
+			if debug {
+				fmt.Println("dict copy, length:", length, "offset:", offset, "d-after:", d+length, "dict start offset:", startOff)
+			}
+			copy(dst[d:d+length], dict.dict[startOff:])
+			d += length
+			continue
+		}
+
+		if debug {
+			fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+		}
+
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+
+	// Remaining with extra checks...
+	for s < len(src) {
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					if debugErrs {
+						fmt.Println("src went oob")
+					}
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-1])
+			case x == 61:
+				s += 3
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					if debugErrs {
+						fmt.Println("src went oob")
+					}
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-2]) | uint32(src[s-1])<<8
+			case x == 62:
+				s += 4
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					if debugErrs {
+						fmt.Println("src went oob")
+					}
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+			case x == 63:
+				s += 5
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					if debugErrs {
+						fmt.Println("src went oob")
+					}
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+			}
+			length = int(x) + 1
+			if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+				if debugErrs {
+					fmt.Println("corrupt literal: length:", length, "d-left:", len(dst)-d, "src-left:", len(src)-s)
+				}
+				return decodeErrCodeCorrupt
+			}
+			if debug {
+				fmt.Println("literals, length:", length, "d-after:", d+length)
+			}
+
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				if debugErrs {
+					fmt.Println("src went oob")
+				}
+				return decodeErrCodeCorrupt
+			}
+			length = int(src[s-2]) >> 2 & 0x7
+			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+			if toffset == 0 {
+				if debug {
+					fmt.Print("(repeat) ")
+				}
+				// keep last offset
+				switch length {
+				case 5:
+					s += 1
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						if debugErrs {
+							fmt.Println("src went oob")
+						}
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-1])) + 4
+				case 6:
+					s += 2
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						if debugErrs {
+							fmt.Println("src went oob")
+						}
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
+				case 7:
+					s += 3
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						if debugErrs {
+							fmt.Println("src went oob")
+						}
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
+				default: // 0-> 4
+				}
+			} else {
+				offset = toffset
+			}
+			length += 4
+		case tagCopy2:
+			s += 3
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				if debugErrs {
+					fmt.Println("src went oob")
+				}
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-3])>>2
+			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+
+		case tagCopy4:
+			s += 5
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				if debugErrs {
+					fmt.Println("src went oob")
+				}
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-5])>>2
+			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+		}
+
+		if offset <= 0 || length > len(dst)-d {
+			if debugErrs {
+				fmt.Println("match error; offset:", offset, "length:", length, "dst-left:", len(dst)-d)
+			}
+			return decodeErrCodeCorrupt
+		}
+
+		// copy from dict
+		if d < offset {
+			if d > MaxDictSrcOffset {
+				if debugErrs {
+					fmt.Println("dict after", MaxDictSrcOffset, "d:", d, "offset:", offset, "length:", length)
+				}
+				return decodeErrCodeCorrupt
+			}
+			rOff := len(dict.dict) - (offset - d)
+			if debug {
+				fmt.Println("starting dict entry from dict offset", len(dict.dict)-rOff)
+			}
+			if rOff+length > len(dict.dict) {
+				if debugErrs {
+					fmt.Println("err: END offset", rOff+length, "bigger than dict", len(dict.dict), "dict offset:", rOff, "length:", length)
+				}
+				return decodeErrCodeCorrupt
+			}
+			if rOff < 0 {
+				if debugErrs {
+					fmt.Println("err: START offset", rOff, "less than 0", len(dict.dict), "dict offset:", rOff, "length:", length)
+				}
+				return decodeErrCodeCorrupt
+			}
+			copy(dst[d:d+length], dict.dict[rOff:])
+			d += length
+			continue
+		}
+
+		if debug {
+			fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+		}
+
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+
+	if d != len(dst) {
+		if debugErrs {
+			fmt.Println("wanted length", len(dst), "got", d)
+		}
+		return decodeErrCodeCorrupt
+	}
+	return 0
+}
diff --git a/vendor/github.com/klauspost/compress/s2/decode_amd64.s b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
new file mode 100644
index 0000000..9b105e0
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
@@ -0,0 +1,568 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+#define R_TMP0 AX
+#define R_TMP1 BX
+#define R_LEN CX
+#define R_OFF DX
+#define R_SRC SI
+#define R_DST DI
+#define R_DBASE R8
+#define R_DLEN R9
+#define R_DEND R10
+#define R_SBASE R11
+#define R_SLEN R12
+#define R_SEND R13
+#define R_TMP2 R14
+#define R_TMP3 R15
+
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+//	- R_TMP0	scratch
+//	- R_TMP1	scratch
+//	- R_LEN	    length or x (shared)
+//	- R_OFF	    offset
+//	- R_SRC	    &src[s]
+//	- R_DST	    &dst[d]
+//	+ R_DBASE	dst_base
+//	+ R_DLEN	dst_len
+//	+ R_DEND	dst_base + dst_len
+//	+ R_SBASE	src_base
+//	+ R_SLEN	src_len
+//	+ R_SEND	src_base + src_len
+//	- R_TMP2	used by doCopy
+//	- R_TMP3	used by doCopy
+//
+// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
+// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
+TEXT ·s2Decode(SB), NOSPLIT, $48-56
+	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
+	MOVQ dst_base+0(FP), R_DBASE
+	MOVQ dst_len+8(FP), R_DLEN
+	MOVQ R_DBASE, R_DST
+	MOVQ R_DBASE, R_DEND
+	ADDQ R_DLEN, R_DEND
+	MOVQ src_base+24(FP), R_SBASE
+	MOVQ src_len+32(FP), R_SLEN
+	MOVQ R_SBASE, R_SRC
+	MOVQ R_SBASE, R_SEND
+	ADDQ R_SLEN, R_SEND
+	XORQ R_OFF, R_OFF
+
+loop:
+	// for s < len(src)
+	CMPQ R_SRC, R_SEND
+	JEQ  end
+
+	// R_LEN = uint32(src[s])
+	//
+	// switch src[s] & 0x03
+	MOVBLZX (R_SRC), R_LEN
+	MOVL    R_LEN, R_TMP1
+	ANDL    $3, R_TMP1
+	CMPL    R_TMP1, $1
+	JAE     tagCopy
+
+	// ----------------------------------------
+	// The code below handles literal tags.
+
+	// case tagLiteral:
+	// x := uint32(src[s] >> 2)
+	// switch
+	SHRL $2, R_LEN
+	CMPL R_LEN, $60
+	JAE  tagLit60Plus
+
+	// case x < 60:
+	// s++
+	INCQ R_SRC
+
+doLit:
+	// This is the end of the inner "switch", when we have a literal tag.
+	//
+	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
+	// used in the pure Go decode_other.go code.
+
+	// length = int(x) + 1
+	//
+	// Unlike the pure Go code, we don't need to check if length <= 0 because
+	// R_LEN can hold 64 bits, so the increment cannot overflow.
+	INCQ R_LEN
+
+	// Prepare to check if copying length bytes will run past the end of dst or
+	// src.
+	//
+	// R_TMP0 = len(dst) - d
+	// R_TMP1 = len(src) - s
+	MOVQ R_DEND, R_TMP0
+	SUBQ R_DST, R_TMP0
+	MOVQ R_SEND, R_TMP1
+	SUBQ R_SRC, R_TMP1
+
+	// !!! Try a faster technique for short (16 or fewer bytes) copies.
+	//
+	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+	//   goto callMemmove // Fall back on calling runtime·memmove.
+	// }
+	//
+	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+	// against 21 instead of 16, because it cannot assume that all of its input
+	// is contiguous in memory and so it needs to leave enough source bytes to
+	// read the next tag without refilling buffers, but Go's Decode assumes
+	// contiguousness (the src argument is a []byte).
+	CMPQ R_LEN, $16
+	JGT  callMemmove
+	CMPQ R_TMP0, $16
+	JLT  callMemmove
+	CMPQ R_TMP1, $16
+	JLT  callMemmove
+
+	// !!! Implement the copy from src to dst as a 16-byte load and store.
+	// (Decode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only length bytes, but that's
+	// OK. If the input is a valid Snappy encoding then subsequent iterations
+	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+	// non-nil error), so the overrun will be ignored.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(R_SRC), X0
+	MOVOU X0, 0(R_DST)
+
+	// d += length
+	// s += length
+	ADDQ R_LEN, R_DST
+	ADDQ R_LEN, R_SRC
+	JMP  loop
+
+callMemmove:
+	// if length > len(dst)-d || length > len(src)-s { etc }
+	CMPQ R_LEN, R_TMP0
+	JGT  errCorrupt
+	CMPQ R_LEN, R_TMP1
+	JGT  errCorrupt
+
+	// copy(dst[d:], src[s:s+length])
+	//
+	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
+	// three registers to the stack, to save local variables across the CALL.
+	MOVQ R_DST, 0(SP)
+	MOVQ R_SRC, 8(SP)
+	MOVQ R_LEN, 16(SP)
+	MOVQ R_DST, 24(SP)
+	MOVQ R_SRC, 32(SP)
+	MOVQ R_LEN, 40(SP)
+	MOVQ R_OFF, 48(SP)
+	CALL runtime·memmove(SB)
+
+	// Restore local variables: unspill registers from the stack and
+	// re-calculate R_DBASE-R_SEND.
+	MOVQ 24(SP), R_DST
+	MOVQ 32(SP), R_SRC
+	MOVQ 40(SP), R_LEN
+	MOVQ 48(SP), R_OFF
+	MOVQ dst_base+0(FP), R_DBASE
+	MOVQ dst_len+8(FP), R_DLEN
+	MOVQ R_DBASE, R_DEND
+	ADDQ R_DLEN, R_DEND
+	MOVQ src_base+24(FP), R_SBASE
+	MOVQ src_len+32(FP), R_SLEN
+	MOVQ R_SBASE, R_SEND
+	ADDQ R_SLEN, R_SEND
+
+	// d += length
+	// s += length
+	ADDQ R_LEN, R_DST
+	ADDQ R_LEN, R_SRC
+	JMP  loop
+
+tagLit60Plus:
+	// !!! This fragment does the
+	//
+	// s += x - 58; if uint(s) > uint(len(src)) { etc }
+	//
+	// checks. In the asm version, we code it once instead of once per switch case.
+	ADDQ R_LEN, R_SRC
+	SUBQ $58, R_SRC
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// case x == 60:
+	CMPL R_LEN, $61
+	JEQ  tagLit61
+	JA   tagLit62Plus
+
+	// x = uint32(src[s-1])
+	MOVBLZX -1(R_SRC), R_LEN
+	JMP     doLit
+
+tagLit61:
+	// case x == 61:
+	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
+	MOVWLZX -2(R_SRC), R_LEN
+	JMP     doLit
+
+tagLit62Plus:
+	CMPL R_LEN, $62
+	JA   tagLit63
+
+	// case x == 62:
+	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+	// We read one byte, safe to read one back, since we are just reading tag.
+	// x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
+	MOVL -4(R_SRC), R_LEN
+	SHRL $8, R_LEN
+	JMP  doLit
+
+tagLit63:
+	// case x == 63:
+	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+	MOVL -4(R_SRC), R_LEN
+	JMP  doLit
+
+// The code above handles literal tags.
+// ----------------------------------------
+// The code below handles copy tags.
+
+tagCopy4:
+	// case tagCopy4:
+	// s += 5
+	ADDQ $5, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// length = 1 + int(src[s-5])>>2
+	SHRQ $2, R_LEN
+	INCQ R_LEN
+
+	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+	MOVLQZX -4(R_SRC), R_OFF
+	JMP     doCopy
+
+tagCopy2:
+	// case tagCopy2:
+	// s += 3
+	ADDQ $3, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// length = 1 + int(src[s-3])>>2
+	SHRQ $2, R_LEN
+	INCQ R_LEN
+
+	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+	MOVWQZX -2(R_SRC), R_OFF
+	JMP     doCopy
+
+tagCopy:
+	// We have a copy tag. We assume that:
+	//	- R_TMP1 == src[s] & 0x03
+	//	- R_LEN == src[s]
+	CMPQ R_TMP1, $2
+	JEQ  tagCopy2
+	JA   tagCopy4
+
+	// case tagCopy1:
+	// s += 2
+	ADDQ $2, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+	// length = 4 + int(src[s-2])>>2&0x7
+	MOVBQZX -1(R_SRC), R_TMP1
+	MOVQ    R_LEN, R_TMP0
+	SHRQ    $2, R_LEN
+	ANDQ    $0xe0, R_TMP0
+	ANDQ    $7, R_LEN
+	SHLQ    $3, R_TMP0
+	ADDQ    $4, R_LEN
+	ORQ     R_TMP1, R_TMP0
+
+	// check if repeat code, ZF set by ORQ.
+	JZ repeatCode
+
+	// This is a regular copy, transfer our temporary value to R_OFF (length)
+	MOVQ R_TMP0, R_OFF
+	JMP  doCopy
+
+// This is a repeat code.
+repeatCode:
+	// If length < 9, reuse last offset, with the length already calculated.
+	CMPQ R_LEN, $9
+	JL   doCopyRepeat
+
+	// Read additional bytes for length.
+	JE repeatLen1
+
+	// Rare, so the extra branch shouldn't hurt too much.
+	CMPQ R_LEN, $10
+	JE   repeatLen2
+	JMP  repeatLen3
+
+// Read repeat lengths.
+repeatLen1:
+	// s ++
+	ADDQ $1, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// length = src[s-1] + 8
+	MOVBQZX -1(R_SRC), R_LEN
+	ADDL    $8, R_LEN
+	JMP     doCopyRepeat
+
+repeatLen2:
+	// s +=2
+	ADDQ $2, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
+	MOVWQZX -2(R_SRC), R_LEN
+	ADDL    $260, R_LEN
+	JMP     doCopyRepeat
+
+repeatLen3:
+	// s +=3
+	ADDQ $3, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
+	// Read one byte further back (just part of the tag, shifted out)
+	MOVL -4(R_SRC), R_LEN
+	SHRL $8, R_LEN
+	ADDL $65540, R_LEN
+	JMP  doCopyRepeat
+
+doCopy:
+	// This is the end of the outer "switch", when we have a copy tag.
+	//
+	// We assume that:
+	//	- R_LEN == length && R_LEN > 0
+	//	- R_OFF == offset
+
+	// if d < offset { etc }
+	MOVQ R_DST, R_TMP1
+	SUBQ R_DBASE, R_TMP1
+	CMPQ R_TMP1, R_OFF
+	JLT  errCorrupt
+
+	// Repeat values can skip the test above, since any offset > 0 will be in dst.
+doCopyRepeat:
+	// if offset <= 0 { etc }
+	CMPQ R_OFF, $0
+	JLE  errCorrupt
+
+	// if length > len(dst)-d { etc }
+	MOVQ R_DEND, R_TMP1
+	SUBQ R_DST, R_TMP1
+	CMPQ R_LEN, R_TMP1
+	JGT  errCorrupt
+
+	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+	//
+	// Set:
+	//	- R_TMP2 = len(dst)-d
+	//	- R_TMP3 = &dst[d-offset]
+	MOVQ R_DEND, R_TMP2
+	SUBQ R_DST, R_TMP2
+	MOVQ R_DST, R_TMP3
+	SUBQ R_OFF, R_TMP3
+
+	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+	//
+	// First, try using two 8-byte load/stores, similar to the doLit technique
+	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+	// and not one 16-byte load/store, and the first store has to be before the
+	// second load, due to the overlap if offset is in the range [8, 16).
+	//
+	// if length > 16 || offset < 8 || len(dst)-d < 16 {
+	//   goto slowForwardCopy
+	// }
+	// copy 16 bytes
+	// d += length
+	CMPQ R_LEN, $16
+	JGT  slowForwardCopy
+	CMPQ R_OFF, $8
+	JLT  slowForwardCopy
+	CMPQ R_TMP2, $16
+	JLT  slowForwardCopy
+	MOVQ 0(R_TMP3), R_TMP0
+	MOVQ R_TMP0, 0(R_DST)
+	MOVQ 8(R_TMP3), R_TMP1
+	MOVQ R_TMP1, 8(R_DST)
+	ADDQ R_LEN, R_DST
+	JMP  loop
+
+slowForwardCopy:
+	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+	// can still try 8-byte load stores, provided we can overrun up to 10 extra
+	// bytes. As above, the overrun will be fixed up by subsequent iterations
+	// of the outermost loop.
+	//
+	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+	// commentary says:
+	//
+	// ----
+	//
+	// The main part of this loop is a simple copy of eight bytes at a time
+	// until we've copied (at least) the requested amount of bytes.  However,
+	// if d and d-offset are less than eight bytes apart (indicating a
+	// repeating pattern of length < 8), we first need to expand the pattern in
+	// order to get the correct results. For instance, if the buffer looks like
+	// this, with the eight-byte <d-offset> and <d> patterns marked as
+	// intervals:
+	//
+	//    abxxxxxxxxxxxx
+	//    [------]           d-offset
+	//      [------]         d
+	//
+	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+	// once, after which we can move <d> two bytes without moving <d-offset>:
+	//
+	//    ababxxxxxxxxxx
+	//    [------]           d-offset
+	//        [------]       d
+	//
+	// and repeat the exercise until the two no longer overlap.
+	//
+	// This allows us to do very well in the special case of one single byte
+	// repeated many times, without taking a big hit for more general cases.
+	//
+	// The worst case of extra writing past the end of the match occurs when
+	// offset == 1 and length == 1; the last copy will read from byte positions
+	// [0..7] and write to [4..11], whereas it was only supposed to write to
+	// position 1. Thus, ten excess bytes.
+	//
+	// ----
+	//
+	// That "10 byte overrun" worst case is confirmed by Go's
+	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+	// and finishSlowForwardCopy algorithm.
+	//
+	// if length > len(dst)-d-10 {
+	//   goto verySlowForwardCopy
+	// }
+	SUBQ $10, R_TMP2
+	CMPQ R_LEN, R_TMP2
+	JGT  verySlowForwardCopy
+
+	// We want to keep the offset, so we use R_TMP2 from here.
+	MOVQ R_OFF, R_TMP2
+
+makeOffsetAtLeast8:
+	// !!! As above, expand the pattern so that offset >= 8 and we can use
+	// 8-byte load/stores.
+	//
+	// for offset < 8 {
+	//   copy 8 bytes from dst[d-offset:] to dst[d:]
+	//   length -= offset
+	//   d      += offset
+	//   offset += offset
+	//   // The two previous lines together means that d-offset, and therefore
+	//   // R_TMP3, is unchanged.
+	// }
+	CMPQ R_TMP2, $8
+	JGE  fixUpSlowForwardCopy
+	MOVQ (R_TMP3), R_TMP1
+	MOVQ R_TMP1, (R_DST)
+	SUBQ R_TMP2, R_LEN
+	ADDQ R_TMP2, R_DST
+	ADDQ R_TMP2, R_TMP2
+	JMP  makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+	// !!! Add length (which might be negative now) to d (implied by R_DST being
+	// &dst[d]) so that d ends up at the right place when we jump back to the
+	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
+	// length is positive, copying the remaining length bytes will write to the
+	// right place.
+	MOVQ R_DST, R_TMP0
+	ADDQ R_LEN, R_DST
+
+finishSlowForwardCopy:
+	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+	// length means that we overrun, but as above, that will be fixed up by
+	// subsequent iterations of the outermost loop.
+	CMPQ R_LEN, $0
+	JLE  loop
+	MOVQ (R_TMP3), R_TMP1
+	MOVQ R_TMP1, (R_TMP0)
+	ADDQ $8, R_TMP3
+	ADDQ $8, R_TMP0
+	SUBQ $8, R_LEN
+	JMP  finishSlowForwardCopy
+
+verySlowForwardCopy:
+	// verySlowForwardCopy is a simple implementation of forward copy. In C
+	// parlance, this is a do/while loop instead of a while loop, since we know
+	// that length > 0. In Go syntax:
+	//
+	// for {
+	//   dst[d] = dst[d - offset]
+	//   d++
+	//   length--
+	//   if length == 0 {
+	//     break
+	//   }
+	// }
+	MOVB (R_TMP3), R_TMP1
+	MOVB R_TMP1, (R_DST)
+	INCQ R_TMP3
+	INCQ R_DST
+	DECQ R_LEN
+	JNZ  verySlowForwardCopy
+	JMP  loop
+
+// The code above handles copy tags.
+// ----------------------------------------
+
+end:
+	// This is the end of the "for s < len(src)".
+	//
+	// if d != len(dst) { etc }
+	CMPQ R_DST, R_DEND
+	JNE  errCorrupt
+
+	// return 0
+	MOVQ $0, ret+48(FP)
+	RET
+
+errCorrupt:
+	// return decodeErrCodeCorrupt
+	MOVQ $1, ret+48(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/s2/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
new file mode 100644
index 0000000..78e463f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
@@ -0,0 +1,574 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+#define R_TMP0 R2
+#define R_TMP1 R3
+#define R_LEN R4
+#define R_OFF R5
+#define R_SRC R6
+#define R_DST R7
+#define R_DBASE R8
+#define R_DLEN R9
+#define R_DEND R10
+#define R_SBASE R11
+#define R_SLEN R12
+#define R_SEND R13
+#define R_TMP2 R14
+#define R_TMP3 R15
+
+// TEST_SRC will check if R_SRC is <= SRC_END
+#define TEST_SRC() \
+	CMP R_SEND, R_SRC \
+	BGT errCorrupt
+
+// MOVD R_SRC, R_TMP1
+// SUB  R_SBASE, R_TMP1, R_TMP1
+// CMP  R_SLEN, R_TMP1
+// BGT  errCorrupt
+
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+//	- R_TMP0	scratch
+//	- R_TMP1	scratch
+//	- R_LEN	length or x
+//	- R_OFF	offset
+//	- R_SRC	&src[s]
+//	- R_DST	&dst[d]
+//	+ R_DBASE	dst_base
+//	+ R_DLEN	dst_len
+//	+ R_DEND	dst_base + dst_len
+//	+ R_SBASE	src_base
+//	+ R_SLEN	src_len
+//	+ R_SEND	src_base + src_len
+//	- R_TMP2	used by doCopy
+//	- R_TMP3	used by doCopy
+//
+// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
+// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
+TEXT ·s2Decode(SB), NOSPLIT, $56-56
+	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
+	MOVD dst_base+0(FP), R_DBASE
+	MOVD dst_len+8(FP), R_DLEN
+	MOVD R_DBASE, R_DST
+	MOVD R_DBASE, R_DEND
+	ADD  R_DLEN, R_DEND, R_DEND
+	MOVD src_base+24(FP), R_SBASE
+	MOVD src_len+32(FP), R_SLEN
+	MOVD R_SBASE, R_SRC
+	MOVD R_SBASE, R_SEND
+	ADD  R_SLEN, R_SEND, R_SEND
+	MOVD $0, R_OFF
+
+loop:
+	// for s < len(src)
+	CMP R_SEND, R_SRC
+	BEQ end
+
+	// R_LEN = uint32(src[s])
+	//
+	// switch src[s] & 0x03
+	MOVBU (R_SRC), R_LEN
+	MOVW  R_LEN, R_TMP1
+	ANDW  $3, R_TMP1
+	MOVW  $1, R1
+	CMPW  R1, R_TMP1
+	BGE   tagCopy
+
+	// ----------------------------------------
+	// The code below handles literal tags.
+
+	// case tagLiteral:
+	// x := uint32(src[s] >> 2)
+	// switch
+	MOVW $60, R1
+	LSRW $2, R_LEN, R_LEN
+	CMPW R_LEN, R1
+	BLS  tagLit60Plus
+
+	// case x < 60:
+	// s++
+	ADD $1, R_SRC, R_SRC
+
+doLit:
+	// This is the end of the inner "switch", when we have a literal tag.
+	//
+	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
+	// used in the pure Go decode_other.go code.
+
+	// length = int(x) + 1
+	//
+	// Unlike the pure Go code, we don't need to check if length <= 0 because
+	// R_LEN can hold 64 bits, so the increment cannot overflow.
+	ADD $1, R_LEN, R_LEN
+
+	// Prepare to check if copying length bytes will run past the end of dst or
+	// src.
+	//
+	// R_TMP0 = len(dst) - d
+	// R_TMP1 = len(src) - s
+	MOVD R_DEND, R_TMP0
+	SUB  R_DST, R_TMP0, R_TMP0
+	MOVD R_SEND, R_TMP1
+	SUB  R_SRC, R_TMP1, R_TMP1
+
+	// !!! Try a faster technique for short (16 or fewer bytes) copies.
+	//
+	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+	//   goto callMemmove // Fall back on calling runtime·memmove.
+	// }
+	//
+	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+	// against 21 instead of 16, because it cannot assume that all of its input
+	// is contiguous in memory and so it needs to leave enough source bytes to
+	// read the next tag without refilling buffers, but Go's Decode assumes
+	// contiguousness (the src argument is a []byte).
+	CMP $16, R_LEN
+	BGT callMemmove
+	CMP $16, R_TMP0
+	BLT callMemmove
+	CMP $16, R_TMP1
+	BLT callMemmove
+
+	// !!! Implement the copy from src to dst as a 16-byte load and store.
+	// (Decode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only length bytes, but that's
+	// OK. If the input is a valid Snappy encoding then subsequent iterations
+	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+	// non-nil error), so the overrun will be ignored.
+	//
+	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	LDP 0(R_SRC), (R_TMP2, R_TMP3)
+	STP (R_TMP2, R_TMP3), 0(R_DST)
+
+	// d += length
+	// s += length
+	ADD R_LEN, R_DST, R_DST
+	ADD R_LEN, R_SRC, R_SRC
+	B   loop
+
+callMemmove:
+	// if length > len(dst)-d || length > len(src)-s { etc }
+	CMP R_TMP0, R_LEN
+	BGT errCorrupt
+	CMP R_TMP1, R_LEN
+	BGT errCorrupt
+
+	// copy(dst[d:], src[s:s+length])
+	//
+	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
+	// three registers to the stack, to save local variables across the CALL.
+	MOVD R_DST, 8(RSP)
+	MOVD R_SRC, 16(RSP)
+	MOVD R_LEN, 24(RSP)
+	MOVD R_DST, 32(RSP)
+	MOVD R_SRC, 40(RSP)
+	MOVD R_LEN, 48(RSP)
+	MOVD R_OFF, 56(RSP)
+	CALL runtime·memmove(SB)
+
+	// Restore local variables: unspill registers from the stack and
+	// re-calculate R_DBASE-R_SEND.
+	MOVD 32(RSP), R_DST
+	MOVD 40(RSP), R_SRC
+	MOVD 48(RSP), R_LEN
+	MOVD 56(RSP), R_OFF
+	MOVD dst_base+0(FP), R_DBASE
+	MOVD dst_len+8(FP), R_DLEN
+	MOVD R_DBASE, R_DEND
+	ADD  R_DLEN, R_DEND, R_DEND
+	MOVD src_base+24(FP), R_SBASE
+	MOVD src_len+32(FP), R_SLEN
+	MOVD R_SBASE, R_SEND
+	ADD  R_SLEN, R_SEND, R_SEND
+
+	// d += length
+	// s += length
+	ADD R_LEN, R_DST, R_DST
+	ADD R_LEN, R_SRC, R_SRC
+	B   loop
+
+tagLit60Plus:
+	// !!! This fragment does the
+	//
+	// s += x - 58; if uint(s) > uint(len(src)) { etc }
+	//
+	// checks. In the asm version, we code it once instead of once per switch case.
+	ADD R_LEN, R_SRC, R_SRC
+	SUB $58, R_SRC, R_SRC
+	TEST_SRC()
+
+	// case x == 60:
+	MOVW $61, R1
+	CMPW R1, R_LEN
+	BEQ  tagLit61
+	BGT  tagLit62Plus
+
+	// x = uint32(src[s-1])
+	MOVBU -1(R_SRC), R_LEN
+	B     doLit
+
+tagLit61:
+	// case x == 61:
+	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
+	MOVHU -2(R_SRC), R_LEN
+	B     doLit
+
+tagLit62Plus:
+	CMPW $62, R_LEN
+	BHI  tagLit63
+
+	// case x == 62:
+	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+	MOVHU -3(R_SRC), R_LEN
+	MOVBU -1(R_SRC), R_TMP1
+	ORR   R_TMP1<<16, R_LEN
+	B     doLit
+
+tagLit63:
+	// case x == 63:
+	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+	MOVWU -4(R_SRC), R_LEN
+	B     doLit
+
+	// The code above handles literal tags.
+	// ----------------------------------------
+	// The code below handles copy tags.
+
+tagCopy4:
+	// case tagCopy4:
+	// s += 5
+	ADD $5, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVD R_SRC, R_TMP1
+	SUB  R_SBASE, R_TMP1, R_TMP1
+	CMP  R_SLEN, R_TMP1
+	BGT  errCorrupt
+
+	// length = 1 + int(src[s-5])>>2
+	MOVD $1, R1
+	ADD  R_LEN>>2, R1, R_LEN
+
+	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+	MOVWU -4(R_SRC), R_OFF
+	B     doCopy
+
+tagCopy2:
+	// case tagCopy2:
+	// s += 3
+	ADD $3, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	TEST_SRC()
+
+	// length = 1 + int(src[s-3])>>2
+	MOVD $1, R1
+	ADD  R_LEN>>2, R1, R_LEN
+
+	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+	MOVHU -2(R_SRC), R_OFF
+	B     doCopy
+
+tagCopy:
+	// We have a copy tag. We assume that:
+	//	- R_TMP1 == src[s] & 0x03
+	//	- R_LEN == src[s]
+	CMP $2, R_TMP1
+	BEQ tagCopy2
+	BGT tagCopy4
+
+	// case tagCopy1:
+	// s += 2
+	ADD $2, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	TEST_SRC()
+
+	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+	// Calculate offset in R_TMP0 in case it is a repeat.
+	MOVD  R_LEN, R_TMP0
+	AND   $0xe0, R_TMP0
+	MOVBU -1(R_SRC), R_TMP1
+	ORR   R_TMP0<<3, R_TMP1, R_TMP0
+
+	// length = 4 + int(src[s-2])>>2&0x7
+	MOVD $7, R1
+	AND  R_LEN>>2, R1, R_LEN
+	ADD  $4, R_LEN, R_LEN
+
+	// check if repeat code with offset 0.
+	CMP $0, R_TMP0
+	BEQ repeatCode
+
+	// This is a regular copy, transfer our temporary value to R_OFF (offset)
+	MOVD R_TMP0, R_OFF
+	B    doCopy
+
+	// This is a repeat code.
+repeatCode:
+	// If length < 9, reuse last offset, with the length already calculated.
+	CMP $9, R_LEN
+	BLT doCopyRepeat
+	BEQ repeatLen1
+	CMP $10, R_LEN
+	BEQ repeatLen2
+
+repeatLen3:
+	// s +=3
+	ADD $3, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	TEST_SRC()
+
+	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
+	MOVBU -1(R_SRC), R_TMP0
+	MOVHU -3(R_SRC), R_LEN
+	ORR   R_TMP0<<16, R_LEN, R_LEN
+	ADD   $65540, R_LEN, R_LEN
+	B     doCopyRepeat
+
+repeatLen2:
+	// s +=2
+	ADD $2, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	TEST_SRC()
+
+	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
+	MOVHU -2(R_SRC), R_LEN
+	ADD   $260, R_LEN, R_LEN
+	B     doCopyRepeat
+
+repeatLen1:
+	// s +=1
+	ADD $1, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	TEST_SRC()
+
+	// length = src[s-1] + 8
+	MOVBU -1(R_SRC), R_LEN
+	ADD   $8, R_LEN, R_LEN
+	B     doCopyRepeat
+
+doCopy:
+	// This is the end of the outer "switch", when we have a copy tag.
+	//
+	// We assume that:
+	//	- R_LEN == length && R_LEN > 0
+	//	- R_OFF == offset
+
+	// if d < offset { etc }
+	MOVD R_DST, R_TMP1
+	SUB  R_DBASE, R_TMP1, R_TMP1
+	CMP  R_OFF, R_TMP1
+	BLT  errCorrupt
+
+	// Repeat values can skip the test above, since any offset > 0 will be in dst.
+doCopyRepeat:
+
+	// if offset <= 0 { etc }
+	CMP $0, R_OFF
+	BLE errCorrupt
+
+	// if length > len(dst)-d { etc }
+	MOVD R_DEND, R_TMP1
+	SUB  R_DST, R_TMP1, R_TMP1
+	CMP  R_TMP1, R_LEN
+	BGT  errCorrupt
+
+	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+	//
+	// Set:
+	//	- R_TMP2 = len(dst)-d
+	//	- R_TMP3 = &dst[d-offset]
+	MOVD R_DEND, R_TMP2
+	SUB  R_DST, R_TMP2, R_TMP2
+	MOVD R_DST, R_TMP3
+	SUB  R_OFF, R_TMP3, R_TMP3
+
+	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+	//
+	// First, try using two 8-byte load/stores, similar to the doLit technique
+	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+	// and not one 16-byte load/store, and the first store has to be before the
+	// second load, due to the overlap if offset is in the range [8, 16).
+	//
+	// if length > 16 || offset < 8 || len(dst)-d < 16 {
+	//   goto slowForwardCopy
+	// }
+	// copy 16 bytes
+	// d += length
+	CMP  $16, R_LEN
+	BGT  slowForwardCopy
+	CMP  $8, R_OFF
+	BLT  slowForwardCopy
+	CMP  $16, R_TMP2
+	BLT  slowForwardCopy
+	MOVD 0(R_TMP3), R_TMP0
+	MOVD R_TMP0, 0(R_DST)
+	MOVD 8(R_TMP3), R_TMP1
+	MOVD R_TMP1, 8(R_DST)
+	ADD  R_LEN, R_DST, R_DST
+	B    loop
+
+slowForwardCopy:
+	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+	// can still try 8-byte load stores, provided we can overrun up to 10 extra
+	// bytes. As above, the overrun will be fixed up by subsequent iterations
+	// of the outermost loop.
+	//
+	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+	// commentary says:
+	//
+	// ----
+	//
+	// The main part of this loop is a simple copy of eight bytes at a time
+	// until we've copied (at least) the requested amount of bytes.  However,
+	// if d and d-offset are less than eight bytes apart (indicating a
+	// repeating pattern of length < 8), we first need to expand the pattern in
+	// order to get the correct results. For instance, if the buffer looks like
+	// this, with the eight-byte <d-offset> and <d> patterns marked as
+	// intervals:
+	//
+	//    abxxxxxxxxxxxx
+	//    [------]           d-offset
+	//      [------]         d
+	//
+	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+	// once, after which we can move <d> two bytes without moving <d-offset>:
+	//
+	//    ababxxxxxxxxxx
+	//    [------]           d-offset
+	//        [------]       d
+	//
+	// and repeat the exercise until the two no longer overlap.
+	//
+	// This allows us to do very well in the special case of one single byte
+	// repeated many times, without taking a big hit for more general cases.
+	//
+	// The worst case of extra writing past the end of the match occurs when
+	// offset == 1 and length == 1; the last copy will read from byte positions
+	// [0..7] and write to [4..11], whereas it was only supposed to write to
+	// position 1. Thus, ten excess bytes.
+	//
+	// ----
+	//
+	// That "10 byte overrun" worst case is confirmed by Go's
+	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+	// and finishSlowForwardCopy algorithm.
+	//
+	// if length > len(dst)-d-10 {
+	//   goto verySlowForwardCopy
+	// }
+	SUB $10, R_TMP2, R_TMP2
+	CMP R_TMP2, R_LEN
+	BGT verySlowForwardCopy
+
+	// We want to keep the offset, so we use R_TMP2 from here.
+	MOVD R_OFF, R_TMP2
+
+makeOffsetAtLeast8:
+	// !!! As above, expand the pattern so that offset >= 8 and we can use
+	// 8-byte load/stores.
+	//
+	// for offset < 8 {
+	//   copy 8 bytes from dst[d-offset:] to dst[d:]
+	//   length -= offset
+	//   d      += offset
+	//   offset += offset
+	//   // The two previous lines together means that d-offset, and therefore
+	//   // R_TMP3, is unchanged.
+	// }
+	CMP  $8, R_TMP2
+	BGE  fixUpSlowForwardCopy
+	MOVD (R_TMP3), R_TMP1
+	MOVD R_TMP1, (R_DST)
+	SUB  R_TMP2, R_LEN, R_LEN
+	ADD  R_TMP2, R_DST, R_DST
+	ADD  R_TMP2, R_TMP2, R_TMP2
+	B    makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+	// !!! Add length (which might be negative now) to d (implied by R_DST being
+	// &dst[d]) so that d ends up at the right place when we jump back to the
+	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
+	// length is positive, copying the remaining length bytes will write to the
+	// right place.
+	MOVD R_DST, R_TMP0
+	ADD  R_LEN, R_DST, R_DST
+
+finishSlowForwardCopy:
+	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+	// length means that we overrun, but as above, that will be fixed up by
+	// subsequent iterations of the outermost loop.
+	MOVD $0, R1
+	CMP  R1, R_LEN
+	BLE  loop
+	MOVD (R_TMP3), R_TMP1
+	MOVD R_TMP1, (R_TMP0)
+	ADD  $8, R_TMP3, R_TMP3
+	ADD  $8, R_TMP0, R_TMP0
+	SUB  $8, R_LEN, R_LEN
+	B    finishSlowForwardCopy
+
+verySlowForwardCopy:
+	// verySlowForwardCopy is a simple implementation of forward copy. In C
+	// parlance, this is a do/while loop instead of a while loop, since we know
+	// that length > 0. In Go syntax:
+	//
+	// for {
+	//   dst[d] = dst[d - offset]
+	//   d++
+	//   length--
+	//   if length == 0 {
+	//     break
+	//   }
+	// }
+	MOVB (R_TMP3), R_TMP1
+	MOVB R_TMP1, (R_DST)
+	ADD  $1, R_TMP3, R_TMP3
+	ADD  $1, R_DST, R_DST
+	SUB  $1, R_LEN, R_LEN
+	CBNZ R_LEN, verySlowForwardCopy
+	B    loop
+
+	// The code above handles copy tags.
+	// ----------------------------------------
+
+end:
+	// This is the end of the "for s < len(src)".
+	//
+	// if d != len(dst) { etc }
+	CMP R_DEND, R_DST
+	BNE errCorrupt
+
+	// return 0
+	MOVD $0, ret+48(FP)
+	RET
+
+errCorrupt:
+	// return decodeErrCodeCorrupt
+	MOVD $1, R_TMP0
+	MOVD R_TMP0, ret+48(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/s2/decode_asm.go b/vendor/github.com/klauspost/compress/s2/decode_asm.go
new file mode 100644
index 0000000..cb3576e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_asm.go
@@ -0,0 +1,17 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (amd64 || arm64) && !appengine && gc && !noasm
+// +build amd64 arm64
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package s2
+
+// decode has the same semantics as in decode_other.go.
+//
+//go:noescape
+func s2Decode(dst, src []byte) int
diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go
new file mode 100644
index 0000000..2cb55c2
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_other.go
@@ -0,0 +1,292 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (!amd64 && !arm64) || appengine || !gc || noasm
+// +build !amd64,!arm64 appengine !gc noasm
+
+package s2
+
+import (
+	"fmt"
+	"strconv"
+)
+
+// decode writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func s2Decode(dst, src []byte) int {
+	const debug = false
+	if debug {
+		fmt.Println("Starting decode, dst len:", len(dst))
+	}
+	var d, s, length int
+	offset := 0
+
+	// As long as we can read at least 5 bytes...
+	for s < len(src)-5 {
+		// Removing bounds checks is SLOWER, when if doing
+		// in := src[s:s+5]
+		// Checked on Go 1.18
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				x = uint32(src[s-1])
+			case x == 61:
+				in := src[s : s+3]
+				x = uint32(in[1]) | uint32(in[2])<<8
+				s += 3
+			case x == 62:
+				in := src[s : s+4]
+				// Load as 32 bit and shift down.
+				x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
+				x >>= 8
+				s += 4
+			case x == 63:
+				in := src[s : s+5]
+				x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24
+				s += 5
+			}
+			length = int(x) + 1
+			if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+				if debug {
+					fmt.Println("corrupt: lit size", length)
+				}
+				return decodeErrCodeCorrupt
+			}
+			if debug {
+				fmt.Println("literals, length:", length, "d-after:", d+length)
+			}
+
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+			length = int(src[s-2]) >> 2 & 0x7
+			if toffset == 0 {
+				if debug {
+					fmt.Print("(repeat) ")
+				}
+				// keep last offset
+				switch length {
+				case 5:
+					length = int(src[s]) + 4
+					s += 1
+				case 6:
+					in := src[s : s+2]
+					length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8)
+					s += 2
+				case 7:
+					in := src[s : s+3]
+					length = int((uint32(in[2])<<16)|(uint32(in[1])<<8)|uint32(in[0])) + (1 << 16)
+					s += 3
+				default: // 0-> 4
+				}
+			} else {
+				offset = toffset
+			}
+			length += 4
+		case tagCopy2:
+			in := src[s : s+3]
+			offset = int(uint32(in[1]) | uint32(in[2])<<8)
+			length = 1 + int(in[0])>>2
+			s += 3
+
+		case tagCopy4:
+			in := src[s : s+5]
+			offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24)
+			length = 1 + int(in[0])>>2
+			s += 5
+		}
+
+		if offset <= 0 || d < offset || length > len(dst)-d {
+			if debug {
+				fmt.Println("corrupt: match, length", length, "offset:", offset, "dst avail:", len(dst)-d, "dst pos:", d)
+			}
+
+			return decodeErrCodeCorrupt
+		}
+
+		if debug {
+			fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+		}
+
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+
+	// Remaining with extra checks...
+	for s < len(src) {
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-1])
+			case x == 61:
+				s += 3
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-2]) | uint32(src[s-1])<<8
+			case x == 62:
+				s += 4
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+			case x == 63:
+				s += 5
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+			}
+			length = int(x) + 1
+			if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+				if debug {
+					fmt.Println("corrupt: lit size", length)
+				}
+				return decodeErrCodeCorrupt
+			}
+			if debug {
+				fmt.Println("literals, length:", length, "d-after:", d+length)
+			}
+
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = int(src[s-2]) >> 2 & 0x7
+			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+			if toffset == 0 {
+				if debug {
+					fmt.Print("(repeat) ")
+				}
+				// keep last offset
+				switch length {
+				case 5:
+					s += 1
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-1])) + 4
+				case 6:
+					s += 2
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
+				case 7:
+					s += 3
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
+				default: // 0-> 4
+				}
+			} else {
+				offset = toffset
+			}
+			length += 4
+		case tagCopy2:
+			s += 3
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-3])>>2
+			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+
+		case tagCopy4:
+			s += 5
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-5])>>2
+			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+		}
+
+		if offset <= 0 || d < offset || length > len(dst)-d {
+			if debug {
+				fmt.Println("corrupt: match, length", length, "offset:", offset, "dst avail:", len(dst)-d, "dst pos:", d)
+			}
+			return decodeErrCodeCorrupt
+		}
+
+		if debug {
+			fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+		}
+
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+
+	if d != len(dst) {
+		return decodeErrCodeCorrupt
+	}
+	return 0
+}
diff --git a/vendor/github.com/klauspost/compress/s2/dict.go b/vendor/github.com/klauspost/compress/s2/dict.go
new file mode 100644
index 0000000..f125ad0
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/dict.go
@@ -0,0 +1,350 @@
+// Copyright (c) 2022+ Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"bytes"
+	"encoding/binary"
+	"sync"
+)
+
+const (
+	// MinDictSize is the minimum dictionary size when repeat has been read.
+	MinDictSize = 16
+
+	// MaxDictSize is the maximum dictionary size when repeat has been read.
+	MaxDictSize = 65536
+
+	// MaxDictSrcOffset is the maximum offset where a dictionary entry can start.
+	MaxDictSrcOffset = 65535
+)
+
+// Dict contains a dictionary that can be used for encoding and decoding s2
+type Dict struct {
+	dict   []byte
+	repeat int // Repeat as index of dict
+
+	fast, better, best sync.Once
+	fastTable          *[1 << 14]uint16
+
+	betterTableShort *[1 << 14]uint16
+	betterTableLong  *[1 << 17]uint16
+
+	bestTableShort *[1 << 16]uint32
+	bestTableLong  *[1 << 19]uint32
+}
+
+// NewDict will read a dictionary.
+// It will return nil if the dictionary is invalid.
+func NewDict(dict []byte) *Dict {
+	if len(dict) == 0 {
+		return nil
+	}
+	var d Dict
+	// Repeat is the first value of the dict
+	r, n := binary.Uvarint(dict)
+	if n <= 0 {
+		return nil
+	}
+	dict = dict[n:]
+	d.dict = dict
+	if cap(d.dict) < len(d.dict)+16 {
+		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
+	}
+	if len(dict) < MinDictSize || len(dict) > MaxDictSize {
+		return nil
+	}
+	d.repeat = int(r)
+	if d.repeat > len(dict) {
+		return nil
+	}
+	return &d
+}
+
+// Bytes will return a serialized version of the dictionary.
+// The output can be sent to NewDict.
+func (d *Dict) Bytes() []byte {
+	dst := make([]byte, binary.MaxVarintLen16+len(d.dict))
+	return append(dst[:binary.PutUvarint(dst, uint64(d.repeat))], d.dict...)
+}
+
+// MakeDict will create a dictionary.
+// 'data' must be at least MinDictSize.
+// If data is longer than MaxDictSize only the last MaxDictSize bytes will be used.
+// If searchStart is set the start repeat value will be set to the last
+// match of this content.
+// If no matches are found, it will attempt to find shorter matches.
+// This content should match the typical start of a block.
+// If at least 4 bytes cannot be matched, repeat is set to start of block.
+func MakeDict(data []byte, searchStart []byte) *Dict {
+	if len(data) == 0 {
+		return nil
+	}
+	if len(data) > MaxDictSize {
+		data = data[len(data)-MaxDictSize:]
+	}
+	var d Dict
+	dict := data
+	d.dict = dict
+	if cap(d.dict) < len(d.dict)+16 {
+		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
+	}
+	if len(dict) < MinDictSize {
+		return nil
+	}
+
+	// Find the longest match possible, last entry if multiple.
+	for s := len(searchStart); s > 4; s-- {
+		if idx := bytes.LastIndex(data, searchStart[:s]); idx >= 0 && idx <= len(data)-8 {
+			d.repeat = idx
+			break
+		}
+	}
+
+	return &d
+}
+
+// MakeDictManual will create a dictionary.
+// 'data' must be at least MinDictSize and less than or equal to MaxDictSize.
+// A manual first repeat index into data must be provided.
+// It must be less than len(data)-8.
+func MakeDictManual(data []byte, firstIdx uint16) *Dict {
+	if len(data) < MinDictSize || int(firstIdx) >= len(data)-8 || len(data) > MaxDictSize {
+		return nil
+	}
+	var d Dict
+	dict := data
+	d.dict = dict
+	if cap(d.dict) < len(d.dict)+16 {
+		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
+	}
+
+	d.repeat = int(firstIdx)
+	return &d
+}
+
+// Encode returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func (d *Dict) Encode(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	dstP := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:dstP]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		dstP += emitLiteral(dst[dstP:], src)
+		return dst[:dstP]
+	}
+	n := encodeBlockDictGo(dst[dstP:], src, d)
+	if n > 0 {
+		dstP += n
+		return dst[:dstP]
+	}
+	// Not compressible
+	dstP += emitLiteral(dst[dstP:], src)
+	return dst[:dstP]
+}
+
+// EncodeBetter returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// EncodeBetter compresses better than Encode but typically with a
+// 10-40% speed decrease on both compression and decompression.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func (d *Dict) EncodeBetter(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	dstP := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:dstP]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		dstP += emitLiteral(dst[dstP:], src)
+		return dst[:dstP]
+	}
+	n := encodeBlockBetterDict(dst[dstP:], src, d)
+	if n > 0 {
+		dstP += n
+		return dst[:dstP]
+	}
+	// Not compressible
+	dstP += emitLiteral(dst[dstP:], src)
+	return dst[:dstP]
+}
+
+// EncodeBest returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// EncodeBest compresses as good as reasonably possible but with a
+// big speed decrease.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func (d *Dict) EncodeBest(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	dstP := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:dstP]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		dstP += emitLiteral(dst[dstP:], src)
+		return dst[:dstP]
+	}
+	n := encodeBlockBest(dst[dstP:], src, d)
+	if n > 0 {
+		dstP += n
+		return dst[:dstP]
+	}
+	// Not compressible
+	dstP += emitLiteral(dst[dstP:], src)
+	return dst[:dstP]
+}
+
+// Decode returns the decoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire decoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func (d *Dict) Decode(dst, src []byte) ([]byte, error) {
+	dLen, s, err := decodedLen(src)
+	if err != nil {
+		return nil, err
+	}
+	if dLen <= cap(dst) {
+		dst = dst[:dLen]
+	} else {
+		dst = make([]byte, dLen)
+	}
+	if s2DecodeDict(dst, src[s:], d) != 0 {
+		return nil, ErrCorrupt
+	}
+	return dst, nil
+}
+
+func (d *Dict) initFast() {
+	d.fast.Do(func() {
+		const (
+			tableBits    = 14
+			maxTableSize = 1 << tableBits
+		)
+
+		var table [maxTableSize]uint16
+		// We stop so any entry of length 8 can always be read.
+		for i := 0; i < len(d.dict)-8-2; i += 3 {
+			x0 := load64(d.dict, i)
+			h0 := hash6(x0, tableBits)
+			h1 := hash6(x0>>8, tableBits)
+			h2 := hash6(x0>>16, tableBits)
+			table[h0] = uint16(i)
+			table[h1] = uint16(i + 1)
+			table[h2] = uint16(i + 2)
+		}
+		d.fastTable = &table
+	})
+}
+
+func (d *Dict) initBetter() {
+	d.better.Do(func() {
+		const (
+			// Long hash matches.
+			lTableBits    = 17
+			maxLTableSize = 1 << lTableBits
+
+			// Short hash matches.
+			sTableBits    = 14
+			maxSTableSize = 1 << sTableBits
+		)
+
+		var lTable [maxLTableSize]uint16
+		var sTable [maxSTableSize]uint16
+
+		// We stop so any entry of length 8 can always be read.
+		for i := 0; i < len(d.dict)-8; i++ {
+			cv := load64(d.dict, i)
+			lTable[hash7(cv, lTableBits)] = uint16(i)
+			sTable[hash4(cv, sTableBits)] = uint16(i)
+		}
+		d.betterTableShort = &sTable
+		d.betterTableLong = &lTable
+	})
+}
+
+func (d *Dict) initBest() {
+	d.best.Do(func() {
+		const (
+			// Long hash matches.
+			lTableBits    = 19
+			maxLTableSize = 1 << lTableBits
+
+			// Short hash matches.
+			sTableBits    = 16
+			maxSTableSize = 1 << sTableBits
+		)
+
+		var lTable [maxLTableSize]uint32
+		var sTable [maxSTableSize]uint32
+
+		// We stop so any entry of length 8 can always be read.
+		for i := 0; i < len(d.dict)-8; i++ {
+			cv := load64(d.dict, i)
+			hashL := hash8(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL := lTable[hashL]
+			candidateS := sTable[hashS]
+			lTable[hashL] = uint32(i) | candidateL<<16
+			sTable[hashS] = uint32(i) | candidateS<<16
+		}
+		d.bestTableShort = &sTable
+		d.bestTableLong = &lTable
+	})
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode.go b/vendor/github.com/klauspost/compress/s2/encode.go
new file mode 100644
index 0000000..0c9088a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode.go
@@ -0,0 +1,393 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"encoding/binary"
+	"math"
+	"math/bits"
+)
+
+// Encode returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func Encode(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+	n := encodeBlock(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// EstimateBlockSize will perform a very fast compression
+// without outputting the result and return the compressed output size.
+// The function returns -1 if no improvement could be achieved.
+// Using actual compression will most often produce better compression than the estimate.
+func EstimateBlockSize(src []byte) (d int) {
+	if len(src) <= inputMargin || int64(len(src)) > 0xffffffff {
+		return -1
+	}
+	if len(src) <= 1024 {
+		d = calcBlockSizeSmall(src)
+	} else {
+		d = calcBlockSize(src)
+	}
+
+	if d == 0 {
+		return -1
+	}
+	// Size of the varint encoded block size.
+	d += (bits.Len64(uint64(len(src))) + 7) / 7
+
+	if d >= len(src) {
+		return -1
+	}
+	return d
+}
+
+// EncodeBetter returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// EncodeBetter compresses better than Encode but typically with a
+// 10-40% speed decrease on both compression and decompression.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeBetter(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+	n := encodeBlockBetter(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// EncodeBest returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// EncodeBest compresses as good as reasonably possible but with a
+// big speed decrease.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeBest(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+	n := encodeBlockBest(dst[d:], src, nil)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// EncodeSnappy returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The output is Snappy compatible and will likely decompress faster.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeSnappy(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+
+	n := encodeBlockSnappy(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// EncodeSnappyBetter returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The output is Snappy compatible and will likely decompress faster.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeSnappyBetter(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+
+	n := encodeBlockBetterSnappy(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// EncodeSnappyBest returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The output is Snappy compatible and will likely decompress faster.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeSnappyBest(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+
+	n := encodeBlockBestSnappy(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// ConcatBlocks will concatenate the supplied blocks and append them to the supplied destination.
+// If the destination is nil or too small, a new will be allocated.
+// The blocks are not validated, so garbage in = garbage out.
+// dst may not overlap block data.
+// Any data in dst is preserved as is, so it will not be considered a block.
+func ConcatBlocks(dst []byte, blocks ...[]byte) ([]byte, error) {
+	totalSize := uint64(0)
+	compSize := 0
+	for _, b := range blocks {
+		l, hdr, err := decodedLen(b)
+		if err != nil {
+			return nil, err
+		}
+		totalSize += uint64(l)
+		compSize += len(b) - hdr
+	}
+	if totalSize == 0 {
+		dst = append(dst, 0)
+		return dst, nil
+	}
+	if totalSize > math.MaxUint32 {
+		return nil, ErrTooLarge
+	}
+	var tmp [binary.MaxVarintLen32]byte
+	hdrSize := binary.PutUvarint(tmp[:], totalSize)
+	wantSize := hdrSize + compSize
+
+	if cap(dst)-len(dst) < wantSize {
+		dst = append(make([]byte, 0, wantSize+len(dst)), dst...)
+	}
+	dst = append(dst, tmp[:hdrSize]...)
+	for _, b := range blocks {
+		_, hdr, err := decodedLen(b)
+		if err != nil {
+			return nil, err
+		}
+		dst = append(dst, b[hdr:]...)
+	}
+	return dst, nil
+}
+
+// inputMargin is the minimum number of extra input bytes to keep, inside
+// encodeBlock's inner loop. On some architectures, this margin lets us
+// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
+// literals can be implemented as a single load to and store from a 16-byte
+// register. That literal's actual length can be as short as 1 byte, so this
+// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
+// the encoding loop will fix up the copy overrun, and this inputMargin ensures
+// that we don't overrun the dst and src buffers.
+const inputMargin = 8
+
+// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
+// will be accepted by the encoder.
+const minNonLiteralBlockSize = 32
+
+const intReduction = 2 - (1 << (^uint(0) >> 63)) // 1 (32 bits) or 0 (64 bits)
+
+// MaxBlockSize is the maximum value where MaxEncodedLen will return a valid block size.
+// Blocks this big are highly discouraged, though.
+// Half the size on 32 bit systems.
+const MaxBlockSize = (1<<(32-intReduction) - 1) - binary.MaxVarintLen32 - 5
+
+// MaxEncodedLen returns the maximum length of a snappy block, given its
+// uncompressed length.
+//
+// It will return a negative value if srcLen is too large to encode.
+// 32 bit platforms will have lower thresholds for rejecting big content.
+func MaxEncodedLen(srcLen int) int {
+	n := uint64(srcLen)
+	if intReduction == 1 {
+		// 32 bits
+		if n > math.MaxInt32 {
+			// Also includes negative.
+			return -1
+		}
+	} else if n > 0xffffffff {
+		// 64 bits
+		// Also includes negative.
+		return -1
+	}
+	// Size of the varint encoded block size.
+	n = n + uint64((bits.Len64(n)+7)/7)
+
+	// Add maximum size of encoding block as literals.
+	n += uint64(literalExtraSize(int64(srcLen)))
+	if intReduction == 1 {
+		// 32 bits
+		if n > math.MaxInt32 {
+			return -1
+		}
+	} else if n > 0xffffffff {
+		// 64 bits
+		// Also includes negative.
+		return -1
+	}
+	return int(n)
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go
new file mode 100644
index 0000000..9977045
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_all.go
@@ -0,0 +1,1068 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math/bits"
+)
+
+func load32(b []byte, i int) uint32 {
+	return binary.LittleEndian.Uint32(b[i:])
+}
+
+func load64(b []byte, i int) uint64 {
+	return binary.LittleEndian.Uint64(b[i:])
+}
+
+// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash6(u uint64, h uint8) uint32 {
+	const prime6bytes = 227718039650203
+	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
+}
+
+func encodeGo(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+	n := encodeBlockGo(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockGo(dst, src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 14
+		maxTableSize = 1 << tableBits
+
+		debug = false
+	)
+
+	var table [maxTableSize]uint32
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>6 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint32(s)
+			table[hash1] = uint32(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				if debug {
+					// Validate match.
+					if s <= candidate {
+						panic("s <= candidate")
+					}
+					a := src[base:s]
+					b := src[base-repeat : base-repeat+(s-base)]
+					if !bytes.Equal(a, b) {
+						panic("mismatch")
+					}
+				}
+				if nextEmit > 0 {
+					// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+					d += emitRepeat(dst[d:], repeat, s-base)
+				} else {
+					// First match, cannot be repeat.
+					d += emitCopy(dst[d:], repeat, s-base)
+				}
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint32(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint32(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards.
+		// The top bytes will be rechecked to get the full match.
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopy(dst[d:], repeat, s-base)
+			if debug {
+				// Validate match.
+				if s <= candidate {
+					panic("s <= candidate")
+				}
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if debug && s == candidate {
+				panic("s == candidate")
+			}
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+func encodeBlockSnappyGo(dst, src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 14
+		maxTableSize = 1 << tableBits
+	)
+
+	var table [maxTableSize]uint32
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>6 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint32(s)
+			table[hash1] = uint32(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+
+				d += emitCopyNoRepeat(dst[d:], repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint32(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint32(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopyNoRepeat(dst[d:], repeat, s-base)
+			if false {
+				// Validate match.
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockDictGo(dst, src []byte, dict *Dict) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 14
+		maxTableSize = 1 << tableBits
+		maxAhead     = 8 // maximum bytes ahead without checking sLimit
+
+		debug = false
+	)
+	dict.initFast()
+
+	var table [maxTableSize]uint32
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if sLimit > MaxDictSrcOffset-maxAhead {
+		sLimit = MaxDictSrcOffset - maxAhead
+	}
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form can start with a dict entry (copy or repeat).
+	s := 0
+
+	// Convert dict repeat to offset
+	repeat := len(dict.dict) - dict.repeat
+	cv := load64(src, 0)
+
+	// While in dict
+searchDict:
+	for {
+		// Next src position to check
+		nextS := s + (s-nextEmit)>>6 + 4
+		hash0 := hash6(cv, tableBits)
+		hash1 := hash6(cv>>8, tableBits)
+		if nextS > sLimit {
+			if debug {
+				fmt.Println("slimit reached", s, nextS)
+			}
+			break searchDict
+		}
+		candidateDict := int(dict.fastTable[hash0])
+		candidateDict2 := int(dict.fastTable[hash1])
+		candidate2 := int(table[hash1])
+		candidate := int(table[hash0])
+		table[hash0] = uint32(s)
+		table[hash1] = uint32(s + 1)
+		hash2 := hash6(cv>>16, tableBits)
+
+		// Check repeat at offset checkRep.
+		const checkRep = 1
+
+		if repeat > s {
+			candidate := len(dict.dict) - repeat + s
+			if repeat-s >= 4 && uint32(cv) == load32(dict.dict, candidate) {
+				// Extend back
+				base := s
+				for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+				if debug && nextEmit != base {
+					fmt.Println("emitted ", base-nextEmit, "literals")
+				}
+				s += 4
+				candidate += 4
+				for candidate < len(dict.dict)-8 && s <= len(src)-8 {
+					if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				d += emitRepeat(dst[d:], repeat, s-base)
+				if debug {
+					fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s)
+				}
+				nextEmit = s
+				if s >= sLimit {
+					break searchDict
+				}
+				cv = load64(src, s)
+				continue
+			}
+		} else if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+			base := s + checkRep
+			// Extend back
+			for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+				i--
+				base--
+			}
+			d += emitLiteral(dst[d:], src[nextEmit:base])
+			if debug && nextEmit != base {
+				fmt.Println("emitted ", base-nextEmit, "literals")
+			}
+
+			// Extend forward
+			candidate := s - repeat + 4 + checkRep
+			s += 4 + checkRep
+			for s <= sLimit {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+			if debug {
+				// Validate match.
+				if s <= candidate {
+					panic("s <= candidate")
+				}
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			if nextEmit > 0 {
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], repeat, s-base)
+			} else {
+				// First match, cannot be repeat.
+				d += emitCopy(dst[d:], repeat, s-base)
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				break searchDict
+			}
+			if debug {
+				fmt.Println("emitted reg repeat", s-base, "s:", s)
+			}
+			cv = load64(src, s)
+			continue searchDict
+		}
+		if s == 0 {
+			cv = load64(src, nextS)
+			s = nextS
+			continue searchDict
+		}
+		// Start with table. These matches will always be closer.
+		if uint32(cv) == load32(src, candidate) {
+			goto emitMatch
+		}
+		candidate = int(table[hash2])
+		if uint32(cv>>8) == load32(src, candidate2) {
+			table[hash2] = uint32(s + 2)
+			candidate = candidate2
+			s++
+			goto emitMatch
+		}
+
+		// Check dict. Dicts have longer offsets, so we want longer matches.
+		if cv == load64(dict.dict, candidateDict) {
+			table[hash2] = uint32(s + 2)
+			goto emitDict
+		}
+
+		candidateDict = int(dict.fastTable[hash2])
+		// Check if upper 7 bytes match
+		if candidateDict2 >= 1 {
+			if cv^load64(dict.dict, candidateDict2-1) < (1 << 8) {
+				table[hash2] = uint32(s + 2)
+				candidateDict = candidateDict2
+				s++
+				goto emitDict
+			}
+		}
+
+		table[hash2] = uint32(s + 2)
+		if uint32(cv>>16) == load32(src, candidate) {
+			s += 2
+			goto emitMatch
+		}
+		if candidateDict >= 2 {
+			// Check if upper 6 bytes match
+			if cv^load64(dict.dict, candidateDict-2) < (1 << 16) {
+				s += 2
+				goto emitDict
+			}
+		}
+
+		cv = load64(src, nextS)
+		s = nextS
+		continue searchDict
+
+	emitDict:
+		{
+			if debug {
+				if load32(dict.dict, candidateDict) != load32(src, s) {
+					panic("dict emit mismatch")
+				}
+			}
+			// Extend backwards.
+			// The top bytes will be rechecked to get the full match.
+			for candidateDict > 0 && s > nextEmit && dict.dict[candidateDict-1] == src[s-1] {
+				candidateDict--
+				s--
+			}
+
+			// Bail if we exceed the maximum size.
+			if d+(s-nextEmit) > dstLimit {
+				return 0
+			}
+
+			// A 4-byte match has been found. We'll later see if more than 4 bytes
+			// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+			// them as literal bytes.
+
+			d += emitLiteral(dst[d:], src[nextEmit:s])
+			if debug && nextEmit != s {
+				fmt.Println("emitted ", s-nextEmit, "literals")
+			}
+			{
+				// Invariant: we have a 4-byte match at s, and no need to emit any
+				// literal bytes prior to s.
+				base := s
+				repeat = s + (len(dict.dict)) - candidateDict
+
+				// Extend the 4-byte match as long as possible.
+				s += 4
+				candidateDict += 4
+				for s <= len(src)-8 && len(dict.dict)-candidateDict >= 8 {
+					if diff := load64(src, s) ^ load64(dict.dict, candidateDict); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidateDict += 8
+				}
+
+				// Matches longer than 64 are split.
+				if s <= sLimit || s-base < 8 {
+					d += emitCopy(dst[d:], repeat, s-base)
+				} else {
+					// Split to ensure we don't start a copy within next block
+					d += emitCopy(dst[d:], repeat, 4)
+					d += emitRepeat(dst[d:], repeat, s-base-4)
+				}
+				if false {
+					// Validate match.
+					if s <= candidate {
+						panic("s <= candidate")
+					}
+					a := src[base:s]
+					b := dict.dict[base-repeat : base-repeat+(s-base)]
+					if !bytes.Equal(a, b) {
+						panic("mismatch")
+					}
+				}
+				if debug {
+					fmt.Println("emitted dict copy, length", s-base, "offset:", repeat, "s:", s)
+				}
+				nextEmit = s
+				if s >= sLimit {
+					break searchDict
+				}
+
+				if d > dstLimit {
+					// Do we have space for more, if not bail.
+					return 0
+				}
+
+				// Index and continue loop to try new candidate.
+				x := load64(src, s-2)
+				m2Hash := hash6(x, tableBits)
+				currHash := hash6(x>>8, tableBits)
+				table[m2Hash] = uint32(s - 2)
+				table[currHash] = uint32(s - 1)
+				cv = load64(src, s)
+			}
+			continue
+		}
+	emitMatch:
+
+		// Extend backwards.
+		// The top bytes will be rechecked to get the full match.
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+		if debug && nextEmit != s {
+			fmt.Println("emitted ", s-nextEmit, "literals")
+		}
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopy(dst[d:], repeat, s-base)
+			if debug {
+				// Validate match.
+				if s <= candidate {
+					panic("s <= candidate")
+				}
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+			if debug {
+				fmt.Println("emitted src copy, length", s-base, "offset:", repeat, "s:", s)
+			}
+			nextEmit = s
+			if s >= sLimit {
+				break searchDict
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if debug && s == candidate {
+				panic("s == candidate")
+			}
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+	// Search without dict:
+	if repeat > s {
+		repeat = 0
+	}
+
+	// No more dict
+	sLimit = len(src) - inputMargin
+	if s >= sLimit {
+		goto emitRemainder
+	}
+	if debug {
+		fmt.Println("non-dict matching at", s, "repeat:", repeat)
+	}
+	cv = load64(src, s)
+	if debug {
+		fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s)
+	}
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>6 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint32(s)
+			table[hash1] = uint32(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if repeat > 0 && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+				if debug && nextEmit != base {
+					fmt.Println("emitted ", base-nextEmit, "literals")
+				}
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				if debug {
+					// Validate match.
+					if s <= candidate {
+						panic("s <= candidate")
+					}
+					a := src[base:s]
+					b := src[base-repeat : base-repeat+(s-base)]
+					if !bytes.Equal(a, b) {
+						panic("mismatch")
+					}
+				}
+				if nextEmit > 0 {
+					// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+					d += emitRepeat(dst[d:], repeat, s-base)
+				} else {
+					// First match, cannot be repeat.
+					d += emitCopy(dst[d:], repeat, s-base)
+				}
+				if debug {
+					fmt.Println("emitted src repeat length", s-base, "offset:", repeat, "s:", s)
+				}
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint32(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint32(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards.
+		// The top bytes will be rechecked to get the full match.
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+		if debug && nextEmit != s {
+			fmt.Println("emitted ", s-nextEmit, "literals")
+		}
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopy(dst[d:], repeat, s-base)
+			if debug {
+				// Validate match.
+				if s <= candidate {
+					panic("s <= candidate")
+				}
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+			if debug {
+				fmt.Println("emitted src copy, length", s-base, "offset:", repeat, "s:", s)
+			}
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if debug && s == candidate {
+				panic("s == candidate")
+			}
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+		if debug && nextEmit != s {
+			fmt.Println("emitted ", len(src)-nextEmit, "literals")
+		}
+	}
+	return d
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_amd64.go b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
new file mode 100644
index 0000000..4f45206
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
@@ -0,0 +1,162 @@
+//go:build !appengine && !noasm && gc
+// +build !appengine,!noasm,gc
+
+package s2
+
+import "github.com/klauspost/compress/internal/race"
+
+const hasAmd64Asm = true
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlock(dst, src []byte) (d int) {
+	race.ReadSlice(src)
+	race.WriteSlice(dst)
+
+	const (
+		// Use 12 bit table when less than...
+		limit12B = 16 << 10
+		// Use 10 bit table when less than...
+		limit10B = 4 << 10
+		// Use 8 bit table when less than...
+		limit8B = 512
+	)
+
+	if len(src) >= 4<<20 {
+		return encodeBlockAsm(dst, src)
+	}
+	if len(src) >= limit12B {
+		return encodeBlockAsm4MB(dst, src)
+	}
+	if len(src) >= limit10B {
+		return encodeBlockAsm12B(dst, src)
+	}
+	if len(src) >= limit8B {
+		return encodeBlockAsm10B(dst, src)
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeBlockAsm8B(dst, src)
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetter(dst, src []byte) (d int) {
+	race.ReadSlice(src)
+	race.WriteSlice(dst)
+
+	const (
+		// Use 12 bit table when less than...
+		limit12B = 16 << 10
+		// Use 10 bit table when less than...
+		limit10B = 4 << 10
+		// Use 8 bit table when less than...
+		limit8B = 512
+	)
+
+	if len(src) > 4<<20 {
+		return encodeBetterBlockAsm(dst, src)
+	}
+	if len(src) >= limit12B {
+		return encodeBetterBlockAsm4MB(dst, src)
+	}
+	if len(src) >= limit10B {
+		return encodeBetterBlockAsm12B(dst, src)
+	}
+	if len(src) >= limit8B {
+		return encodeBetterBlockAsm10B(dst, src)
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeBetterBlockAsm8B(dst, src)
+}
+
+// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockSnappy(dst, src []byte) (d int) {
+	race.ReadSlice(src)
+	race.WriteSlice(dst)
+
+	const (
+		// Use 12 bit table when less than...
+		limit12B = 16 << 10
+		// Use 10 bit table when less than...
+		limit10B = 4 << 10
+		// Use 8 bit table when less than...
+		limit8B = 512
+	)
+	if len(src) >= 64<<10 {
+		return encodeSnappyBlockAsm(dst, src)
+	}
+	if len(src) >= limit12B {
+		return encodeSnappyBlockAsm64K(dst, src)
+	}
+	if len(src) >= limit10B {
+		return encodeSnappyBlockAsm12B(dst, src)
+	}
+	if len(src) >= limit8B {
+		return encodeSnappyBlockAsm10B(dst, src)
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeSnappyBlockAsm8B(dst, src)
+}
+
+// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterSnappy(dst, src []byte) (d int) {
+	race.ReadSlice(src)
+	race.WriteSlice(dst)
+
+	const (
+		// Use 12 bit table when less than...
+		limit12B = 16 << 10
+		// Use 10 bit table when less than...
+		limit10B = 4 << 10
+		// Use 8 bit table when less than...
+		limit8B = 512
+	)
+	if len(src) >= 64<<10 {
+		return encodeSnappyBetterBlockAsm(dst, src)
+	}
+	if len(src) >= limit12B {
+		return encodeSnappyBetterBlockAsm64K(dst, src)
+	}
+	if len(src) >= limit10B {
+		return encodeSnappyBetterBlockAsm12B(dst, src)
+	}
+	if len(src) >= limit8B {
+		return encodeSnappyBetterBlockAsm10B(dst, src)
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeSnappyBetterBlockAsm8B(dst, src)
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go
new file mode 100644
index 0000000..47bac74
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_best.go
@@ -0,0 +1,796 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"fmt"
+	"math"
+	"math/bits"
+)
+
+// encodeBlockBest encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 19
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 16
+		maxSTableSize = 1 << sTableBits
+
+		inputMargin = 8 + 2
+
+		debug = false
+	)
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	sLimitDict := len(src) - inputMargin
+	if sLimitDict > MaxDictSrcOffset-inputMargin {
+		sLimitDict = MaxDictSrcOffset - inputMargin
+	}
+
+	var lTable [maxLTableSize]uint64
+	var sTable [maxSTableSize]uint64
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	repeat := 1
+	if dict != nil {
+		dict.initBest()
+		s = 0
+		repeat = len(dict.dict) - dict.repeat
+	}
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	const lowbitMask = 0xffffffff
+	getCur := func(x uint64) int {
+		return int(x & lowbitMask)
+	}
+	getPrev := func(x uint64) int {
+		return int(x >> 32)
+	}
+	const maxSkip = 64
+
+	for {
+		type match struct {
+			offset    int
+			s         int
+			length    int
+			score     int
+			rep, dict bool
+		}
+		var best match
+		for {
+			// Next src position to check
+			nextS := (s-nextEmit)>>8 + 1
+			if nextS > maxSkip {
+				nextS = s + maxSkip
+			} else {
+				nextS += s
+			}
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			if dict != nil && s >= MaxDictSrcOffset {
+				dict = nil
+				if repeat > s {
+					repeat = math.MinInt32
+				}
+			}
+			hashL := hash8(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL := lTable[hashL]
+			candidateS := sTable[hashS]
+
+			score := func(m match) int {
+				// Matches that are longer forward are penalized since we must emit it as a literal.
+				score := m.length - m.s
+				if nextEmit == m.s {
+					// If we do not have to emit literals, we save 1 byte
+					score++
+				}
+				offset := m.s - m.offset
+				if m.rep {
+					return score - emitRepeatSize(offset, m.length)
+				}
+				return score - emitCopySize(offset, m.length)
+			}
+
+			matchAt := func(offset, s int, first uint32, rep bool) match {
+				if best.length != 0 && best.s-best.offset == s-offset {
+					// Don't retest if we have the same offset.
+					return match{offset: offset, s: s}
+				}
+				if load32(src, offset) != first {
+					return match{offset: offset, s: s}
+				}
+				m := match{offset: offset, s: s, length: 4 + offset, rep: rep}
+				s += 4
+				for s < len(src) {
+					if len(src)-s < 8 {
+						if src[s] == src[m.length] {
+							m.length++
+							s++
+							continue
+						}
+						break
+					}
+					if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
+						m.length += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					m.length += 8
+				}
+				m.length -= offset
+				m.score = score(m)
+				if m.score <= -m.s {
+					// Eliminate if no savings, we might find a better one.
+					m.length = 0
+				}
+				return m
+			}
+			matchDict := func(candidate, s int, first uint32, rep bool) match {
+				if s >= MaxDictSrcOffset {
+					return match{offset: candidate, s: s}
+				}
+				// Calculate offset as if in continuous array with s
+				offset := -len(dict.dict) + candidate
+				if best.length != 0 && best.s-best.offset == s-offset && !rep {
+					// Don't retest if we have the same offset.
+					return match{offset: offset, s: s}
+				}
+
+				if load32(dict.dict, candidate) != first {
+					return match{offset: offset, s: s}
+				}
+				m := match{offset: offset, s: s, length: 4 + candidate, rep: rep, dict: true}
+				s += 4
+				if !rep {
+					for s < sLimitDict && m.length < len(dict.dict) {
+						if len(src)-s < 8 || len(dict.dict)-m.length < 8 {
+							if src[s] == dict.dict[m.length] {
+								m.length++
+								s++
+								continue
+							}
+							break
+						}
+						if diff := load64(src, s) ^ load64(dict.dict, m.length); diff != 0 {
+							m.length += bits.TrailingZeros64(diff) >> 3
+							break
+						}
+						s += 8
+						m.length += 8
+					}
+				} else {
+					for s < len(src) && m.length < len(dict.dict) {
+						if len(src)-s < 8 || len(dict.dict)-m.length < 8 {
+							if src[s] == dict.dict[m.length] {
+								m.length++
+								s++
+								continue
+							}
+							break
+						}
+						if diff := load64(src, s) ^ load64(dict.dict, m.length); diff != 0 {
+							m.length += bits.TrailingZeros64(diff) >> 3
+							break
+						}
+						s += 8
+						m.length += 8
+					}
+				}
+				m.length -= candidate
+				m.score = score(m)
+				if m.score <= -m.s {
+					// Eliminate if no savings, we might find a better one.
+					m.length = 0
+				}
+				return m
+			}
+
+			bestOf := func(a, b match) match {
+				if b.length == 0 {
+					return a
+				}
+				if a.length == 0 {
+					return b
+				}
+				as := a.score + b.s
+				bs := b.score + a.s
+				if as >= bs {
+					return a
+				}
+				return b
+			}
+
+			if s > 0 {
+				best = bestOf(matchAt(getCur(candidateL), s, uint32(cv), false), matchAt(getPrev(candidateL), s, uint32(cv), false))
+				best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv), false))
+				best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv), false))
+			}
+			if dict != nil {
+				candidateL := dict.bestTableLong[hashL]
+				candidateS := dict.bestTableShort[hashS]
+				best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
+				best = bestOf(best, matchDict(int(candidateL>>16), s, uint32(cv), false))
+				best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
+				best = bestOf(best, matchDict(int(candidateS>>16), s, uint32(cv), false))
+			}
+			{
+				if (dict == nil || repeat <= s) && repeat > 0 {
+					best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
+				} else if s-repeat < -4 && dict != nil {
+					candidate := len(dict.dict) - (repeat - s)
+					best = bestOf(best, matchDict(candidate, s, uint32(cv), true))
+					candidate++
+					best = bestOf(best, matchDict(candidate, s+1, uint32(cv>>8), true))
+				}
+
+				if best.length > 0 {
+					hashS := hash4(cv>>8, sTableBits)
+					// s+1
+					nextShort := sTable[hashS]
+					s := s + 1
+					cv := load64(src, s)
+					hashL := hash8(cv, lTableBits)
+					nextLong := lTable[hashL]
+					best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
+					best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
+					best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
+					best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
+
+					// Dict at + 1
+					if dict != nil {
+						candidateL := dict.bestTableLong[hashL]
+						candidateS := dict.bestTableShort[hashS]
+
+						best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
+						best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
+					}
+
+					// s+2
+					if true {
+						hashS := hash4(cv>>8, sTableBits)
+
+						nextShort = sTable[hashS]
+						s++
+						cv = load64(src, s)
+						hashL := hash8(cv, lTableBits)
+						nextLong = lTable[hashL]
+
+						if (dict == nil || repeat <= s) && repeat > 0 {
+							// Repeat at + 2
+							best = bestOf(best, matchAt(s-repeat, s, uint32(cv), true))
+						} else if repeat-s > 4 && dict != nil {
+							candidate := len(dict.dict) - (repeat - s)
+							best = bestOf(best, matchDict(candidate, s, uint32(cv), true))
+						}
+						best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
+						best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
+						best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
+						best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
+
+						// Dict at +2
+						// Very small gain
+						if dict != nil {
+							candidateL := dict.bestTableLong[hashL]
+							candidateS := dict.bestTableShort[hashS]
+
+							best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
+							best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
+						}
+					}
+					// Search for a match at best match end, see if that is better.
+					// Allow some bytes at the beginning to mismatch.
+					// Sweet spot is around 1-2 bytes, but depends on input.
+					// The skipped bytes are tested in Extend backwards,
+					// and still picked up as part of the match if they do.
+					const skipBeginning = 2
+					const skipEnd = 1
+					if sAt := best.s + best.length - skipEnd; sAt < sLimit {
+
+						sBack := best.s + skipBeginning - skipEnd
+						backL := best.length - skipBeginning
+						// Load initial values
+						cv = load64(src, sBack)
+
+						// Grab candidates...
+						next := lTable[hash8(load64(src, sAt), lTableBits)]
+
+						if checkAt := getCur(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+						}
+						if checkAt := getPrev(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+						}
+						// Disabled: Extremely small gain
+						if false {
+							next = sTable[hash4(load64(src, sAt), sTableBits)]
+							if checkAt := getCur(next) - backL; checkAt > 0 {
+								best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+							}
+							if checkAt := getPrev(next) - backL; checkAt > 0 {
+								best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+							}
+						}
+					}
+				}
+			}
+
+			// Update table
+			lTable[hashL] = uint64(s) | candidateL<<32
+			sTable[hashS] = uint64(s) | candidateS<<32
+
+			if best.length > 0 {
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards, not needed for repeats...
+		s = best.s
+		if !best.rep && !best.dict {
+			for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
+				best.offset--
+				best.length++
+				s--
+			}
+		}
+		if false && best.offset >= s {
+			panic(fmt.Errorf("t %d >= s %d", best.offset, s))
+		}
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := s - best.offset
+		s += best.length
+
+		if offset > 65535 && s-base <= 5 && !best.rep {
+			// Bail if the match is equal or worse to the encoding.
+			s = best.s + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+		if debug && nextEmit != base {
+			fmt.Println("EMIT", base-nextEmit, "literals. base-after:", base)
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		if best.rep {
+			if nextEmit > 0 || best.dict {
+				if debug {
+					fmt.Println("REPEAT, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
+				}
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], offset, best.length)
+			} else {
+				// First match without dict cannot be a repeat.
+				if debug {
+					fmt.Println("COPY, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
+				}
+				d += emitCopy(dst[d:], offset, best.length)
+			}
+		} else {
+			if debug {
+				fmt.Println("COPY, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
+			}
+			d += emitCopy(dst[d:], offset, best.length)
+		}
+		repeat = offset
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+		// Fill tables...
+		for i := best.s + 1; i < s; i++ {
+			cv0 := load64(src, i)
+			long0 := hash8(cv0, lTableBits)
+			short0 := hash4(cv0, sTableBits)
+			lTable[long0] = uint64(i) | lTable[long0]<<32
+			sTable[short0] = uint64(i) | sTable[short0]<<32
+		}
+		cv = load64(src, s)
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		if debug && nextEmit != s {
+			fmt.Println("emitted ", len(src)-nextEmit, "literals")
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// encodeBlockBestSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBestSnappy(dst, src []byte) (d int) {
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 19
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 16
+		maxSTableSize = 1 << sTableBits
+
+		inputMargin = 8 + 2
+	)
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	var lTable [maxLTableSize]uint64
+	var sTable [maxSTableSize]uint64
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+	const lowbitMask = 0xffffffff
+	getCur := func(x uint64) int {
+		return int(x & lowbitMask)
+	}
+	getPrev := func(x uint64) int {
+		return int(x >> 32)
+	}
+	const maxSkip = 64
+
+	for {
+		type match struct {
+			offset int
+			s      int
+			length int
+			score  int
+		}
+		var best match
+		for {
+			// Next src position to check
+			nextS := (s-nextEmit)>>8 + 1
+			if nextS > maxSkip {
+				nextS = s + maxSkip
+			} else {
+				nextS += s
+			}
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash8(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL := lTable[hashL]
+			candidateS := sTable[hashS]
+
+			score := func(m match) int {
+				// Matches that are longer forward are penalized since we must emit it as a literal.
+				score := m.length - m.s
+				if nextEmit == m.s {
+					// If we do not have to emit literals, we save 1 byte
+					score++
+				}
+				offset := m.s - m.offset
+
+				return score - emitCopyNoRepeatSize(offset, m.length)
+			}
+
+			matchAt := func(offset, s int, first uint32) match {
+				if best.length != 0 && best.s-best.offset == s-offset {
+					// Don't retest if we have the same offset.
+					return match{offset: offset, s: s}
+				}
+				if load32(src, offset) != first {
+					return match{offset: offset, s: s}
+				}
+				m := match{offset: offset, s: s, length: 4 + offset}
+				s += 4
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
+						m.length += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					m.length += 8
+				}
+				m.length -= offset
+				m.score = score(m)
+				if m.score <= -m.s {
+					// Eliminate if no savings, we might find a better one.
+					m.length = 0
+				}
+				return m
+			}
+
+			bestOf := func(a, b match) match {
+				if b.length == 0 {
+					return a
+				}
+				if a.length == 0 {
+					return b
+				}
+				as := a.score + b.s
+				bs := b.score + a.s
+				if as >= bs {
+					return a
+				}
+				return b
+			}
+
+			best = bestOf(matchAt(getCur(candidateL), s, uint32(cv)), matchAt(getPrev(candidateL), s, uint32(cv)))
+			best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv)))
+			best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv)))
+
+			{
+				best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8)))
+				if best.length > 0 {
+					// s+1
+					nextShort := sTable[hash4(cv>>8, sTableBits)]
+					s := s + 1
+					cv := load64(src, s)
+					nextLong := lTable[hash8(cv, lTableBits)]
+					best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv)))
+					best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv)))
+					best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv)))
+					best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv)))
+					// Repeat at + 2
+					best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8)))
+
+					// s+2
+					if true {
+						nextShort = sTable[hash4(cv>>8, sTableBits)]
+						s++
+						cv = load64(src, s)
+						nextLong = lTable[hash8(cv, lTableBits)]
+						best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv)))
+						best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv)))
+						best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv)))
+						best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv)))
+					}
+					// Search for a match at best match end, see if that is better.
+					if sAt := best.s + best.length; sAt < sLimit {
+						sBack := best.s
+						backL := best.length
+						// Load initial values
+						cv = load64(src, sBack)
+						// Search for mismatch
+						next := lTable[hash8(load64(src, sAt), lTableBits)]
+						//next := sTable[hash4(load64(src, sAt), sTableBits)]
+
+						if checkAt := getCur(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
+						}
+						if checkAt := getPrev(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
+						}
+					}
+				}
+			}
+
+			// Update table
+			lTable[hashL] = uint64(s) | candidateL<<32
+			sTable[hashS] = uint64(s) | candidateS<<32
+
+			if best.length > 0 {
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards, not needed for repeats...
+		s = best.s
+		if true {
+			for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
+				best.offset--
+				best.length++
+				s--
+			}
+		}
+		if false && best.offset >= s {
+			panic(fmt.Errorf("t %d >= s %d", best.offset, s))
+		}
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := s - best.offset
+
+		s += best.length
+
+		if offset > 65535 && s-base <= 5 {
+			// Bail if the match is equal or worse to the encoding.
+			s = best.s + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		d += emitCopyNoRepeat(dst[d:], offset, best.length)
+		repeat = offset
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+		// Fill tables...
+		for i := best.s + 1; i < s; i++ {
+			cv0 := load64(src, i)
+			long0 := hash8(cv0, lTableBits)
+			short0 := hash4(cv0, sTableBits)
+			lTable[long0] = uint64(i) | lTable[long0]<<32
+			sTable[short0] = uint64(i) | sTable[short0]<<32
+		}
+		cv = load64(src, s)
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// emitCopySize returns the size to encode the offset+length
+//
+// It assumes that:
+//
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+func emitCopySize(offset, length int) int {
+	if offset >= 65536 {
+		i := 0
+		if length > 64 {
+			length -= 64
+			if length >= 4 {
+				// Emit remaining as repeats
+				return 5 + emitRepeatSize(offset, length)
+			}
+			i = 5
+		}
+		if length == 0 {
+			return i
+		}
+		return i + 5
+	}
+
+	// Offset no more than 2 bytes.
+	if length > 64 {
+		if offset < 2048 {
+			// Emit 8 bytes, then rest as repeats...
+			return 2 + emitRepeatSize(offset, length-8)
+		}
+		// Emit remaining as repeats, at least 4 bytes remain.
+		return 3 + emitRepeatSize(offset, length-60)
+	}
+	if length >= 12 || offset >= 2048 {
+		return 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	return 2
+}
+
+// emitCopyNoRepeatSize returns the size to encode the offset+length
+//
+// It assumes that:
+//
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+func emitCopyNoRepeatSize(offset, length int) int {
+	if offset >= 65536 {
+		return 5 + 5*(length/64)
+	}
+
+	// Offset no more than 2 bytes.
+	if length > 64 {
+		// Emit remaining as repeats, at least 4 bytes remain.
+		return 3 + 3*(length/60)
+	}
+	if length >= 12 || offset >= 2048 {
+		return 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	return 2
+}
+
+// emitRepeatSize returns the number of bytes required to encode a repeat.
+// Length must be at least 4 and < 1<<24
+func emitRepeatSize(offset, length int) int {
+	// Repeat offset, make length cheaper
+	if length <= 4+4 || (length < 8+4 && offset < 2048) {
+		return 2
+	}
+	if length < (1<<8)+4+4 {
+		return 3
+	}
+	if length < (1<<16)+(1<<8)+4 {
+		return 4
+	}
+	const maxRepeat = (1 << 24) - 1
+	length -= (1 << 16) - 4
+	left := 0
+	if length > maxRepeat {
+		left = length - maxRepeat + 4
+	}
+	if left > 0 {
+		return 5 + emitRepeatSize(offset, left)
+	}
+	return 5
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go
new file mode 100644
index 0000000..544cb1e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_better.go
@@ -0,0 +1,1106 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"bytes"
+	"fmt"
+	"math/bits"
+)
+
+// hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4(u uint64, h uint8) uint32 {
+	const prime4bytes = 2654435761
+	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
+}
+
+// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash5(u uint64, h uint8) uint32 {
+	const prime5bytes = 889523592379
+	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
+}
+
+// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash7(u uint64, h uint8) uint32 {
+	const prime7bytes = 58295818150454627
+	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
+}
+
+// hash8 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash8(u uint64, h uint8) uint32 {
+	const prime8bytes = 0xcf1bbcdcb7a56463
+	return uint32((u * prime8bytes) >> ((64 - h) & 63))
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterGo(dst, src []byte) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 17
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 14
+		maxSTableSize = 1 << sTableBits
+	)
+
+	var lTable [maxLTableSize]uint32
+	var sTable [maxSTableSize]uint32
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We initialize repeat to 0, so we never match on first attempt
+	repeat := 0
+
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = s + (s-nextEmit)>>7 + 1
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint32(s)
+			sTable[hashS] = uint32(s)
+
+			valLong := load64(src, candidateL)
+			valShort := load64(src, candidateS)
+
+			// If long matches at least 8 bytes, use that.
+			if cv == valLong {
+				break
+			}
+			if cv == valShort {
+				candidateL = candidateS
+				break
+			}
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			// Minimum length of a repeat. Tested with various values.
+			// While 4-5 offers improvements in some, 6 reduces
+			// regressions significantly.
+			const wantRepeatBytes = 6
+			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
+			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + wantRepeatBytes + checkRep
+				s += wantRepeatBytes + checkRep
+				for s < len(src) {
+					if len(src)-s < 8 {
+						if src[s] == src[candidate] {
+							s++
+							candidate++
+							continue
+						}
+						break
+					}
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+				// Index in-between
+				index0 := base + 1
+				index1 := s - 2
+
+				for index0 < index1 {
+					cv0 := load64(src, index0)
+					cv1 := load64(src, index1)
+					lTable[hash7(cv0, lTableBits)] = uint32(index0)
+					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+					lTable[hash7(cv1, lTableBits)] = uint32(index1)
+					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+					index0 += 2
+					index1 -= 2
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			// Long likely matches 7, so take that.
+			if uint32(cv) == uint32(valLong) {
+				break
+			}
+
+			// Check our short candidate
+			if uint32(cv) == uint32(valShort) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		if offset > 65535 && s-base <= 5 && repeat != offset {
+			// Bail if the match is equal or worse to the encoding.
+			s = nextS + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		if repeat == offset {
+			d += emitRepeat(dst[d:], offset, s-base)
+		} else {
+			d += emitCopy(dst[d:], offset, s-base)
+			repeat = offset
+		}
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint32(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+		// lTable could be postponed, but very minor difference.
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 16
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 14
+		maxSTableSize = 1 << sTableBits
+	)
+
+	var lTable [maxLTableSize]uint32
+	var sTable [maxSTableSize]uint32
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We initialize repeat to 0, so we never match on first attempt
+	repeat := 0
+	const maxSkip = 100
+
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = (s-nextEmit)>>7 + 1
+			if nextS > maxSkip {
+				nextS = s + maxSkip
+			} else {
+				nextS += s
+			}
+
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint32(s)
+			sTable[hashS] = uint32(s)
+
+			if uint32(cv) == load32(src, candidateL) {
+				break
+			}
+
+			// Check our short candidate
+			if uint32(cv) == load32(src, candidateS) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		if offset > 65535 && s-base <= 5 && repeat != offset {
+			// Bail if the match is equal or worse to the encoding.
+			s = nextS + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		d += emitCopyNoRepeat(dst[d:], offset, s-base)
+		repeat = offset
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint32(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// encodeBlockBetterDict encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterDict(dst, src []byte, dict *Dict) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 17
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 14
+		maxSTableSize = 1 << sTableBits
+
+		maxAhead = 8 // maximum bytes ahead without checking sLimit
+
+		debug = false
+	)
+
+	sLimit := len(src) - inputMargin
+	if sLimit > MaxDictSrcOffset-maxAhead {
+		sLimit = MaxDictSrcOffset - maxAhead
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	dict.initBetter()
+
+	var lTable [maxLTableSize]uint32
+	var sTable [maxSTableSize]uint32
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 0
+	cv := load64(src, s)
+
+	// We initialize repeat to 0, so we never match on first attempt
+	repeat := len(dict.dict) - dict.repeat
+
+	// While in dict
+searchDict:
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = s + (s-nextEmit)>>7 + 1
+			if nextS > sLimit {
+				break searchDict
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			dictL := int(dict.betterTableLong[hashL])
+			dictS := int(dict.betterTableShort[hashS])
+			lTable[hashL] = uint32(s)
+			sTable[hashS] = uint32(s)
+
+			valLong := load64(src, candidateL)
+			valShort := load64(src, candidateS)
+
+			// If long matches at least 8 bytes, use that.
+			if s != 0 {
+				if cv == valLong {
+					goto emitMatch
+				}
+				if cv == valShort {
+					candidateL = candidateS
+					goto emitMatch
+				}
+			}
+
+			// Check dict repeat.
+			if repeat >= s+4 {
+				candidate := len(dict.dict) - repeat + s
+				if candidate > 0 && uint32(cv) == load32(dict.dict, candidate) {
+					// Extend back
+					base := s
+					for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; {
+						i--
+						base--
+					}
+					d += emitLiteral(dst[d:], src[nextEmit:base])
+					if debug && nextEmit != base {
+						fmt.Println("emitted ", base-nextEmit, "literals")
+					}
+					s += 4
+					candidate += 4
+					for candidate < len(dict.dict)-8 && s <= len(src)-8 {
+						if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 {
+							s += bits.TrailingZeros64(diff) >> 3
+							break
+						}
+						s += 8
+						candidate += 8
+					}
+					d += emitRepeat(dst[d:], repeat, s-base)
+					if debug {
+						fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s)
+					}
+					nextEmit = s
+					if s >= sLimit {
+						break searchDict
+					}
+					// Index in-between
+					index0 := base + 1
+					index1 := s - 2
+
+					cv = load64(src, s)
+					for index0 < index1 {
+						cv0 := load64(src, index0)
+						cv1 := load64(src, index1)
+						lTable[hash7(cv0, lTableBits)] = uint32(index0)
+						sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+						lTable[hash7(cv1, lTableBits)] = uint32(index1)
+						sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+						index0 += 2
+						index1 -= 2
+					}
+					continue
+				}
+			}
+			// Don't try to find match at s==0
+			if s == 0 {
+				cv = load64(src, nextS)
+				s = nextS
+				continue
+			}
+
+			// Long likely matches 7, so take that.
+			if uint32(cv) == uint32(valLong) {
+				goto emitMatch
+			}
+
+			// Long dict...
+			if uint32(cv) == load32(dict.dict, dictL) {
+				candidateL = dictL
+				goto emitDict
+			}
+
+			// Check our short candidate
+			if uint32(cv) == uint32(valShort) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					goto emitMatch
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				goto emitMatch
+			}
+			if uint32(cv) == load32(dict.dict, dictS) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					goto emitMatch
+				}
+				candidateL = dictS
+				goto emitDict
+			}
+			cv = load64(src, nextS)
+			s = nextS
+		}
+	emitDict:
+		{
+			if debug {
+				if load32(dict.dict, candidateL) != load32(src, s) {
+					panic("dict emit mismatch")
+				}
+			}
+			// Extend backwards.
+			// The top bytes will be rechecked to get the full match.
+			for candidateL > 0 && s > nextEmit && dict.dict[candidateL-1] == src[s-1] {
+				candidateL--
+				s--
+			}
+
+			// Bail if we exceed the maximum size.
+			if d+(s-nextEmit) > dstLimit {
+				return 0
+			}
+
+			// A 4-byte match has been found. We'll later see if more than 4 bytes
+			// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+			// them as literal bytes.
+
+			d += emitLiteral(dst[d:], src[nextEmit:s])
+			if debug && nextEmit != s {
+				fmt.Println("emitted ", s-nextEmit, "literals")
+			}
+			{
+				// Invariant: we have a 4-byte match at s, and no need to emit any
+				// literal bytes prior to s.
+				base := s
+				offset := s + (len(dict.dict)) - candidateL
+
+				// Extend the 4-byte match as long as possible.
+				s += 4
+				candidateL += 4
+				for s <= len(src)-8 && len(dict.dict)-candidateL >= 8 {
+					if diff := load64(src, s) ^ load64(dict.dict, candidateL); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidateL += 8
+				}
+
+				if repeat == offset {
+					if debug {
+						fmt.Println("emitted dict repeat, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
+					}
+					d += emitRepeat(dst[d:], offset, s-base)
+				} else {
+					if debug {
+						fmt.Println("emitted dict copy, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
+					}
+					// Matches longer than 64 are split.
+					if s <= sLimit || s-base < 8 {
+						d += emitCopy(dst[d:], offset, s-base)
+					} else {
+						// Split to ensure we don't start a copy within next block.
+						d += emitCopy(dst[d:], offset, 4)
+						d += emitRepeat(dst[d:], offset, s-base-4)
+					}
+					repeat = offset
+				}
+				if false {
+					// Validate match.
+					if s <= candidateL {
+						panic("s <= candidate")
+					}
+					a := src[base:s]
+					b := dict.dict[base-repeat : base-repeat+(s-base)]
+					if !bytes.Equal(a, b) {
+						panic("mismatch")
+					}
+				}
+
+				nextEmit = s
+				if s >= sLimit {
+					break searchDict
+				}
+
+				if d > dstLimit {
+					// Do we have space for more, if not bail.
+					return 0
+				}
+
+				// Index short & long
+				index0 := base + 1
+				index1 := s - 2
+
+				cv0 := load64(src, index0)
+				cv1 := load64(src, index1)
+				lTable[hash7(cv0, lTableBits)] = uint32(index0)
+				sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+				lTable[hash7(cv1, lTableBits)] = uint32(index1)
+				sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+				index0 += 1
+				index1 -= 1
+				cv = load64(src, s)
+
+				// index every second long in between.
+				for index0 < index1 {
+					lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+					lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
+					index0 += 2
+					index1 -= 2
+				}
+			}
+			continue
+		}
+	emitMatch:
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		if offset > 65535 && s-base <= 5 && repeat != offset {
+			// Bail if the match is equal or worse to the encoding.
+			s = nextS + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		if debug && nextEmit != s {
+			fmt.Println("emitted ", s-nextEmit, "literals")
+		}
+		if repeat == offset {
+			if debug {
+				fmt.Println("emitted match repeat, length", s-base, "offset:", offset, "s:", s)
+			}
+			d += emitRepeat(dst[d:], offset, s-base)
+		} else {
+			if debug {
+				fmt.Println("emitted match copy, length", s-base, "offset:", offset, "s:", s)
+			}
+			d += emitCopy(dst[d:], offset, s-base)
+			repeat = offset
+		}
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint32(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+	// Search without dict:
+	if repeat > s {
+		repeat = 0
+	}
+
+	// No more dict
+	sLimit = len(src) - inputMargin
+	if s >= sLimit {
+		goto emitRemainder
+	}
+	cv = load64(src, s)
+	if debug {
+		fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s)
+	}
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = s + (s-nextEmit)>>7 + 1
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint32(s)
+			sTable[hashS] = uint32(s)
+
+			valLong := load64(src, candidateL)
+			valShort := load64(src, candidateS)
+
+			// If long matches at least 8 bytes, use that.
+			if cv == valLong {
+				break
+			}
+			if cv == valShort {
+				candidateL = candidateS
+				break
+			}
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			// Minimum length of a repeat. Tested with various values.
+			// While 4-5 offers improvements in some, 6 reduces
+			// regressions significantly.
+			const wantRepeatBytes = 6
+			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
+			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + wantRepeatBytes + checkRep
+				s += wantRepeatBytes + checkRep
+				for s < len(src) {
+					if len(src)-s < 8 {
+						if src[s] == src[candidate] {
+							s++
+							candidate++
+							continue
+						}
+						break
+					}
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+				// Index in-between
+				index0 := base + 1
+				index1 := s - 2
+
+				for index0 < index1 {
+					cv0 := load64(src, index0)
+					cv1 := load64(src, index1)
+					lTable[hash7(cv0, lTableBits)] = uint32(index0)
+					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+					lTable[hash7(cv1, lTableBits)] = uint32(index1)
+					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+					index0 += 2
+					index1 -= 2
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			// Long likely matches 7, so take that.
+			if uint32(cv) == uint32(valLong) {
+				break
+			}
+
+			// Check our short candidate
+			if uint32(cv) == uint32(valShort) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		if offset > 65535 && s-base <= 5 && repeat != offset {
+			// Bail if the match is equal or worse to the encoding.
+			s = nextS + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		if repeat == offset {
+			d += emitRepeat(dst[d:], offset, s-base)
+		} else {
+			d += emitCopy(dst[d:], offset, s-base)
+			repeat = offset
+		}
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint32(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go
new file mode 100644
index 0000000..6b393c3
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_go.go
@@ -0,0 +1,729 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+package s2
+
+import (
+	"bytes"
+	"math/bits"
+)
+
+const hasAmd64Asm = false
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src))
+func encodeBlock(dst, src []byte) (d int) {
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeBlockGo(dst, src)
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src))
+func encodeBlockBetter(dst, src []byte) (d int) {
+	return encodeBlockBetterGo(dst, src)
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src))
+func encodeBlockBetterSnappy(dst, src []byte) (d int) {
+	return encodeBlockBetterSnappyGo(dst, src)
+}
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src))
+func encodeBlockSnappy(dst, src []byte) (d int) {
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeBlockSnappyGo(dst, src)
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	0 <= len(lit) && len(lit) <= math.MaxUint32
+func emitLiteral(dst, lit []byte) int {
+	if len(lit) == 0 {
+		return 0
+	}
+	const num = 63<<2 | tagLiteral
+	i, n := 0, uint(len(lit)-1)
+	switch {
+	case n < 60:
+		dst[0] = uint8(n)<<2 | tagLiteral
+		i = 1
+	case n < 1<<8:
+		dst[1] = uint8(n)
+		dst[0] = 60<<2 | tagLiteral
+		i = 2
+	case n < 1<<16:
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 61<<2 | tagLiteral
+		i = 3
+	case n < 1<<24:
+		dst[3] = uint8(n >> 16)
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 62<<2 | tagLiteral
+		i = 4
+	default:
+		dst[4] = uint8(n >> 24)
+		dst[3] = uint8(n >> 16)
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 63<<2 | tagLiteral
+		i = 5
+	}
+	return i + copy(dst[i:], lit)
+}
+
+// emitRepeat writes a repeat chunk and returns the number of bytes written.
+// Length must be at least 4 and < 1<<24
+func emitRepeat(dst []byte, offset, length int) int {
+	// Repeat offset, make length cheaper
+	length -= 4
+	if length <= 4 {
+		dst[0] = uint8(length)<<2 | tagCopy1
+		dst[1] = 0
+		return 2
+	}
+	if length < 8 && offset < 2048 {
+		// Encode WITH offset
+		dst[1] = uint8(offset)
+		dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
+		return 2
+	}
+	if length < (1<<8)+4 {
+		length -= 4
+		dst[2] = uint8(length)
+		dst[1] = 0
+		dst[0] = 5<<2 | tagCopy1
+		return 3
+	}
+	if length < (1<<16)+(1<<8) {
+		length -= 1 << 8
+		dst[3] = uint8(length >> 8)
+		dst[2] = uint8(length >> 0)
+		dst[1] = 0
+		dst[0] = 6<<2 | tagCopy1
+		return 4
+	}
+	const maxRepeat = (1 << 24) - 1
+	length -= 1 << 16
+	left := 0
+	if length > maxRepeat {
+		left = length - maxRepeat + 4
+		length = maxRepeat - 4
+	}
+	dst[4] = uint8(length >> 16)
+	dst[3] = uint8(length >> 8)
+	dst[2] = uint8(length >> 0)
+	dst[1] = 0
+	dst[0] = 7<<2 | tagCopy1
+	if left > 0 {
+		return 5 + emitRepeat(dst[5:], offset, left)
+	}
+	return 5
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+func emitCopy(dst []byte, offset, length int) int {
+	if offset >= 65536 {
+		i := 0
+		if length > 64 {
+			// Emit a length 64 copy, encoded as 5 bytes.
+			dst[4] = uint8(offset >> 24)
+			dst[3] = uint8(offset >> 16)
+			dst[2] = uint8(offset >> 8)
+			dst[1] = uint8(offset)
+			dst[0] = 63<<2 | tagCopy4
+			length -= 64
+			if length >= 4 {
+				// Emit remaining as repeats
+				return 5 + emitRepeat(dst[5:], offset, length)
+			}
+			i = 5
+		}
+		if length == 0 {
+			return i
+		}
+		// Emit a copy, offset encoded as 4 bytes.
+		dst[i+0] = uint8(length-1)<<2 | tagCopy4
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		dst[i+3] = uint8(offset >> 16)
+		dst[i+4] = uint8(offset >> 24)
+		return i + 5
+	}
+
+	// Offset no more than 2 bytes.
+	if length > 64 {
+		off := 3
+		if offset < 2048 {
+			// emit 8 bytes as tagCopy1, rest as repeats.
+			dst[1] = uint8(offset)
+			dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
+			length -= 8
+			off = 2
+		} else {
+			// Emit a length 60 copy, encoded as 3 bytes.
+			// Emit remaining as repeat value (minimum 4 bytes).
+			dst[2] = uint8(offset >> 8)
+			dst[1] = uint8(offset)
+			dst[0] = 59<<2 | tagCopy2
+			length -= 60
+		}
+		// Emit remaining as repeats, at least 4 bytes remain.
+		return off + emitRepeat(dst[off:], offset, length)
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[2] = uint8(offset >> 8)
+		dst[1] = uint8(offset)
+		dst[0] = uint8(length-1)<<2 | tagCopy2
+		return 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[1] = uint8(offset)
+	dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	return 2
+}
+
+// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+func emitCopyNoRepeat(dst []byte, offset, length int) int {
+	if offset >= 65536 {
+		i := 0
+		if length > 64 {
+			// Emit a length 64 copy, encoded as 5 bytes.
+			dst[4] = uint8(offset >> 24)
+			dst[3] = uint8(offset >> 16)
+			dst[2] = uint8(offset >> 8)
+			dst[1] = uint8(offset)
+			dst[0] = 63<<2 | tagCopy4
+			length -= 64
+			if length >= 4 {
+				// Emit remaining as repeats
+				return 5 + emitCopyNoRepeat(dst[5:], offset, length)
+			}
+			i = 5
+		}
+		if length == 0 {
+			return i
+		}
+		// Emit a copy, offset encoded as 4 bytes.
+		dst[i+0] = uint8(length-1)<<2 | tagCopy4
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		dst[i+3] = uint8(offset >> 16)
+		dst[i+4] = uint8(offset >> 24)
+		return i + 5
+	}
+
+	// Offset no more than 2 bytes.
+	if length > 64 {
+		// Emit a length 60 copy, encoded as 3 bytes.
+		// Emit remaining as repeat value (minimum 4 bytes).
+		dst[2] = uint8(offset >> 8)
+		dst[1] = uint8(offset)
+		dst[0] = 59<<2 | tagCopy2
+		length -= 60
+		// Emit remaining as repeats, at least 4 bytes remain.
+		return 3 + emitCopyNoRepeat(dst[3:], offset, length)
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[2] = uint8(offset >> 8)
+		dst[1] = uint8(offset)
+		dst[0] = uint8(length-1)<<2 | tagCopy2
+		return 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[1] = uint8(offset)
+	dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	return 2
+}
+
+// matchLen returns how many bytes match in a and b
+//
+// It assumes that:
+//
+//	len(a) <= len(b)
+func matchLen(a []byte, b []byte) int {
+	b = b[:len(a)]
+	var checked int
+	if len(a) > 4 {
+		// Try 4 bytes first
+		if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
+			return bits.TrailingZeros32(diff) >> 3
+		}
+		// Switch to 8 byte matching.
+		checked = 4
+		a = a[4:]
+		b = b[4:]
+		for len(a) >= 8 {
+			b = b[:len(a)]
+			if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
+				return checked + (bits.TrailingZeros64(diff) >> 3)
+			}
+			checked += 8
+			a = a[8:]
+			b = b[8:]
+		}
+	}
+	b = b[:len(a)]
+	for i := range a {
+		if a[i] != b[i] {
+			return int(i) + checked
+		}
+	}
+	return len(a) + checked
+}
+
+// input must be > inputMargin
+func calcBlockSize(src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 13
+		maxTableSize = 1 << tableBits
+	)
+
+	var table [maxTableSize]uint32
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>6 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint32(s)
+			table[hash1] = uint32(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				d += emitLiteralSize(src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+
+				d += emitCopyNoRepeatSize(repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint32(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint32(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteralSize(src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopyNoRepeatSize(repeat, s-base)
+			if false {
+				// Validate match.
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteralSize(src[nextEmit:])
+	}
+	return d
+}
+
+// length must be > inputMargin.
+func calcBlockSizeSmall(src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 9
+		maxTableSize = 1 << tableBits
+	)
+
+	var table [maxTableSize]uint32
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>6 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint32(s)
+			table[hash1] = uint32(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				d += emitLiteralSize(src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+
+				d += emitCopyNoRepeatSize(repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint32(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint32(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteralSize(src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopyNoRepeatSize(repeat, s-base)
+			if false {
+				// Validate match.
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteralSize(src[nextEmit:])
+	}
+	return d
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	0 <= len(lit) && len(lit) <= math.MaxUint32
+func emitLiteralSize(lit []byte) int {
+	if len(lit) == 0 {
+		return 0
+	}
+	switch {
+	case len(lit) <= 60:
+		return len(lit) + 1
+	case len(lit) <= 1<<8:
+		return len(lit) + 2
+	case len(lit) <= 1<<16:
+		return len(lit) + 3
+	case len(lit) <= 1<<24:
+		return len(lit) + 4
+	default:
+		return len(lit) + 5
+	}
+}
+
+func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
+	panic("cvtLZ4BlockAsm should be unreachable")
+}
+
+func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
+	panic("cvtLZ4BlockSnappyAsm should be unreachable")
+}
+
+func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
+	panic("cvtLZ4sBlockAsm should be unreachable")
+}
+
+func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
+	panic("cvtLZ4sBlockSnappyAsm should be unreachable")
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
new file mode 100644
index 0000000..297e415
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
@@ -0,0 +1,228 @@
+// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !noasm
+
+package s2
+
+func _dummy_()
+
+// encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm(dst []byte, src []byte) int
+
+// encodeBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4194304 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm4MB(dst []byte, src []byte) int
+
+// encodeBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm12B(dst []byte, src []byte) int
+
+// encodeBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm10B(dst []byte, src []byte) int
+
+// encodeBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm8B(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4194304 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm12B(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm10B(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm8B(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 65535 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 65535 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
+
+// calcBlockSize encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func calcBlockSize(src []byte) int
+
+// calcBlockSizeSmall encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 1024 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func calcBlockSizeSmall(src []byte) int
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes with margin of 0 bytes
+//	0 <= len(lit) && len(lit) <= math.MaxUint32
+//
+//go:noescape
+func emitLiteral(dst []byte, lit []byte) int
+
+// emitRepeat writes a repeat chunk and returns the number of bytes written.
+// Length must be at least 4 and < 1<<32
+//
+//go:noescape
+func emitRepeat(dst []byte, offset int, length int) int
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+//
+//go:noescape
+func emitCopy(dst []byte, offset int, length int) int
+
+// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+//
+//go:noescape
+func emitCopyNoRepeat(dst []byte, offset int, length int) int
+
+// matchLen returns how many bytes match in a and b
+//
+// It assumes that:
+//
+//	len(a) <= len(b)
+//
+//go:noescape
+func matchLen(a []byte, b []byte) int
+
+// cvtLZ4Block converts an LZ4 block to S2
+//
+//go:noescape
+func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+
+// cvtLZ4sBlock converts an LZ4s block to S2
+//
+//go:noescape
+func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+
+// cvtLZ4Block converts an LZ4 block to Snappy
+//
+//go:noescape
+func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+
+// cvtLZ4sBlock converts an LZ4s block to Snappy
+//
+//go:noescape
+func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
new file mode 100644
index 0000000..2ff5b33
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
@@ -0,0 +1,21277 @@
+// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !noasm
+
+#include "textflag.h"
+
+// func _dummy_()
+TEXT ·_dummy_(SB), $0
+#ifdef GOAMD64_v4
+#ifndef GOAMD64_v3
+#define GOAMD64_v3
+#endif
+#endif
+	RET
+
+// func encodeBlockAsm(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBlockAsm(SB), $65560-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000200, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBlockAsm
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBlockAsm:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x06, BX
+	LEAL  4(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeBlockAsm
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHRQ  $0x08, R10
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x32, R9
+	SHLQ  $0x10, R10
+	IMULQ R8, R10
+	SHRQ  $0x32, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  24(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	LEAL  1(CX), R9
+	MOVL  R9, 24(SP)(R10*4)
+	MOVQ  SI, R9
+	SHRQ  $0x10, R9
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x32, R9
+	MOVL  CX, R8
+	SUBL  16(SP), R8
+	MOVL  1(DX)(R8*1), R10
+	MOVQ  SI, R8
+	SHRQ  $0x08, R8
+	CMPL  R8, R10
+	JNE   no_repeat_found_encodeBlockAsm
+	LEAL  1(CX), SI
+	MOVL  12(SP), DI
+	MOVL  SI, BX
+	SUBL  16(SP), BX
+	JZ    repeat_extend_back_end_encodeBlockAsm
+
+repeat_extend_back_loop_encodeBlockAsm:
+	CMPL SI, DI
+	JBE  repeat_extend_back_end_encodeBlockAsm
+	MOVB -1(DX)(BX*1), R8
+	MOVB -1(DX)(SI*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeBlockAsm
+	LEAL -1(SI), SI
+	DECL BX
+	JNZ  repeat_extend_back_loop_encodeBlockAsm
+
+repeat_extend_back_end_encodeBlockAsm:
+	MOVL SI, BX
+	SUBL 12(SP), BX
+	LEAQ 5(AX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   repeat_dst_size_check_encodeBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+repeat_dst_size_check_encodeBlockAsm:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_repeat_emit_encodeBlockAsm
+	CMPL BX, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBlockAsm
+	CMPL BX, $0x00010000
+	JB   three_bytes_repeat_emit_encodeBlockAsm
+	CMPL BX, $0x01000000
+	JB   four_bytes_repeat_emit_encodeBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL BX, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm
+
+four_bytes_repeat_emit_encodeBlockAsm:
+	MOVL BX, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (AX)
+	MOVW BX, 1(AX)
+	MOVB R10, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm
+
+three_bytes_repeat_emit_encodeBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm
+
+two_bytes_repeat_emit_encodeBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_repeat_emit_encodeBlockAsm
+	JMP  memmove_long_repeat_emit_encodeBlockAsm
+
+one_byte_repeat_emit_encodeBlockAsm:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm:
+	MOVQ BX, AX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm
+
+memmove_long_repeat_emit_encodeBlockAsm:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm:
+	ADDL $0x05, CX
+	MOVL CX, BX
+	SUBL 16(SP), BX
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm:
+	CMPL R8, $0x10
+	JB   matchlen_match8_repeat_extend_encodeBlockAsm
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (BX)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm
+	XORQ 8(BX)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_match8_repeat_extend_encodeBlockAsm:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm
+	MOVQ (R9)(R11*1), R10
+	XORQ (BX)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_match4_repeat_extend_encodeBlockAsm:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm
+	MOVL (R9)(R11*1), R10
+	CMPL (BX)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeBlockAsm:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm
+	JB   repeat_extend_forward_end_encodeBlockAsm
+	MOVW (R9)(R11*1), R10
+	CMPW (BX)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_match1_repeat_extend_encodeBlockAsm:
+	MOVB (R9)(R11*1), R10
+	CMPB (BX)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeBlockAsm
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeBlockAsm:
+	ADDL  R11, CX
+	MOVL  CX, BX
+	SUBL  SI, BX
+	MOVL  16(SP), SI
+	TESTL DI, DI
+	JZ    repeat_as_copy_encodeBlockAsm
+
+	// emitRepeat
+emit_repeat_again_match_repeat_encodeBlockAsm:
+	MOVL BX, DI
+	LEAL -4(BX), BX
+	CMPL DI, $0x08
+	JBE  repeat_two_match_repeat_encodeBlockAsm
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_match_repeat_encodeBlockAsm
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm:
+	CMPL BX, $0x00000104
+	JB   repeat_three_match_repeat_encodeBlockAsm
+	CMPL BX, $0x00010100
+	JB   repeat_four_match_repeat_encodeBlockAsm
+	CMPL BX, $0x0100ffff
+	JB   repeat_five_match_repeat_encodeBlockAsm
+	LEAL -16842747(BX), BX
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_match_repeat_encodeBlockAsm
+
+repeat_five_match_repeat_encodeBlockAsm:
+	LEAL -65536(BX), BX
+	MOVL BX, SI
+	MOVW $0x001d, (AX)
+	MOVW BX, 2(AX)
+	SARL $0x10, SI
+	MOVB SI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_four_match_repeat_encodeBlockAsm:
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_three_match_repeat_encodeBlockAsm:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_match_repeat_encodeBlockAsm:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_match_repeat_encodeBlockAsm:
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_as_copy_encodeBlockAsm:
+	// emitCopy
+	CMPL SI, $0x00010000
+	JB   two_byte_offset_repeat_as_copy_encodeBlockAsm
+	CMPL BX, $0x40
+	JBE  four_bytes_remain_repeat_as_copy_encodeBlockAsm
+	MOVB $0xff, (AX)
+	MOVL SI, 1(AX)
+	LEAL -64(BX), BX
+	ADDQ $0x05, AX
+	CMPL BX, $0x04
+	JB   four_bytes_remain_repeat_as_copy_encodeBlockAsm
+
+	// emitRepeat
+emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
+	MOVL BX, DI
+	LEAL -4(BX), BX
+	CMPL DI, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
+	CMPL BX, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
+	CMPL BX, $0x00010100
+	JB   repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
+	CMPL BX, $0x0100ffff
+	JB   repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
+	LEAL -16842747(BX), BX
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
+
+repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
+	LEAL -65536(BX), BX
+	MOVL BX, SI
+	MOVW $0x001d, (AX)
+	MOVW BX, 2(AX)
+	SARL $0x10, SI
+	MOVB SI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+four_bytes_remain_repeat_as_copy_encodeBlockAsm:
+	TESTL BX, BX
+	JZ    repeat_end_emit_encodeBlockAsm
+	XORL  DI, DI
+	LEAL  -1(DI)(BX*4), BX
+	MOVB  BL, (AX)
+	MOVL  SI, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   repeat_end_emit_encodeBlockAsm
+
+two_byte_offset_repeat_as_copy_encodeBlockAsm:
+	CMPL BX, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm
+	CMPL SI, $0x00000800
+	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm
+	MOVL $0x00000001, DI
+	LEAL 16(DI), DI
+	MOVB SI, 1(AX)
+	MOVL SI, R8
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, BX
+
+	// emitRepeat
+	LEAL -4(BX), BX
+	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+
+emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+	MOVL BX, DI
+	LEAL -4(BX), BX
+	CMPL DI, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+	CMPL BX, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+	CMPL BX, $0x00010100
+	JB   repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+	CMPL BX, $0x0100ffff
+	JB   repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+	LEAL -16842747(BX), BX
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+
+repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+	LEAL -65536(BX), BX
+	MOVL BX, SI
+	MOVW $0x001d, (AX)
+	MOVW BX, 2(AX)
+	SARL $0x10, SI
+	MOVB SI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+long_offset_short_repeat_as_copy_encodeBlockAsm:
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(BX), BX
+	ADDQ $0x03, AX
+
+	// emitRepeat
+emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	MOVL BX, DI
+	LEAL -4(BX), BX
+	CMPL DI, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	CMPL BX, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	CMPL BX, $0x00010100
+	JB   repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	CMPL BX, $0x0100ffff
+	JB   repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	LEAL -16842747(BX), BX
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
+
+repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	LEAL -65536(BX), BX
+	MOVL BX, SI
+	MOVW $0x001d, (AX)
+	MOVW BX, 2(AX)
+	SARL $0x10, SI
+	MOVB SI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
+	MOVL BX, DI
+	SHLL $0x02, DI
+	CMPL BX, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm
+	LEAL -15(DI), DI
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeBlockAsm
+
+no_repeat_found_encodeBlockAsm:
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate_match_encodeBlockAsm
+	SHRQ $0x08, SI
+	MOVL 24(SP)(R9*4), BX
+	LEAL 2(CX), R8
+	CMPL (DX)(DI*1), SI
+	JEQ  candidate2_match_encodeBlockAsm
+	MOVL R8, 24(SP)(R9*4)
+	SHRQ $0x08, SI
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate3_match_encodeBlockAsm
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBlockAsm
+
+candidate3_match_encodeBlockAsm:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeBlockAsm
+
+candidate2_match_encodeBlockAsm:
+	MOVL R8, 24(SP)(R9*4)
+	INCL CX
+	MOVL DI, BX
+
+candidate_match_encodeBlockAsm:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeBlockAsm
+
+match_extend_back_loop_encodeBlockAsm:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeBlockAsm
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBlockAsm
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeBlockAsm
+	JMP  match_extend_back_loop_encodeBlockAsm
+
+match_extend_back_end_encodeBlockAsm:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 5(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm:
+	MOVL CX, SI
+	MOVL 12(SP), DI
+	CMPL DI, SI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(DI*1), SI
+	SUBL DI, R8
+	LEAL -1(R8), DI
+	CMPL DI, $0x3c
+	JB   one_byte_match_emit_encodeBlockAsm
+	CMPL DI, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm
+	CMPL DI, $0x00010000
+	JB   three_bytes_match_emit_encodeBlockAsm
+	CMPL DI, $0x01000000
+	JB   four_bytes_match_emit_encodeBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL DI, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+four_bytes_match_emit_encodeBlockAsm:
+	MOVL DI, R9
+	SHRL $0x10, R9
+	MOVB $0xf8, (AX)
+	MOVW DI, 1(AX)
+	MOVB R9, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+three_bytes_match_emit_encodeBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+two_bytes_match_emit_encodeBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB DI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DI, $0x40
+	JB   memmove_match_emit_encodeBlockAsm
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+one_byte_match_emit_encodeBlockAsm:
+	SHLB $0x02, DI
+	MOVB DI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
+	MOVQ (SI), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
+	MOVQ (SI), R9
+	MOVQ -8(SI)(R8*1), SI
+	MOVQ R9, (AX)
+	MOVQ SI, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (SI), X0
+	MOVOU -16(SI)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm:
+	MOVQ DI, AX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm
+
+memmove_long_match_emit_encodeBlockAsm:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveLong
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVQ  R8, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(SI)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(SI)(R11*1), X4
+	MOVOU -16(SI)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  R8, R11
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  DI, AX
+
+emit_literal_done_match_emit_encodeBlockAsm:
+match_nolit_loop_encodeBlockAsm:
+	MOVL CX, SI
+	SUBL BX, SI
+	MOVL SI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), SI
+	SUBL CX, SI
+	LEAQ (DX)(CX*1), DI
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R9, R9
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm:
+	CMPL SI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBlockAsm
+	MOVQ (DI)(R9*1), R8
+	MOVQ 8(DI)(R9*1), R10
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm
+	XORQ 8(BX)(R9*1), R10
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm
+	LEAL -16(SI), SI
+	LEAL 16(R9), R9
+	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm
+
+matchlen_bsf_16match_nolit_encodeBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R9)(R10*1), R9
+	JMP  match_nolit_end_encodeBlockAsm
+
+matchlen_match8_match_nolit_encodeBlockAsm:
+	CMPL SI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm
+	MOVQ (DI)(R9*1), R8
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm
+	LEAL -8(SI), SI
+	LEAL 8(R9), R9
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R8, R8
+
+#else
+	BSFQ R8, R8
+
+#endif
+	SARQ $0x03, R8
+	LEAL (R9)(R8*1), R9
+	JMP  match_nolit_end_encodeBlockAsm
+
+matchlen_match4_match_nolit_encodeBlockAsm:
+	CMPL SI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm
+	MOVL (DI)(R9*1), R8
+	CMPL (BX)(R9*1), R8
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm
+	LEAL -4(SI), SI
+	LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_encodeBlockAsm:
+	CMPL SI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm
+	JB   match_nolit_end_encodeBlockAsm
+	MOVW (DI)(R9*1), R8
+	CMPW (BX)(R9*1), R8
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm
+	LEAL 2(R9), R9
+	SUBL $0x02, SI
+	JZ   match_nolit_end_encodeBlockAsm
+
+matchlen_match1_match_nolit_encodeBlockAsm:
+	MOVB (DI)(R9*1), R8
+	CMPB (BX)(R9*1), R8
+	JNE  match_nolit_end_encodeBlockAsm
+	LEAL 1(R9), R9
+
+match_nolit_end_encodeBlockAsm:
+	ADDL R9, CX
+	MOVL 16(SP), BX
+	ADDL $0x04, R9
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL BX, $0x00010000
+	JB   two_byte_offset_match_nolit_encodeBlockAsm
+	CMPL R9, $0x40
+	JBE  four_bytes_remain_match_nolit_encodeBlockAsm
+	MOVB $0xff, (AX)
+	MOVL BX, 1(AX)
+	LEAL -64(R9), R9
+	ADDQ $0x05, AX
+	CMPL R9, $0x04
+	JB   four_bytes_remain_match_nolit_encodeBlockAsm
+
+	// emitRepeat
+emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
+	MOVL R9, SI
+	LEAL -4(R9), R9
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm_emit_copy
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
+	CMPL BX, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
+	CMPL R9, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm_emit_copy
+	CMPL R9, $0x00010100
+	JB   repeat_four_match_nolit_encodeBlockAsm_emit_copy
+	CMPL R9, $0x0100ffff
+	JB   repeat_five_match_nolit_encodeBlockAsm_emit_copy
+	LEAL -16842747(R9), R9
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
+
+repeat_five_match_nolit_encodeBlockAsm_emit_copy:
+	LEAL -65536(R9), R9
+	MOVL R9, BX
+	MOVW $0x001d, (AX)
+	MOVW R9, 2(AX)
+	SARL $0x10, BX
+	MOVB BL, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_four_match_nolit_encodeBlockAsm_emit_copy:
+	LEAL -256(R9), R9
+	MOVW $0x0019, (AX)
+	MOVW R9, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_match_nolit_encodeBlockAsm_emit_copy:
+	LEAL -4(R9), R9
+	MOVW $0x0015, (AX)
+	MOVB R9, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_match_nolit_encodeBlockAsm_emit_copy:
+	SHLL $0x02, R9
+	ORL  $0x01, R9
+	MOVW R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
+	XORQ SI, SI
+	LEAL 1(SI)(R9*4), R9
+	MOVB BL, 1(AX)
+	SARL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, R9
+	MOVB R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+four_bytes_remain_match_nolit_encodeBlockAsm:
+	TESTL R9, R9
+	JZ    match_nolit_emitcopy_end_encodeBlockAsm
+	XORL  SI, SI
+	LEAL  -1(SI)(R9*4), R9
+	MOVB  R9, (AX)
+	MOVL  BX, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   match_nolit_emitcopy_end_encodeBlockAsm
+
+two_byte_offset_match_nolit_encodeBlockAsm:
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm
+	CMPL BX, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBlockAsm
+	MOVL $0x00000001, SI
+	LEAL 16(SI), SI
+	MOVB BL, 1(AX)
+	MOVL BX, DI
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R9
+
+	// emitRepeat
+	LEAL -4(R9), R9
+	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
+
+emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+	MOVL R9, SI
+	LEAL -4(R9), R9
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
+	CMPL BX, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+	CMPL R9, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b
+	CMPL R9, $0x00010100
+	JB   repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b
+	CMPL R9, $0x0100ffff
+	JB   repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b
+	LEAL -16842747(R9), R9
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b
+
+repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+	LEAL -65536(R9), R9
+	MOVL R9, BX
+	MOVW $0x001d, (AX)
+	MOVW R9, 2(AX)
+	SARL $0x10, BX
+	MOVB BL, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+	LEAL -256(R9), R9
+	MOVW $0x0019, (AX)
+	MOVW R9, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+	LEAL -4(R9), R9
+	MOVW $0x0015, (AX)
+	MOVB R9, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+	SHLL $0x02, R9
+	ORL  $0x01, R9
+	MOVW R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+	XORQ SI, SI
+	LEAL 1(SI)(R9*4), R9
+	MOVB BL, 1(AX)
+	SARL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, R9
+	MOVB R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+long_offset_short_match_nolit_encodeBlockAsm:
+	MOVB $0xee, (AX)
+	MOVW BX, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+
+	// emitRepeat
+emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
+	MOVL R9, SI
+	LEAL -4(R9), R9
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
+	CMPL BX, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
+	CMPL R9, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
+	CMPL R9, $0x00010100
+	JB   repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
+	CMPL R9, $0x0100ffff
+	JB   repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
+	LEAL -16842747(R9), R9
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
+
+repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
+	LEAL -65536(R9), R9
+	MOVL R9, BX
+	MOVW $0x001d, (AX)
+	MOVW R9, 2(AX)
+	SARL $0x10, BX
+	MOVB BL, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
+	LEAL -256(R9), R9
+	MOVW $0x0019, (AX)
+	MOVW R9, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
+	LEAL -4(R9), R9
+	MOVW $0x0015, (AX)
+	MOVB R9, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
+	SHLL $0x02, R9
+	ORL  $0x01, R9
+	MOVW R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(R9*4), R9
+	MOVB BL, 1(AX)
+	SARL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, R9
+	MOVB R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+two_byte_offset_short_match_nolit_encodeBlockAsm:
+	MOVL R9, SI
+	SHLL $0x02, SI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm
+	CMPL BX, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm
+	LEAL -15(SI), SI
+	MOVB BL, 1(AX)
+	SHRL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+emit_copy_three_match_nolit_encodeBlockAsm:
+	LEAL -2(SI), SI
+	MOVB SI, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm
+	MOVQ -2(DX)(CX*1), SI
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm:
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  SI, DI
+	SHRQ  $0x10, SI
+	MOVQ  SI, BX
+	SHLQ  $0x10, DI
+	IMULQ R8, DI
+	SHRQ  $0x32, DI
+	SHLQ  $0x10, BX
+	IMULQ R8, BX
+	SHRQ  $0x32, BX
+	LEAL  -2(CX), R8
+	LEAQ  24(SP)(BX*4), R9
+	MOVL  (R9), BX
+	MOVL  R8, 24(SP)(DI*4)
+	MOVL  CX, (R9)
+	CMPL  (DX)(BX*1), SI
+	JEQ   match_nolit_loop_encodeBlockAsm
+	INCL  CX
+	JMP   search_loop_encodeBlockAsm
+
+emit_remainder_encodeBlockAsm:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 5(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBlockAsm
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeBlockAsm
+	CMPL DX, $0x01000000
+	JB   four_bytes_emit_remainder_encodeBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL DX, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+four_bytes_emit_remainder_encodeBlockAsm:
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (AX)
+	MOVW DX, 1(AX)
+	MOVB BL, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+three_bytes_emit_remainder_encodeBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+two_bytes_emit_remainder_encodeBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBlockAsm
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+one_byte_emit_remainder_encodeBlockAsm:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm
+
+memmove_long_emit_remainder_encodeBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBlockAsm4MB(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBlockAsm4MB(SB), $65560-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000200, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm4MB:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBlockAsm4MB
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBlockAsm4MB:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x06, BX
+	LEAL  4(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeBlockAsm4MB
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHRQ  $0x08, R10
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x32, R9
+	SHLQ  $0x10, R10
+	IMULQ R8, R10
+	SHRQ  $0x32, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  24(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	LEAL  1(CX), R9
+	MOVL  R9, 24(SP)(R10*4)
+	MOVQ  SI, R9
+	SHRQ  $0x10, R9
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x32, R9
+	MOVL  CX, R8
+	SUBL  16(SP), R8
+	MOVL  1(DX)(R8*1), R10
+	MOVQ  SI, R8
+	SHRQ  $0x08, R8
+	CMPL  R8, R10
+	JNE   no_repeat_found_encodeBlockAsm4MB
+	LEAL  1(CX), SI
+	MOVL  12(SP), DI
+	MOVL  SI, BX
+	SUBL  16(SP), BX
+	JZ    repeat_extend_back_end_encodeBlockAsm4MB
+
+repeat_extend_back_loop_encodeBlockAsm4MB:
+	CMPL SI, DI
+	JBE  repeat_extend_back_end_encodeBlockAsm4MB
+	MOVB -1(DX)(BX*1), R8
+	MOVB -1(DX)(SI*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeBlockAsm4MB
+	LEAL -1(SI), SI
+	DECL BX
+	JNZ  repeat_extend_back_loop_encodeBlockAsm4MB
+
+repeat_extend_back_end_encodeBlockAsm4MB:
+	MOVL SI, BX
+	SUBL 12(SP), BX
+	LEAQ 4(AX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   repeat_dst_size_check_encodeBlockAsm4MB
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+repeat_dst_size_check_encodeBlockAsm4MB:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm4MB
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_repeat_emit_encodeBlockAsm4MB
+	CMPL BX, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBlockAsm4MB
+	CMPL BX, $0x00010000
+	JB   three_bytes_repeat_emit_encodeBlockAsm4MB
+	MOVL BX, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (AX)
+	MOVW BX, 1(AX)
+	MOVB R10, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
+
+three_bytes_repeat_emit_encodeBlockAsm4MB:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
+
+two_bytes_repeat_emit_encodeBlockAsm4MB:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_repeat_emit_encodeBlockAsm4MB
+	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
+
+one_byte_repeat_emit_encodeBlockAsm4MB:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm4MB:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
+	MOVQ BX, AX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm4MB
+
+memmove_long_repeat_emit_encodeBlockAsm4MB:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm4MB:
+	ADDL $0x05, CX
+	MOVL CX, BX
+	SUBL 16(SP), BX
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB:
+	CMPL R8, $0x10
+	JB   matchlen_match8_repeat_extend_encodeBlockAsm4MB
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (BX)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
+	XORQ 8(BX)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm4MB
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm4MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm4MB
+
+matchlen_match8_repeat_extend_encodeBlockAsm4MB:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm4MB
+	MOVQ (R9)(R11*1), R10
+	XORQ (BX)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm4MB
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm4MB
+
+matchlen_match4_repeat_extend_encodeBlockAsm4MB:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm4MB
+	MOVL (R9)(R11*1), R10
+	CMPL (BX)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm4MB
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeBlockAsm4MB:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm4MB
+	JB   repeat_extend_forward_end_encodeBlockAsm4MB
+	MOVW (R9)(R11*1), R10
+	CMPW (BX)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm4MB
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeBlockAsm4MB
+
+matchlen_match1_repeat_extend_encodeBlockAsm4MB:
+	MOVB (R9)(R11*1), R10
+	CMPB (BX)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeBlockAsm4MB
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeBlockAsm4MB:
+	ADDL  R11, CX
+	MOVL  CX, BX
+	SUBL  SI, BX
+	MOVL  16(SP), SI
+	TESTL DI, DI
+	JZ    repeat_as_copy_encodeBlockAsm4MB
+
+	// emitRepeat
+	MOVL BX, DI
+	LEAL -4(BX), BX
+	CMPL DI, $0x08
+	JBE  repeat_two_match_repeat_encodeBlockAsm4MB
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_match_repeat_encodeBlockAsm4MB
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
+	CMPL BX, $0x00000104
+	JB   repeat_three_match_repeat_encodeBlockAsm4MB
+	CMPL BX, $0x00010100
+	JB   repeat_four_match_repeat_encodeBlockAsm4MB
+	LEAL -65536(BX), BX
+	MOVL BX, SI
+	MOVW $0x001d, (AX)
+	MOVW BX, 2(AX)
+	SARL $0x10, SI
+	MOVB SI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_match_repeat_encodeBlockAsm4MB:
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_match_repeat_encodeBlockAsm4MB:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_match_repeat_encodeBlockAsm4MB:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_match_repeat_encodeBlockAsm4MB:
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_as_copy_encodeBlockAsm4MB:
+	// emitCopy
+	CMPL SI, $0x00010000
+	JB   two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
+	CMPL BX, $0x40
+	JBE  four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
+	MOVB $0xff, (AX)
+	MOVL SI, 1(AX)
+	LEAL -64(BX), BX
+	ADDQ $0x05, AX
+	CMPL BX, $0x04
+	JB   four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
+
+	// emitRepeat
+	MOVL BX, DI
+	LEAL -4(BX), BX
+	CMPL DI, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	CMPL BX, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+	CMPL BX, $0x00010100
+	JB   repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+	LEAL -65536(BX), BX
+	MOVL BX, SI
+	MOVW $0x001d, (AX)
+	MOVW BX, 2(AX)
+	SARL $0x10, SI
+	MOVB SI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
+	TESTL BX, BX
+	JZ    repeat_end_emit_encodeBlockAsm4MB
+	XORL  DI, DI
+	LEAL  -1(DI)(BX*4), BX
+	MOVB  BL, (AX)
+	MOVL  SI, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   repeat_end_emit_encodeBlockAsm4MB
+
+two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
+	CMPL BX, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
+	CMPL SI, $0x00000800
+	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm4MB
+	MOVL $0x00000001, DI
+	LEAL 16(DI), DI
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, BX
+
+	// emitRepeat
+	LEAL -4(BX), BX
+	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+	MOVL BX, DI
+	LEAL -4(BX), BX
+	CMPL DI, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+	CMPL BX, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+	CMPL BX, $0x00010100
+	JB   repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+	LEAL -65536(BX), BX
+	MOVL BX, SI
+	MOVW $0x001d, (AX)
+	MOVW BX, 2(AX)
+	SARL $0x10, SI
+	MOVB SI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+long_offset_short_repeat_as_copy_encodeBlockAsm4MB:
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(BX), BX
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL BX, DI
+	LEAL -4(BX), BX
+	CMPL DI, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	CMPL BX, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+	CMPL BX, $0x00010100
+	JB   repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+	LEAL -65536(BX), BX
+	MOVL BX, SI
+	MOVW $0x001d, (AX)
+	MOVW BX, 2(AX)
+	SARL $0x10, SI
+	MOVB SI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
+	MOVL BX, DI
+	SHLL $0x02, DI
+	CMPL BX, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
+	LEAL -15(DI), DI
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm4MB:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeBlockAsm4MB
+
+no_repeat_found_encodeBlockAsm4MB:
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate_match_encodeBlockAsm4MB
+	SHRQ $0x08, SI
+	MOVL 24(SP)(R9*4), BX
+	LEAL 2(CX), R8
+	CMPL (DX)(DI*1), SI
+	JEQ  candidate2_match_encodeBlockAsm4MB
+	MOVL R8, 24(SP)(R9*4)
+	SHRQ $0x08, SI
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate3_match_encodeBlockAsm4MB
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBlockAsm4MB
+
+candidate3_match_encodeBlockAsm4MB:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeBlockAsm4MB
+
+candidate2_match_encodeBlockAsm4MB:
+	MOVL R8, 24(SP)(R9*4)
+	INCL CX
+	MOVL DI, BX
+
+candidate_match_encodeBlockAsm4MB:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeBlockAsm4MB
+
+match_extend_back_loop_encodeBlockAsm4MB:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeBlockAsm4MB
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBlockAsm4MB
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeBlockAsm4MB
+	JMP  match_extend_back_loop_encodeBlockAsm4MB
+
+match_extend_back_end_encodeBlockAsm4MB:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 4(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeBlockAsm4MB
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm4MB:
+	MOVL CX, SI
+	MOVL 12(SP), DI
+	CMPL DI, SI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm4MB
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(DI*1), SI
+	SUBL DI, R8
+	LEAL -1(R8), DI
+	CMPL DI, $0x3c
+	JB   one_byte_match_emit_encodeBlockAsm4MB
+	CMPL DI, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm4MB
+	CMPL DI, $0x00010000
+	JB   three_bytes_match_emit_encodeBlockAsm4MB
+	MOVL DI, R9
+	SHRL $0x10, R9
+	MOVB $0xf8, (AX)
+	MOVW DI, 1(AX)
+	MOVB R9, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm4MB
+
+three_bytes_match_emit_encodeBlockAsm4MB:
+	MOVB $0xf4, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm4MB
+
+two_bytes_match_emit_encodeBlockAsm4MB:
+	MOVB $0xf0, (AX)
+	MOVB DI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DI, $0x40
+	JB   memmove_match_emit_encodeBlockAsm4MB
+	JMP  memmove_long_match_emit_encodeBlockAsm4MB
+
+one_byte_match_emit_encodeBlockAsm4MB:
+	SHLB $0x02, DI
+	MOVB DI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm4MB:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
+	MOVQ (SI), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
+	MOVQ (SI), R9
+	MOVQ -8(SI)(R8*1), SI
+	MOVQ R9, (AX)
+	MOVQ SI, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
+	MOVOU (SI), X0
+	MOVOU -16(SI)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm4MB:
+	MOVQ DI, AX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm4MB
+
+memmove_long_match_emit_encodeBlockAsm4MB:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveLong
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVQ  R8, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(SI)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(SI)(R11*1), X4
+	MOVOU -16(SI)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  R8, R11
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  DI, AX
+
+emit_literal_done_match_emit_encodeBlockAsm4MB:
+match_nolit_loop_encodeBlockAsm4MB:
+	MOVL CX, SI
+	SUBL BX, SI
+	MOVL SI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), SI
+	SUBL CX, SI
+	LEAQ (DX)(CX*1), DI
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R9, R9
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm4MB:
+	CMPL SI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBlockAsm4MB
+	MOVQ (DI)(R9*1), R8
+	MOVQ 8(DI)(R9*1), R10
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
+	XORQ 8(BX)(R9*1), R10
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm4MB
+	LEAL -16(SI), SI
+	LEAL 16(R9), R9
+	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm4MB
+
+matchlen_bsf_16match_nolit_encodeBlockAsm4MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R9)(R10*1), R9
+	JMP  match_nolit_end_encodeBlockAsm4MB
+
+matchlen_match8_match_nolit_encodeBlockAsm4MB:
+	CMPL SI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm4MB
+	MOVQ (DI)(R9*1), R8
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
+	LEAL -8(SI), SI
+	LEAL 8(R9), R9
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm4MB
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm4MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R8, R8
+
+#else
+	BSFQ R8, R8
+
+#endif
+	SARQ $0x03, R8
+	LEAL (R9)(R8*1), R9
+	JMP  match_nolit_end_encodeBlockAsm4MB
+
+matchlen_match4_match_nolit_encodeBlockAsm4MB:
+	CMPL SI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm4MB
+	MOVL (DI)(R9*1), R8
+	CMPL (BX)(R9*1), R8
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm4MB
+	LEAL -4(SI), SI
+	LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_encodeBlockAsm4MB:
+	CMPL SI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm4MB
+	JB   match_nolit_end_encodeBlockAsm4MB
+	MOVW (DI)(R9*1), R8
+	CMPW (BX)(R9*1), R8
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm4MB
+	LEAL 2(R9), R9
+	SUBL $0x02, SI
+	JZ   match_nolit_end_encodeBlockAsm4MB
+
+matchlen_match1_match_nolit_encodeBlockAsm4MB:
+	MOVB (DI)(R9*1), R8
+	CMPB (BX)(R9*1), R8
+	JNE  match_nolit_end_encodeBlockAsm4MB
+	LEAL 1(R9), R9
+
+match_nolit_end_encodeBlockAsm4MB:
+	ADDL R9, CX
+	MOVL 16(SP), BX
+	ADDL $0x04, R9
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL BX, $0x00010000
+	JB   two_byte_offset_match_nolit_encodeBlockAsm4MB
+	CMPL R9, $0x40
+	JBE  four_bytes_remain_match_nolit_encodeBlockAsm4MB
+	MOVB $0xff, (AX)
+	MOVL BX, 1(AX)
+	LEAL -64(R9), R9
+	ADDQ $0x05, AX
+	CMPL R9, $0x04
+	JB   four_bytes_remain_match_nolit_encodeBlockAsm4MB
+
+	// emitRepeat
+	MOVL R9, SI
+	LEAL -4(R9), R9
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
+	CMPL BX, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
+	CMPL R9, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
+	CMPL R9, $0x00010100
+	JB   repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
+	LEAL -65536(R9), R9
+	MOVL R9, BX
+	MOVW $0x001d, (AX)
+	MOVW R9, 2(AX)
+	SARL $0x10, BX
+	MOVB BL, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
+	LEAL -256(R9), R9
+	MOVW $0x0019, (AX)
+	MOVW R9, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
+	LEAL -4(R9), R9
+	MOVW $0x0015, (AX)
+	MOVB R9, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
+	SHLL $0x02, R9
+	ORL  $0x01, R9
+	MOVW R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
+	XORQ SI, SI
+	LEAL 1(SI)(R9*4), R9
+	MOVB BL, 1(AX)
+	SARL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, R9
+	MOVB R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+four_bytes_remain_match_nolit_encodeBlockAsm4MB:
+	TESTL R9, R9
+	JZ    match_nolit_emitcopy_end_encodeBlockAsm4MB
+	XORL  SI, SI
+	LEAL  -1(SI)(R9*4), R9
+	MOVB  R9, (AX)
+	MOVL  BX, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+two_byte_offset_match_nolit_encodeBlockAsm4MB:
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm4MB
+	CMPL BX, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBlockAsm4MB
+	MOVL $0x00000001, SI
+	LEAL 16(SI), SI
+	MOVB BL, 1(AX)
+	SHRL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R9
+
+	// emitRepeat
+	LEAL -4(R9), R9
+	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+	MOVL R9, SI
+	LEAL -4(R9), R9
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+	CMPL BX, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+	CMPL R9, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+	CMPL R9, $0x00010100
+	JB   repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+	LEAL -65536(R9), R9
+	MOVL R9, BX
+	MOVW $0x001d, (AX)
+	MOVW R9, 2(AX)
+	SARL $0x10, BX
+	MOVB BL, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+	LEAL -256(R9), R9
+	MOVW $0x0019, (AX)
+	MOVW R9, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+	LEAL -4(R9), R9
+	MOVW $0x0015, (AX)
+	MOVB R9, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+	SHLL $0x02, R9
+	ORL  $0x01, R9
+	MOVW R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+	XORQ SI, SI
+	LEAL 1(SI)(R9*4), R9
+	MOVB BL, 1(AX)
+	SARL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, R9
+	MOVB R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+long_offset_short_match_nolit_encodeBlockAsm4MB:
+	MOVB $0xee, (AX)
+	MOVW BX, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R9, SI
+	LEAL -4(R9), R9
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
+	CMPL BX, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	CMPL R9, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
+	CMPL R9, $0x00010100
+	JB   repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
+	LEAL -65536(R9), R9
+	MOVL R9, BX
+	MOVW $0x001d, (AX)
+	MOVW R9, 2(AX)
+	SARL $0x10, BX
+	MOVB BL, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	LEAL -256(R9), R9
+	MOVW $0x0019, (AX)
+	MOVW R9, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	LEAL -4(R9), R9
+	MOVW $0x0015, (AX)
+	MOVB R9, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	SHLL $0x02, R9
+	ORL  $0x01, R9
+	MOVW R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(R9*4), R9
+	MOVB BL, 1(AX)
+	SARL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, R9
+	MOVB R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
+	MOVL R9, SI
+	SHLL $0x02, SI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm4MB
+	CMPL BX, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm4MB
+	LEAL -15(SI), SI
+	MOVB BL, 1(AX)
+	SHRL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+emit_copy_three_match_nolit_encodeBlockAsm4MB:
+	LEAL -2(SI), SI
+	MOVB SI, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm4MB:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm4MB
+	MOVQ -2(DX)(CX*1), SI
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm4MB
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm4MB:
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  SI, DI
+	SHRQ  $0x10, SI
+	MOVQ  SI, BX
+	SHLQ  $0x10, DI
+	IMULQ R8, DI
+	SHRQ  $0x32, DI
+	SHLQ  $0x10, BX
+	IMULQ R8, BX
+	SHRQ  $0x32, BX
+	LEAL  -2(CX), R8
+	LEAQ  24(SP)(BX*4), R9
+	MOVL  (R9), BX
+	MOVL  R8, 24(SP)(DI*4)
+	MOVL  CX, (R9)
+	CMPL  (DX)(BX*1), SI
+	JEQ   match_nolit_loop_encodeBlockAsm4MB
+	INCL  CX
+	JMP   search_loop_encodeBlockAsm4MB
+
+emit_remainder_encodeBlockAsm4MB:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 4(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeBlockAsm4MB
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm4MB:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm4MB
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBlockAsm4MB
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm4MB
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeBlockAsm4MB
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (AX)
+	MOVW DX, 1(AX)
+	MOVB BL, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
+
+three_bytes_emit_remainder_encodeBlockAsm4MB:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
+
+two_bytes_emit_remainder_encodeBlockAsm4MB:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBlockAsm4MB
+	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
+
+one_byte_emit_remainder_encodeBlockAsm4MB:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm4MB:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm4MB
+
+memmove_long_emit_remainder_encodeBlockAsm4MB:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm4MB:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBlockAsm12B(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBlockAsm12B(SB), $16408-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000080, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm12B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBlockAsm12B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBlockAsm12B:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x05, BX
+	LEAL  4(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeBlockAsm12B
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x000000cf1bbcdcbb, R8
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHRQ  $0x08, R10
+	SHLQ  $0x18, R9
+	IMULQ R8, R9
+	SHRQ  $0x34, R9
+	SHLQ  $0x18, R10
+	IMULQ R8, R10
+	SHRQ  $0x34, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  24(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	LEAL  1(CX), R9
+	MOVL  R9, 24(SP)(R10*4)
+	MOVQ  SI, R9
+	SHRQ  $0x10, R9
+	SHLQ  $0x18, R9
+	IMULQ R8, R9
+	SHRQ  $0x34, R9
+	MOVL  CX, R8
+	SUBL  16(SP), R8
+	MOVL  1(DX)(R8*1), R10
+	MOVQ  SI, R8
+	SHRQ  $0x08, R8
+	CMPL  R8, R10
+	JNE   no_repeat_found_encodeBlockAsm12B
+	LEAL  1(CX), SI
+	MOVL  12(SP), DI
+	MOVL  SI, BX
+	SUBL  16(SP), BX
+	JZ    repeat_extend_back_end_encodeBlockAsm12B
+
+repeat_extend_back_loop_encodeBlockAsm12B:
+	CMPL SI, DI
+	JBE  repeat_extend_back_end_encodeBlockAsm12B
+	MOVB -1(DX)(BX*1), R8
+	MOVB -1(DX)(SI*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeBlockAsm12B
+	LEAL -1(SI), SI
+	DECL BX
+	JNZ  repeat_extend_back_loop_encodeBlockAsm12B
+
+repeat_extend_back_end_encodeBlockAsm12B:
+	MOVL SI, BX
+	SUBL 12(SP), BX
+	LEAQ 3(AX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   repeat_dst_size_check_encodeBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+repeat_dst_size_check_encodeBlockAsm12B:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm12B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_repeat_emit_encodeBlockAsm12B
+	CMPL BX, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBlockAsm12B
+	JB   three_bytes_repeat_emit_encodeBlockAsm12B
+
+three_bytes_repeat_emit_encodeBlockAsm12B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm12B
+
+two_bytes_repeat_emit_encodeBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_repeat_emit_encodeBlockAsm12B
+	JMP  memmove_long_repeat_emit_encodeBlockAsm12B
+
+one_byte_repeat_emit_encodeBlockAsm12B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm12B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm12B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm12B
+
+memmove_long_repeat_emit_encodeBlockAsm12B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm12B:
+	ADDL $0x05, CX
+	MOVL CX, BX
+	SUBL 16(SP), BX
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm12B:
+	CMPL R8, $0x10
+	JB   matchlen_match8_repeat_extend_encodeBlockAsm12B
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (BX)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
+	XORQ 8(BX)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm12B
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm12B
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm12B
+
+matchlen_match8_repeat_extend_encodeBlockAsm12B:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm12B
+	MOVQ (R9)(R11*1), R10
+	XORQ (BX)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm12B
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm12B
+
+matchlen_match4_repeat_extend_encodeBlockAsm12B:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm12B
+	MOVL (R9)(R11*1), R10
+	CMPL (BX)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm12B
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeBlockAsm12B:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm12B
+	JB   repeat_extend_forward_end_encodeBlockAsm12B
+	MOVW (R9)(R11*1), R10
+	CMPW (BX)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm12B
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeBlockAsm12B
+
+matchlen_match1_repeat_extend_encodeBlockAsm12B:
+	MOVB (R9)(R11*1), R10
+	CMPB (BX)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeBlockAsm12B
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeBlockAsm12B:
+	ADDL  R11, CX
+	MOVL  CX, BX
+	SUBL  SI, BX
+	MOVL  16(SP), SI
+	TESTL DI, DI
+	JZ    repeat_as_copy_encodeBlockAsm12B
+
+	// emitRepeat
+	MOVL BX, DI
+	LEAL -4(BX), BX
+	CMPL DI, $0x08
+	JBE  repeat_two_match_repeat_encodeBlockAsm12B
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_match_repeat_encodeBlockAsm12B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
+	CMPL BX, $0x00000104
+	JB   repeat_three_match_repeat_encodeBlockAsm12B
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_three_match_repeat_encodeBlockAsm12B:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_match_repeat_encodeBlockAsm12B:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_offset_match_repeat_encodeBlockAsm12B:
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_as_copy_encodeBlockAsm12B:
+	// emitCopy
+	CMPL BX, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
+	CMPL SI, $0x00000800
+	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm12B
+	MOVL $0x00000001, DI
+	LEAL 16(DI), DI
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, BX
+
+	// emitRepeat
+	LEAL -4(BX), BX
+	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+	MOVL BX, DI
+	LEAL -4(BX), BX
+	CMPL DI, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
+	CMPL BX, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+long_offset_short_repeat_as_copy_encodeBlockAsm12B:
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(BX), BX
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL BX, DI
+	LEAL -4(BX), BX
+	CMPL DI, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+	CMPL BX, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
+	MOVL BX, DI
+	SHLL $0x02, DI
+	CMPL BX, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
+	LEAL -15(DI), DI
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm12B:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeBlockAsm12B
+
+no_repeat_found_encodeBlockAsm12B:
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate_match_encodeBlockAsm12B
+	SHRQ $0x08, SI
+	MOVL 24(SP)(R9*4), BX
+	LEAL 2(CX), R8
+	CMPL (DX)(DI*1), SI
+	JEQ  candidate2_match_encodeBlockAsm12B
+	MOVL R8, 24(SP)(R9*4)
+	SHRQ $0x08, SI
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate3_match_encodeBlockAsm12B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBlockAsm12B
+
+candidate3_match_encodeBlockAsm12B:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeBlockAsm12B
+
+candidate2_match_encodeBlockAsm12B:
+	MOVL R8, 24(SP)(R9*4)
+	INCL CX
+	MOVL DI, BX
+
+candidate_match_encodeBlockAsm12B:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeBlockAsm12B
+
+match_extend_back_loop_encodeBlockAsm12B:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeBlockAsm12B
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBlockAsm12B
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeBlockAsm12B
+	JMP  match_extend_back_loop_encodeBlockAsm12B
+
+match_extend_back_end_encodeBlockAsm12B:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm12B:
+	MOVL CX, SI
+	MOVL 12(SP), DI
+	CMPL DI, SI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm12B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(DI*1), SI
+	SUBL DI, R8
+	LEAL -1(R8), DI
+	CMPL DI, $0x3c
+	JB   one_byte_match_emit_encodeBlockAsm12B
+	CMPL DI, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm12B
+	JB   three_bytes_match_emit_encodeBlockAsm12B
+
+three_bytes_match_emit_encodeBlockAsm12B:
+	MOVB $0xf4, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm12B
+
+two_bytes_match_emit_encodeBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB DI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DI, $0x40
+	JB   memmove_match_emit_encodeBlockAsm12B
+	JMP  memmove_long_match_emit_encodeBlockAsm12B
+
+one_byte_match_emit_encodeBlockAsm12B:
+	SHLB $0x02, DI
+	MOVB DI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm12B:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
+	MOVQ (SI), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
+	MOVQ (SI), R9
+	MOVQ -8(SI)(R8*1), SI
+	MOVQ R9, (AX)
+	MOVQ SI, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
+	MOVOU (SI), X0
+	MOVOU -16(SI)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm12B:
+	MOVQ DI, AX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm12B
+
+memmove_long_match_emit_encodeBlockAsm12B:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveLong
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVQ  R8, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(SI)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(SI)(R11*1), X4
+	MOVOU -16(SI)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  R8, R11
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  DI, AX
+
+emit_literal_done_match_emit_encodeBlockAsm12B:
+match_nolit_loop_encodeBlockAsm12B:
+	MOVL CX, SI
+	SUBL BX, SI
+	MOVL SI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), SI
+	SUBL CX, SI
+	LEAQ (DX)(CX*1), DI
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R9, R9
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm12B:
+	CMPL SI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBlockAsm12B
+	MOVQ (DI)(R9*1), R8
+	MOVQ 8(DI)(R9*1), R10
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm12B
+	XORQ 8(BX)(R9*1), R10
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm12B
+	LEAL -16(SI), SI
+	LEAL 16(R9), R9
+	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm12B
+
+matchlen_bsf_16match_nolit_encodeBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R9)(R10*1), R9
+	JMP  match_nolit_end_encodeBlockAsm12B
+
+matchlen_match8_match_nolit_encodeBlockAsm12B:
+	CMPL SI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm12B
+	MOVQ (DI)(R9*1), R8
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm12B
+	LEAL -8(SI), SI
+	LEAL 8(R9), R9
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm12B
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R8, R8
+
+#else
+	BSFQ R8, R8
+
+#endif
+	SARQ $0x03, R8
+	LEAL (R9)(R8*1), R9
+	JMP  match_nolit_end_encodeBlockAsm12B
+
+matchlen_match4_match_nolit_encodeBlockAsm12B:
+	CMPL SI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm12B
+	MOVL (DI)(R9*1), R8
+	CMPL (BX)(R9*1), R8
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm12B
+	LEAL -4(SI), SI
+	LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_encodeBlockAsm12B:
+	CMPL SI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm12B
+	JB   match_nolit_end_encodeBlockAsm12B
+	MOVW (DI)(R9*1), R8
+	CMPW (BX)(R9*1), R8
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm12B
+	LEAL 2(R9), R9
+	SUBL $0x02, SI
+	JZ   match_nolit_end_encodeBlockAsm12B
+
+matchlen_match1_match_nolit_encodeBlockAsm12B:
+	MOVB (DI)(R9*1), R8
+	CMPB (BX)(R9*1), R8
+	JNE  match_nolit_end_encodeBlockAsm12B
+	LEAL 1(R9), R9
+
+match_nolit_end_encodeBlockAsm12B:
+	ADDL R9, CX
+	MOVL 16(SP), BX
+	ADDL $0x04, R9
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm12B
+	CMPL BX, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBlockAsm12B
+	MOVL $0x00000001, SI
+	LEAL 16(SI), SI
+	MOVB BL, 1(AX)
+	SHRL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R9
+
+	// emitRepeat
+	LEAL -4(R9), R9
+	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+	MOVL R9, SI
+	LEAL -4(R9), R9
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+	CMPL BX, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
+	CMPL R9, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+	LEAL -256(R9), R9
+	MOVW $0x0019, (AX)
+	MOVW R9, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
+	LEAL -4(R9), R9
+	MOVW $0x0015, (AX)
+	MOVB R9, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
+	SHLL $0x02, R9
+	ORL  $0x01, R9
+	MOVW R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
+	XORQ SI, SI
+	LEAL 1(SI)(R9*4), R9
+	MOVB BL, 1(AX)
+	SARL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, R9
+	MOVB R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+long_offset_short_match_nolit_encodeBlockAsm12B:
+	MOVB $0xee, (AX)
+	MOVW BX, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R9, SI
+	LEAL -4(R9), R9
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
+	CMPL BX, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
+	CMPL R9, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
+	LEAL -256(R9), R9
+	MOVW $0x0019, (AX)
+	MOVW R9, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
+	LEAL -4(R9), R9
+	MOVW $0x0015, (AX)
+	MOVB R9, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
+	SHLL $0x02, R9
+	ORL  $0x01, R9
+	MOVW R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(R9*4), R9
+	MOVB BL, 1(AX)
+	SARL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, R9
+	MOVB R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm12B:
+	MOVL R9, SI
+	SHLL $0x02, SI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm12B
+	CMPL BX, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm12B
+	LEAL -15(SI), SI
+	MOVB BL, 1(AX)
+	SHRL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+emit_copy_three_match_nolit_encodeBlockAsm12B:
+	LEAL -2(SI), SI
+	MOVB SI, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm12B:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm12B
+	MOVQ -2(DX)(CX*1), SI
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm12B:
+	MOVQ  $0x000000cf1bbcdcbb, R8
+	MOVQ  SI, DI
+	SHRQ  $0x10, SI
+	MOVQ  SI, BX
+	SHLQ  $0x18, DI
+	IMULQ R8, DI
+	SHRQ  $0x34, DI
+	SHLQ  $0x18, BX
+	IMULQ R8, BX
+	SHRQ  $0x34, BX
+	LEAL  -2(CX), R8
+	LEAQ  24(SP)(BX*4), R9
+	MOVL  (R9), BX
+	MOVL  R8, 24(SP)(DI*4)
+	MOVL  CX, (R9)
+	CMPL  (DX)(BX*1), SI
+	JEQ   match_nolit_loop_encodeBlockAsm12B
+	INCL  CX
+	JMP   search_loop_encodeBlockAsm12B
+
+emit_remainder_encodeBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm12B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBlockAsm12B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm12B
+	JB   three_bytes_emit_remainder_encodeBlockAsm12B
+
+three_bytes_emit_remainder_encodeBlockAsm12B:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm12B
+
+two_bytes_emit_remainder_encodeBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBlockAsm12B
+	JMP  memmove_long_emit_remainder_encodeBlockAsm12B
+
+one_byte_emit_remainder_encodeBlockAsm12B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm12B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm12B
+
+memmove_long_emit_remainder_encodeBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm12B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBlockAsm10B(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBlockAsm10B(SB), $4120-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000020, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm10B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBlockAsm10B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBlockAsm10B:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x05, BX
+	LEAL  4(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeBlockAsm10B
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x9e3779b1, R8
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHRQ  $0x08, R10
+	SHLQ  $0x20, R9
+	IMULQ R8, R9
+	SHRQ  $0x36, R9
+	SHLQ  $0x20, R10
+	IMULQ R8, R10
+	SHRQ  $0x36, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  24(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	LEAL  1(CX), R9
+	MOVL  R9, 24(SP)(R10*4)
+	MOVQ  SI, R9
+	SHRQ  $0x10, R9
+	SHLQ  $0x20, R9
+	IMULQ R8, R9
+	SHRQ  $0x36, R9
+	MOVL  CX, R8
+	SUBL  16(SP), R8
+	MOVL  1(DX)(R8*1), R10
+	MOVQ  SI, R8
+	SHRQ  $0x08, R8
+	CMPL  R8, R10
+	JNE   no_repeat_found_encodeBlockAsm10B
+	LEAL  1(CX), SI
+	MOVL  12(SP), DI
+	MOVL  SI, BX
+	SUBL  16(SP), BX
+	JZ    repeat_extend_back_end_encodeBlockAsm10B
+
+repeat_extend_back_loop_encodeBlockAsm10B:
+	CMPL SI, DI
+	JBE  repeat_extend_back_end_encodeBlockAsm10B
+	MOVB -1(DX)(BX*1), R8
+	MOVB -1(DX)(SI*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeBlockAsm10B
+	LEAL -1(SI), SI
+	DECL BX
+	JNZ  repeat_extend_back_loop_encodeBlockAsm10B
+
+repeat_extend_back_end_encodeBlockAsm10B:
+	MOVL SI, BX
+	SUBL 12(SP), BX
+	LEAQ 3(AX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   repeat_dst_size_check_encodeBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+repeat_dst_size_check_encodeBlockAsm10B:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm10B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_repeat_emit_encodeBlockAsm10B
+	CMPL BX, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBlockAsm10B
+	JB   three_bytes_repeat_emit_encodeBlockAsm10B
+
+three_bytes_repeat_emit_encodeBlockAsm10B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm10B
+
+two_bytes_repeat_emit_encodeBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_repeat_emit_encodeBlockAsm10B
+	JMP  memmove_long_repeat_emit_encodeBlockAsm10B
+
+one_byte_repeat_emit_encodeBlockAsm10B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm10B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm10B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm10B
+
+memmove_long_repeat_emit_encodeBlockAsm10B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm10B:
+	ADDL $0x05, CX
+	MOVL CX, BX
+	SUBL 16(SP), BX
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm10B:
+	CMPL R8, $0x10
+	JB   matchlen_match8_repeat_extend_encodeBlockAsm10B
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (BX)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
+	XORQ 8(BX)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm10B
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm10B
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm10B
+
+matchlen_match8_repeat_extend_encodeBlockAsm10B:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm10B
+	MOVQ (R9)(R11*1), R10
+	XORQ (BX)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm10B
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm10B
+
+matchlen_match4_repeat_extend_encodeBlockAsm10B:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm10B
+	MOVL (R9)(R11*1), R10
+	CMPL (BX)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm10B
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeBlockAsm10B:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm10B
+	JB   repeat_extend_forward_end_encodeBlockAsm10B
+	MOVW (R9)(R11*1), R10
+	CMPW (BX)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm10B
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeBlockAsm10B
+
+matchlen_match1_repeat_extend_encodeBlockAsm10B:
+	MOVB (R9)(R11*1), R10
+	CMPB (BX)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeBlockAsm10B
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeBlockAsm10B:
+	ADDL  R11, CX
+	MOVL  CX, BX
+	SUBL  SI, BX
+	MOVL  16(SP), SI
+	TESTL DI, DI
+	JZ    repeat_as_copy_encodeBlockAsm10B
+
+	// emitRepeat
+	MOVL BX, DI
+	LEAL -4(BX), BX
+	CMPL DI, $0x08
+	JBE  repeat_two_match_repeat_encodeBlockAsm10B
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_match_repeat_encodeBlockAsm10B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
+	CMPL BX, $0x00000104
+	JB   repeat_three_match_repeat_encodeBlockAsm10B
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_three_match_repeat_encodeBlockAsm10B:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_match_repeat_encodeBlockAsm10B:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_offset_match_repeat_encodeBlockAsm10B:
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_as_copy_encodeBlockAsm10B:
+	// emitCopy
+	CMPL BX, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
+	CMPL SI, $0x00000800
+	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm10B
+	MOVL $0x00000001, DI
+	LEAL 16(DI), DI
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, BX
+
+	// emitRepeat
+	LEAL -4(BX), BX
+	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+	MOVL BX, DI
+	LEAL -4(BX), BX
+	CMPL DI, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
+	CMPL BX, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+long_offset_short_repeat_as_copy_encodeBlockAsm10B:
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(BX), BX
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL BX, DI
+	LEAL -4(BX), BX
+	CMPL DI, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+	CMPL BX, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
+	MOVL BX, DI
+	SHLL $0x02, DI
+	CMPL BX, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
+	LEAL -15(DI), DI
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm10B:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeBlockAsm10B
+
+no_repeat_found_encodeBlockAsm10B:
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate_match_encodeBlockAsm10B
+	SHRQ $0x08, SI
+	MOVL 24(SP)(R9*4), BX
+	LEAL 2(CX), R8
+	CMPL (DX)(DI*1), SI
+	JEQ  candidate2_match_encodeBlockAsm10B
+	MOVL R8, 24(SP)(R9*4)
+	SHRQ $0x08, SI
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate3_match_encodeBlockAsm10B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBlockAsm10B
+
+candidate3_match_encodeBlockAsm10B:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeBlockAsm10B
+
+candidate2_match_encodeBlockAsm10B:
+	MOVL R8, 24(SP)(R9*4)
+	INCL CX
+	MOVL DI, BX
+
+candidate_match_encodeBlockAsm10B:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeBlockAsm10B
+
+match_extend_back_loop_encodeBlockAsm10B:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeBlockAsm10B
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBlockAsm10B
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeBlockAsm10B
+	JMP  match_extend_back_loop_encodeBlockAsm10B
+
+match_extend_back_end_encodeBlockAsm10B:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm10B:
+	MOVL CX, SI
+	MOVL 12(SP), DI
+	CMPL DI, SI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm10B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(DI*1), SI
+	SUBL DI, R8
+	LEAL -1(R8), DI
+	CMPL DI, $0x3c
+	JB   one_byte_match_emit_encodeBlockAsm10B
+	CMPL DI, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm10B
+	JB   three_bytes_match_emit_encodeBlockAsm10B
+
+three_bytes_match_emit_encodeBlockAsm10B:
+	MOVB $0xf4, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm10B
+
+two_bytes_match_emit_encodeBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB DI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DI, $0x40
+	JB   memmove_match_emit_encodeBlockAsm10B
+	JMP  memmove_long_match_emit_encodeBlockAsm10B
+
+one_byte_match_emit_encodeBlockAsm10B:
+	SHLB $0x02, DI
+	MOVB DI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm10B:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
+	MOVQ (SI), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
+	MOVQ (SI), R9
+	MOVQ -8(SI)(R8*1), SI
+	MOVQ R9, (AX)
+	MOVQ SI, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
+	MOVOU (SI), X0
+	MOVOU -16(SI)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm10B:
+	MOVQ DI, AX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm10B
+
+memmove_long_match_emit_encodeBlockAsm10B:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveLong
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVQ  R8, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(SI)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(SI)(R11*1), X4
+	MOVOU -16(SI)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  R8, R11
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  DI, AX
+
+emit_literal_done_match_emit_encodeBlockAsm10B:
+match_nolit_loop_encodeBlockAsm10B:
+	MOVL CX, SI
+	SUBL BX, SI
+	MOVL SI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), SI
+	SUBL CX, SI
+	LEAQ (DX)(CX*1), DI
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R9, R9
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm10B:
+	CMPL SI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBlockAsm10B
+	MOVQ (DI)(R9*1), R8
+	MOVQ 8(DI)(R9*1), R10
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm10B
+	XORQ 8(BX)(R9*1), R10
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm10B
+	LEAL -16(SI), SI
+	LEAL 16(R9), R9
+	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm10B
+
+matchlen_bsf_16match_nolit_encodeBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R9)(R10*1), R9
+	JMP  match_nolit_end_encodeBlockAsm10B
+
+matchlen_match8_match_nolit_encodeBlockAsm10B:
+	CMPL SI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm10B
+	MOVQ (DI)(R9*1), R8
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm10B
+	LEAL -8(SI), SI
+	LEAL 8(R9), R9
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm10B
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R8, R8
+
+#else
+	BSFQ R8, R8
+
+#endif
+	SARQ $0x03, R8
+	LEAL (R9)(R8*1), R9
+	JMP  match_nolit_end_encodeBlockAsm10B
+
+matchlen_match4_match_nolit_encodeBlockAsm10B:
+	CMPL SI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm10B
+	MOVL (DI)(R9*1), R8
+	CMPL (BX)(R9*1), R8
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm10B
+	LEAL -4(SI), SI
+	LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_encodeBlockAsm10B:
+	CMPL SI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm10B
+	JB   match_nolit_end_encodeBlockAsm10B
+	MOVW (DI)(R9*1), R8
+	CMPW (BX)(R9*1), R8
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm10B
+	LEAL 2(R9), R9
+	SUBL $0x02, SI
+	JZ   match_nolit_end_encodeBlockAsm10B
+
+matchlen_match1_match_nolit_encodeBlockAsm10B:
+	MOVB (DI)(R9*1), R8
+	CMPB (BX)(R9*1), R8
+	JNE  match_nolit_end_encodeBlockAsm10B
+	LEAL 1(R9), R9
+
+match_nolit_end_encodeBlockAsm10B:
+	ADDL R9, CX
+	MOVL 16(SP), BX
+	ADDL $0x04, R9
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm10B
+	CMPL BX, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBlockAsm10B
+	MOVL $0x00000001, SI
+	LEAL 16(SI), SI
+	MOVB BL, 1(AX)
+	SHRL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R9
+
+	// emitRepeat
+	LEAL -4(R9), R9
+	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+	MOVL R9, SI
+	LEAL -4(R9), R9
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+	CMPL BX, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
+	CMPL R9, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+	LEAL -256(R9), R9
+	MOVW $0x0019, (AX)
+	MOVW R9, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
+	LEAL -4(R9), R9
+	MOVW $0x0015, (AX)
+	MOVB R9, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
+	SHLL $0x02, R9
+	ORL  $0x01, R9
+	MOVW R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
+	XORQ SI, SI
+	LEAL 1(SI)(R9*4), R9
+	MOVB BL, 1(AX)
+	SARL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, R9
+	MOVB R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+long_offset_short_match_nolit_encodeBlockAsm10B:
+	MOVB $0xee, (AX)
+	MOVW BX, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R9, SI
+	LEAL -4(R9), R9
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
+	CMPL BX, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
+	CMPL R9, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
+	LEAL -256(R9), R9
+	MOVW $0x0019, (AX)
+	MOVW R9, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
+	LEAL -4(R9), R9
+	MOVW $0x0015, (AX)
+	MOVB R9, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
+	SHLL $0x02, R9
+	ORL  $0x01, R9
+	MOVW R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(R9*4), R9
+	MOVB BL, 1(AX)
+	SARL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, R9
+	MOVB R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm10B:
+	MOVL R9, SI
+	SHLL $0x02, SI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm10B
+	CMPL BX, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm10B
+	LEAL -15(SI), SI
+	MOVB BL, 1(AX)
+	SHRL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+emit_copy_three_match_nolit_encodeBlockAsm10B:
+	LEAL -2(SI), SI
+	MOVB SI, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm10B:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm10B
+	MOVQ -2(DX)(CX*1), SI
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm10B:
+	MOVQ  $0x9e3779b1, R8
+	MOVQ  SI, DI
+	SHRQ  $0x10, SI
+	MOVQ  SI, BX
+	SHLQ  $0x20, DI
+	IMULQ R8, DI
+	SHRQ  $0x36, DI
+	SHLQ  $0x20, BX
+	IMULQ R8, BX
+	SHRQ  $0x36, BX
+	LEAL  -2(CX), R8
+	LEAQ  24(SP)(BX*4), R9
+	MOVL  (R9), BX
+	MOVL  R8, 24(SP)(DI*4)
+	MOVL  CX, (R9)
+	CMPL  (DX)(BX*1), SI
+	JEQ   match_nolit_loop_encodeBlockAsm10B
+	INCL  CX
+	JMP   search_loop_encodeBlockAsm10B
+
+emit_remainder_encodeBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm10B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBlockAsm10B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm10B
+	JB   three_bytes_emit_remainder_encodeBlockAsm10B
+
+three_bytes_emit_remainder_encodeBlockAsm10B:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm10B
+
+two_bytes_emit_remainder_encodeBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBlockAsm10B
+	JMP  memmove_long_emit_remainder_encodeBlockAsm10B
+
+one_byte_emit_remainder_encodeBlockAsm10B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm10B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm10B
+
+memmove_long_emit_remainder_encodeBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm10B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBlockAsm8B(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBlockAsm8B(SB), $1048-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000008, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm8B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBlockAsm8B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBlockAsm8B:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x04, BX
+	LEAL  4(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeBlockAsm8B
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x9e3779b1, R8
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHRQ  $0x08, R10
+	SHLQ  $0x20, R9
+	IMULQ R8, R9
+	SHRQ  $0x38, R9
+	SHLQ  $0x20, R10
+	IMULQ R8, R10
+	SHRQ  $0x38, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  24(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	LEAL  1(CX), R9
+	MOVL  R9, 24(SP)(R10*4)
+	MOVQ  SI, R9
+	SHRQ  $0x10, R9
+	SHLQ  $0x20, R9
+	IMULQ R8, R9
+	SHRQ  $0x38, R9
+	MOVL  CX, R8
+	SUBL  16(SP), R8
+	MOVL  1(DX)(R8*1), R10
+	MOVQ  SI, R8
+	SHRQ  $0x08, R8
+	CMPL  R8, R10
+	JNE   no_repeat_found_encodeBlockAsm8B
+	LEAL  1(CX), SI
+	MOVL  12(SP), DI
+	MOVL  SI, BX
+	SUBL  16(SP), BX
+	JZ    repeat_extend_back_end_encodeBlockAsm8B
+
+repeat_extend_back_loop_encodeBlockAsm8B:
+	CMPL SI, DI
+	JBE  repeat_extend_back_end_encodeBlockAsm8B
+	MOVB -1(DX)(BX*1), R8
+	MOVB -1(DX)(SI*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeBlockAsm8B
+	LEAL -1(SI), SI
+	DECL BX
+	JNZ  repeat_extend_back_loop_encodeBlockAsm8B
+
+repeat_extend_back_end_encodeBlockAsm8B:
+	MOVL SI, BX
+	SUBL 12(SP), BX
+	LEAQ 3(AX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   repeat_dst_size_check_encodeBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+repeat_dst_size_check_encodeBlockAsm8B:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm8B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_repeat_emit_encodeBlockAsm8B
+	CMPL BX, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBlockAsm8B
+	JB   three_bytes_repeat_emit_encodeBlockAsm8B
+
+three_bytes_repeat_emit_encodeBlockAsm8B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm8B
+
+two_bytes_repeat_emit_encodeBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_repeat_emit_encodeBlockAsm8B
+	JMP  memmove_long_repeat_emit_encodeBlockAsm8B
+
+one_byte_repeat_emit_encodeBlockAsm8B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm8B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm8B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm8B
+
+memmove_long_repeat_emit_encodeBlockAsm8B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm8B:
+	ADDL $0x05, CX
+	MOVL CX, BX
+	SUBL 16(SP), BX
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm8B:
+	CMPL R8, $0x10
+	JB   matchlen_match8_repeat_extend_encodeBlockAsm8B
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (BX)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
+	XORQ 8(BX)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm8B
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm8B
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm8B
+
+matchlen_match8_repeat_extend_encodeBlockAsm8B:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm8B
+	MOVQ (R9)(R11*1), R10
+	XORQ (BX)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm8B
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm8B
+
+matchlen_match4_repeat_extend_encodeBlockAsm8B:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm8B
+	MOVL (R9)(R11*1), R10
+	CMPL (BX)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm8B
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeBlockAsm8B:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm8B
+	JB   repeat_extend_forward_end_encodeBlockAsm8B
+	MOVW (R9)(R11*1), R10
+	CMPW (BX)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm8B
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeBlockAsm8B
+
+matchlen_match1_repeat_extend_encodeBlockAsm8B:
+	MOVB (R9)(R11*1), R10
+	CMPB (BX)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeBlockAsm8B
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeBlockAsm8B:
+	ADDL  R11, CX
+	MOVL  CX, BX
+	SUBL  SI, BX
+	MOVL  16(SP), SI
+	TESTL DI, DI
+	JZ    repeat_as_copy_encodeBlockAsm8B
+
+	// emitRepeat
+	MOVL BX, SI
+	LEAL -4(BX), BX
+	CMPL SI, $0x08
+	JBE  repeat_two_match_repeat_encodeBlockAsm8B
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
+	CMPL BX, $0x00000104
+	JB   repeat_three_match_repeat_encodeBlockAsm8B
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_three_match_repeat_encodeBlockAsm8B:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_two_match_repeat_encodeBlockAsm8B:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_as_copy_encodeBlockAsm8B:
+	// emitCopy
+	CMPL BX, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
+	CMPL SI, $0x00000800
+	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm8B
+	MOVL $0x00000001, DI
+	LEAL 16(DI), DI
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, BX
+
+	// emitRepeat
+	LEAL -4(BX), BX
+	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
+	MOVL BX, SI
+	LEAL -4(BX), BX
+	CMPL SI, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
+	CMPL BX, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+long_offset_short_repeat_as_copy_encodeBlockAsm8B:
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(BX), BX
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL BX, SI
+	LEAL -4(BX), BX
+	CMPL SI, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+	CMPL BX, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+	LEAL -256(BX), BX
+	MOVW $0x0019, (AX)
+	MOVW BX, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+	LEAL -4(BX), BX
+	MOVW $0x0015, (AX)
+	MOVB BL, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+	SHLL $0x02, BX
+	ORL  $0x01, BX
+	MOVW BX, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+	XORQ DI, DI
+	LEAL 1(DI)(BX*4), BX
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
+	MOVL BX, DI
+	SHLL $0x02, DI
+	CMPL BX, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm8B
+	LEAL -15(DI), DI
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm8B:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeBlockAsm8B
+
+no_repeat_found_encodeBlockAsm8B:
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate_match_encodeBlockAsm8B
+	SHRQ $0x08, SI
+	MOVL 24(SP)(R9*4), BX
+	LEAL 2(CX), R8
+	CMPL (DX)(DI*1), SI
+	JEQ  candidate2_match_encodeBlockAsm8B
+	MOVL R8, 24(SP)(R9*4)
+	SHRQ $0x08, SI
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate3_match_encodeBlockAsm8B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBlockAsm8B
+
+candidate3_match_encodeBlockAsm8B:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeBlockAsm8B
+
+candidate2_match_encodeBlockAsm8B:
+	MOVL R8, 24(SP)(R9*4)
+	INCL CX
+	MOVL DI, BX
+
+candidate_match_encodeBlockAsm8B:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeBlockAsm8B
+
+match_extend_back_loop_encodeBlockAsm8B:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeBlockAsm8B
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBlockAsm8B
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeBlockAsm8B
+	JMP  match_extend_back_loop_encodeBlockAsm8B
+
+match_extend_back_end_encodeBlockAsm8B:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm8B:
+	MOVL CX, SI
+	MOVL 12(SP), DI
+	CMPL DI, SI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm8B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(DI*1), SI
+	SUBL DI, R8
+	LEAL -1(R8), DI
+	CMPL DI, $0x3c
+	JB   one_byte_match_emit_encodeBlockAsm8B
+	CMPL DI, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm8B
+	JB   three_bytes_match_emit_encodeBlockAsm8B
+
+three_bytes_match_emit_encodeBlockAsm8B:
+	MOVB $0xf4, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm8B
+
+two_bytes_match_emit_encodeBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB DI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DI, $0x40
+	JB   memmove_match_emit_encodeBlockAsm8B
+	JMP  memmove_long_match_emit_encodeBlockAsm8B
+
+one_byte_match_emit_encodeBlockAsm8B:
+	SHLB $0x02, DI
+	MOVB DI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm8B:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
+	MOVQ (SI), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
+	MOVQ (SI), R9
+	MOVQ -8(SI)(R8*1), SI
+	MOVQ R9, (AX)
+	MOVQ SI, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
+	MOVOU (SI), X0
+	MOVOU -16(SI)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm8B:
+	MOVQ DI, AX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm8B
+
+memmove_long_match_emit_encodeBlockAsm8B:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveLong
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVQ  R8, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(SI)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(SI)(R11*1), X4
+	MOVOU -16(SI)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  R8, R11
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  DI, AX
+
+emit_literal_done_match_emit_encodeBlockAsm8B:
+match_nolit_loop_encodeBlockAsm8B:
+	MOVL CX, SI
+	SUBL BX, SI
+	MOVL SI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), SI
+	SUBL CX, SI
+	LEAQ (DX)(CX*1), DI
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R9, R9
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm8B:
+	CMPL SI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBlockAsm8B
+	MOVQ (DI)(R9*1), R8
+	MOVQ 8(DI)(R9*1), R10
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm8B
+	XORQ 8(BX)(R9*1), R10
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm8B
+	LEAL -16(SI), SI
+	LEAL 16(R9), R9
+	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm8B
+
+matchlen_bsf_16match_nolit_encodeBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R9)(R10*1), R9
+	JMP  match_nolit_end_encodeBlockAsm8B
+
+matchlen_match8_match_nolit_encodeBlockAsm8B:
+	CMPL SI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm8B
+	MOVQ (DI)(R9*1), R8
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm8B
+	LEAL -8(SI), SI
+	LEAL 8(R9), R9
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm8B
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R8, R8
+
+#else
+	BSFQ R8, R8
+
+#endif
+	SARQ $0x03, R8
+	LEAL (R9)(R8*1), R9
+	JMP  match_nolit_end_encodeBlockAsm8B
+
+matchlen_match4_match_nolit_encodeBlockAsm8B:
+	CMPL SI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm8B
+	MOVL (DI)(R9*1), R8
+	CMPL (BX)(R9*1), R8
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm8B
+	LEAL -4(SI), SI
+	LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_encodeBlockAsm8B:
+	CMPL SI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm8B
+	JB   match_nolit_end_encodeBlockAsm8B
+	MOVW (DI)(R9*1), R8
+	CMPW (BX)(R9*1), R8
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm8B
+	LEAL 2(R9), R9
+	SUBL $0x02, SI
+	JZ   match_nolit_end_encodeBlockAsm8B
+
+matchlen_match1_match_nolit_encodeBlockAsm8B:
+	MOVB (DI)(R9*1), R8
+	CMPB (BX)(R9*1), R8
+	JNE  match_nolit_end_encodeBlockAsm8B
+	LEAL 1(R9), R9
+
+match_nolit_end_encodeBlockAsm8B:
+	ADDL R9, CX
+	MOVL 16(SP), BX
+	ADDL $0x04, R9
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm8B
+	CMPL BX, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBlockAsm8B
+	MOVL $0x00000001, SI
+	LEAL 16(SI), SI
+	MOVB BL, 1(AX)
+	SHRL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R9
+
+	// emitRepeat
+	LEAL -4(R9), R9
+	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
+	MOVL R9, BX
+	LEAL -4(R9), R9
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
+	CMPL R9, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
+	LEAL -256(R9), R9
+	MOVW $0x0019, (AX)
+	MOVW R9, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
+	LEAL -4(R9), R9
+	MOVW $0x0015, (AX)
+	MOVB R9, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
+	SHLL $0x02, R9
+	ORL  $0x01, R9
+	MOVW R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+	XORQ SI, SI
+	LEAL 1(SI)(R9*4), R9
+	MOVB BL, 1(AX)
+	SARL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, R9
+	MOVB R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+long_offset_short_match_nolit_encodeBlockAsm8B:
+	MOVB $0xee, (AX)
+	MOVW BX, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R9, BX
+	LEAL -4(R9), R9
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
+	CMPL R9, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
+	LEAL -256(R9), R9
+	MOVW $0x0019, (AX)
+	MOVW R9, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
+	LEAL -4(R9), R9
+	MOVW $0x0015, (AX)
+	MOVB R9, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
+	SHLL $0x02, R9
+	ORL  $0x01, R9
+	MOVW R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+	XORQ SI, SI
+	LEAL 1(SI)(R9*4), R9
+	MOVB BL, 1(AX)
+	SARL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, R9
+	MOVB R9, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm8B:
+	MOVL R9, SI
+	SHLL $0x02, SI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm8B
+	LEAL -15(SI), SI
+	MOVB BL, 1(AX)
+	SHRL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+emit_copy_three_match_nolit_encodeBlockAsm8B:
+	LEAL -2(SI), SI
+	MOVB SI, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm8B:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm8B
+	MOVQ -2(DX)(CX*1), SI
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm8B:
+	MOVQ  $0x9e3779b1, R8
+	MOVQ  SI, DI
+	SHRQ  $0x10, SI
+	MOVQ  SI, BX
+	SHLQ  $0x20, DI
+	IMULQ R8, DI
+	SHRQ  $0x38, DI
+	SHLQ  $0x20, BX
+	IMULQ R8, BX
+	SHRQ  $0x38, BX
+	LEAL  -2(CX), R8
+	LEAQ  24(SP)(BX*4), R9
+	MOVL  (R9), BX
+	MOVL  R8, 24(SP)(DI*4)
+	MOVL  CX, (R9)
+	CMPL  (DX)(BX*1), SI
+	JEQ   match_nolit_loop_encodeBlockAsm8B
+	INCL  CX
+	JMP   search_loop_encodeBlockAsm8B
+
+emit_remainder_encodeBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm8B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBlockAsm8B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm8B
+	JB   three_bytes_emit_remainder_encodeBlockAsm8B
+
+three_bytes_emit_remainder_encodeBlockAsm8B:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm8B
+
+two_bytes_emit_remainder_encodeBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBlockAsm8B
+	JMP  memmove_long_emit_remainder_encodeBlockAsm8B
+
+one_byte_emit_remainder_encodeBlockAsm8B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm8B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm8B
+
+memmove_long_emit_remainder_encodeBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm8B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBetterBlockAsm(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBetterBlockAsm(SB), $589848-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00001200, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBetterBlockAsm
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -6(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm:
+	MOVL CX, BX
+	SUBL 12(SP), BX
+	SHRL $0x07, BX
+	CMPL BX, $0x63
+	JBE  check_maxskip_ok_encodeBetterBlockAsm
+	LEAL 100(CX), BX
+	JMP  check_maxskip_cont_encodeBetterBlockAsm
+
+check_maxskip_ok_encodeBetterBlockAsm:
+	LEAL 1(CX)(BX*1), BX
+
+check_maxskip_cont_encodeBetterBlockAsm:
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeBetterBlockAsm
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R8
+	MOVQ  $0x9e3779b1, BX
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHLQ  $0x08, R9
+	IMULQ R8, R9
+	SHRQ  $0x2f, R9
+	SHLQ  $0x20, R10
+	IMULQ BX, R10
+	SHRQ  $0x32, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  524312(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	MOVL  CX, 524312(SP)(R10*4)
+	MOVQ  (DX)(BX*1), R9
+	MOVQ  (DX)(DI*1), R10
+	CMPQ  R9, SI
+	JEQ   candidate_match_encodeBetterBlockAsm
+	CMPQ  R10, SI
+	JNE   no_short_found_encodeBetterBlockAsm
+	MOVL  DI, BX
+	JMP   candidate_match_encodeBetterBlockAsm
+
+no_short_found_encodeBetterBlockAsm:
+	CMPL R9, SI
+	JEQ  candidate_match_encodeBetterBlockAsm
+	CMPL R10, SI
+	JEQ  candidateS_match_encodeBetterBlockAsm
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBetterBlockAsm
+
+candidateS_match_encodeBetterBlockAsm:
+	SHRQ  $0x08, SI
+	MOVQ  SI, R9
+	SHLQ  $0x08, R9
+	IMULQ R8, R9
+	SHRQ  $0x2f, R9
+	MOVL  24(SP)(R9*4), BX
+	INCL  CX
+	MOVL  CX, 24(SP)(R9*4)
+	CMPL  (DX)(BX*1), SI
+	JEQ   candidate_match_encodeBetterBlockAsm
+	DECL  CX
+	MOVL  DI, BX
+
+candidate_match_encodeBetterBlockAsm:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeBetterBlockAsm
+
+match_extend_back_loop_encodeBetterBlockAsm:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeBetterBlockAsm
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeBetterBlockAsm
+	JMP  match_extend_back_loop_encodeBetterBlockAsm
+
+match_extend_back_end_encodeBetterBlockAsm:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 5(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm:
+	MOVL CX, SI
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), R9
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm
+	JB   match_nolit_end_encodeBetterBlockAsm
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBetterBlockAsm
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeBetterBlockAsm
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBetterBlockAsm:
+	MOVL CX, DI
+	SUBL BX, DI
+
+	// Check if repeat
+	CMPL 16(SP), DI
+	JEQ  match_is_repeat_encodeBetterBlockAsm
+	CMPL R11, $0x01
+	JA   match_length_ok_encodeBetterBlockAsm
+	CMPL DI, $0x0000ffff
+	JBE  match_length_ok_encodeBetterBlockAsm
+	MOVL 20(SP), CX
+	INCL CX
+	JMP  search_loop_encodeBetterBlockAsm
+
+match_length_ok_encodeBetterBlockAsm:
+	MOVL DI, 16(SP)
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_encodeBetterBlockAsm
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm
+	CMPL BX, $0x00010000
+	JB   three_bytes_match_emit_encodeBetterBlockAsm
+	CMPL BX, $0x01000000
+	JB   four_bytes_match_emit_encodeBetterBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL BX, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+four_bytes_match_emit_encodeBetterBlockAsm:
+	MOVL BX, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (AX)
+	MOVW BX, 1(AX)
+	MOVB R10, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+three_bytes_match_emit_encodeBetterBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+two_bytes_match_emit_encodeBetterBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_encodeBetterBlockAsm
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+one_byte_match_emit_encodeBetterBlockAsm:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x04
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
+	CMPQ R8, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
+	MOVL (R9), R10
+	MOVL R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
+	MOVL (R9), R10
+	MOVL -4(R9)(R8*1), R9
+	MOVL R10, (AX)
+	MOVL R9, -4(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm
+
+memmove_long_match_emit_encodeBetterBlockAsm:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL DI, $0x00010000
+	JB   two_byte_offset_match_nolit_encodeBetterBlockAsm
+	CMPL R11, $0x40
+	JBE  four_bytes_remain_match_nolit_encodeBetterBlockAsm
+	MOVB $0xff, (AX)
+	MOVL DI, 1(AX)
+	LEAL -64(R11), R11
+	ADDQ $0x05, AX
+	CMPL R11, $0x04
+	JB   four_bytes_remain_match_nolit_encodeBetterBlockAsm
+
+	// emitRepeat
+emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
+	CMPL R11, $0x00010100
+	JB   repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
+	CMPL R11, $0x0100ffff
+	JB   repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
+	LEAL -16842747(R11), R11
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
+
+repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
+	LEAL -65536(R11), R11
+	MOVL R11, DI
+	MOVW $0x001d, (AX)
+	MOVW R11, 2(AX)
+	SARL $0x10, DI
+	MOVB DI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+four_bytes_remain_match_nolit_encodeBetterBlockAsm:
+	TESTL R11, R11
+	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm
+	XORL  BX, BX
+	LEAL  -1(BX)(R11*4), R11
+	MOVB  R11, (AX)
+	MOVL  DI, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+two_byte_offset_match_nolit_encodeBetterBlockAsm:
+	CMPL R11, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm
+	CMPL DI, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm
+	MOVL $0x00000001, BX
+	LEAL 16(BX), BX
+	MOVB DI, 1(AX)
+	MOVL DI, R8
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R11
+
+	// emitRepeat
+	LEAL -4(R11), R11
+	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+
+emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+	CMPL R11, $0x00010100
+	JB   repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+	CMPL R11, $0x0100ffff
+	JB   repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+	LEAL -16842747(R11), R11
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+
+repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+	LEAL -65536(R11), R11
+	MOVL R11, DI
+	MOVW $0x001d, (AX)
+	MOVW R11, 2(AX)
+	SARL $0x10, DI
+	MOVB DI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+long_offset_short_match_nolit_encodeBetterBlockAsm:
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(R11), R11
+	ADDQ $0x03, AX
+
+	// emitRepeat
+emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	CMPL R11, $0x00010100
+	JB   repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	CMPL R11, $0x0100ffff
+	JB   repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	LEAL -16842747(R11), R11
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
+
+repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	LEAL -65536(R11), R11
+	MOVL R11, DI
+	MOVW $0x001d, (AX)
+	MOVW R11, 2(AX)
+	SARL $0x10, DI
+	MOVB DI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
+	MOVL R11, BX
+	SHLL $0x02, BX
+	CMPL R11, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm
+	LEAL -15(BX), BX
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm:
+	LEAL -2(BX), BX
+	MOVB BL, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+match_is_repeat_encodeBetterBlockAsm:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm
+	CMPL BX, $0x00010000
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm
+	CMPL BX, $0x01000000
+	JB   four_bytes_match_emit_repeat_encodeBetterBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL BX, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+four_bytes_match_emit_repeat_encodeBetterBlockAsm:
+	MOVL BX, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (AX)
+	MOVW BX, 1(AX)
+	MOVB R10, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_repeat_encodeBetterBlockAsm
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x04
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
+	CMPQ R8, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
+	MOVL (R9), R10
+	MOVL R10, (AX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
+	MOVL (R9), R10
+	MOVL -4(R9)(R8*1), R9
+	MOVL R10, (AX)
+	MOVL R9, -4(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitRepeat
+emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm
+	CMPL R11, $0x00010100
+	JB   repeat_four_match_nolit_repeat_encodeBetterBlockAsm
+	CMPL R11, $0x0100ffff
+	JB   repeat_five_match_nolit_repeat_encodeBetterBlockAsm
+	LEAL -16842747(R11), R11
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
+
+repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
+	LEAL -65536(R11), R11
+	MOVL R11, DI
+	MOVW $0x001d, (AX)
+	MOVW R11, 2(AX)
+	SARL $0x10, DI
+	MOVB DI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm:
+	MOVQ  $0x00cf1bbcdcbfa563, BX
+	MOVQ  $0x9e3779b1, DI
+	LEAQ  1(SI), SI
+	LEAQ  -2(CX), R8
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  1(DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	MOVQ  1(DX)(R8*1), R12
+	SHLQ  $0x08, R9
+	IMULQ BX, R9
+	SHRQ  $0x2f, R9
+	SHLQ  $0x20, R10
+	IMULQ DI, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x08, R11
+	IMULQ BX, R11
+	SHRQ  $0x2f, R11
+	SHLQ  $0x20, R12
+	IMULQ DI, R12
+	SHRQ  $0x32, R12
+	LEAQ  1(SI), DI
+	LEAQ  1(R8), R13
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  R8, 24(SP)(R11*4)
+	MOVL  DI, 524312(SP)(R10*4)
+	MOVL  R13, 524312(SP)(R12*4)
+	LEAQ  1(R8)(SI*1), DI
+	SHRQ  $0x01, DI
+	ADDQ  $0x01, SI
+	SUBQ  $0x01, R8
+
+index_loop_encodeBetterBlockAsm:
+	CMPQ  DI, R8
+	JAE   search_loop_encodeBetterBlockAsm
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  (DX)(DI*1), R10
+	SHLQ  $0x08, R9
+	IMULQ BX, R9
+	SHRQ  $0x2f, R9
+	SHLQ  $0x08, R10
+	IMULQ BX, R10
+	SHRQ  $0x2f, R10
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  DI, 24(SP)(R10*4)
+	ADDQ  $0x02, SI
+	ADDQ  $0x02, DI
+	JMP   index_loop_encodeBetterBlockAsm
+
+emit_remainder_encodeBetterBlockAsm:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 5(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm
+	CMPL DX, $0x01000000
+	JB   four_bytes_emit_remainder_encodeBetterBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL DX, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+four_bytes_emit_remainder_encodeBetterBlockAsm:
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (AX)
+	MOVW DX, 1(AX)
+	MOVB BL, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+three_bytes_emit_remainder_encodeBetterBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+two_bytes_emit_remainder_encodeBetterBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBetterBlockAsm
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+one_byte_emit_remainder_encodeBetterBlockAsm:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm
+
+memmove_long_emit_remainder_encodeBetterBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00001200, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm4MB:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBetterBlockAsm4MB
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -6(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm4MB:
+	MOVL CX, BX
+	SUBL 12(SP), BX
+	SHRL $0x07, BX
+	CMPL BX, $0x63
+	JBE  check_maxskip_ok_encodeBetterBlockAsm4MB
+	LEAL 100(CX), BX
+	JMP  check_maxskip_cont_encodeBetterBlockAsm4MB
+
+check_maxskip_ok_encodeBetterBlockAsm4MB:
+	LEAL 1(CX)(BX*1), BX
+
+check_maxskip_cont_encodeBetterBlockAsm4MB:
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeBetterBlockAsm4MB
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R8
+	MOVQ  $0x9e3779b1, BX
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHLQ  $0x08, R9
+	IMULQ R8, R9
+	SHRQ  $0x2f, R9
+	SHLQ  $0x20, R10
+	IMULQ BX, R10
+	SHRQ  $0x32, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  524312(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	MOVL  CX, 524312(SP)(R10*4)
+	MOVQ  (DX)(BX*1), R9
+	MOVQ  (DX)(DI*1), R10
+	CMPQ  R9, SI
+	JEQ   candidate_match_encodeBetterBlockAsm4MB
+	CMPQ  R10, SI
+	JNE   no_short_found_encodeBetterBlockAsm4MB
+	MOVL  DI, BX
+	JMP   candidate_match_encodeBetterBlockAsm4MB
+
+no_short_found_encodeBetterBlockAsm4MB:
+	CMPL R9, SI
+	JEQ  candidate_match_encodeBetterBlockAsm4MB
+	CMPL R10, SI
+	JEQ  candidateS_match_encodeBetterBlockAsm4MB
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBetterBlockAsm4MB
+
+candidateS_match_encodeBetterBlockAsm4MB:
+	SHRQ  $0x08, SI
+	MOVQ  SI, R9
+	SHLQ  $0x08, R9
+	IMULQ R8, R9
+	SHRQ  $0x2f, R9
+	MOVL  24(SP)(R9*4), BX
+	INCL  CX
+	MOVL  CX, 24(SP)(R9*4)
+	CMPL  (DX)(BX*1), SI
+	JEQ   candidate_match_encodeBetterBlockAsm4MB
+	DECL  CX
+	MOVL  DI, BX
+
+candidate_match_encodeBetterBlockAsm4MB:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeBetterBlockAsm4MB
+
+match_extend_back_loop_encodeBetterBlockAsm4MB:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeBetterBlockAsm4MB
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm4MB
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeBetterBlockAsm4MB
+	JMP  match_extend_back_loop_encodeBetterBlockAsm4MB
+
+match_extend_back_end_encodeBetterBlockAsm4MB:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 4(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm4MB
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm4MB:
+	MOVL CX, SI
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), R9
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm4MB
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm4MB
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm4MB:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm4MB
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm4MB:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm4MB:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
+	JB   match_nolit_end_encodeBetterBlockAsm4MB
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBetterBlockAsm4MB
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm4MB:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeBetterBlockAsm4MB
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBetterBlockAsm4MB:
+	MOVL CX, DI
+	SUBL BX, DI
+
+	// Check if repeat
+	CMPL 16(SP), DI
+	JEQ  match_is_repeat_encodeBetterBlockAsm4MB
+	CMPL R11, $0x01
+	JA   match_length_ok_encodeBetterBlockAsm4MB
+	CMPL DI, $0x0000ffff
+	JBE  match_length_ok_encodeBetterBlockAsm4MB
+	MOVL 20(SP), CX
+	INCL CX
+	JMP  search_loop_encodeBetterBlockAsm4MB
+
+match_length_ok_encodeBetterBlockAsm4MB:
+	MOVL DI, 16(SP)
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm4MB
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_encodeBetterBlockAsm4MB
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm4MB
+	CMPL BX, $0x00010000
+	JB   three_bytes_match_emit_encodeBetterBlockAsm4MB
+	MOVL BX, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (AX)
+	MOVW BX, 1(AX)
+	MOVB R10, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+three_bytes_match_emit_encodeBetterBlockAsm4MB:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+two_bytes_match_emit_encodeBetterBlockAsm4MB:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_encodeBetterBlockAsm4MB
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+one_byte_match_emit_encodeBetterBlockAsm4MB:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm4MB:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x04
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
+	CMPQ R8, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
+	MOVL (R9), R10
+	MOVL R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
+	MOVL (R9), R10
+	MOVL -4(R9)(R8*1), R9
+	MOVL R10, (AX)
+	MOVL R9, -4(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm4MB
+
+memmove_long_match_emit_encodeBetterBlockAsm4MB:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL DI, $0x00010000
+	JB   two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
+	CMPL R11, $0x40
+	JBE  four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
+	MOVB $0xff, (AX)
+	MOVL DI, 1(AX)
+	LEAL -64(R11), R11
+	ADDQ $0x05, AX
+	CMPL R11, $0x04
+	JB   four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
+
+	// emitRepeat
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+	CMPL R11, $0x00010100
+	JB   repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+	LEAL -65536(R11), R11
+	MOVL R11, DI
+	MOVW $0x001d, (AX)
+	MOVW R11, 2(AX)
+	SARL $0x10, DI
+	MOVB DI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
+	TESTL R11, R11
+	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+	XORL  BX, BX
+	LEAL  -1(BX)(R11*4), R11
+	MOVB  R11, (AX)
+	MOVL  DI, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
+	CMPL R11, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
+	CMPL DI, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm4MB
+	MOVL $0x00000001, BX
+	LEAL 16(BX), BX
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R11
+
+	// emitRepeat
+	LEAL -4(R11), R11
+	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+	CMPL R11, $0x00010100
+	JB   repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+	LEAL -65536(R11), R11
+	MOVL R11, DI
+	MOVW $0x001d, (AX)
+	MOVW R11, 2(AX)
+	SARL $0x10, DI
+	MOVB DI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+long_offset_short_match_nolit_encodeBetterBlockAsm4MB:
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(R11), R11
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+	CMPL R11, $0x00010100
+	JB   repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+	LEAL -65536(R11), R11
+	MOVL R11, DI
+	MOVW $0x001d, (AX)
+	MOVW R11, 2(AX)
+	SARL $0x10, DI
+	MOVB DI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
+	MOVL R11, BX
+	SHLL $0x02, BX
+	CMPL R11, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
+	LEAL -15(BX), BX
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
+	LEAL -2(BX), BX
+	MOVB BL, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+match_is_repeat_encodeBetterBlockAsm4MB:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
+	CMPL BX, $0x00010000
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
+	MOVL BX, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (AX)
+	MOVW BX, 1(AX)
+	MOVB R10, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_repeat_encodeBetterBlockAsm4MB
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x04
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
+	CMPQ R8, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
+	MOVL (R9), R10
+	MOVL R10, (AX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
+	MOVL (R9), R10
+	MOVL -4(R9)(R8*1), R9
+	MOVL R10, (AX)
+	MOVL R9, -4(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitRepeat
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
+	CMPL R11, $0x00010100
+	JB   repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
+	LEAL -65536(R11), R11
+	MOVL R11, DI
+	MOVW $0x001d, (AX)
+	MOVW R11, 2(AX)
+	SARL $0x10, DI
+	MOVB DI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm4MB
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm4MB
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm4MB:
+	MOVQ  $0x00cf1bbcdcbfa563, BX
+	MOVQ  $0x9e3779b1, DI
+	LEAQ  1(SI), SI
+	LEAQ  -2(CX), R8
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  1(DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	MOVQ  1(DX)(R8*1), R12
+	SHLQ  $0x08, R9
+	IMULQ BX, R9
+	SHRQ  $0x2f, R9
+	SHLQ  $0x20, R10
+	IMULQ DI, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x08, R11
+	IMULQ BX, R11
+	SHRQ  $0x2f, R11
+	SHLQ  $0x20, R12
+	IMULQ DI, R12
+	SHRQ  $0x32, R12
+	LEAQ  1(SI), DI
+	LEAQ  1(R8), R13
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  R8, 24(SP)(R11*4)
+	MOVL  DI, 524312(SP)(R10*4)
+	MOVL  R13, 524312(SP)(R12*4)
+	LEAQ  1(R8)(SI*1), DI
+	SHRQ  $0x01, DI
+	ADDQ  $0x01, SI
+	SUBQ  $0x01, R8
+
+index_loop_encodeBetterBlockAsm4MB:
+	CMPQ  DI, R8
+	JAE   search_loop_encodeBetterBlockAsm4MB
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  (DX)(DI*1), R10
+	SHLQ  $0x08, R9
+	IMULQ BX, R9
+	SHRQ  $0x2f, R9
+	SHLQ  $0x08, R10
+	IMULQ BX, R10
+	SHRQ  $0x2f, R10
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  DI, 24(SP)(R10*4)
+	ADDQ  $0x02, SI
+	ADDQ  $0x02, DI
+	JMP   index_loop_encodeBetterBlockAsm4MB
+
+emit_remainder_encodeBetterBlockAsm4MB:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 4(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm4MB
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm4MB:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm4MB
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm4MB
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm4MB
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (AX)
+	MOVW DX, 1(AX)
+	MOVB BL, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBetterBlockAsm4MB
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+one_byte_emit_remainder_encodeBetterBlockAsm4MB:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm4MB:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
+
+memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBetterBlockAsm12B(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBetterBlockAsm12B(SB), $81944-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000280, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm12B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBetterBlockAsm12B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -6(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm12B:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x06, BX
+	LEAL  1(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeBetterBlockAsm12B
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  $0x9e3779b1, BX
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x32, R9
+	SHLQ  $0x20, R10
+	IMULQ BX, R10
+	SHRQ  $0x34, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  65560(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	MOVL  CX, 65560(SP)(R10*4)
+	MOVQ  (DX)(BX*1), R9
+	MOVQ  (DX)(DI*1), R10
+	CMPQ  R9, SI
+	JEQ   candidate_match_encodeBetterBlockAsm12B
+	CMPQ  R10, SI
+	JNE   no_short_found_encodeBetterBlockAsm12B
+	MOVL  DI, BX
+	JMP   candidate_match_encodeBetterBlockAsm12B
+
+no_short_found_encodeBetterBlockAsm12B:
+	CMPL R9, SI
+	JEQ  candidate_match_encodeBetterBlockAsm12B
+	CMPL R10, SI
+	JEQ  candidateS_match_encodeBetterBlockAsm12B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBetterBlockAsm12B
+
+candidateS_match_encodeBetterBlockAsm12B:
+	SHRQ  $0x08, SI
+	MOVQ  SI, R9
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x32, R9
+	MOVL  24(SP)(R9*4), BX
+	INCL  CX
+	MOVL  CX, 24(SP)(R9*4)
+	CMPL  (DX)(BX*1), SI
+	JEQ   candidate_match_encodeBetterBlockAsm12B
+	DECL  CX
+	MOVL  DI, BX
+
+candidate_match_encodeBetterBlockAsm12B:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeBetterBlockAsm12B
+
+match_extend_back_loop_encodeBetterBlockAsm12B:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeBetterBlockAsm12B
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm12B
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeBetterBlockAsm12B
+	JMP  match_extend_back_loop_encodeBetterBlockAsm12B
+
+match_extend_back_end_encodeBetterBlockAsm12B:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm12B:
+	MOVL CX, SI
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), R9
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm12B
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm12B
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm12B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm12B
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm12B
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm12B
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm12B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm12B
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm12B
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm12B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm12B
+	JB   match_nolit_end_encodeBetterBlockAsm12B
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm12B
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBetterBlockAsm12B
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm12B:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeBetterBlockAsm12B
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBetterBlockAsm12B:
+	MOVL CX, DI
+	SUBL BX, DI
+
+	// Check if repeat
+	CMPL 16(SP), DI
+	JEQ  match_is_repeat_encodeBetterBlockAsm12B
+	MOVL DI, 16(SP)
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm12B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_encodeBetterBlockAsm12B
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm12B
+	JB   three_bytes_match_emit_encodeBetterBlockAsm12B
+
+three_bytes_match_emit_encodeBetterBlockAsm12B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B
+
+two_bytes_match_emit_encodeBetterBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_encodeBetterBlockAsm12B
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B
+
+one_byte_match_emit_encodeBetterBlockAsm12B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm12B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x04
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
+	CMPQ R8, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
+	MOVL (R9), R10
+	MOVL R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
+	MOVL (R9), R10
+	MOVL -4(R9)(R8*1), R9
+	MOVL R10, (AX)
+	MOVL R9, -4(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm12B
+
+memmove_long_match_emit_encodeBetterBlockAsm12B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm12B:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL R11, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
+	CMPL DI, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm12B
+	MOVL $0x00000001, BX
+	LEAL 16(BX), BX
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R11
+
+	// emitRepeat
+	LEAL -4(R11), R11
+	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+long_offset_short_match_nolit_encodeBetterBlockAsm12B:
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(R11), R11
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
+	MOVL R11, BX
+	SHLL $0x02, BX
+	CMPL R11, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
+	LEAL -15(BX), BX
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
+	LEAL -2(BX), BX
+	MOVB BL, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+match_is_repeat_encodeBetterBlockAsm12B:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm12B
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm12B
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_repeat_encodeBetterBlockAsm12B
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm12B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x04
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
+	CMPQ R8, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
+	MOVL (R9), R10
+	MOVL R10, (AX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
+	MOVL (R9), R10
+	MOVL -4(R9)(R8*1), R9
+	MOVL R10, (AX)
+	MOVL R9, -4(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitRepeat
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm12B
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm12B:
+	MOVQ  $0x0000cf1bbcdcbf9b, BX
+	MOVQ  $0x9e3779b1, DI
+	LEAQ  1(SI), SI
+	LEAQ  -2(CX), R8
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  1(DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	MOVQ  1(DX)(R8*1), R12
+	SHLQ  $0x10, R9
+	IMULQ BX, R9
+	SHRQ  $0x32, R9
+	SHLQ  $0x20, R10
+	IMULQ DI, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x10, R11
+	IMULQ BX, R11
+	SHRQ  $0x32, R11
+	SHLQ  $0x20, R12
+	IMULQ DI, R12
+	SHRQ  $0x34, R12
+	LEAQ  1(SI), DI
+	LEAQ  1(R8), R13
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  R8, 24(SP)(R11*4)
+	MOVL  DI, 65560(SP)(R10*4)
+	MOVL  R13, 65560(SP)(R12*4)
+	LEAQ  1(R8)(SI*1), DI
+	SHRQ  $0x01, DI
+	ADDQ  $0x01, SI
+	SUBQ  $0x01, R8
+
+index_loop_encodeBetterBlockAsm12B:
+	CMPQ  DI, R8
+	JAE   search_loop_encodeBetterBlockAsm12B
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  (DX)(DI*1), R10
+	SHLQ  $0x10, R9
+	IMULQ BX, R9
+	SHRQ  $0x32, R9
+	SHLQ  $0x10, R10
+	IMULQ BX, R10
+	SHRQ  $0x32, R10
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  DI, 24(SP)(R10*4)
+	ADDQ  $0x02, SI
+	ADDQ  $0x02, DI
+	JMP   index_loop_encodeBetterBlockAsm12B
+
+emit_remainder_encodeBetterBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm12B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm12B
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm12B
+
+three_bytes_emit_remainder_encodeBetterBlockAsm12B:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBetterBlockAsm12B
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B
+
+one_byte_emit_remainder_encodeBetterBlockAsm12B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBetterBlockAsm10B(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBetterBlockAsm10B(SB), $20504-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x000000a0, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm10B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBetterBlockAsm10B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -6(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm10B:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x05, BX
+	LEAL  1(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeBetterBlockAsm10B
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  $0x9e3779b1, BX
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x34, R9
+	SHLQ  $0x20, R10
+	IMULQ BX, R10
+	SHRQ  $0x36, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  16408(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	MOVL  CX, 16408(SP)(R10*4)
+	MOVQ  (DX)(BX*1), R9
+	MOVQ  (DX)(DI*1), R10
+	CMPQ  R9, SI
+	JEQ   candidate_match_encodeBetterBlockAsm10B
+	CMPQ  R10, SI
+	JNE   no_short_found_encodeBetterBlockAsm10B
+	MOVL  DI, BX
+	JMP   candidate_match_encodeBetterBlockAsm10B
+
+no_short_found_encodeBetterBlockAsm10B:
+	CMPL R9, SI
+	JEQ  candidate_match_encodeBetterBlockAsm10B
+	CMPL R10, SI
+	JEQ  candidateS_match_encodeBetterBlockAsm10B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBetterBlockAsm10B
+
+candidateS_match_encodeBetterBlockAsm10B:
+	SHRQ  $0x08, SI
+	MOVQ  SI, R9
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x34, R9
+	MOVL  24(SP)(R9*4), BX
+	INCL  CX
+	MOVL  CX, 24(SP)(R9*4)
+	CMPL  (DX)(BX*1), SI
+	JEQ   candidate_match_encodeBetterBlockAsm10B
+	DECL  CX
+	MOVL  DI, BX
+
+candidate_match_encodeBetterBlockAsm10B:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeBetterBlockAsm10B
+
+match_extend_back_loop_encodeBetterBlockAsm10B:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeBetterBlockAsm10B
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm10B
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeBetterBlockAsm10B
+	JMP  match_extend_back_loop_encodeBetterBlockAsm10B
+
+match_extend_back_end_encodeBetterBlockAsm10B:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm10B:
+	MOVL CX, SI
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), R9
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm10B
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm10B
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm10B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm10B
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm10B
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm10B
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm10B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm10B
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm10B
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm10B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm10B
+	JB   match_nolit_end_encodeBetterBlockAsm10B
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm10B
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBetterBlockAsm10B
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm10B:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeBetterBlockAsm10B
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBetterBlockAsm10B:
+	MOVL CX, DI
+	SUBL BX, DI
+
+	// Check if repeat
+	CMPL 16(SP), DI
+	JEQ  match_is_repeat_encodeBetterBlockAsm10B
+	MOVL DI, 16(SP)
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm10B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_encodeBetterBlockAsm10B
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm10B
+	JB   three_bytes_match_emit_encodeBetterBlockAsm10B
+
+three_bytes_match_emit_encodeBetterBlockAsm10B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B
+
+two_bytes_match_emit_encodeBetterBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_encodeBetterBlockAsm10B
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B
+
+one_byte_match_emit_encodeBetterBlockAsm10B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm10B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x04
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
+	CMPQ R8, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
+	MOVL (R9), R10
+	MOVL R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
+	MOVL (R9), R10
+	MOVL -4(R9)(R8*1), R9
+	MOVL R10, (AX)
+	MOVL R9, -4(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm10B
+
+memmove_long_match_emit_encodeBetterBlockAsm10B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm10B:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL R11, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
+	CMPL DI, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm10B
+	MOVL $0x00000001, BX
+	LEAL 16(BX), BX
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R11
+
+	// emitRepeat
+	LEAL -4(R11), R11
+	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+long_offset_short_match_nolit_encodeBetterBlockAsm10B:
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(R11), R11
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
+	MOVL R11, BX
+	SHLL $0x02, BX
+	CMPL R11, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
+	LEAL -15(BX), BX
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
+	LEAL -2(BX), BX
+	MOVB BL, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+match_is_repeat_encodeBetterBlockAsm10B:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm10B
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm10B
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_repeat_encodeBetterBlockAsm10B
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm10B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x04
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
+	CMPQ R8, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
+	MOVL (R9), R10
+	MOVL R10, (AX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
+	MOVL (R9), R10
+	MOVL -4(R9)(R8*1), R9
+	MOVL R10, (AX)
+	MOVL R9, -4(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitRepeat
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm10B
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm10B:
+	MOVQ  $0x0000cf1bbcdcbf9b, BX
+	MOVQ  $0x9e3779b1, DI
+	LEAQ  1(SI), SI
+	LEAQ  -2(CX), R8
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  1(DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	MOVQ  1(DX)(R8*1), R12
+	SHLQ  $0x10, R9
+	IMULQ BX, R9
+	SHRQ  $0x34, R9
+	SHLQ  $0x20, R10
+	IMULQ DI, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x10, R11
+	IMULQ BX, R11
+	SHRQ  $0x34, R11
+	SHLQ  $0x20, R12
+	IMULQ DI, R12
+	SHRQ  $0x36, R12
+	LEAQ  1(SI), DI
+	LEAQ  1(R8), R13
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  R8, 24(SP)(R11*4)
+	MOVL  DI, 16408(SP)(R10*4)
+	MOVL  R13, 16408(SP)(R12*4)
+	LEAQ  1(R8)(SI*1), DI
+	SHRQ  $0x01, DI
+	ADDQ  $0x01, SI
+	SUBQ  $0x01, R8
+
+index_loop_encodeBetterBlockAsm10B:
+	CMPQ  DI, R8
+	JAE   search_loop_encodeBetterBlockAsm10B
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  (DX)(DI*1), R10
+	SHLQ  $0x10, R9
+	IMULQ BX, R9
+	SHRQ  $0x34, R9
+	SHLQ  $0x10, R10
+	IMULQ BX, R10
+	SHRQ  $0x34, R10
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  DI, 24(SP)(R10*4)
+	ADDQ  $0x02, SI
+	ADDQ  $0x02, DI
+	JMP   index_loop_encodeBetterBlockAsm10B
+
+emit_remainder_encodeBetterBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm10B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm10B
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm10B
+
+three_bytes_emit_remainder_encodeBetterBlockAsm10B:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBetterBlockAsm10B
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B
+
+one_byte_emit_remainder_encodeBetterBlockAsm10B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBetterBlockAsm8B(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBetterBlockAsm8B(SB), $5144-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000028, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm8B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBetterBlockAsm8B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -6(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm8B:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x04, BX
+	LEAL  1(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeBetterBlockAsm8B
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  $0x9e3779b1, BX
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x36, R9
+	SHLQ  $0x20, R10
+	IMULQ BX, R10
+	SHRQ  $0x38, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  4120(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	MOVL  CX, 4120(SP)(R10*4)
+	MOVQ  (DX)(BX*1), R9
+	MOVQ  (DX)(DI*1), R10
+	CMPQ  R9, SI
+	JEQ   candidate_match_encodeBetterBlockAsm8B
+	CMPQ  R10, SI
+	JNE   no_short_found_encodeBetterBlockAsm8B
+	MOVL  DI, BX
+	JMP   candidate_match_encodeBetterBlockAsm8B
+
+no_short_found_encodeBetterBlockAsm8B:
+	CMPL R9, SI
+	JEQ  candidate_match_encodeBetterBlockAsm8B
+	CMPL R10, SI
+	JEQ  candidateS_match_encodeBetterBlockAsm8B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBetterBlockAsm8B
+
+candidateS_match_encodeBetterBlockAsm8B:
+	SHRQ  $0x08, SI
+	MOVQ  SI, R9
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x36, R9
+	MOVL  24(SP)(R9*4), BX
+	INCL  CX
+	MOVL  CX, 24(SP)(R9*4)
+	CMPL  (DX)(BX*1), SI
+	JEQ   candidate_match_encodeBetterBlockAsm8B
+	DECL  CX
+	MOVL  DI, BX
+
+candidate_match_encodeBetterBlockAsm8B:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeBetterBlockAsm8B
+
+match_extend_back_loop_encodeBetterBlockAsm8B:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeBetterBlockAsm8B
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm8B
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeBetterBlockAsm8B
+	JMP  match_extend_back_loop_encodeBetterBlockAsm8B
+
+match_extend_back_end_encodeBetterBlockAsm8B:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm8B:
+	MOVL CX, SI
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), R9
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm8B
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm8B
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm8B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm8B
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm8B
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm8B
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm8B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm8B
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm8B
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm8B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm8B
+	JB   match_nolit_end_encodeBetterBlockAsm8B
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm8B
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBetterBlockAsm8B
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm8B:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeBetterBlockAsm8B
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBetterBlockAsm8B:
+	MOVL CX, DI
+	SUBL BX, DI
+
+	// Check if repeat
+	CMPL 16(SP), DI
+	JEQ  match_is_repeat_encodeBetterBlockAsm8B
+	MOVL DI, 16(SP)
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm8B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_encodeBetterBlockAsm8B
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm8B
+	JB   three_bytes_match_emit_encodeBetterBlockAsm8B
+
+three_bytes_match_emit_encodeBetterBlockAsm8B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B
+
+two_bytes_match_emit_encodeBetterBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_encodeBetterBlockAsm8B
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B
+
+one_byte_match_emit_encodeBetterBlockAsm8B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm8B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x04
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
+	CMPQ R8, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
+	MOVL (R9), R10
+	MOVL R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
+	MOVL (R9), R10
+	MOVL -4(R9)(R8*1), R9
+	MOVL R10, (AX)
+	MOVL R9, -4(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm8B
+
+memmove_long_match_emit_encodeBetterBlockAsm8B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm8B:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL R11, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
+	CMPL DI, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm8B
+	MOVL $0x00000001, BX
+	LEAL 16(BX), BX
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R11
+
+	// emitRepeat
+	LEAL -4(R11), R11
+	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+long_offset_short_match_nolit_encodeBetterBlockAsm8B:
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(R11), R11
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
+	MOVL R11, BX
+	SHLL $0x02, BX
+	CMPL R11, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm8B
+	LEAL -15(BX), BX
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
+	LEAL -2(BX), BX
+	MOVB BL, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+match_is_repeat_encodeBetterBlockAsm8B:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
+	MOVL SI, DI
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R8
+	SUBL BX, DI
+	LEAL -1(DI), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm8B
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm8B
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_repeat_encodeBetterBlockAsm8B
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm8B:
+	LEAQ (AX)(DI*1), BX
+
+	// genMemMoveShort
+	CMPQ DI, $0x04
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
+	CMPQ DI, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
+	MOVL (R8), R9
+	MOVL R9, (AX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
+	MOVL (R8), R9
+	MOVL -4(R8)(DI*1), R8
+	MOVL R9, (AX)
+	MOVL R8, -4(AX)(DI*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (AX)
+	MOVQ R8, -8(AX)(DI*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(DI*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DI*1)
+	MOVOU X3, -16(AX)(DI*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
+	LEAQ (AX)(DI*1), BX
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R12
+	SUBQ  R9, R12
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R12*1), R9
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R12
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R12*1), X4
+	MOVOU -16(R8)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  DI, R12
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DI*1)
+	MOVOU X3, -16(AX)(DI*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitRepeat
+	MOVL R11, BX
+	LEAL -4(R11), R11
+	CMPL BX, $0x08
+	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
+	CMPL BX, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
+	CMPL R11, $0x00000104
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
+	LEAL -256(R11), R11
+	MOVW $0x0019, (AX)
+	MOVW R11, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
+	LEAL -4(R11), R11
+	MOVW $0x0015, (AX)
+	MOVB R11, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
+	SHLL $0x02, R11
+	ORL  $0x01, R11
+	MOVW R11, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+	XORQ BX, BX
+	LEAL 1(BX)(R11*4), R11
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R11
+	MOVB R11, (AX)
+	ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm8B
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm8B:
+	MOVQ  $0x0000cf1bbcdcbf9b, BX
+	MOVQ  $0x9e3779b1, DI
+	LEAQ  1(SI), SI
+	LEAQ  -2(CX), R8
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  1(DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	MOVQ  1(DX)(R8*1), R12
+	SHLQ  $0x10, R9
+	IMULQ BX, R9
+	SHRQ  $0x36, R9
+	SHLQ  $0x20, R10
+	IMULQ DI, R10
+	SHRQ  $0x38, R10
+	SHLQ  $0x10, R11
+	IMULQ BX, R11
+	SHRQ  $0x36, R11
+	SHLQ  $0x20, R12
+	IMULQ DI, R12
+	SHRQ  $0x38, R12
+	LEAQ  1(SI), DI
+	LEAQ  1(R8), R13
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  R8, 24(SP)(R11*4)
+	MOVL  DI, 4120(SP)(R10*4)
+	MOVL  R13, 4120(SP)(R12*4)
+	LEAQ  1(R8)(SI*1), DI
+	SHRQ  $0x01, DI
+	ADDQ  $0x01, SI
+	SUBQ  $0x01, R8
+
+index_loop_encodeBetterBlockAsm8B:
+	CMPQ  DI, R8
+	JAE   search_loop_encodeBetterBlockAsm8B
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  (DX)(DI*1), R10
+	SHLQ  $0x10, R9
+	IMULQ BX, R9
+	SHRQ  $0x36, R9
+	SHLQ  $0x10, R10
+	IMULQ BX, R10
+	SHRQ  $0x36, R10
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  DI, 24(SP)(R10*4)
+	ADDQ  $0x02, SI
+	ADDQ  $0x02, DI
+	JMP   index_loop_encodeBetterBlockAsm8B
+
+emit_remainder_encodeBetterBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm8B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm8B
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm8B
+
+three_bytes_emit_remainder_encodeBetterBlockAsm8B:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBetterBlockAsm8B
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B
+
+one_byte_emit_remainder_encodeBetterBlockAsm8B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBlockAsm(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBlockAsm(SB), $65560-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000200, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBlockAsm
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x06, BX
+	LEAL  4(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeSnappyBlockAsm
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHRQ  $0x08, R10
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x32, R9
+	SHLQ  $0x10, R10
+	IMULQ R8, R10
+	SHRQ  $0x32, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  24(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	LEAL  1(CX), R9
+	MOVL  R9, 24(SP)(R10*4)
+	MOVQ  SI, R9
+	SHRQ  $0x10, R9
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x32, R9
+	MOVL  CX, R8
+	SUBL  16(SP), R8
+	MOVL  1(DX)(R8*1), R10
+	MOVQ  SI, R8
+	SHRQ  $0x08, R8
+	CMPL  R8, R10
+	JNE   no_repeat_found_encodeSnappyBlockAsm
+	LEAL  1(CX), SI
+	MOVL  12(SP), BX
+	MOVL  SI, DI
+	SUBL  16(SP), DI
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm
+
+repeat_extend_back_loop_encodeSnappyBlockAsm:
+	CMPL SI, BX
+	JBE  repeat_extend_back_end_encodeSnappyBlockAsm
+	MOVB -1(DX)(DI*1), R8
+	MOVB -1(DX)(SI*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm
+	LEAL -1(SI), SI
+	DECL DI
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm
+
+repeat_extend_back_end_encodeSnappyBlockAsm:
+	MOVL SI, BX
+	SUBL 12(SP), BX
+	LEAQ 5(AX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   repeat_dst_size_check_encodeSnappyBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+repeat_dst_size_check_encodeSnappyBlockAsm:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm
+	MOVL SI, DI
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R8
+	SUBL BX, DI
+	LEAL -1(DI), BX
+	CMPL BX, $0x3c
+	JB   one_byte_repeat_emit_encodeSnappyBlockAsm
+	CMPL BX, $0x00000100
+	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm
+	CMPL BX, $0x00010000
+	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm
+	CMPL BX, $0x01000000
+	JB   four_bytes_repeat_emit_encodeSnappyBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL BX, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+four_bytes_repeat_emit_encodeSnappyBlockAsm:
+	MOVL BX, R9
+	SHRL $0x10, R9
+	MOVB $0xf8, (AX)
+	MOVW BX, 1(AX)
+	MOVB R9, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+three_bytes_repeat_emit_encodeSnappyBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_repeat_emit_encodeSnappyBlockAsm
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+one_byte_repeat_emit_encodeSnappyBlockAsm:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm:
+	LEAQ (AX)(DI*1), BX
+
+	// genMemMoveShort
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (AX)
+	MOVQ R8, -8(AX)(DI*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(DI*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DI*1)
+	MOVOU X3, -16(AX)(DI*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
+	MOVQ BX, AX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm:
+	LEAQ (AX)(DI*1), BX
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R11*1), X4
+	MOVOU -16(R8)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  DI, R11
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DI*1)
+	MOVOU X3, -16(AX)(DI*1)
+	MOVQ  BX, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
+	ADDL $0x05, CX
+	MOVL CX, BX
+	SUBL 16(SP), BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm:
+	CMPL DI, $0x10
+	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (BX)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
+	XORQ 8(BX)(R10*1), R11
+	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm
+
+matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm
+
+matchlen_match8_repeat_extend_encodeSnappyBlockAsm:
+	CMPL DI, $0x08
+	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm
+	MOVQ (R8)(R10*1), R9
+	XORQ (BX)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm
+
+matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm:
+	CMPL DI, $0x04
+	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm
+	MOVL (R8)(R10*1), R9
+	CMPL (BX)(R10*1), R9
+	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm:
+	CMPL DI, $0x01
+	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm
+	JB   repeat_extend_forward_end_encodeSnappyBlockAsm
+	MOVW (R8)(R10*1), R9
+	CMPW (BX)(R10*1), R9
+	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm:
+	MOVB (R8)(R10*1), R9
+	CMPB (BX)(R10*1), R9
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm
+	LEAL 1(R10), R10
+
+repeat_extend_forward_end_encodeSnappyBlockAsm:
+	ADDL R10, CX
+	MOVL CX, BX
+	SUBL SI, BX
+	MOVL 16(SP), SI
+
+	// emitCopy
+	CMPL SI, $0x00010000
+	JB   two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
+
+four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
+	CMPL BX, $0x40
+	JBE  four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
+	MOVB $0xff, (AX)
+	MOVL SI, 1(AX)
+	LEAL -64(BX), BX
+	ADDQ $0x05, AX
+	CMPL BX, $0x04
+	JB   four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
+	JMP  four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
+
+four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
+	TESTL BX, BX
+	JZ    repeat_end_emit_encodeSnappyBlockAsm
+	XORL  DI, DI
+	LEAL  -1(DI)(BX*4), BX
+	MOVB  BL, (AX)
+	MOVL  SI, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   repeat_end_emit_encodeSnappyBlockAsm
+
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
+	CMPL BX, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(BX), BX
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
+	MOVL BX, DI
+	SHLL $0x02, DI
+	CMPL BX, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
+	LEAL -15(DI), DI
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm
+
+no_repeat_found_encodeSnappyBlockAsm:
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate_match_encodeSnappyBlockAsm
+	SHRQ $0x08, SI
+	MOVL 24(SP)(R9*4), BX
+	LEAL 2(CX), R8
+	CMPL (DX)(DI*1), SI
+	JEQ  candidate2_match_encodeSnappyBlockAsm
+	MOVL R8, 24(SP)(R9*4)
+	SHRQ $0x08, SI
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate3_match_encodeSnappyBlockAsm
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBlockAsm
+
+candidate3_match_encodeSnappyBlockAsm:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeSnappyBlockAsm
+
+candidate2_match_encodeSnappyBlockAsm:
+	MOVL R8, 24(SP)(R9*4)
+	INCL CX
+	MOVL DI, BX
+
+candidate_match_encodeSnappyBlockAsm:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeSnappyBlockAsm
+
+match_extend_back_loop_encodeSnappyBlockAsm:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeSnappyBlockAsm
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeSnappyBlockAsm
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeSnappyBlockAsm
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm
+
+match_extend_back_end_encodeSnappyBlockAsm:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 5(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeSnappyBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm:
+	MOVL CX, SI
+	MOVL 12(SP), DI
+	CMPL DI, SI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(DI*1), SI
+	SUBL DI, R8
+	LEAL -1(R8), DI
+	CMPL DI, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBlockAsm
+	CMPL DI, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBlockAsm
+	CMPL DI, $0x00010000
+	JB   three_bytes_match_emit_encodeSnappyBlockAsm
+	CMPL DI, $0x01000000
+	JB   four_bytes_match_emit_encodeSnappyBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL DI, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
+
+four_bytes_match_emit_encodeSnappyBlockAsm:
+	MOVL DI, R9
+	SHRL $0x10, R9
+	MOVB $0xf8, (AX)
+	MOVW DI, 1(AX)
+	MOVB R9, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
+
+three_bytes_match_emit_encodeSnappyBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
+
+two_bytes_match_emit_encodeSnappyBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB DI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DI, $0x40
+	JB   memmove_match_emit_encodeSnappyBlockAsm
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
+
+one_byte_match_emit_encodeSnappyBlockAsm:
+	SHLB $0x02, DI
+	MOVB DI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
+	MOVQ (SI), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
+	MOVQ (SI), R9
+	MOVQ -8(SI)(R8*1), SI
+	MOVQ R9, (AX)
+	MOVQ SI, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
+	MOVOU (SI), X0
+	MOVOU -16(SI)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm:
+	MOVQ DI, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm
+
+memmove_long_match_emit_encodeSnappyBlockAsm:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveLong
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVQ  R8, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(SI)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(SI)(R11*1), X4
+	MOVOU -16(SI)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  R8, R11
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  DI, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm:
+match_nolit_loop_encodeSnappyBlockAsm:
+	MOVL CX, SI
+	SUBL BX, SI
+	MOVL SI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), SI
+	SUBL CX, SI
+	LEAQ (DX)(CX*1), DI
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R9, R9
+
+matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm:
+	CMPL SI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm
+	MOVQ (DI)(R9*1), R8
+	MOVQ 8(DI)(R9*1), R10
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
+	XORQ 8(BX)(R9*1), R10
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm
+	LEAL -16(SI), SI
+	LEAL 16(R9), R9
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm
+
+matchlen_bsf_16match_nolit_encodeSnappyBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R9)(R10*1), R9
+	JMP  match_nolit_end_encodeSnappyBlockAsm
+
+matchlen_match8_match_nolit_encodeSnappyBlockAsm:
+	CMPL SI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm
+	MOVQ (DI)(R9*1), R8
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
+	LEAL -8(SI), SI
+	LEAL 8(R9), R9
+	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm
+
+matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R8, R8
+
+#else
+	BSFQ R8, R8
+
+#endif
+	SARQ $0x03, R8
+	LEAL (R9)(R8*1), R9
+	JMP  match_nolit_end_encodeSnappyBlockAsm
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm:
+	CMPL SI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm
+	MOVL (DI)(R9*1), R8
+	CMPL (BX)(R9*1), R8
+	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm
+	LEAL -4(SI), SI
+	LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm:
+	CMPL SI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm
+	JB   match_nolit_end_encodeSnappyBlockAsm
+	MOVW (DI)(R9*1), R8
+	CMPW (BX)(R9*1), R8
+	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm
+	LEAL 2(R9), R9
+	SUBL $0x02, SI
+	JZ   match_nolit_end_encodeSnappyBlockAsm
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm:
+	MOVB (DI)(R9*1), R8
+	CMPB (BX)(R9*1), R8
+	JNE  match_nolit_end_encodeSnappyBlockAsm
+	LEAL 1(R9), R9
+
+match_nolit_end_encodeSnappyBlockAsm:
+	ADDL R9, CX
+	MOVL 16(SP), BX
+	ADDL $0x04, R9
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL BX, $0x00010000
+	JB   two_byte_offset_match_nolit_encodeSnappyBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
+	CMPL R9, $0x40
+	JBE  four_bytes_remain_match_nolit_encodeSnappyBlockAsm
+	MOVB $0xff, (AX)
+	MOVL BX, 1(AX)
+	LEAL -64(R9), R9
+	ADDQ $0x05, AX
+	CMPL R9, $0x04
+	JB   four_bytes_remain_match_nolit_encodeSnappyBlockAsm
+	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
+
+four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
+	TESTL R9, R9
+	JZ    match_nolit_emitcopy_end_encodeSnappyBlockAsm
+	XORL  SI, SI
+	LEAL  -1(SI)(R9*4), R9
+	MOVB  R9, (AX)
+	MOVL  BX, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   match_nolit_emitcopy_end_encodeSnappyBlockAsm
+
+two_byte_offset_match_nolit_encodeSnappyBlockAsm:
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
+	MOVB $0xee, (AX)
+	MOVW BX, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
+	MOVL R9, SI
+	SHLL $0x02, SI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
+	CMPL BX, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
+	LEAL -15(SI), SI
+	MOVB BL, 1(AX)
+	SHRL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm:
+	LEAL -2(SI), SI
+	MOVB SI, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBlockAsm
+	MOVQ -2(DX)(CX*1), SI
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm:
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  SI, DI
+	SHRQ  $0x10, SI
+	MOVQ  SI, BX
+	SHLQ  $0x10, DI
+	IMULQ R8, DI
+	SHRQ  $0x32, DI
+	SHLQ  $0x10, BX
+	IMULQ R8, BX
+	SHRQ  $0x32, BX
+	LEAL  -2(CX), R8
+	LEAQ  24(SP)(BX*4), R9
+	MOVL  (R9), BX
+	MOVL  R8, 24(SP)(DI*4)
+	MOVL  CX, (R9)
+	CMPL  (DX)(BX*1), SI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm
+	INCL  CX
+	JMP   search_loop_encodeSnappyBlockAsm
+
+emit_remainder_encodeSnappyBlockAsm:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 5(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBlockAsm
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm
+	CMPL DX, $0x01000000
+	JB   four_bytes_emit_remainder_encodeSnappyBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL DX, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+four_bytes_emit_remainder_encodeSnappyBlockAsm:
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (AX)
+	MOVW DX, 1(AX)
+	MOVB BL, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+three_bytes_emit_remainder_encodeSnappyBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBlockAsm
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+one_byte_emit_remainder_encodeSnappyBlockAsm:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000200, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm64K:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBlockAsm64K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm64K:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x06, BX
+	LEAL  4(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeSnappyBlockAsm64K
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHRQ  $0x08, R10
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x32, R9
+	SHLQ  $0x10, R10
+	IMULQ R8, R10
+	SHRQ  $0x32, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  24(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	LEAL  1(CX), R9
+	MOVL  R9, 24(SP)(R10*4)
+	MOVQ  SI, R9
+	SHRQ  $0x10, R9
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x32, R9
+	MOVL  CX, R8
+	SUBL  16(SP), R8
+	MOVL  1(DX)(R8*1), R10
+	MOVQ  SI, R8
+	SHRQ  $0x08, R8
+	CMPL  R8, R10
+	JNE   no_repeat_found_encodeSnappyBlockAsm64K
+	LEAL  1(CX), SI
+	MOVL  12(SP), BX
+	MOVL  SI, DI
+	SUBL  16(SP), DI
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm64K
+
+repeat_extend_back_loop_encodeSnappyBlockAsm64K:
+	CMPL SI, BX
+	JBE  repeat_extend_back_end_encodeSnappyBlockAsm64K
+	MOVB -1(DX)(DI*1), R8
+	MOVB -1(DX)(SI*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm64K
+	LEAL -1(SI), SI
+	DECL DI
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm64K
+
+repeat_extend_back_end_encodeSnappyBlockAsm64K:
+	MOVL SI, BX
+	SUBL 12(SP), BX
+	LEAQ 3(AX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   repeat_dst_size_check_encodeSnappyBlockAsm64K
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+repeat_dst_size_check_encodeSnappyBlockAsm64K:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
+	MOVL SI, DI
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R8
+	SUBL BX, DI
+	LEAL -1(DI), BX
+	CMPL BX, $0x3c
+	JB   one_byte_repeat_emit_encodeSnappyBlockAsm64K
+	CMPL BX, $0x00000100
+	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm64K
+	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm64K
+
+three_bytes_repeat_emit_encodeSnappyBlockAsm64K:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm64K
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_repeat_emit_encodeSnappyBlockAsm64K
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm64K
+
+one_byte_repeat_emit_encodeSnappyBlockAsm64K:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm64K:
+	LEAQ (AX)(DI*1), BX
+
+	// genMemMoveShort
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (AX)
+	MOVQ R8, -8(AX)(DI*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(DI*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DI*1)
+	MOVOU X3, -16(AX)(DI*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
+	MOVQ BX, AX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
+	LEAQ (AX)(DI*1), BX
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R11*1), X4
+	MOVOU -16(R8)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  DI, R11
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DI*1)
+	MOVOU X3, -16(AX)(DI*1)
+	MOVQ  BX, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
+	ADDL $0x05, CX
+	MOVL CX, BX
+	SUBL 16(SP), BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K:
+	CMPL DI, $0x10
+	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (BX)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
+	XORQ 8(BX)(R10*1), R11
+	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K
+
+matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm64K
+
+matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
+	MOVQ (R8)(R10*1), R9
+	XORQ (BX)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
+
+matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm64K
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
+	MOVL (R8)(R10*1), R9
+	CMPL (BX)(R10*1), R9
+	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
+	JB   repeat_extend_forward_end_encodeSnappyBlockAsm64K
+	MOVW (R8)(R10*1), R9
+	CMPW (BX)(R10*1), R9
+	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm64K
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K:
+	MOVB (R8)(R10*1), R9
+	CMPB (BX)(R10*1), R9
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm64K
+	LEAL 1(R10), R10
+
+repeat_extend_forward_end_encodeSnappyBlockAsm64K:
+	ADDL R10, CX
+	MOVL CX, BX
+	SUBL SI, BX
+	MOVL 16(SP), SI
+
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
+	CMPL BX, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(BX), BX
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
+	MOVL BX, DI
+	SHLL $0x02, DI
+	CMPL BX, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
+	LEAL -15(DI), DI
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm64K
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm64K:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm64K
+
+no_repeat_found_encodeSnappyBlockAsm64K:
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate_match_encodeSnappyBlockAsm64K
+	SHRQ $0x08, SI
+	MOVL 24(SP)(R9*4), BX
+	LEAL 2(CX), R8
+	CMPL (DX)(DI*1), SI
+	JEQ  candidate2_match_encodeSnappyBlockAsm64K
+	MOVL R8, 24(SP)(R9*4)
+	SHRQ $0x08, SI
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate3_match_encodeSnappyBlockAsm64K
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBlockAsm64K
+
+candidate3_match_encodeSnappyBlockAsm64K:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeSnappyBlockAsm64K
+
+candidate2_match_encodeSnappyBlockAsm64K:
+	MOVL R8, 24(SP)(R9*4)
+	INCL CX
+	MOVL DI, BX
+
+candidate_match_encodeSnappyBlockAsm64K:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeSnappyBlockAsm64K
+
+match_extend_back_loop_encodeSnappyBlockAsm64K:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeSnappyBlockAsm64K
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeSnappyBlockAsm64K
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeSnappyBlockAsm64K
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm64K
+
+match_extend_back_end_encodeSnappyBlockAsm64K:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeSnappyBlockAsm64K
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm64K:
+	MOVL CX, SI
+	MOVL 12(SP), DI
+	CMPL DI, SI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm64K
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(DI*1), SI
+	SUBL DI, R8
+	LEAL -1(R8), DI
+	CMPL DI, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBlockAsm64K
+	CMPL DI, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBlockAsm64K
+	JB   three_bytes_match_emit_encodeSnappyBlockAsm64K
+
+three_bytes_match_emit_encodeSnappyBlockAsm64K:
+	MOVB $0xf4, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm64K
+
+two_bytes_match_emit_encodeSnappyBlockAsm64K:
+	MOVB $0xf0, (AX)
+	MOVB DI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DI, $0x40
+	JB   memmove_match_emit_encodeSnappyBlockAsm64K
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm64K
+
+one_byte_match_emit_encodeSnappyBlockAsm64K:
+	SHLB $0x02, DI
+	MOVB DI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm64K:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
+	MOVQ (SI), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
+	MOVQ (SI), R9
+	MOVQ -8(SI)(R8*1), SI
+	MOVQ R9, (AX)
+	MOVQ SI, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
+	MOVOU (SI), X0
+	MOVOU -16(SI)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
+	MOVQ DI, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm64K
+
+memmove_long_match_emit_encodeSnappyBlockAsm64K:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveLong
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVQ  R8, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(SI)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(SI)(R11*1), X4
+	MOVOU -16(SI)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  R8, R11
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  DI, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
+match_nolit_loop_encodeSnappyBlockAsm64K:
+	MOVL CX, SI
+	SUBL BX, SI
+	MOVL SI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), SI
+	SUBL CX, SI
+	LEAQ (DX)(CX*1), DI
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R9, R9
+
+matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K:
+	CMPL SI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm64K
+	MOVQ (DI)(R9*1), R8
+	MOVQ 8(DI)(R9*1), R10
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
+	XORQ 8(BX)(R9*1), R10
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K
+	LEAL -16(SI), SI
+	LEAL 16(R9), R9
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K
+
+matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R9)(R10*1), R9
+	JMP  match_nolit_end_encodeSnappyBlockAsm64K
+
+matchlen_match8_match_nolit_encodeSnappyBlockAsm64K:
+	CMPL SI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
+	MOVQ (DI)(R9*1), R8
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
+	LEAL -8(SI), SI
+	LEAL 8(R9), R9
+	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
+
+matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R8, R8
+
+#else
+	BSFQ R8, R8
+
+#endif
+	SARQ $0x03, R8
+	LEAL (R9)(R8*1), R9
+	JMP  match_nolit_end_encodeSnappyBlockAsm64K
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm64K:
+	CMPL SI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
+	MOVL (DI)(R9*1), R8
+	CMPL (BX)(R9*1), R8
+	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
+	LEAL -4(SI), SI
+	LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm64K:
+	CMPL SI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
+	JB   match_nolit_end_encodeSnappyBlockAsm64K
+	MOVW (DI)(R9*1), R8
+	CMPW (BX)(R9*1), R8
+	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
+	LEAL 2(R9), R9
+	SUBL $0x02, SI
+	JZ   match_nolit_end_encodeSnappyBlockAsm64K
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm64K:
+	MOVB (DI)(R9*1), R8
+	CMPB (BX)(R9*1), R8
+	JNE  match_nolit_end_encodeSnappyBlockAsm64K
+	LEAL 1(R9), R9
+
+match_nolit_end_encodeSnappyBlockAsm64K:
+	ADDL R9, CX
+	MOVL 16(SP), BX
+	ADDL $0x04, R9
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
+	MOVB $0xee, (AX)
+	MOVW BX, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm64K
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
+	MOVL R9, SI
+	SHLL $0x02, SI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
+	CMPL BX, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
+	LEAL -15(SI), SI
+	MOVB BL, 1(AX)
+	SHRL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm64K
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
+	LEAL -2(SI), SI
+	MOVB SI, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBlockAsm64K
+	MOVQ -2(DX)(CX*1), SI
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBlockAsm64K
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm64K:
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  SI, DI
+	SHRQ  $0x10, SI
+	MOVQ  SI, BX
+	SHLQ  $0x10, DI
+	IMULQ R8, DI
+	SHRQ  $0x32, DI
+	SHLQ  $0x10, BX
+	IMULQ R8, BX
+	SHRQ  $0x32, BX
+	LEAL  -2(CX), R8
+	LEAQ  24(SP)(BX*4), R9
+	MOVL  (R9), BX
+	MOVL  R8, 24(SP)(DI*4)
+	MOVL  CX, (R9)
+	CMPL  (DX)(BX*1), SI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm64K
+	INCL  CX
+	JMP   search_loop_encodeSnappyBlockAsm64K
+
+emit_remainder_encodeSnappyBlockAsm64K:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBlockAsm64K
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm64K:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBlockAsm64K
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm64K
+	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm64K
+
+three_bytes_emit_remainder_encodeSnappyBlockAsm64K:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm64K
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBlockAsm64K
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm64K
+
+one_byte_emit_remainder_encodeSnappyBlockAsm64K:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm64K:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000080, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm12B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBlockAsm12B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm12B:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x05, BX
+	LEAL  4(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeSnappyBlockAsm12B
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x000000cf1bbcdcbb, R8
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHRQ  $0x08, R10
+	SHLQ  $0x18, R9
+	IMULQ R8, R9
+	SHRQ  $0x34, R9
+	SHLQ  $0x18, R10
+	IMULQ R8, R10
+	SHRQ  $0x34, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  24(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	LEAL  1(CX), R9
+	MOVL  R9, 24(SP)(R10*4)
+	MOVQ  SI, R9
+	SHRQ  $0x10, R9
+	SHLQ  $0x18, R9
+	IMULQ R8, R9
+	SHRQ  $0x34, R9
+	MOVL  CX, R8
+	SUBL  16(SP), R8
+	MOVL  1(DX)(R8*1), R10
+	MOVQ  SI, R8
+	SHRQ  $0x08, R8
+	CMPL  R8, R10
+	JNE   no_repeat_found_encodeSnappyBlockAsm12B
+	LEAL  1(CX), SI
+	MOVL  12(SP), BX
+	MOVL  SI, DI
+	SUBL  16(SP), DI
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm12B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm12B:
+	CMPL SI, BX
+	JBE  repeat_extend_back_end_encodeSnappyBlockAsm12B
+	MOVB -1(DX)(DI*1), R8
+	MOVB -1(DX)(SI*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm12B
+	LEAL -1(SI), SI
+	DECL DI
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm12B
+
+repeat_extend_back_end_encodeSnappyBlockAsm12B:
+	MOVL SI, BX
+	SUBL 12(SP), BX
+	LEAQ 3(AX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   repeat_dst_size_check_encodeSnappyBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+repeat_dst_size_check_encodeSnappyBlockAsm12B:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
+	MOVL SI, DI
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R8
+	SUBL BX, DI
+	LEAL -1(DI), BX
+	CMPL BX, $0x3c
+	JB   one_byte_repeat_emit_encodeSnappyBlockAsm12B
+	CMPL BX, $0x00000100
+	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm12B
+	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm12B
+
+three_bytes_repeat_emit_encodeSnappyBlockAsm12B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_repeat_emit_encodeSnappyBlockAsm12B
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm12B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm12B:
+	LEAQ (AX)(DI*1), BX
+
+	// genMemMoveShort
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (AX)
+	MOVQ R8, -8(AX)(DI*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(DI*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DI*1)
+	MOVOU X3, -16(AX)(DI*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
+	LEAQ (AX)(DI*1), BX
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R11*1), X4
+	MOVOU -16(R8)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  DI, R11
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DI*1)
+	MOVOU X3, -16(AX)(DI*1)
+	MOVQ  BX, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
+	ADDL $0x05, CX
+	MOVL CX, BX
+	SUBL 16(SP), BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (BX)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
+	XORQ 8(BX)(R10*1), R11
+	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B
+
+matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm12B
+
+matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
+	MOVQ (R8)(R10*1), R9
+	XORQ (BX)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
+
+matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm12B
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
+	MOVL (R8)(R10*1), R9
+	CMPL (BX)(R10*1), R9
+	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
+	JB   repeat_extend_forward_end_encodeSnappyBlockAsm12B
+	MOVW (R8)(R10*1), R9
+	CMPW (BX)(R10*1), R9
+	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm12B
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B:
+	MOVB (R8)(R10*1), R9
+	CMPB (BX)(R10*1), R9
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm12B
+	LEAL 1(R10), R10
+
+repeat_extend_forward_end_encodeSnappyBlockAsm12B:
+	ADDL R10, CX
+	MOVL CX, BX
+	SUBL SI, BX
+	MOVL 16(SP), SI
+
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
+	CMPL BX, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(BX), BX
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
+	MOVL BX, DI
+	SHLL $0x02, DI
+	CMPL BX, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
+	LEAL -15(DI), DI
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm12B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm12B:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm12B
+
+no_repeat_found_encodeSnappyBlockAsm12B:
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate_match_encodeSnappyBlockAsm12B
+	SHRQ $0x08, SI
+	MOVL 24(SP)(R9*4), BX
+	LEAL 2(CX), R8
+	CMPL (DX)(DI*1), SI
+	JEQ  candidate2_match_encodeSnappyBlockAsm12B
+	MOVL R8, 24(SP)(R9*4)
+	SHRQ $0x08, SI
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate3_match_encodeSnappyBlockAsm12B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBlockAsm12B
+
+candidate3_match_encodeSnappyBlockAsm12B:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeSnappyBlockAsm12B
+
+candidate2_match_encodeSnappyBlockAsm12B:
+	MOVL R8, 24(SP)(R9*4)
+	INCL CX
+	MOVL DI, BX
+
+candidate_match_encodeSnappyBlockAsm12B:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeSnappyBlockAsm12B
+
+match_extend_back_loop_encodeSnappyBlockAsm12B:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeSnappyBlockAsm12B
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeSnappyBlockAsm12B
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeSnappyBlockAsm12B
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm12B
+
+match_extend_back_end_encodeSnappyBlockAsm12B:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeSnappyBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm12B:
+	MOVL CX, SI
+	MOVL 12(SP), DI
+	CMPL DI, SI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm12B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(DI*1), SI
+	SUBL DI, R8
+	LEAL -1(R8), DI
+	CMPL DI, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBlockAsm12B
+	CMPL DI, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBlockAsm12B
+	JB   three_bytes_match_emit_encodeSnappyBlockAsm12B
+
+three_bytes_match_emit_encodeSnappyBlockAsm12B:
+	MOVB $0xf4, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B
+
+two_bytes_match_emit_encodeSnappyBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB DI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DI, $0x40
+	JB   memmove_match_emit_encodeSnappyBlockAsm12B
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B
+
+one_byte_match_emit_encodeSnappyBlockAsm12B:
+	SHLB $0x02, DI
+	MOVB DI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm12B:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
+	MOVQ (SI), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
+	MOVQ (SI), R9
+	MOVQ -8(SI)(R8*1), SI
+	MOVQ R9, (AX)
+	MOVQ SI, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
+	MOVOU (SI), X0
+	MOVOU -16(SI)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
+	MOVQ DI, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm12B
+
+memmove_long_match_emit_encodeSnappyBlockAsm12B:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveLong
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVQ  R8, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(SI)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(SI)(R11*1), X4
+	MOVOU -16(SI)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  R8, R11
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  DI, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
+match_nolit_loop_encodeSnappyBlockAsm12B:
+	MOVL CX, SI
+	SUBL BX, SI
+	MOVL SI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), SI
+	SUBL CX, SI
+	LEAQ (DX)(CX*1), DI
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R9, R9
+
+matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B:
+	CMPL SI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm12B
+	MOVQ (DI)(R9*1), R8
+	MOVQ 8(DI)(R9*1), R10
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
+	XORQ 8(BX)(R9*1), R10
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B
+	LEAL -16(SI), SI
+	LEAL 16(R9), R9
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B
+
+matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R9)(R10*1), R9
+	JMP  match_nolit_end_encodeSnappyBlockAsm12B
+
+matchlen_match8_match_nolit_encodeSnappyBlockAsm12B:
+	CMPL SI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
+	MOVQ (DI)(R9*1), R8
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
+	LEAL -8(SI), SI
+	LEAL 8(R9), R9
+	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
+
+matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R8, R8
+
+#else
+	BSFQ R8, R8
+
+#endif
+	SARQ $0x03, R8
+	LEAL (R9)(R8*1), R9
+	JMP  match_nolit_end_encodeSnappyBlockAsm12B
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm12B:
+	CMPL SI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
+	MOVL (DI)(R9*1), R8
+	CMPL (BX)(R9*1), R8
+	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
+	LEAL -4(SI), SI
+	LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm12B:
+	CMPL SI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
+	JB   match_nolit_end_encodeSnappyBlockAsm12B
+	MOVW (DI)(R9*1), R8
+	CMPW (BX)(R9*1), R8
+	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
+	LEAL 2(R9), R9
+	SUBL $0x02, SI
+	JZ   match_nolit_end_encodeSnappyBlockAsm12B
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm12B:
+	MOVB (DI)(R9*1), R8
+	CMPB (BX)(R9*1), R8
+	JNE  match_nolit_end_encodeSnappyBlockAsm12B
+	LEAL 1(R9), R9
+
+match_nolit_end_encodeSnappyBlockAsm12B:
+	ADDL R9, CX
+	MOVL 16(SP), BX
+	ADDL $0x04, R9
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
+	MOVB $0xee, (AX)
+	MOVW BX, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
+	MOVL R9, SI
+	SHLL $0x02, SI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
+	CMPL BX, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
+	LEAL -15(SI), SI
+	MOVB BL, 1(AX)
+	SHRL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
+	LEAL -2(SI), SI
+	MOVB SI, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBlockAsm12B
+	MOVQ -2(DX)(CX*1), SI
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm12B:
+	MOVQ  $0x000000cf1bbcdcbb, R8
+	MOVQ  SI, DI
+	SHRQ  $0x10, SI
+	MOVQ  SI, BX
+	SHLQ  $0x18, DI
+	IMULQ R8, DI
+	SHRQ  $0x34, DI
+	SHLQ  $0x18, BX
+	IMULQ R8, BX
+	SHRQ  $0x34, BX
+	LEAL  -2(CX), R8
+	LEAQ  24(SP)(BX*4), R9
+	MOVL  (R9), BX
+	MOVL  R8, 24(SP)(DI*4)
+	MOVL  CX, (R9)
+	CMPL  (DX)(BX*1), SI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm12B
+	INCL  CX
+	JMP   search_loop_encodeSnappyBlockAsm12B
+
+emit_remainder_encodeSnappyBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBlockAsm12B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm12B
+	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm12B
+
+three_bytes_emit_remainder_encodeSnappyBlockAsm12B:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBlockAsm12B
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm12B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000020, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm10B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBlockAsm10B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm10B:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x05, BX
+	LEAL  4(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeSnappyBlockAsm10B
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x9e3779b1, R8
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHRQ  $0x08, R10
+	SHLQ  $0x20, R9
+	IMULQ R8, R9
+	SHRQ  $0x36, R9
+	SHLQ  $0x20, R10
+	IMULQ R8, R10
+	SHRQ  $0x36, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  24(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	LEAL  1(CX), R9
+	MOVL  R9, 24(SP)(R10*4)
+	MOVQ  SI, R9
+	SHRQ  $0x10, R9
+	SHLQ  $0x20, R9
+	IMULQ R8, R9
+	SHRQ  $0x36, R9
+	MOVL  CX, R8
+	SUBL  16(SP), R8
+	MOVL  1(DX)(R8*1), R10
+	MOVQ  SI, R8
+	SHRQ  $0x08, R8
+	CMPL  R8, R10
+	JNE   no_repeat_found_encodeSnappyBlockAsm10B
+	LEAL  1(CX), SI
+	MOVL  12(SP), BX
+	MOVL  SI, DI
+	SUBL  16(SP), DI
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm10B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm10B:
+	CMPL SI, BX
+	JBE  repeat_extend_back_end_encodeSnappyBlockAsm10B
+	MOVB -1(DX)(DI*1), R8
+	MOVB -1(DX)(SI*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm10B
+	LEAL -1(SI), SI
+	DECL DI
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm10B
+
+repeat_extend_back_end_encodeSnappyBlockAsm10B:
+	MOVL SI, BX
+	SUBL 12(SP), BX
+	LEAQ 3(AX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   repeat_dst_size_check_encodeSnappyBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+repeat_dst_size_check_encodeSnappyBlockAsm10B:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
+	MOVL SI, DI
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R8
+	SUBL BX, DI
+	LEAL -1(DI), BX
+	CMPL BX, $0x3c
+	JB   one_byte_repeat_emit_encodeSnappyBlockAsm10B
+	CMPL BX, $0x00000100
+	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm10B
+	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm10B
+
+three_bytes_repeat_emit_encodeSnappyBlockAsm10B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_repeat_emit_encodeSnappyBlockAsm10B
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm10B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm10B:
+	LEAQ (AX)(DI*1), BX
+
+	// genMemMoveShort
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (AX)
+	MOVQ R8, -8(AX)(DI*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(DI*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DI*1)
+	MOVOU X3, -16(AX)(DI*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
+	LEAQ (AX)(DI*1), BX
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R11*1), X4
+	MOVOU -16(R8)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  DI, R11
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DI*1)
+	MOVOU X3, -16(AX)(DI*1)
+	MOVQ  BX, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
+	ADDL $0x05, CX
+	MOVL CX, BX
+	SUBL 16(SP), BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (BX)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
+	XORQ 8(BX)(R10*1), R11
+	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B
+
+matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm10B
+
+matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
+	MOVQ (R8)(R10*1), R9
+	XORQ (BX)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
+
+matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm10B
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
+	MOVL (R8)(R10*1), R9
+	CMPL (BX)(R10*1), R9
+	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
+	JB   repeat_extend_forward_end_encodeSnappyBlockAsm10B
+	MOVW (R8)(R10*1), R9
+	CMPW (BX)(R10*1), R9
+	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm10B
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B:
+	MOVB (R8)(R10*1), R9
+	CMPB (BX)(R10*1), R9
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm10B
+	LEAL 1(R10), R10
+
+repeat_extend_forward_end_encodeSnappyBlockAsm10B:
+	ADDL R10, CX
+	MOVL CX, BX
+	SUBL SI, BX
+	MOVL 16(SP), SI
+
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
+	CMPL BX, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(BX), BX
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
+	MOVL BX, DI
+	SHLL $0x02, DI
+	CMPL BX, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
+	LEAL -15(DI), DI
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm10B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm10B:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm10B
+
+no_repeat_found_encodeSnappyBlockAsm10B:
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate_match_encodeSnappyBlockAsm10B
+	SHRQ $0x08, SI
+	MOVL 24(SP)(R9*4), BX
+	LEAL 2(CX), R8
+	CMPL (DX)(DI*1), SI
+	JEQ  candidate2_match_encodeSnappyBlockAsm10B
+	MOVL R8, 24(SP)(R9*4)
+	SHRQ $0x08, SI
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate3_match_encodeSnappyBlockAsm10B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBlockAsm10B
+
+candidate3_match_encodeSnappyBlockAsm10B:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeSnappyBlockAsm10B
+
+candidate2_match_encodeSnappyBlockAsm10B:
+	MOVL R8, 24(SP)(R9*4)
+	INCL CX
+	MOVL DI, BX
+
+candidate_match_encodeSnappyBlockAsm10B:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeSnappyBlockAsm10B
+
+match_extend_back_loop_encodeSnappyBlockAsm10B:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeSnappyBlockAsm10B
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeSnappyBlockAsm10B
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeSnappyBlockAsm10B
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm10B
+
+match_extend_back_end_encodeSnappyBlockAsm10B:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeSnappyBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm10B:
+	MOVL CX, SI
+	MOVL 12(SP), DI
+	CMPL DI, SI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm10B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(DI*1), SI
+	SUBL DI, R8
+	LEAL -1(R8), DI
+	CMPL DI, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBlockAsm10B
+	CMPL DI, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBlockAsm10B
+	JB   three_bytes_match_emit_encodeSnappyBlockAsm10B
+
+three_bytes_match_emit_encodeSnappyBlockAsm10B:
+	MOVB $0xf4, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B
+
+two_bytes_match_emit_encodeSnappyBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB DI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DI, $0x40
+	JB   memmove_match_emit_encodeSnappyBlockAsm10B
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B
+
+one_byte_match_emit_encodeSnappyBlockAsm10B:
+	SHLB $0x02, DI
+	MOVB DI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm10B:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
+	MOVQ (SI), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
+	MOVQ (SI), R9
+	MOVQ -8(SI)(R8*1), SI
+	MOVQ R9, (AX)
+	MOVQ SI, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
+	MOVOU (SI), X0
+	MOVOU -16(SI)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
+	MOVQ DI, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm10B
+
+memmove_long_match_emit_encodeSnappyBlockAsm10B:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveLong
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVQ  R8, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(SI)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(SI)(R11*1), X4
+	MOVOU -16(SI)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  R8, R11
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  DI, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
+match_nolit_loop_encodeSnappyBlockAsm10B:
+	MOVL CX, SI
+	SUBL BX, SI
+	MOVL SI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), SI
+	SUBL CX, SI
+	LEAQ (DX)(CX*1), DI
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R9, R9
+
+matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B:
+	CMPL SI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm10B
+	MOVQ (DI)(R9*1), R8
+	MOVQ 8(DI)(R9*1), R10
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
+	XORQ 8(BX)(R9*1), R10
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B
+	LEAL -16(SI), SI
+	LEAL 16(R9), R9
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B
+
+matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R9)(R10*1), R9
+	JMP  match_nolit_end_encodeSnappyBlockAsm10B
+
+matchlen_match8_match_nolit_encodeSnappyBlockAsm10B:
+	CMPL SI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
+	MOVQ (DI)(R9*1), R8
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
+	LEAL -8(SI), SI
+	LEAL 8(R9), R9
+	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
+
+matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R8, R8
+
+#else
+	BSFQ R8, R8
+
+#endif
+	SARQ $0x03, R8
+	LEAL (R9)(R8*1), R9
+	JMP  match_nolit_end_encodeSnappyBlockAsm10B
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm10B:
+	CMPL SI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
+	MOVL (DI)(R9*1), R8
+	CMPL (BX)(R9*1), R8
+	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
+	LEAL -4(SI), SI
+	LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm10B:
+	CMPL SI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
+	JB   match_nolit_end_encodeSnappyBlockAsm10B
+	MOVW (DI)(R9*1), R8
+	CMPW (BX)(R9*1), R8
+	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
+	LEAL 2(R9), R9
+	SUBL $0x02, SI
+	JZ   match_nolit_end_encodeSnappyBlockAsm10B
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm10B:
+	MOVB (DI)(R9*1), R8
+	CMPB (BX)(R9*1), R8
+	JNE  match_nolit_end_encodeSnappyBlockAsm10B
+	LEAL 1(R9), R9
+
+match_nolit_end_encodeSnappyBlockAsm10B:
+	ADDL R9, CX
+	MOVL 16(SP), BX
+	ADDL $0x04, R9
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
+	MOVB $0xee, (AX)
+	MOVW BX, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
+	MOVL R9, SI
+	SHLL $0x02, SI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
+	CMPL BX, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
+	LEAL -15(SI), SI
+	MOVB BL, 1(AX)
+	SHRL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
+	LEAL -2(SI), SI
+	MOVB SI, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBlockAsm10B
+	MOVQ -2(DX)(CX*1), SI
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm10B:
+	MOVQ  $0x9e3779b1, R8
+	MOVQ  SI, DI
+	SHRQ  $0x10, SI
+	MOVQ  SI, BX
+	SHLQ  $0x20, DI
+	IMULQ R8, DI
+	SHRQ  $0x36, DI
+	SHLQ  $0x20, BX
+	IMULQ R8, BX
+	SHRQ  $0x36, BX
+	LEAL  -2(CX), R8
+	LEAQ  24(SP)(BX*4), R9
+	MOVL  (R9), BX
+	MOVL  R8, 24(SP)(DI*4)
+	MOVL  CX, (R9)
+	CMPL  (DX)(BX*1), SI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm10B
+	INCL  CX
+	JMP   search_loop_encodeSnappyBlockAsm10B
+
+emit_remainder_encodeSnappyBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBlockAsm10B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm10B
+	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm10B
+
+three_bytes_emit_remainder_encodeSnappyBlockAsm10B:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBlockAsm10B
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm10B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000008, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm8B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBlockAsm8B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm8B:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x04, BX
+	LEAL  4(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeSnappyBlockAsm8B
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x9e3779b1, R8
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHRQ  $0x08, R10
+	SHLQ  $0x20, R9
+	IMULQ R8, R9
+	SHRQ  $0x38, R9
+	SHLQ  $0x20, R10
+	IMULQ R8, R10
+	SHRQ  $0x38, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  24(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	LEAL  1(CX), R9
+	MOVL  R9, 24(SP)(R10*4)
+	MOVQ  SI, R9
+	SHRQ  $0x10, R9
+	SHLQ  $0x20, R9
+	IMULQ R8, R9
+	SHRQ  $0x38, R9
+	MOVL  CX, R8
+	SUBL  16(SP), R8
+	MOVL  1(DX)(R8*1), R10
+	MOVQ  SI, R8
+	SHRQ  $0x08, R8
+	CMPL  R8, R10
+	JNE   no_repeat_found_encodeSnappyBlockAsm8B
+	LEAL  1(CX), SI
+	MOVL  12(SP), BX
+	MOVL  SI, DI
+	SUBL  16(SP), DI
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm8B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm8B:
+	CMPL SI, BX
+	JBE  repeat_extend_back_end_encodeSnappyBlockAsm8B
+	MOVB -1(DX)(DI*1), R8
+	MOVB -1(DX)(SI*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm8B
+	LEAL -1(SI), SI
+	DECL DI
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm8B
+
+repeat_extend_back_end_encodeSnappyBlockAsm8B:
+	MOVL SI, BX
+	SUBL 12(SP), BX
+	LEAQ 3(AX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   repeat_dst_size_check_encodeSnappyBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+repeat_dst_size_check_encodeSnappyBlockAsm8B:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
+	MOVL SI, DI
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R8
+	SUBL BX, DI
+	LEAL -1(DI), BX
+	CMPL BX, $0x3c
+	JB   one_byte_repeat_emit_encodeSnappyBlockAsm8B
+	CMPL BX, $0x00000100
+	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm8B
+	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm8B
+
+three_bytes_repeat_emit_encodeSnappyBlockAsm8B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_repeat_emit_encodeSnappyBlockAsm8B
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm8B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm8B:
+	LEAQ (AX)(DI*1), BX
+
+	// genMemMoveShort
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (AX)
+	MOVQ R8, -8(AX)(DI*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(DI*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DI*1)
+	MOVOU X3, -16(AX)(DI*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
+	LEAQ (AX)(DI*1), BX
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R11*1), X4
+	MOVOU -16(R8)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  DI, R11
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DI*1)
+	MOVOU X3, -16(AX)(DI*1)
+	MOVQ  BX, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
+	ADDL $0x05, CX
+	MOVL CX, BX
+	SUBL 16(SP), BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (BX)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
+	XORQ 8(BX)(R10*1), R11
+	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B
+
+matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm8B
+
+matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
+	MOVQ (R8)(R10*1), R9
+	XORQ (BX)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
+
+matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm8B
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
+	MOVL (R8)(R10*1), R9
+	CMPL (BX)(R10*1), R9
+	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
+	JB   repeat_extend_forward_end_encodeSnappyBlockAsm8B
+	MOVW (R8)(R10*1), R9
+	CMPW (BX)(R10*1), R9
+	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm8B
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B:
+	MOVB (R8)(R10*1), R9
+	CMPB (BX)(R10*1), R9
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm8B
+	LEAL 1(R10), R10
+
+repeat_extend_forward_end_encodeSnappyBlockAsm8B:
+	ADDL R10, CX
+	MOVL CX, BX
+	SUBL SI, BX
+	MOVL 16(SP), SI
+
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
+	CMPL BX, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(BX), BX
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
+	MOVL BX, DI
+	SHLL $0x02, DI
+	CMPL BX, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
+	LEAL -15(DI), DI
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm8B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm8B:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm8B
+
+no_repeat_found_encodeSnappyBlockAsm8B:
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate_match_encodeSnappyBlockAsm8B
+	SHRQ $0x08, SI
+	MOVL 24(SP)(R9*4), BX
+	LEAL 2(CX), R8
+	CMPL (DX)(DI*1), SI
+	JEQ  candidate2_match_encodeSnappyBlockAsm8B
+	MOVL R8, 24(SP)(R9*4)
+	SHRQ $0x08, SI
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate3_match_encodeSnappyBlockAsm8B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBlockAsm8B
+
+candidate3_match_encodeSnappyBlockAsm8B:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeSnappyBlockAsm8B
+
+candidate2_match_encodeSnappyBlockAsm8B:
+	MOVL R8, 24(SP)(R9*4)
+	INCL CX
+	MOVL DI, BX
+
+candidate_match_encodeSnappyBlockAsm8B:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeSnappyBlockAsm8B
+
+match_extend_back_loop_encodeSnappyBlockAsm8B:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeSnappyBlockAsm8B
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeSnappyBlockAsm8B
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeSnappyBlockAsm8B
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm8B
+
+match_extend_back_end_encodeSnappyBlockAsm8B:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeSnappyBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm8B:
+	MOVL CX, SI
+	MOVL 12(SP), DI
+	CMPL DI, SI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm8B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(DI*1), SI
+	SUBL DI, R8
+	LEAL -1(R8), DI
+	CMPL DI, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBlockAsm8B
+	CMPL DI, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBlockAsm8B
+	JB   three_bytes_match_emit_encodeSnappyBlockAsm8B
+
+three_bytes_match_emit_encodeSnappyBlockAsm8B:
+	MOVB $0xf4, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B
+
+two_bytes_match_emit_encodeSnappyBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB DI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DI, $0x40
+	JB   memmove_match_emit_encodeSnappyBlockAsm8B
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B
+
+one_byte_match_emit_encodeSnappyBlockAsm8B:
+	SHLB $0x02, DI
+	MOVB DI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm8B:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
+	MOVQ (SI), R9
+	MOVQ R9, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
+	MOVQ (SI), R9
+	MOVQ -8(SI)(R8*1), SI
+	MOVQ R9, (AX)
+	MOVQ SI, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
+	MOVOU (SI), X0
+	MOVOU -16(SI)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
+	MOVQ DI, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm8B
+
+memmove_long_match_emit_encodeSnappyBlockAsm8B:
+	LEAQ (AX)(R8*1), DI
+
+	// genMemMoveLong
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU -32(SI)(R8*1), X2
+	MOVOU -16(SI)(R8*1), X3
+	MOVQ  R8, R10
+	SHRQ  $0x05, R10
+	MOVQ  AX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(SI)(R11*1), R9
+	LEAQ  -32(AX)(R11*1), R12
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(SI)(R11*1), X4
+	MOVOU -16(SI)(R11*1), X5
+	MOVOA X4, -32(AX)(R11*1)
+	MOVOA X5, -16(AX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  R8, R11
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  DI, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
+match_nolit_loop_encodeSnappyBlockAsm8B:
+	MOVL CX, SI
+	SUBL BX, SI
+	MOVL SI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), SI
+	SUBL CX, SI
+	LEAQ (DX)(CX*1), DI
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R9, R9
+
+matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B:
+	CMPL SI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm8B
+	MOVQ (DI)(R9*1), R8
+	MOVQ 8(DI)(R9*1), R10
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
+	XORQ 8(BX)(R9*1), R10
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B
+	LEAL -16(SI), SI
+	LEAL 16(R9), R9
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B
+
+matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R9)(R10*1), R9
+	JMP  match_nolit_end_encodeSnappyBlockAsm8B
+
+matchlen_match8_match_nolit_encodeSnappyBlockAsm8B:
+	CMPL SI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
+	MOVQ (DI)(R9*1), R8
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
+	LEAL -8(SI), SI
+	LEAL 8(R9), R9
+	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
+
+matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R8, R8
+
+#else
+	BSFQ R8, R8
+
+#endif
+	SARQ $0x03, R8
+	LEAL (R9)(R8*1), R9
+	JMP  match_nolit_end_encodeSnappyBlockAsm8B
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm8B:
+	CMPL SI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
+	MOVL (DI)(R9*1), R8
+	CMPL (BX)(R9*1), R8
+	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
+	LEAL -4(SI), SI
+	LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm8B:
+	CMPL SI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
+	JB   match_nolit_end_encodeSnappyBlockAsm8B
+	MOVW (DI)(R9*1), R8
+	CMPW (BX)(R9*1), R8
+	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
+	LEAL 2(R9), R9
+	SUBL $0x02, SI
+	JZ   match_nolit_end_encodeSnappyBlockAsm8B
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm8B:
+	MOVB (DI)(R9*1), R8
+	CMPB (BX)(R9*1), R8
+	JNE  match_nolit_end_encodeSnappyBlockAsm8B
+	LEAL 1(R9), R9
+
+match_nolit_end_encodeSnappyBlockAsm8B:
+	ADDL R9, CX
+	MOVL 16(SP), BX
+	ADDL $0x04, R9
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
+	MOVB $0xee, (AX)
+	MOVW BX, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
+	MOVL R9, SI
+	SHLL $0x02, SI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
+	LEAL -15(SI), SI
+	MOVB BL, 1(AX)
+	SHRL $0x08, BX
+	SHLL $0x05, BX
+	ORL  BX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
+	LEAL -2(SI), SI
+	MOVB SI, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBlockAsm8B
+	MOVQ -2(DX)(CX*1), SI
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm8B:
+	MOVQ  $0x9e3779b1, R8
+	MOVQ  SI, DI
+	SHRQ  $0x10, SI
+	MOVQ  SI, BX
+	SHLQ  $0x20, DI
+	IMULQ R8, DI
+	SHRQ  $0x38, DI
+	SHLQ  $0x20, BX
+	IMULQ R8, BX
+	SHRQ  $0x38, BX
+	LEAL  -2(CX), R8
+	LEAQ  24(SP)(BX*4), R9
+	MOVL  (R9), BX
+	MOVL  R8, 24(SP)(DI*4)
+	MOVL  CX, (R9)
+	CMPL  (DX)(BX*1), SI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm8B
+	INCL  CX
+	JMP   search_loop_encodeSnappyBlockAsm8B
+
+emit_remainder_encodeSnappyBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBlockAsm8B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm8B
+	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm8B
+
+three_bytes_emit_remainder_encodeSnappyBlockAsm8B:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBlockAsm8B
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm8B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00001200, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm:
+	MOVL CX, BX
+	SUBL 12(SP), BX
+	SHRL $0x07, BX
+	CMPL BX, $0x63
+	JBE  check_maxskip_ok_encodeSnappyBetterBlockAsm
+	LEAL 100(CX), BX
+	JMP  check_maxskip_cont_encodeSnappyBetterBlockAsm
+
+check_maxskip_ok_encodeSnappyBetterBlockAsm:
+	LEAL 1(CX)(BX*1), BX
+
+check_maxskip_cont_encodeSnappyBetterBlockAsm:
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeSnappyBetterBlockAsm
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R8
+	MOVQ  $0x9e3779b1, BX
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHLQ  $0x08, R9
+	IMULQ R8, R9
+	SHRQ  $0x2f, R9
+	SHLQ  $0x20, R10
+	IMULQ BX, R10
+	SHRQ  $0x32, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  524312(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	MOVL  CX, 524312(SP)(R10*4)
+	MOVQ  (DX)(BX*1), R9
+	MOVQ  (DX)(DI*1), R10
+	CMPQ  R9, SI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm
+	CMPQ  R10, SI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm
+	MOVL  DI, BX
+	JMP   candidate_match_encodeSnappyBetterBlockAsm
+
+no_short_found_encodeSnappyBetterBlockAsm:
+	CMPL R9, SI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm
+	CMPL R10, SI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBetterBlockAsm
+
+candidateS_match_encodeSnappyBetterBlockAsm:
+	SHRQ  $0x08, SI
+	MOVQ  SI, R9
+	SHLQ  $0x08, R9
+	IMULQ R8, R9
+	SHRQ  $0x2f, R9
+	MOVL  24(SP)(R9*4), BX
+	INCL  CX
+	MOVL  CX, 24(SP)(R9*4)
+	CMPL  (DX)(BX*1), SI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm
+	DECL  CX
+	MOVL  DI, BX
+
+candidate_match_encodeSnappyBetterBlockAsm:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm
+
+match_extend_back_end_encodeSnappyBetterBlockAsm:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 5(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeSnappyBetterBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm:
+	MOVL CX, SI
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), R9
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm
+
+matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm
+
+matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
+
+matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
+	JB   match_nolit_end_encodeSnappyBetterBlockAsm
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeSnappyBetterBlockAsm
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeSnappyBetterBlockAsm:
+	MOVL CX, DI
+	SUBL BX, DI
+
+	// Check if repeat
+	CMPL R11, $0x01
+	JA   match_length_ok_encodeSnappyBetterBlockAsm
+	CMPL DI, $0x0000ffff
+	JBE  match_length_ok_encodeSnappyBetterBlockAsm
+	MOVL 20(SP), CX
+	INCL CX
+	JMP  search_loop_encodeSnappyBetterBlockAsm
+
+match_length_ok_encodeSnappyBetterBlockAsm:
+	MOVL DI, 16(SP)
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm
+	CMPL BX, $0x00010000
+	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm
+	CMPL BX, $0x01000000
+	JB   four_bytes_match_emit_encodeSnappyBetterBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL BX, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+four_bytes_match_emit_encodeSnappyBetterBlockAsm:
+	MOVL BX, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (AX)
+	MOVW BX, 1(AX)
+	MOVB R10, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+three_bytes_match_emit_encodeSnappyBetterBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_encodeSnappyBetterBlockAsm
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL DI, $0x00010000
+	JB   two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL R11, $0x40
+	JBE  four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
+	MOVB $0xff, (AX)
+	MOVL DI, 1(AX)
+	LEAL -64(R11), R11
+	ADDQ $0x05, AX
+	CMPL R11, $0x04
+	JB   four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
+	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm
+
+four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
+	TESTL R11, R11
+	JZ    match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+	XORL  BX, BX
+	LEAL  -1(BX)(R11*4), R11
+	MOVB  R11, (AX)
+	MOVL  DI, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL R11, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(R11), R11
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
+	MOVL R11, BX
+	SHLL $0x02, BX
+	CMPL R11, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
+	LEAL -15(BX), BX
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
+	LEAL -2(BX), BX
+	MOVB BL, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBetterBlockAsm
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
+	MOVQ  $0x00cf1bbcdcbfa563, BX
+	MOVQ  $0x9e3779b1, DI
+	LEAQ  1(SI), SI
+	LEAQ  -2(CX), R8
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  1(DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	MOVQ  1(DX)(R8*1), R12
+	SHLQ  $0x08, R9
+	IMULQ BX, R9
+	SHRQ  $0x2f, R9
+	SHLQ  $0x20, R10
+	IMULQ DI, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x08, R11
+	IMULQ BX, R11
+	SHRQ  $0x2f, R11
+	SHLQ  $0x20, R12
+	IMULQ DI, R12
+	SHRQ  $0x32, R12
+	LEAQ  1(SI), DI
+	LEAQ  1(R8), R13
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  R8, 24(SP)(R11*4)
+	MOVL  DI, 524312(SP)(R10*4)
+	MOVL  R13, 524312(SP)(R12*4)
+	LEAQ  1(R8)(SI*1), DI
+	SHRQ  $0x01, DI
+	ADDQ  $0x01, SI
+	SUBQ  $0x01, R8
+
+index_loop_encodeSnappyBetterBlockAsm:
+	CMPQ  DI, R8
+	JAE   search_loop_encodeSnappyBetterBlockAsm
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  (DX)(DI*1), R10
+	SHLQ  $0x08, R9
+	IMULQ BX, R9
+	SHRQ  $0x2f, R9
+	SHLQ  $0x08, R10
+	IMULQ BX, R10
+	SHRQ  $0x2f, R10
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  DI, 24(SP)(R10*4)
+	ADDQ  $0x02, SI
+	ADDQ  $0x02, DI
+	JMP   index_loop_encodeSnappyBetterBlockAsm
+
+emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 5(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+	CMPL DX, $0x01000000
+	JB   four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL DX, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (AX)
+	MOVW DX, 1(AX)
+	MOVB BL, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000a00, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm64K:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm64K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm64K:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x07, BX
+	LEAL  1(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeSnappyBetterBlockAsm64K
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R8
+	MOVQ  $0x9e3779b1, BX
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHLQ  $0x08, R9
+	IMULQ R8, R9
+	SHRQ  $0x30, R9
+	SHLQ  $0x20, R10
+	IMULQ BX, R10
+	SHRQ  $0x32, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  262168(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	MOVL  CX, 262168(SP)(R10*4)
+	MOVQ  (DX)(BX*1), R9
+	MOVQ  (DX)(DI*1), R10
+	CMPQ  R9, SI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
+	CMPQ  R10, SI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm64K
+	MOVL  DI, BX
+	JMP   candidate_match_encodeSnappyBetterBlockAsm64K
+
+no_short_found_encodeSnappyBetterBlockAsm64K:
+	CMPL R9, SI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm64K
+	CMPL R10, SI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm64K
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBetterBlockAsm64K
+
+candidateS_match_encodeSnappyBetterBlockAsm64K:
+	SHRQ  $0x08, SI
+	MOVQ  SI, R9
+	SHLQ  $0x08, R9
+	IMULQ R8, R9
+	SHRQ  $0x30, R9
+	MOVL  24(SP)(R9*4), BX
+	INCL  CX
+	MOVL  CX, 24(SP)(R9*4)
+	CMPL  (DX)(BX*1), SI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
+	DECL  CX
+	MOVL  DI, BX
+
+candidate_match_encodeSnappyBetterBlockAsm64K:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm64K
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm64K
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm64K
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm64K
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm64K
+
+match_extend_back_end_encodeSnappyBetterBlockAsm64K:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeSnappyBetterBlockAsm64K
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm64K:
+	MOVL CX, SI
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), R9
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K
+
+matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm64K
+
+matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
+
+matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm64K
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
+	JB   match_nolit_end_encodeSnappyBetterBlockAsm64K
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeSnappyBetterBlockAsm64K
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm64K
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeSnappyBetterBlockAsm64K:
+	MOVL CX, DI
+	SUBL BX, DI
+
+	// Check if repeat
+	MOVL DI, 16(SP)
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm64K
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
+	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm64K
+
+three_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_encodeSnappyBetterBlockAsm64K
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm64K:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
+	CMPL R11, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(R11), R11
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
+	MOVL R11, BX
+	SHLL $0x02, BX
+	CMPL R11, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
+	LEAL -15(BX), BX
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
+	LEAL -2(BX), BX
+	MOVB BL, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBetterBlockAsm64K
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
+	MOVQ  $0x00cf1bbcdcbfa563, BX
+	MOVQ  $0x9e3779b1, DI
+	LEAQ  1(SI), SI
+	LEAQ  -2(CX), R8
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  1(DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	MOVQ  1(DX)(R8*1), R12
+	SHLQ  $0x08, R9
+	IMULQ BX, R9
+	SHRQ  $0x30, R9
+	SHLQ  $0x20, R10
+	IMULQ DI, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x08, R11
+	IMULQ BX, R11
+	SHRQ  $0x30, R11
+	SHLQ  $0x20, R12
+	IMULQ DI, R12
+	SHRQ  $0x32, R12
+	LEAQ  1(SI), DI
+	LEAQ  1(R8), R13
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  R8, 24(SP)(R11*4)
+	MOVL  DI, 262168(SP)(R10*4)
+	MOVL  R13, 262168(SP)(R12*4)
+	LEAQ  1(R8)(SI*1), DI
+	SHRQ  $0x01, DI
+	ADDQ  $0x01, SI
+	SUBQ  $0x01, R8
+
+index_loop_encodeSnappyBetterBlockAsm64K:
+	CMPQ  DI, R8
+	JAE   search_loop_encodeSnappyBetterBlockAsm64K
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  (DX)(DI*1), R10
+	SHLQ  $0x08, R9
+	IMULQ BX, R9
+	SHRQ  $0x30, R9
+	SHLQ  $0x08, R10
+	IMULQ BX, R10
+	SHRQ  $0x30, R10
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  DI, 24(SP)(R10*4)
+	ADDQ  $0x02, SI
+	ADDQ  $0x02, DI
+	JMP   index_loop_encodeSnappyBetterBlockAsm64K
+
+emit_remainder_encodeSnappyBetterBlockAsm64K:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm64K
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
+	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000280, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm12B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm12B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm12B:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x06, BX
+	LEAL  1(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeSnappyBetterBlockAsm12B
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  $0x9e3779b1, BX
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x32, R9
+	SHLQ  $0x20, R10
+	IMULQ BX, R10
+	SHRQ  $0x34, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  65560(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	MOVL  CX, 65560(SP)(R10*4)
+	MOVQ  (DX)(BX*1), R9
+	MOVQ  (DX)(DI*1), R10
+	CMPQ  R9, SI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
+	CMPQ  R10, SI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm12B
+	MOVL  DI, BX
+	JMP   candidate_match_encodeSnappyBetterBlockAsm12B
+
+no_short_found_encodeSnappyBetterBlockAsm12B:
+	CMPL R9, SI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm12B
+	CMPL R10, SI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm12B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBetterBlockAsm12B
+
+candidateS_match_encodeSnappyBetterBlockAsm12B:
+	SHRQ  $0x08, SI
+	MOVQ  SI, R9
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x32, R9
+	MOVL  24(SP)(R9*4), BX
+	INCL  CX
+	MOVL  CX, 24(SP)(R9*4)
+	CMPL  (DX)(BX*1), SI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
+	DECL  CX
+	MOVL  DI, BX
+
+candidate_match_encodeSnappyBetterBlockAsm12B:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm12B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm12B
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm12B
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm12B
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm12B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm12B:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeSnappyBetterBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm12B:
+	MOVL CX, SI
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), R9
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B
+
+matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm12B
+
+matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
+
+matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm12B
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
+	JB   match_nolit_end_encodeSnappyBetterBlockAsm12B
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeSnappyBetterBlockAsm12B
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm12B
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeSnappyBetterBlockAsm12B:
+	MOVL CX, DI
+	SUBL BX, DI
+
+	// Check if repeat
+	MOVL DI, 16(SP)
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm12B
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
+	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm12B
+
+three_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_encodeSnappyBetterBlockAsm12B
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm12B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
+	CMPL R11, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(R11), R11
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
+	MOVL R11, BX
+	SHLL $0x02, BX
+	CMPL R11, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
+	LEAL -15(BX), BX
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
+	LEAL -2(BX), BX
+	MOVB BL, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBetterBlockAsm12B
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
+	MOVQ  $0x0000cf1bbcdcbf9b, BX
+	MOVQ  $0x9e3779b1, DI
+	LEAQ  1(SI), SI
+	LEAQ  -2(CX), R8
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  1(DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	MOVQ  1(DX)(R8*1), R12
+	SHLQ  $0x10, R9
+	IMULQ BX, R9
+	SHRQ  $0x32, R9
+	SHLQ  $0x20, R10
+	IMULQ DI, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x10, R11
+	IMULQ BX, R11
+	SHRQ  $0x32, R11
+	SHLQ  $0x20, R12
+	IMULQ DI, R12
+	SHRQ  $0x34, R12
+	LEAQ  1(SI), DI
+	LEAQ  1(R8), R13
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  R8, 24(SP)(R11*4)
+	MOVL  DI, 65560(SP)(R10*4)
+	MOVL  R13, 65560(SP)(R12*4)
+	LEAQ  1(R8)(SI*1), DI
+	SHRQ  $0x01, DI
+	ADDQ  $0x01, SI
+	SUBQ  $0x01, R8
+
+index_loop_encodeSnappyBetterBlockAsm12B:
+	CMPQ  DI, R8
+	JAE   search_loop_encodeSnappyBetterBlockAsm12B
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  (DX)(DI*1), R10
+	SHLQ  $0x10, R9
+	IMULQ BX, R9
+	SHRQ  $0x32, R9
+	SHLQ  $0x10, R10
+	IMULQ BX, R10
+	SHRQ  $0x32, R10
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  DI, 24(SP)(R10*4)
+	ADDQ  $0x02, SI
+	ADDQ  $0x02, DI
+	JMP   index_loop_encodeSnappyBetterBlockAsm12B
+
+emit_remainder_encodeSnappyBetterBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
+	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x000000a0, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm10B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm10B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm10B:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x05, BX
+	LEAL  1(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeSnappyBetterBlockAsm10B
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  $0x9e3779b1, BX
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x34, R9
+	SHLQ  $0x20, R10
+	IMULQ BX, R10
+	SHRQ  $0x36, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  16408(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	MOVL  CX, 16408(SP)(R10*4)
+	MOVQ  (DX)(BX*1), R9
+	MOVQ  (DX)(DI*1), R10
+	CMPQ  R9, SI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
+	CMPQ  R10, SI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm10B
+	MOVL  DI, BX
+	JMP   candidate_match_encodeSnappyBetterBlockAsm10B
+
+no_short_found_encodeSnappyBetterBlockAsm10B:
+	CMPL R9, SI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm10B
+	CMPL R10, SI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm10B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBetterBlockAsm10B
+
+candidateS_match_encodeSnappyBetterBlockAsm10B:
+	SHRQ  $0x08, SI
+	MOVQ  SI, R9
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x34, R9
+	MOVL  24(SP)(R9*4), BX
+	INCL  CX
+	MOVL  CX, 24(SP)(R9*4)
+	CMPL  (DX)(BX*1), SI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
+	DECL  CX
+	MOVL  DI, BX
+
+candidate_match_encodeSnappyBetterBlockAsm10B:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm10B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm10B
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm10B
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm10B
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm10B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm10B:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeSnappyBetterBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm10B:
+	MOVL CX, SI
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), R9
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B
+
+matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm10B
+
+matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
+
+matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm10B
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
+	JB   match_nolit_end_encodeSnappyBetterBlockAsm10B
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeSnappyBetterBlockAsm10B
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm10B
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeSnappyBetterBlockAsm10B:
+	MOVL CX, DI
+	SUBL BX, DI
+
+	// Check if repeat
+	MOVL DI, 16(SP)
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm10B
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
+	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm10B
+
+three_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_encodeSnappyBetterBlockAsm10B
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm10B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
+	CMPL R11, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(R11), R11
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
+	MOVL R11, BX
+	SHLL $0x02, BX
+	CMPL R11, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
+	LEAL -15(BX), BX
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
+	LEAL -2(BX), BX
+	MOVB BL, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBetterBlockAsm10B
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
+	MOVQ  $0x0000cf1bbcdcbf9b, BX
+	MOVQ  $0x9e3779b1, DI
+	LEAQ  1(SI), SI
+	LEAQ  -2(CX), R8
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  1(DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	MOVQ  1(DX)(R8*1), R12
+	SHLQ  $0x10, R9
+	IMULQ BX, R9
+	SHRQ  $0x34, R9
+	SHLQ  $0x20, R10
+	IMULQ DI, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x10, R11
+	IMULQ BX, R11
+	SHRQ  $0x34, R11
+	SHLQ  $0x20, R12
+	IMULQ DI, R12
+	SHRQ  $0x36, R12
+	LEAQ  1(SI), DI
+	LEAQ  1(R8), R13
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  R8, 24(SP)(R11*4)
+	MOVL  DI, 16408(SP)(R10*4)
+	MOVL  R13, 16408(SP)(R12*4)
+	LEAQ  1(R8)(SI*1), DI
+	SHRQ  $0x01, DI
+	ADDQ  $0x01, SI
+	SUBQ  $0x01, R8
+
+index_loop_encodeSnappyBetterBlockAsm10B:
+	CMPQ  DI, R8
+	JAE   search_loop_encodeSnappyBetterBlockAsm10B
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  (DX)(DI*1), R10
+	SHLQ  $0x10, R9
+	IMULQ BX, R9
+	SHRQ  $0x34, R9
+	SHLQ  $0x10, R10
+	IMULQ BX, R10
+	SHRQ  $0x34, R10
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  DI, 24(SP)(R10*4)
+	ADDQ  $0x02, SI
+	ADDQ  $0x02, DI
+	JMP   index_loop_encodeSnappyBetterBlockAsm10B
+
+emit_remainder_encodeSnappyBetterBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
+	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000028, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm8B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm8B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm8B:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x04, BX
+	LEAL  1(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_encodeSnappyBetterBlockAsm8B
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  $0x9e3779b1, BX
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x36, R9
+	SHLQ  $0x20, R10
+	IMULQ BX, R10
+	SHRQ  $0x38, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  4120(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	MOVL  CX, 4120(SP)(R10*4)
+	MOVQ  (DX)(BX*1), R9
+	MOVQ  (DX)(DI*1), R10
+	CMPQ  R9, SI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
+	CMPQ  R10, SI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm8B
+	MOVL  DI, BX
+	JMP   candidate_match_encodeSnappyBetterBlockAsm8B
+
+no_short_found_encodeSnappyBetterBlockAsm8B:
+	CMPL R9, SI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm8B
+	CMPL R10, SI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm8B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBetterBlockAsm8B
+
+candidateS_match_encodeSnappyBetterBlockAsm8B:
+	SHRQ  $0x08, SI
+	MOVQ  SI, R9
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x36, R9
+	MOVL  24(SP)(R9*4), BX
+	INCL  CX
+	MOVL  CX, 24(SP)(R9*4)
+	CMPL  (DX)(BX*1), SI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
+	DECL  CX
+	MOVL  DI, BX
+
+candidate_match_encodeSnappyBetterBlockAsm8B:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm8B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
+	CMPL CX, SI
+	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm8B
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm8B
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm8B
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm8B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm8B:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_encodeSnappyBetterBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm8B:
+	MOVL CX, SI
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), R9
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B
+
+matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm8B
+
+matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
+
+matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm8B
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
+	JB   match_nolit_end_encodeSnappyBetterBlockAsm8B
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeSnappyBetterBlockAsm8B
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm8B
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeSnappyBetterBlockAsm8B:
+	MOVL CX, DI
+	SUBL BX, DI
+
+	// Check if repeat
+	MOVL DI, 16(SP)
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R9
+	SUBL BX, R8
+	LEAL -1(R8), BX
+	CMPL BX, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm8B
+	CMPL BX, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
+	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm8B
+
+three_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_match_emit_encodeSnappyBetterBlockAsm8B
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
+	SHLB $0x02, BL
+	MOVB BL, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm8B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
+	MOVQ BX, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
+	LEAQ (AX)(R8*1), BX
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  BX, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
+	ADDL R11, CX
+	ADDL $0x04, R11
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
+	CMPL R11, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(R11), R11
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
+	MOVL R11, BX
+	SHLL $0x02, BX
+	CMPL R11, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
+	LEAL -15(BX), BX
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, BX
+	MOVB BL, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
+	LEAL -2(BX), BX
+	MOVB BL, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBetterBlockAsm8B
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
+	MOVQ  $0x0000cf1bbcdcbf9b, BX
+	MOVQ  $0x9e3779b1, DI
+	LEAQ  1(SI), SI
+	LEAQ  -2(CX), R8
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  1(DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	MOVQ  1(DX)(R8*1), R12
+	SHLQ  $0x10, R9
+	IMULQ BX, R9
+	SHRQ  $0x36, R9
+	SHLQ  $0x20, R10
+	IMULQ DI, R10
+	SHRQ  $0x38, R10
+	SHLQ  $0x10, R11
+	IMULQ BX, R11
+	SHRQ  $0x36, R11
+	SHLQ  $0x20, R12
+	IMULQ DI, R12
+	SHRQ  $0x38, R12
+	LEAQ  1(SI), DI
+	LEAQ  1(R8), R13
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  R8, 24(SP)(R11*4)
+	MOVL  DI, 4120(SP)(R10*4)
+	MOVL  R13, 4120(SP)(R12*4)
+	LEAQ  1(R8)(SI*1), DI
+	SHRQ  $0x01, DI
+	ADDQ  $0x01, SI
+	SUBQ  $0x01, R8
+
+index_loop_encodeSnappyBetterBlockAsm8B:
+	CMPQ  DI, R8
+	JAE   search_loop_encodeSnappyBetterBlockAsm8B
+	MOVQ  (DX)(SI*1), R9
+	MOVQ  (DX)(DI*1), R10
+	SHLQ  $0x10, R9
+	IMULQ BX, R9
+	SHRQ  $0x36, R9
+	SHLQ  $0x10, R10
+	IMULQ BX, R10
+	SHRQ  $0x36, R10
+	MOVL  SI, 24(SP)(R9*4)
+	MOVL  DI, 24(SP)(R10*4)
+	ADDQ  $0x02, SI
+	ADDQ  $0x02, DI
+	JMP   index_loop_encodeSnappyBetterBlockAsm8B
+
+emit_remainder_encodeSnappyBetterBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
+	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(BX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func calcBlockSize(src []byte) int
+// Requires: BMI, SSE2
+TEXT ·calcBlockSize(SB), $32792-32
+	XORQ AX, AX
+	MOVQ $0x00000100, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_calcBlockSize:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_calcBlockSize
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+8(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+0(FP), DX
+
+search_loop_calcBlockSize:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x05, BX
+	LEAL  4(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_calcBlockSize
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHRQ  $0x08, R10
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x33, R9
+	SHLQ  $0x10, R10
+	IMULQ R8, R10
+	SHRQ  $0x33, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  24(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	LEAL  1(CX), R9
+	MOVL  R9, 24(SP)(R10*4)
+	MOVQ  SI, R9
+	SHRQ  $0x10, R9
+	SHLQ  $0x10, R9
+	IMULQ R8, R9
+	SHRQ  $0x33, R9
+	MOVL  CX, R8
+	SUBL  16(SP), R8
+	MOVL  1(DX)(R8*1), R10
+	MOVQ  SI, R8
+	SHRQ  $0x08, R8
+	CMPL  R8, R10
+	JNE   no_repeat_found_calcBlockSize
+	LEAL  1(CX), SI
+	MOVL  12(SP), BX
+	MOVL  SI, DI
+	SUBL  16(SP), DI
+	JZ    repeat_extend_back_end_calcBlockSize
+
+repeat_extend_back_loop_calcBlockSize:
+	CMPL SI, BX
+	JBE  repeat_extend_back_end_calcBlockSize
+	MOVB -1(DX)(DI*1), R8
+	MOVB -1(DX)(SI*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_calcBlockSize
+	LEAL -1(SI), SI
+	DECL DI
+	JNZ  repeat_extend_back_loop_calcBlockSize
+
+repeat_extend_back_end_calcBlockSize:
+	MOVL SI, BX
+	SUBL 12(SP), BX
+	LEAQ 5(AX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   repeat_dst_size_check_calcBlockSize
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+repeat_dst_size_check_calcBlockSize:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_repeat_emit_calcBlockSize
+	MOVL SI, DI
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R8
+	SUBL BX, DI
+	LEAL -1(DI), BX
+	CMPL BX, $0x3c
+	JB   one_byte_repeat_emit_calcBlockSize
+	CMPL BX, $0x00000100
+	JB   two_bytes_repeat_emit_calcBlockSize
+	CMPL BX, $0x00010000
+	JB   three_bytes_repeat_emit_calcBlockSize
+	CMPL BX, $0x01000000
+	JB   four_bytes_repeat_emit_calcBlockSize
+	ADDQ $0x05, AX
+	JMP  memmove_long_repeat_emit_calcBlockSize
+
+four_bytes_repeat_emit_calcBlockSize:
+	ADDQ $0x04, AX
+	JMP  memmove_long_repeat_emit_calcBlockSize
+
+three_bytes_repeat_emit_calcBlockSize:
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_calcBlockSize
+
+two_bytes_repeat_emit_calcBlockSize:
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_repeat_emit_calcBlockSize
+	JMP  memmove_long_repeat_emit_calcBlockSize
+
+one_byte_repeat_emit_calcBlockSize:
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_calcBlockSize:
+	LEAQ (AX)(DI*1), AX
+	JMP  emit_literal_done_repeat_emit_calcBlockSize
+
+memmove_long_repeat_emit_calcBlockSize:
+	LEAQ (AX)(DI*1), AX
+
+emit_literal_done_repeat_emit_calcBlockSize:
+	ADDL $0x05, CX
+	MOVL CX, BX
+	SUBL 16(SP), BX
+	MOVQ src_len+8(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_repeat_extend_calcBlockSize:
+	CMPL DI, $0x10
+	JB   matchlen_match8_repeat_extend_calcBlockSize
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (BX)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSize
+	XORQ 8(BX)(R10*1), R11
+	JNZ  matchlen_bsf_16repeat_extend_calcBlockSize
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_repeat_extend_calcBlockSize
+
+matchlen_bsf_16repeat_extend_calcBlockSize:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  repeat_extend_forward_end_calcBlockSize
+
+matchlen_match8_repeat_extend_calcBlockSize:
+	CMPL DI, $0x08
+	JB   matchlen_match4_repeat_extend_calcBlockSize
+	MOVQ (R8)(R10*1), R9
+	XORQ (BX)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSize
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_repeat_extend_calcBlockSize
+
+matchlen_bsf_8_repeat_extend_calcBlockSize:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  repeat_extend_forward_end_calcBlockSize
+
+matchlen_match4_repeat_extend_calcBlockSize:
+	CMPL DI, $0x04
+	JB   matchlen_match2_repeat_extend_calcBlockSize
+	MOVL (R8)(R10*1), R9
+	CMPL (BX)(R10*1), R9
+	JNE  matchlen_match2_repeat_extend_calcBlockSize
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_calcBlockSize:
+	CMPL DI, $0x01
+	JE   matchlen_match1_repeat_extend_calcBlockSize
+	JB   repeat_extend_forward_end_calcBlockSize
+	MOVW (R8)(R10*1), R9
+	CMPW (BX)(R10*1), R9
+	JNE  matchlen_match1_repeat_extend_calcBlockSize
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   repeat_extend_forward_end_calcBlockSize
+
+matchlen_match1_repeat_extend_calcBlockSize:
+	MOVB (R8)(R10*1), R9
+	CMPB (BX)(R10*1), R9
+	JNE  repeat_extend_forward_end_calcBlockSize
+	LEAL 1(R10), R10
+
+repeat_extend_forward_end_calcBlockSize:
+	ADDL R10, CX
+	MOVL CX, BX
+	SUBL SI, BX
+	MOVL 16(SP), SI
+
+	// emitCopy
+	CMPL SI, $0x00010000
+	JB   two_byte_offset_repeat_as_copy_calcBlockSize
+
+four_bytes_loop_back_repeat_as_copy_calcBlockSize:
+	CMPL BX, $0x40
+	JBE  four_bytes_remain_repeat_as_copy_calcBlockSize
+	LEAL -64(BX), BX
+	ADDQ $0x05, AX
+	CMPL BX, $0x04
+	JB   four_bytes_remain_repeat_as_copy_calcBlockSize
+	JMP  four_bytes_loop_back_repeat_as_copy_calcBlockSize
+
+four_bytes_remain_repeat_as_copy_calcBlockSize:
+	TESTL BX, BX
+	JZ    repeat_end_emit_calcBlockSize
+	XORL  BX, BX
+	ADDQ  $0x05, AX
+	JMP   repeat_end_emit_calcBlockSize
+
+two_byte_offset_repeat_as_copy_calcBlockSize:
+	CMPL BX, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_calcBlockSize
+	LEAL -60(BX), BX
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_repeat_as_copy_calcBlockSize
+
+two_byte_offset_short_repeat_as_copy_calcBlockSize:
+	MOVL BX, DI
+	SHLL $0x02, DI
+	CMPL BX, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_calcBlockSize
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_calcBlockSize
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_calcBlockSize
+
+emit_copy_three_repeat_as_copy_calcBlockSize:
+	ADDQ $0x03, AX
+
+repeat_end_emit_calcBlockSize:
+	MOVL CX, 12(SP)
+	JMP  search_loop_calcBlockSize
+
+no_repeat_found_calcBlockSize:
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate_match_calcBlockSize
+	SHRQ $0x08, SI
+	MOVL 24(SP)(R9*4), BX
+	LEAL 2(CX), R8
+	CMPL (DX)(DI*1), SI
+	JEQ  candidate2_match_calcBlockSize
+	MOVL R8, 24(SP)(R9*4)
+	SHRQ $0x08, SI
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate3_match_calcBlockSize
+	MOVL 20(SP), CX
+	JMP  search_loop_calcBlockSize
+
+candidate3_match_calcBlockSize:
+	ADDL $0x02, CX
+	JMP  candidate_match_calcBlockSize
+
+candidate2_match_calcBlockSize:
+	MOVL R8, 24(SP)(R9*4)
+	INCL CX
+	MOVL DI, BX
+
+candidate_match_calcBlockSize:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_calcBlockSize
+
+match_extend_back_loop_calcBlockSize:
+	CMPL CX, SI
+	JBE  match_extend_back_end_calcBlockSize
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_calcBlockSize
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_calcBlockSize
+	JMP  match_extend_back_loop_calcBlockSize
+
+match_extend_back_end_calcBlockSize:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 5(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_calcBlockSize
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+match_dst_size_check_calcBlockSize:
+	MOVL CX, SI
+	MOVL 12(SP), DI
+	CMPL DI, SI
+	JEQ  emit_literal_done_match_emit_calcBlockSize
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(DI*1), SI
+	SUBL DI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_calcBlockSize
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_calcBlockSize
+	CMPL SI, $0x00010000
+	JB   three_bytes_match_emit_calcBlockSize
+	CMPL SI, $0x01000000
+	JB   four_bytes_match_emit_calcBlockSize
+	ADDQ $0x05, AX
+	JMP  memmove_long_match_emit_calcBlockSize
+
+four_bytes_match_emit_calcBlockSize:
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_calcBlockSize
+
+three_bytes_match_emit_calcBlockSize:
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_calcBlockSize
+
+two_bytes_match_emit_calcBlockSize:
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_calcBlockSize
+	JMP  memmove_long_match_emit_calcBlockSize
+
+one_byte_match_emit_calcBlockSize:
+	ADDQ $0x01, AX
+
+memmove_match_emit_calcBlockSize:
+	LEAQ (AX)(R8*1), AX
+	JMP  emit_literal_done_match_emit_calcBlockSize
+
+memmove_long_match_emit_calcBlockSize:
+	LEAQ (AX)(R8*1), AX
+
+emit_literal_done_match_emit_calcBlockSize:
+match_nolit_loop_calcBlockSize:
+	MOVL CX, SI
+	SUBL BX, SI
+	MOVL SI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+8(FP), SI
+	SUBL CX, SI
+	LEAQ (DX)(CX*1), DI
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R9, R9
+
+matchlen_loopback_16_match_nolit_calcBlockSize:
+	CMPL SI, $0x10
+	JB   matchlen_match8_match_nolit_calcBlockSize
+	MOVQ (DI)(R9*1), R8
+	MOVQ 8(DI)(R9*1), R10
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_calcBlockSize
+	XORQ 8(BX)(R9*1), R10
+	JNZ  matchlen_bsf_16match_nolit_calcBlockSize
+	LEAL -16(SI), SI
+	LEAL 16(R9), R9
+	JMP  matchlen_loopback_16_match_nolit_calcBlockSize
+
+matchlen_bsf_16match_nolit_calcBlockSize:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R9)(R10*1), R9
+	JMP  match_nolit_end_calcBlockSize
+
+matchlen_match8_match_nolit_calcBlockSize:
+	CMPL SI, $0x08
+	JB   matchlen_match4_match_nolit_calcBlockSize
+	MOVQ (DI)(R9*1), R8
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_calcBlockSize
+	LEAL -8(SI), SI
+	LEAL 8(R9), R9
+	JMP  matchlen_match4_match_nolit_calcBlockSize
+
+matchlen_bsf_8_match_nolit_calcBlockSize:
+#ifdef GOAMD64_v3
+	TZCNTQ R8, R8
+
+#else
+	BSFQ R8, R8
+
+#endif
+	SARQ $0x03, R8
+	LEAL (R9)(R8*1), R9
+	JMP  match_nolit_end_calcBlockSize
+
+matchlen_match4_match_nolit_calcBlockSize:
+	CMPL SI, $0x04
+	JB   matchlen_match2_match_nolit_calcBlockSize
+	MOVL (DI)(R9*1), R8
+	CMPL (BX)(R9*1), R8
+	JNE  matchlen_match2_match_nolit_calcBlockSize
+	LEAL -4(SI), SI
+	LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_calcBlockSize:
+	CMPL SI, $0x01
+	JE   matchlen_match1_match_nolit_calcBlockSize
+	JB   match_nolit_end_calcBlockSize
+	MOVW (DI)(R9*1), R8
+	CMPW (BX)(R9*1), R8
+	JNE  matchlen_match1_match_nolit_calcBlockSize
+	LEAL 2(R9), R9
+	SUBL $0x02, SI
+	JZ   match_nolit_end_calcBlockSize
+
+matchlen_match1_match_nolit_calcBlockSize:
+	MOVB (DI)(R9*1), R8
+	CMPB (BX)(R9*1), R8
+	JNE  match_nolit_end_calcBlockSize
+	LEAL 1(R9), R9
+
+match_nolit_end_calcBlockSize:
+	ADDL R9, CX
+	MOVL 16(SP), BX
+	ADDL $0x04, R9
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL BX, $0x00010000
+	JB   two_byte_offset_match_nolit_calcBlockSize
+
+four_bytes_loop_back_match_nolit_calcBlockSize:
+	CMPL R9, $0x40
+	JBE  four_bytes_remain_match_nolit_calcBlockSize
+	LEAL -64(R9), R9
+	ADDQ $0x05, AX
+	CMPL R9, $0x04
+	JB   four_bytes_remain_match_nolit_calcBlockSize
+	JMP  four_bytes_loop_back_match_nolit_calcBlockSize
+
+four_bytes_remain_match_nolit_calcBlockSize:
+	TESTL R9, R9
+	JZ    match_nolit_emitcopy_end_calcBlockSize
+	XORL  BX, BX
+	ADDQ  $0x05, AX
+	JMP   match_nolit_emitcopy_end_calcBlockSize
+
+two_byte_offset_match_nolit_calcBlockSize:
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_match_nolit_calcBlockSize
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_calcBlockSize
+
+two_byte_offset_short_match_nolit_calcBlockSize:
+	MOVL R9, SI
+	SHLL $0x02, SI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_match_nolit_calcBlockSize
+	CMPL BX, $0x00000800
+	JAE  emit_copy_three_match_nolit_calcBlockSize
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_calcBlockSize
+
+emit_copy_three_match_nolit_calcBlockSize:
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_calcBlockSize:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_calcBlockSize
+	MOVQ -2(DX)(CX*1), SI
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_calcBlockSize
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+match_nolit_dst_ok_calcBlockSize:
+	MOVQ  $0x0000cf1bbcdcbf9b, R8
+	MOVQ  SI, DI
+	SHRQ  $0x10, SI
+	MOVQ  SI, BX
+	SHLQ  $0x10, DI
+	IMULQ R8, DI
+	SHRQ  $0x33, DI
+	SHLQ  $0x10, BX
+	IMULQ R8, BX
+	SHRQ  $0x33, BX
+	LEAL  -2(CX), R8
+	LEAQ  24(SP)(BX*4), R9
+	MOVL  (R9), BX
+	MOVL  R8, 24(SP)(DI*4)
+	MOVL  CX, (R9)
+	CMPL  (DX)(BX*1), SI
+	JEQ   match_nolit_loop_calcBlockSize
+	INCL  CX
+	JMP   search_loop_calcBlockSize
+
+emit_remainder_calcBlockSize:
+	MOVQ src_len+8(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 5(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_calcBlockSize
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+emit_remainder_ok_calcBlockSize:
+	MOVQ src_len+8(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_calcBlockSize
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), CX
+	CMPL CX, $0x3c
+	JB   one_byte_emit_remainder_calcBlockSize
+	CMPL CX, $0x00000100
+	JB   two_bytes_emit_remainder_calcBlockSize
+	CMPL CX, $0x00010000
+	JB   three_bytes_emit_remainder_calcBlockSize
+	CMPL CX, $0x01000000
+	JB   four_bytes_emit_remainder_calcBlockSize
+	ADDQ $0x05, AX
+	JMP  memmove_long_emit_remainder_calcBlockSize
+
+four_bytes_emit_remainder_calcBlockSize:
+	ADDQ $0x04, AX
+	JMP  memmove_long_emit_remainder_calcBlockSize
+
+three_bytes_emit_remainder_calcBlockSize:
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_calcBlockSize
+
+two_bytes_emit_remainder_calcBlockSize:
+	ADDQ $0x02, AX
+	CMPL CX, $0x40
+	JB   memmove_emit_remainder_calcBlockSize
+	JMP  memmove_long_emit_remainder_calcBlockSize
+
+one_byte_emit_remainder_calcBlockSize:
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_calcBlockSize:
+	LEAQ (AX)(SI*1), AX
+	JMP  emit_literal_done_emit_remainder_calcBlockSize
+
+memmove_long_emit_remainder_calcBlockSize:
+	LEAQ (AX)(SI*1), AX
+
+emit_literal_done_emit_remainder_calcBlockSize:
+	MOVQ AX, ret+24(FP)
+	RET
+
+// func calcBlockSizeSmall(src []byte) int
+// Requires: BMI, SSE2
+TEXT ·calcBlockSizeSmall(SB), $2072-32
+	XORQ AX, AX
+	MOVQ $0x00000010, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_calcBlockSizeSmall:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_calcBlockSizeSmall
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+8(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+0(FP), DX
+
+search_loop_calcBlockSizeSmall:
+	MOVL  CX, BX
+	SUBL  12(SP), BX
+	SHRL  $0x04, BX
+	LEAL  4(CX)(BX*1), BX
+	CMPL  BX, 8(SP)
+	JAE   emit_remainder_calcBlockSizeSmall
+	MOVQ  (DX)(CX*1), SI
+	MOVL  BX, 20(SP)
+	MOVQ  $0x9e3779b1, R8
+	MOVQ  SI, R9
+	MOVQ  SI, R10
+	SHRQ  $0x08, R10
+	SHLQ  $0x20, R9
+	IMULQ R8, R9
+	SHRQ  $0x37, R9
+	SHLQ  $0x20, R10
+	IMULQ R8, R10
+	SHRQ  $0x37, R10
+	MOVL  24(SP)(R9*4), BX
+	MOVL  24(SP)(R10*4), DI
+	MOVL  CX, 24(SP)(R9*4)
+	LEAL  1(CX), R9
+	MOVL  R9, 24(SP)(R10*4)
+	MOVQ  SI, R9
+	SHRQ  $0x10, R9
+	SHLQ  $0x20, R9
+	IMULQ R8, R9
+	SHRQ  $0x37, R9
+	MOVL  CX, R8
+	SUBL  16(SP), R8
+	MOVL  1(DX)(R8*1), R10
+	MOVQ  SI, R8
+	SHRQ  $0x08, R8
+	CMPL  R8, R10
+	JNE   no_repeat_found_calcBlockSizeSmall
+	LEAL  1(CX), SI
+	MOVL  12(SP), BX
+	MOVL  SI, DI
+	SUBL  16(SP), DI
+	JZ    repeat_extend_back_end_calcBlockSizeSmall
+
+repeat_extend_back_loop_calcBlockSizeSmall:
+	CMPL SI, BX
+	JBE  repeat_extend_back_end_calcBlockSizeSmall
+	MOVB -1(DX)(DI*1), R8
+	MOVB -1(DX)(SI*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_calcBlockSizeSmall
+	LEAL -1(SI), SI
+	DECL DI
+	JNZ  repeat_extend_back_loop_calcBlockSizeSmall
+
+repeat_extend_back_end_calcBlockSizeSmall:
+	MOVL SI, BX
+	SUBL 12(SP), BX
+	LEAQ 3(AX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   repeat_dst_size_check_calcBlockSizeSmall
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+repeat_dst_size_check_calcBlockSizeSmall:
+	MOVL 12(SP), BX
+	CMPL BX, SI
+	JEQ  emit_literal_done_repeat_emit_calcBlockSizeSmall
+	MOVL SI, DI
+	MOVL SI, 12(SP)
+	LEAQ (DX)(BX*1), R8
+	SUBL BX, DI
+	LEAL -1(DI), BX
+	CMPL BX, $0x3c
+	JB   one_byte_repeat_emit_calcBlockSizeSmall
+	CMPL BX, $0x00000100
+	JB   two_bytes_repeat_emit_calcBlockSizeSmall
+	JB   three_bytes_repeat_emit_calcBlockSizeSmall
+
+three_bytes_repeat_emit_calcBlockSizeSmall:
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_calcBlockSizeSmall
+
+two_bytes_repeat_emit_calcBlockSizeSmall:
+	ADDQ $0x02, AX
+	CMPL BX, $0x40
+	JB   memmove_repeat_emit_calcBlockSizeSmall
+	JMP  memmove_long_repeat_emit_calcBlockSizeSmall
+
+one_byte_repeat_emit_calcBlockSizeSmall:
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_calcBlockSizeSmall:
+	LEAQ (AX)(DI*1), AX
+	JMP  emit_literal_done_repeat_emit_calcBlockSizeSmall
+
+memmove_long_repeat_emit_calcBlockSizeSmall:
+	LEAQ (AX)(DI*1), AX
+
+emit_literal_done_repeat_emit_calcBlockSizeSmall:
+	ADDL $0x05, CX
+	MOVL CX, BX
+	SUBL 16(SP), BX
+	MOVQ src_len+8(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_repeat_extend_calcBlockSizeSmall:
+	CMPL DI, $0x10
+	JB   matchlen_match8_repeat_extend_calcBlockSizeSmall
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (BX)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
+	XORQ 8(BX)(R10*1), R11
+	JNZ  matchlen_bsf_16repeat_extend_calcBlockSizeSmall
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_repeat_extend_calcBlockSizeSmall
+
+matchlen_bsf_16repeat_extend_calcBlockSizeSmall:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  repeat_extend_forward_end_calcBlockSizeSmall
+
+matchlen_match8_repeat_extend_calcBlockSizeSmall:
+	CMPL DI, $0x08
+	JB   matchlen_match4_repeat_extend_calcBlockSizeSmall
+	MOVQ (R8)(R10*1), R9
+	XORQ (BX)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_repeat_extend_calcBlockSizeSmall
+
+matchlen_bsf_8_repeat_extend_calcBlockSizeSmall:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  repeat_extend_forward_end_calcBlockSizeSmall
+
+matchlen_match4_repeat_extend_calcBlockSizeSmall:
+	CMPL DI, $0x04
+	JB   matchlen_match2_repeat_extend_calcBlockSizeSmall
+	MOVL (R8)(R10*1), R9
+	CMPL (BX)(R10*1), R9
+	JNE  matchlen_match2_repeat_extend_calcBlockSizeSmall
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_calcBlockSizeSmall:
+	CMPL DI, $0x01
+	JE   matchlen_match1_repeat_extend_calcBlockSizeSmall
+	JB   repeat_extend_forward_end_calcBlockSizeSmall
+	MOVW (R8)(R10*1), R9
+	CMPW (BX)(R10*1), R9
+	JNE  matchlen_match1_repeat_extend_calcBlockSizeSmall
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   repeat_extend_forward_end_calcBlockSizeSmall
+
+matchlen_match1_repeat_extend_calcBlockSizeSmall:
+	MOVB (R8)(R10*1), R9
+	CMPB (BX)(R10*1), R9
+	JNE  repeat_extend_forward_end_calcBlockSizeSmall
+	LEAL 1(R10), R10
+
+repeat_extend_forward_end_calcBlockSizeSmall:
+	ADDL R10, CX
+	MOVL CX, BX
+	SUBL SI, BX
+	MOVL 16(SP), SI
+
+	// emitCopy
+two_byte_offset_repeat_as_copy_calcBlockSizeSmall:
+	CMPL BX, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall
+	LEAL -60(BX), BX
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_repeat_as_copy_calcBlockSizeSmall
+
+two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall:
+	MOVL BX, SI
+	SHLL $0x02, SI
+	CMPL BX, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_calcBlockSizeSmall
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_calcBlockSizeSmall
+
+emit_copy_three_repeat_as_copy_calcBlockSizeSmall:
+	ADDQ $0x03, AX
+
+repeat_end_emit_calcBlockSizeSmall:
+	MOVL CX, 12(SP)
+	JMP  search_loop_calcBlockSizeSmall
+
+no_repeat_found_calcBlockSizeSmall:
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate_match_calcBlockSizeSmall
+	SHRQ $0x08, SI
+	MOVL 24(SP)(R9*4), BX
+	LEAL 2(CX), R8
+	CMPL (DX)(DI*1), SI
+	JEQ  candidate2_match_calcBlockSizeSmall
+	MOVL R8, 24(SP)(R9*4)
+	SHRQ $0x08, SI
+	CMPL (DX)(BX*1), SI
+	JEQ  candidate3_match_calcBlockSizeSmall
+	MOVL 20(SP), CX
+	JMP  search_loop_calcBlockSizeSmall
+
+candidate3_match_calcBlockSizeSmall:
+	ADDL $0x02, CX
+	JMP  candidate_match_calcBlockSizeSmall
+
+candidate2_match_calcBlockSizeSmall:
+	MOVL R8, 24(SP)(R9*4)
+	INCL CX
+	MOVL DI, BX
+
+candidate_match_calcBlockSizeSmall:
+	MOVL  12(SP), SI
+	TESTL BX, BX
+	JZ    match_extend_back_end_calcBlockSizeSmall
+
+match_extend_back_loop_calcBlockSizeSmall:
+	CMPL CX, SI
+	JBE  match_extend_back_end_calcBlockSizeSmall
+	MOVB -1(DX)(BX*1), DI
+	MOVB -1(DX)(CX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_calcBlockSizeSmall
+	LEAL -1(CX), CX
+	DECL BX
+	JZ   match_extend_back_end_calcBlockSizeSmall
+	JMP  match_extend_back_loop_calcBlockSizeSmall
+
+match_extend_back_end_calcBlockSizeSmall:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(AX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   match_dst_size_check_calcBlockSizeSmall
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+match_dst_size_check_calcBlockSizeSmall:
+	MOVL CX, SI
+	MOVL 12(SP), DI
+	CMPL DI, SI
+	JEQ  emit_literal_done_match_emit_calcBlockSizeSmall
+	MOVL SI, R8
+	MOVL SI, 12(SP)
+	LEAQ (DX)(DI*1), SI
+	SUBL DI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_calcBlockSizeSmall
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_calcBlockSizeSmall
+	JB   three_bytes_match_emit_calcBlockSizeSmall
+
+three_bytes_match_emit_calcBlockSizeSmall:
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_calcBlockSizeSmall
+
+two_bytes_match_emit_calcBlockSizeSmall:
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_calcBlockSizeSmall
+	JMP  memmove_long_match_emit_calcBlockSizeSmall
+
+one_byte_match_emit_calcBlockSizeSmall:
+	ADDQ $0x01, AX
+
+memmove_match_emit_calcBlockSizeSmall:
+	LEAQ (AX)(R8*1), AX
+	JMP  emit_literal_done_match_emit_calcBlockSizeSmall
+
+memmove_long_match_emit_calcBlockSizeSmall:
+	LEAQ (AX)(R8*1), AX
+
+emit_literal_done_match_emit_calcBlockSizeSmall:
+match_nolit_loop_calcBlockSizeSmall:
+	MOVL CX, SI
+	SUBL BX, SI
+	MOVL SI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, BX
+	MOVQ src_len+8(FP), SI
+	SUBL CX, SI
+	LEAQ (DX)(CX*1), DI
+	LEAQ (DX)(BX*1), BX
+
+	// matchLen
+	XORL R9, R9
+
+matchlen_loopback_16_match_nolit_calcBlockSizeSmall:
+	CMPL SI, $0x10
+	JB   matchlen_match8_match_nolit_calcBlockSizeSmall
+	MOVQ (DI)(R9*1), R8
+	MOVQ 8(DI)(R9*1), R10
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_calcBlockSizeSmall
+	XORQ 8(BX)(R9*1), R10
+	JNZ  matchlen_bsf_16match_nolit_calcBlockSizeSmall
+	LEAL -16(SI), SI
+	LEAL 16(R9), R9
+	JMP  matchlen_loopback_16_match_nolit_calcBlockSizeSmall
+
+matchlen_bsf_16match_nolit_calcBlockSizeSmall:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R9)(R10*1), R9
+	JMP  match_nolit_end_calcBlockSizeSmall
+
+matchlen_match8_match_nolit_calcBlockSizeSmall:
+	CMPL SI, $0x08
+	JB   matchlen_match4_match_nolit_calcBlockSizeSmall
+	MOVQ (DI)(R9*1), R8
+	XORQ (BX)(R9*1), R8
+	JNZ  matchlen_bsf_8_match_nolit_calcBlockSizeSmall
+	LEAL -8(SI), SI
+	LEAL 8(R9), R9
+	JMP  matchlen_match4_match_nolit_calcBlockSizeSmall
+
+matchlen_bsf_8_match_nolit_calcBlockSizeSmall:
+#ifdef GOAMD64_v3
+	TZCNTQ R8, R8
+
+#else
+	BSFQ R8, R8
+
+#endif
+	SARQ $0x03, R8
+	LEAL (R9)(R8*1), R9
+	JMP  match_nolit_end_calcBlockSizeSmall
+
+matchlen_match4_match_nolit_calcBlockSizeSmall:
+	CMPL SI, $0x04
+	JB   matchlen_match2_match_nolit_calcBlockSizeSmall
+	MOVL (DI)(R9*1), R8
+	CMPL (BX)(R9*1), R8
+	JNE  matchlen_match2_match_nolit_calcBlockSizeSmall
+	LEAL -4(SI), SI
+	LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_calcBlockSizeSmall:
+	CMPL SI, $0x01
+	JE   matchlen_match1_match_nolit_calcBlockSizeSmall
+	JB   match_nolit_end_calcBlockSizeSmall
+	MOVW (DI)(R9*1), R8
+	CMPW (BX)(R9*1), R8
+	JNE  matchlen_match1_match_nolit_calcBlockSizeSmall
+	LEAL 2(R9), R9
+	SUBL $0x02, SI
+	JZ   match_nolit_end_calcBlockSizeSmall
+
+matchlen_match1_match_nolit_calcBlockSizeSmall:
+	MOVB (DI)(R9*1), R8
+	CMPB (BX)(R9*1), R8
+	JNE  match_nolit_end_calcBlockSizeSmall
+	LEAL 1(R9), R9
+
+match_nolit_end_calcBlockSizeSmall:
+	ADDL R9, CX
+	MOVL 16(SP), BX
+	ADDL $0x04, R9
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_calcBlockSizeSmall:
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_match_nolit_calcBlockSizeSmall
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_calcBlockSizeSmall
+
+two_byte_offset_short_match_nolit_calcBlockSizeSmall:
+	MOVL R9, BX
+	SHLL $0x02, BX
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_match_nolit_calcBlockSizeSmall
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_calcBlockSizeSmall
+
+emit_copy_three_match_nolit_calcBlockSizeSmall:
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_calcBlockSizeSmall:
+	CMPL CX, 8(SP)
+	JAE  emit_remainder_calcBlockSizeSmall
+	MOVQ -2(DX)(CX*1), SI
+	CMPQ AX, (SP)
+	JB   match_nolit_dst_ok_calcBlockSizeSmall
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+match_nolit_dst_ok_calcBlockSizeSmall:
+	MOVQ  $0x9e3779b1, R8
+	MOVQ  SI, DI
+	SHRQ  $0x10, SI
+	MOVQ  SI, BX
+	SHLQ  $0x20, DI
+	IMULQ R8, DI
+	SHRQ  $0x37, DI
+	SHLQ  $0x20, BX
+	IMULQ R8, BX
+	SHRQ  $0x37, BX
+	LEAL  -2(CX), R8
+	LEAQ  24(SP)(BX*4), R9
+	MOVL  (R9), BX
+	MOVL  R8, 24(SP)(DI*4)
+	MOVL  CX, (R9)
+	CMPL  (DX)(BX*1), SI
+	JEQ   match_nolit_loop_calcBlockSizeSmall
+	INCL  CX
+	JMP   search_loop_calcBlockSizeSmall
+
+emit_remainder_calcBlockSizeSmall:
+	MOVQ src_len+8(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JB   emit_remainder_ok_calcBlockSizeSmall
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+emit_remainder_ok_calcBlockSizeSmall:
+	MOVQ src_len+8(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_calcBlockSizeSmall
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), CX
+	CMPL CX, $0x3c
+	JB   one_byte_emit_remainder_calcBlockSizeSmall
+	CMPL CX, $0x00000100
+	JB   two_bytes_emit_remainder_calcBlockSizeSmall
+	JB   three_bytes_emit_remainder_calcBlockSizeSmall
+
+three_bytes_emit_remainder_calcBlockSizeSmall:
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_calcBlockSizeSmall
+
+two_bytes_emit_remainder_calcBlockSizeSmall:
+	ADDQ $0x02, AX
+	CMPL CX, $0x40
+	JB   memmove_emit_remainder_calcBlockSizeSmall
+	JMP  memmove_long_emit_remainder_calcBlockSizeSmall
+
+one_byte_emit_remainder_calcBlockSizeSmall:
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_calcBlockSizeSmall:
+	LEAQ (AX)(SI*1), AX
+	JMP  emit_literal_done_emit_remainder_calcBlockSizeSmall
+
+memmove_long_emit_remainder_calcBlockSizeSmall:
+	LEAQ (AX)(SI*1), AX
+
+emit_literal_done_emit_remainder_calcBlockSizeSmall:
+	MOVQ AX, ret+24(FP)
+	RET
+
+// func emitLiteral(dst []byte, lit []byte) int
+// Requires: SSE2
+TEXT ·emitLiteral(SB), NOSPLIT, $0-56
+	MOVQ  lit_len+32(FP), DX
+	MOVQ  dst_base+0(FP), AX
+	MOVQ  lit_base+24(FP), CX
+	TESTQ DX, DX
+	JZ    emit_literal_end_standalone_skip
+	MOVL  DX, BX
+	LEAL  -1(DX), SI
+	CMPL  SI, $0x3c
+	JB    one_byte_standalone
+	CMPL  SI, $0x00000100
+	JB    two_bytes_standalone
+	CMPL  SI, $0x00010000
+	JB    three_bytes_standalone
+	CMPL  SI, $0x01000000
+	JB    four_bytes_standalone
+	MOVB  $0xfc, (AX)
+	MOVL  SI, 1(AX)
+	ADDQ  $0x05, BX
+	ADDQ  $0x05, AX
+	JMP   memmove_long_standalone
+
+four_bytes_standalone:
+	MOVL SI, DI
+	SHRL $0x10, DI
+	MOVB $0xf8, (AX)
+	MOVW SI, 1(AX)
+	MOVB DI, 3(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  memmove_long_standalone
+
+three_bytes_standalone:
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  memmove_long_standalone
+
+two_bytes_standalone:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JB   memmove_standalone
+	JMP  memmove_long_standalone
+
+one_byte_standalone:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, BX
+	ADDQ $0x01, AX
+
+memmove_standalone:
+	// genMemMoveShort
+	CMPQ DX, $0x03
+	JB   emit_lit_memmove_standalone_memmove_move_1or2
+	JE   emit_lit_memmove_standalone_memmove_move_3
+	CMPQ DX, $0x08
+	JB   emit_lit_memmove_standalone_memmove_move_4through7
+	CMPQ DX, $0x10
+	JBE  emit_lit_memmove_standalone_memmove_move_8through16
+	CMPQ DX, $0x20
+	JBE  emit_lit_memmove_standalone_memmove_move_17through32
+	JMP  emit_lit_memmove_standalone_memmove_move_33through64
+
+emit_lit_memmove_standalone_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(DX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(DX*1)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(DX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(DX*1)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(DX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(DX*1)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(DX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(DX*1)
+	JMP   emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(DX*1), X2
+	MOVOU -16(CX)(DX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DX*1)
+	MOVOU X3, -16(AX)(DX*1)
+	JMP   emit_literal_end_standalone
+	JMP emit_literal_end_standalone
+
+memmove_long_standalone:
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(DX*1), X2
+	MOVOU -16(CX)(DX*1), X3
+	MOVQ  DX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_standalonelarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_standalonelarge_big_loop_back
+
+emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  DX, R8
+	JAE   emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DX*1)
+	MOVOU X3, -16(AX)(DX*1)
+	JMP   emit_literal_end_standalone
+	JMP emit_literal_end_standalone
+
+emit_literal_end_standalone_skip:
+	XORQ BX, BX
+
+emit_literal_end_standalone:
+	MOVQ BX, ret+48(FP)
+	RET
+
+// func emitRepeat(dst []byte, offset int, length int) int
+TEXT ·emitRepeat(SB), NOSPLIT, $0-48
+	XORQ BX, BX
+	MOVQ dst_base+0(FP), AX
+	MOVQ offset+24(FP), CX
+	MOVQ length+32(FP), DX
+
+	// emitRepeat
+emit_repeat_again_standalone:
+	MOVL DX, SI
+	LEAL -4(DX), DX
+	CMPL SI, $0x08
+	JBE  repeat_two_standalone
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_standalone
+	CMPL CX, $0x00000800
+	JB   repeat_two_offset_standalone
+
+cant_repeat_two_offset_standalone:
+	CMPL DX, $0x00000104
+	JB   repeat_three_standalone
+	CMPL DX, $0x00010100
+	JB   repeat_four_standalone
+	CMPL DX, $0x0100ffff
+	JB   repeat_five_standalone
+	LEAL -16842747(DX), DX
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	ADDQ $0x05, BX
+	JMP  emit_repeat_again_standalone
+
+repeat_five_standalone:
+	LEAL -65536(DX), DX
+	MOVL DX, CX
+	MOVW $0x001d, (AX)
+	MOVW DX, 2(AX)
+	SARL $0x10, CX
+	MOVB CL, 4(AX)
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	JMP  gen_emit_repeat_end
+
+repeat_four_standalone:
+	LEAL -256(DX), DX
+	MOVW $0x0019, (AX)
+	MOVW DX, 2(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  gen_emit_repeat_end
+
+repeat_three_standalone:
+	LEAL -4(DX), DX
+	MOVW $0x0015, (AX)
+	MOVB DL, 2(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  gen_emit_repeat_end
+
+repeat_two_standalone:
+	SHLL $0x02, DX
+	ORL  $0x01, DX
+	MOVW DX, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_repeat_end
+
+repeat_two_offset_standalone:
+	XORQ SI, SI
+	LEAL 1(SI)(DX*4), DX
+	MOVB CL, 1(AX)
+	SARL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, DX
+	MOVB DL, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+
+gen_emit_repeat_end:
+	MOVQ BX, ret+40(FP)
+	RET
+
+// func emitCopy(dst []byte, offset int, length int) int
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+	XORQ BX, BX
+	MOVQ dst_base+0(FP), AX
+	MOVQ offset+24(FP), CX
+	MOVQ length+32(FP), DX
+
+	// emitCopy
+	CMPL CX, $0x00010000
+	JB   two_byte_offset_standalone
+	CMPL DX, $0x40
+	JBE  four_bytes_remain_standalone
+	MOVB $0xff, (AX)
+	MOVL CX, 1(AX)
+	LEAL -64(DX), DX
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	CMPL DX, $0x04
+	JB   four_bytes_remain_standalone
+
+	// emitRepeat
+emit_repeat_again_standalone_emit_copy:
+	MOVL DX, SI
+	LEAL -4(DX), DX
+	CMPL SI, $0x08
+	JBE  repeat_two_standalone_emit_copy
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_standalone_emit_copy
+	CMPL CX, $0x00000800
+	JB   repeat_two_offset_standalone_emit_copy
+
+cant_repeat_two_offset_standalone_emit_copy:
+	CMPL DX, $0x00000104
+	JB   repeat_three_standalone_emit_copy
+	CMPL DX, $0x00010100
+	JB   repeat_four_standalone_emit_copy
+	CMPL DX, $0x0100ffff
+	JB   repeat_five_standalone_emit_copy
+	LEAL -16842747(DX), DX
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	ADDQ $0x05, BX
+	JMP  emit_repeat_again_standalone_emit_copy
+
+repeat_five_standalone_emit_copy:
+	LEAL -65536(DX), DX
+	MOVL DX, CX
+	MOVW $0x001d, (AX)
+	MOVW DX, 2(AX)
+	SARL $0x10, CX
+	MOVB CL, 4(AX)
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	JMP  gen_emit_copy_end
+
+repeat_four_standalone_emit_copy:
+	LEAL -256(DX), DX
+	MOVW $0x0019, (AX)
+	MOVW DX, 2(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  gen_emit_copy_end
+
+repeat_three_standalone_emit_copy:
+	LEAL -4(DX), DX
+	MOVW $0x0015, (AX)
+	MOVB DL, 2(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_standalone_emit_copy:
+	SHLL $0x02, DX
+	ORL  $0x01, DX
+	MOVW DX, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_offset_standalone_emit_copy:
+	XORQ SI, SI
+	LEAL 1(SI)(DX*4), DX
+	MOVB CL, 1(AX)
+	SARL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, DX
+	MOVB DL, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+four_bytes_remain_standalone:
+	TESTL DX, DX
+	JZ    gen_emit_copy_end
+	XORL  SI, SI
+	LEAL  -1(SI)(DX*4), DX
+	MOVB  DL, (AX)
+	MOVL  CX, 1(AX)
+	ADDQ  $0x05, BX
+	ADDQ  $0x05, AX
+	JMP   gen_emit_copy_end
+
+two_byte_offset_standalone:
+	CMPL DX, $0x40
+	JBE  two_byte_offset_short_standalone
+	CMPL CX, $0x00000800
+	JAE  long_offset_short_standalone
+	MOVL $0x00000001, SI
+	LEAL 16(SI), SI
+	MOVB CL, 1(AX)
+	MOVL CX, DI
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	SUBL $0x08, DX
+
+	// emitRepeat
+	LEAL -4(DX), DX
+	JMP  cant_repeat_two_offset_standalone_emit_copy_short_2b
+
+emit_repeat_again_standalone_emit_copy_short_2b:
+	MOVL DX, SI
+	LEAL -4(DX), DX
+	CMPL SI, $0x08
+	JBE  repeat_two_standalone_emit_copy_short_2b
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_standalone_emit_copy_short_2b
+	CMPL CX, $0x00000800
+	JB   repeat_two_offset_standalone_emit_copy_short_2b
+
+cant_repeat_two_offset_standalone_emit_copy_short_2b:
+	CMPL DX, $0x00000104
+	JB   repeat_three_standalone_emit_copy_short_2b
+	CMPL DX, $0x00010100
+	JB   repeat_four_standalone_emit_copy_short_2b
+	CMPL DX, $0x0100ffff
+	JB   repeat_five_standalone_emit_copy_short_2b
+	LEAL -16842747(DX), DX
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	ADDQ $0x05, BX
+	JMP  emit_repeat_again_standalone_emit_copy_short_2b
+
+repeat_five_standalone_emit_copy_short_2b:
+	LEAL -65536(DX), DX
+	MOVL DX, CX
+	MOVW $0x001d, (AX)
+	MOVW DX, 2(AX)
+	SARL $0x10, CX
+	MOVB CL, 4(AX)
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	JMP  gen_emit_copy_end
+
+repeat_four_standalone_emit_copy_short_2b:
+	LEAL -256(DX), DX
+	MOVW $0x0019, (AX)
+	MOVW DX, 2(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  gen_emit_copy_end
+
+repeat_three_standalone_emit_copy_short_2b:
+	LEAL -4(DX), DX
+	MOVW $0x0015, (AX)
+	MOVB DL, 2(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_standalone_emit_copy_short_2b:
+	SHLL $0x02, DX
+	ORL  $0x01, DX
+	MOVW DX, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_offset_standalone_emit_copy_short_2b:
+	XORQ SI, SI
+	LEAL 1(SI)(DX*4), DX
+	MOVB CL, 1(AX)
+	SARL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, DX
+	MOVB DL, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+long_offset_short_standalone:
+	MOVB $0xee, (AX)
+	MOVW CX, 1(AX)
+	LEAL -60(DX), DX
+	ADDQ $0x03, AX
+	ADDQ $0x03, BX
+
+	// emitRepeat
+emit_repeat_again_standalone_emit_copy_short:
+	MOVL DX, SI
+	LEAL -4(DX), DX
+	CMPL SI, $0x08
+	JBE  repeat_two_standalone_emit_copy_short
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_standalone_emit_copy_short
+	CMPL CX, $0x00000800
+	JB   repeat_two_offset_standalone_emit_copy_short
+
+cant_repeat_two_offset_standalone_emit_copy_short:
+	CMPL DX, $0x00000104
+	JB   repeat_three_standalone_emit_copy_short
+	CMPL DX, $0x00010100
+	JB   repeat_four_standalone_emit_copy_short
+	CMPL DX, $0x0100ffff
+	JB   repeat_five_standalone_emit_copy_short
+	LEAL -16842747(DX), DX
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	ADDQ $0x05, BX
+	JMP  emit_repeat_again_standalone_emit_copy_short
+
+repeat_five_standalone_emit_copy_short:
+	LEAL -65536(DX), DX
+	MOVL DX, CX
+	MOVW $0x001d, (AX)
+	MOVW DX, 2(AX)
+	SARL $0x10, CX
+	MOVB CL, 4(AX)
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	JMP  gen_emit_copy_end
+
+repeat_four_standalone_emit_copy_short:
+	LEAL -256(DX), DX
+	MOVW $0x0019, (AX)
+	MOVW DX, 2(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  gen_emit_copy_end
+
+repeat_three_standalone_emit_copy_short:
+	LEAL -4(DX), DX
+	MOVW $0x0015, (AX)
+	MOVB DL, 2(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_standalone_emit_copy_short:
+	SHLL $0x02, DX
+	ORL  $0x01, DX
+	MOVW DX, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_offset_standalone_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(DX*4), DX
+	MOVB CL, 1(AX)
+	SARL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, DX
+	MOVB DL, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+two_byte_offset_short_standalone:
+	MOVL DX, SI
+	SHLL $0x02, SI
+	CMPL DX, $0x0c
+	JAE  emit_copy_three_standalone
+	CMPL CX, $0x00000800
+	JAE  emit_copy_three_standalone
+	LEAL -15(SI), SI
+	MOVB CL, 1(AX)
+	SHRL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+emit_copy_three_standalone:
+	LEAL -2(SI), SI
+	MOVB SI, (AX)
+	MOVW CX, 1(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+
+gen_emit_copy_end:
+	MOVQ BX, ret+40(FP)
+	RET
+
+// func emitCopyNoRepeat(dst []byte, offset int, length int) int
+TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
+	XORQ BX, BX
+	MOVQ dst_base+0(FP), AX
+	MOVQ offset+24(FP), CX
+	MOVQ length+32(FP), DX
+
+	// emitCopy
+	CMPL CX, $0x00010000
+	JB   two_byte_offset_standalone_snappy
+
+four_bytes_loop_back_standalone_snappy:
+	CMPL DX, $0x40
+	JBE  four_bytes_remain_standalone_snappy
+	MOVB $0xff, (AX)
+	MOVL CX, 1(AX)
+	LEAL -64(DX), DX
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	CMPL DX, $0x04
+	JB   four_bytes_remain_standalone_snappy
+	JMP  four_bytes_loop_back_standalone_snappy
+
+four_bytes_remain_standalone_snappy:
+	TESTL DX, DX
+	JZ    gen_emit_copy_end_snappy
+	XORL  SI, SI
+	LEAL  -1(SI)(DX*4), DX
+	MOVB  DL, (AX)
+	MOVL  CX, 1(AX)
+	ADDQ  $0x05, BX
+	ADDQ  $0x05, AX
+	JMP   gen_emit_copy_end_snappy
+
+two_byte_offset_standalone_snappy:
+	CMPL DX, $0x40
+	JBE  two_byte_offset_short_standalone_snappy
+	MOVB $0xee, (AX)
+	MOVW CX, 1(AX)
+	LEAL -60(DX), DX
+	ADDQ $0x03, AX
+	ADDQ $0x03, BX
+	JMP  two_byte_offset_standalone_snappy
+
+two_byte_offset_short_standalone_snappy:
+	MOVL DX, SI
+	SHLL $0x02, SI
+	CMPL DX, $0x0c
+	JAE  emit_copy_three_standalone_snappy
+	CMPL CX, $0x00000800
+	JAE  emit_copy_three_standalone_snappy
+	LEAL -15(SI), SI
+	MOVB CL, 1(AX)
+	SHRL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end_snappy
+
+emit_copy_three_standalone_snappy:
+	LEAL -2(SI), SI
+	MOVB SI, (AX)
+	MOVW CX, 1(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+
+gen_emit_copy_end_snappy:
+	MOVQ BX, ret+40(FP)
+	RET
+
+// func matchLen(a []byte, b []byte) int
+// Requires: BMI
+TEXT ·matchLen(SB), NOSPLIT, $0-56
+	MOVQ a_base+0(FP), AX
+	MOVQ b_base+24(FP), CX
+	MOVQ a_len+8(FP), DX
+
+	// matchLen
+	XORL SI, SI
+
+matchlen_loopback_16_standalone:
+	CMPL DX, $0x10
+	JB   matchlen_match8_standalone
+	MOVQ (AX)(SI*1), BX
+	MOVQ 8(AX)(SI*1), DI
+	XORQ (CX)(SI*1), BX
+	JNZ  matchlen_bsf_8_standalone
+	XORQ 8(CX)(SI*1), DI
+	JNZ  matchlen_bsf_16standalone
+	LEAL -16(DX), DX
+	LEAL 16(SI), SI
+	JMP  matchlen_loopback_16_standalone
+
+matchlen_bsf_16standalone:
+#ifdef GOAMD64_v3
+	TZCNTQ DI, DI
+
+#else
+	BSFQ DI, DI
+
+#endif
+	SARQ $0x03, DI
+	LEAL 8(SI)(DI*1), SI
+	JMP  gen_match_len_end
+
+matchlen_match8_standalone:
+	CMPL DX, $0x08
+	JB   matchlen_match4_standalone
+	MOVQ (AX)(SI*1), BX
+	XORQ (CX)(SI*1), BX
+	JNZ  matchlen_bsf_8_standalone
+	LEAL -8(DX), DX
+	LEAL 8(SI), SI
+	JMP  matchlen_match4_standalone
+
+matchlen_bsf_8_standalone:
+#ifdef GOAMD64_v3
+	TZCNTQ BX, BX
+
+#else
+	BSFQ BX, BX
+
+#endif
+	SARQ $0x03, BX
+	LEAL (SI)(BX*1), SI
+	JMP  gen_match_len_end
+
+matchlen_match4_standalone:
+	CMPL DX, $0x04
+	JB   matchlen_match2_standalone
+	MOVL (AX)(SI*1), BX
+	CMPL (CX)(SI*1), BX
+	JNE  matchlen_match2_standalone
+	LEAL -4(DX), DX
+	LEAL 4(SI), SI
+
+matchlen_match2_standalone:
+	CMPL DX, $0x01
+	JE   matchlen_match1_standalone
+	JB   gen_match_len_end
+	MOVW (AX)(SI*1), BX
+	CMPW (CX)(SI*1), BX
+	JNE  matchlen_match1_standalone
+	LEAL 2(SI), SI
+	SUBL $0x02, DX
+	JZ   gen_match_len_end
+
+matchlen_match1_standalone:
+	MOVB (AX)(SI*1), BL
+	CMPB (CX)(SI*1), BL
+	JNE  gen_match_len_end
+	LEAL 1(SI), SI
+
+gen_match_len_end:
+	MOVQ SI, ret+48(FP)
+	RET
+
+// func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64
+	XORQ SI, SI
+	MOVQ dst_base+0(FP), AX
+	MOVQ dst_len+8(FP), CX
+	MOVQ src_base+24(FP), DX
+	MOVQ src_len+32(FP), BX
+	LEAQ (DX)(BX*1), BX
+	LEAQ -10(AX)(CX*1), CX
+	XORQ DI, DI
+
+lz4_s2_loop:
+	CMPQ    DX, BX
+	JAE     lz4_s2_corrupt
+	CMPQ    AX, CX
+	JAE     lz4_s2_dstfull
+	MOVBQZX (DX), R8
+	MOVQ    R8, R9
+	MOVQ    R8, R10
+	SHRQ    $0x04, R9
+	ANDQ    $0x0f, R10
+	CMPQ    R8, $0xf0
+	JB      lz4_s2_ll_end
+
+lz4_s2_ll_loop:
+	INCQ    DX
+	CMPQ    DX, BX
+	JAE     lz4_s2_corrupt
+	MOVBQZX (DX), R8
+	ADDQ    R8, R9
+	CMPQ    R8, $0xff
+	JEQ     lz4_s2_ll_loop
+
+lz4_s2_ll_end:
+	LEAQ  (DX)(R9*1), R8
+	ADDQ  $0x04, R10
+	CMPQ  R8, BX
+	JAE   lz4_s2_corrupt
+	INCQ  DX
+	INCQ  R8
+	TESTQ R9, R9
+	JZ    lz4_s2_lits_done
+	LEAQ  (AX)(R9*1), R11
+	CMPQ  R11, CX
+	JAE   lz4_s2_dstfull
+	ADDQ  R9, SI
+	LEAL  -1(R9), R11
+	CMPL  R11, $0x3c
+	JB    one_byte_lz4_s2
+	CMPL  R11, $0x00000100
+	JB    two_bytes_lz4_s2
+	CMPL  R11, $0x00010000
+	JB    three_bytes_lz4_s2
+	CMPL  R11, $0x01000000
+	JB    four_bytes_lz4_s2
+	MOVB  $0xfc, (AX)
+	MOVL  R11, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   memmove_long_lz4_s2
+
+four_bytes_lz4_s2:
+	MOVL R11, R12
+	SHRL $0x10, R12
+	MOVB $0xf8, (AX)
+	MOVW R11, 1(AX)
+	MOVB R12, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_lz4_s2
+
+three_bytes_lz4_s2:
+	MOVB $0xf4, (AX)
+	MOVW R11, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_lz4_s2
+
+two_bytes_lz4_s2:
+	MOVB $0xf0, (AX)
+	MOVB R11, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R11, $0x40
+	JB   memmove_lz4_s2
+	JMP  memmove_long_lz4_s2
+
+one_byte_lz4_s2:
+	SHLB $0x02, R11
+	MOVB R11, (AX)
+	ADDQ $0x01, AX
+
+memmove_lz4_s2:
+	LEAQ (AX)(R9*1), R11
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_lz4_s2_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_lz4_s2_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_lz4_s2_memmove_move_17through32
+	JMP  emit_lit_memmove_lz4_s2_memmove_move_33through64
+
+emit_lit_memmove_lz4_s2_memmove_move_8:
+	MOVQ (DX), R12
+	MOVQ R12, (AX)
+	JMP  memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_8through16:
+	MOVQ (DX), R12
+	MOVQ -8(DX)(R9*1), DX
+	MOVQ R12, (AX)
+	MOVQ DX, -8(AX)(R9*1)
+	JMP  memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R9*1), X2
+	MOVOU -16(DX)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_lz4_s2:
+	MOVQ R11, AX
+	JMP  lz4_s2_lits_emit_done
+
+memmove_long_lz4_s2:
+	LEAQ (AX)(R9*1), R11
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R9*1), X2
+	MOVOU -16(DX)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R12
+	ANDL  $0x0000001f, R12
+	MOVQ  $0x00000040, R14
+	SUBQ  R12, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
+	LEAQ  -32(DX)(R14*1), R12
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_lz4_s2large_big_loop_back:
+	MOVOU (R12), X4
+	MOVOU 16(R12), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_lz4_s2large_big_loop_back
+
+emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32:
+	MOVOU -32(DX)(R14*1), X4
+	MOVOU -16(DX)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R11, AX
+
+lz4_s2_lits_emit_done:
+	MOVQ R8, DX
+
+lz4_s2_lits_done:
+	CMPQ DX, BX
+	JNE  lz4_s2_match
+	CMPQ R10, $0x04
+	JEQ  lz4_s2_done
+	JMP  lz4_s2_corrupt
+
+lz4_s2_match:
+	LEAQ    2(DX), R8
+	CMPQ    R8, BX
+	JAE     lz4_s2_corrupt
+	MOVWQZX (DX), R9
+	MOVQ    R8, DX
+	TESTQ   R9, R9
+	JZ      lz4_s2_corrupt
+	CMPQ    R9, SI
+	JA      lz4_s2_corrupt
+	CMPQ    R10, $0x13
+	JNE     lz4_s2_ml_done
+
+lz4_s2_ml_loop:
+	MOVBQZX (DX), R8
+	INCQ    DX
+	ADDQ    R8, R10
+	CMPQ    DX, BX
+	JAE     lz4_s2_corrupt
+	CMPQ    R8, $0xff
+	JEQ     lz4_s2_ml_loop
+
+lz4_s2_ml_done:
+	ADDQ R10, SI
+	CMPQ R9, DI
+	JNE  lz4_s2_docopy
+
+	// emitRepeat
+emit_repeat_again_lz4_s2:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JBE  repeat_two_lz4_s2
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_lz4_s2
+	CMPL R9, $0x00000800
+	JB   repeat_two_offset_lz4_s2
+
+cant_repeat_two_offset_lz4_s2:
+	CMPL R10, $0x00000104
+	JB   repeat_three_lz4_s2
+	CMPL R10, $0x00010100
+	JB   repeat_four_lz4_s2
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_lz4_s2
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2
+
+repeat_five_lz4_s2:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4_s2_loop
+
+repeat_four_lz4_s2:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_s2_loop
+
+repeat_three_lz4_s2:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_s2_loop
+
+repeat_two_lz4_s2:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+repeat_two_offset_lz4_s2:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+lz4_s2_docopy:
+	MOVQ R9, DI
+
+	// emitCopy
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_lz4_s2
+	CMPL R9, $0x00000800
+	JAE  long_offset_short_lz4_s2
+	MOVL $0x00000001, R8
+	LEAL 16(R8), R8
+	MOVB R9, 1(AX)
+	MOVL R9, R11
+	SHRL $0x08, R11
+	SHLL $0x05, R11
+	ORL  R11, R8
+	MOVB R8, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R10
+
+	// emitRepeat
+	LEAL -4(R10), R10
+	JMP  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+emit_repeat_again_lz4_s2_emit_copy_short_2b:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JBE  repeat_two_lz4_s2_emit_copy_short_2b
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+	CMPL R9, $0x00000800
+	JB   repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
+	CMPL R10, $0x00000104
+	JB   repeat_three_lz4_s2_emit_copy_short_2b
+	CMPL R10, $0x00010100
+	JB   repeat_four_lz4_s2_emit_copy_short_2b
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_lz4_s2_emit_copy_short_2b
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2_emit_copy_short_2b
+
+repeat_five_lz4_s2_emit_copy_short_2b:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short_2b:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short_2b:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short_2b:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short_2b:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+long_offset_short_lz4_s2:
+	MOVB $0xee, (AX)
+	MOVW R9, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+
+	// emitRepeat
+emit_repeat_again_lz4_s2_emit_copy_short:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JBE  repeat_two_lz4_s2_emit_copy_short
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short
+	CMPL R9, $0x00000800
+	JB   repeat_two_offset_lz4_s2_emit_copy_short
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short:
+	CMPL R10, $0x00000104
+	JB   repeat_three_lz4_s2_emit_copy_short
+	CMPL R10, $0x00010100
+	JB   repeat_four_lz4_s2_emit_copy_short
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_lz4_s2_emit_copy_short
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2_emit_copy_short
+
+repeat_five_lz4_s2_emit_copy_short:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+two_byte_offset_short_lz4_s2:
+	MOVL R10, R8
+	SHLL $0x02, R8
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_lz4_s2
+	CMPL R9, $0x00000800
+	JAE  emit_copy_three_lz4_s2
+	LEAL -15(R8), R8
+	MOVB R9, 1(AX)
+	SHRL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R8
+	MOVB R8, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+emit_copy_three_lz4_s2:
+	LEAL -2(R8), R8
+	MOVB R8, (AX)
+	MOVW R9, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_s2_loop
+
+lz4_s2_done:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ SI, uncompressed+48(FP)
+	MOVQ AX, dstUsed+56(FP)
+	RET
+
+lz4_s2_corrupt:
+	XORQ AX, AX
+	LEAQ -1(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+lz4_s2_dstfull:
+	XORQ AX, AX
+	LEAQ -2(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+// func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64
+	XORQ SI, SI
+	MOVQ dst_base+0(FP), AX
+	MOVQ dst_len+8(FP), CX
+	MOVQ src_base+24(FP), DX
+	MOVQ src_len+32(FP), BX
+	LEAQ (DX)(BX*1), BX
+	LEAQ -10(AX)(CX*1), CX
+	XORQ DI, DI
+
+lz4s_s2_loop:
+	CMPQ    DX, BX
+	JAE     lz4s_s2_corrupt
+	CMPQ    AX, CX
+	JAE     lz4s_s2_dstfull
+	MOVBQZX (DX), R8
+	MOVQ    R8, R9
+	MOVQ    R8, R10
+	SHRQ    $0x04, R9
+	ANDQ    $0x0f, R10
+	CMPQ    R8, $0xf0
+	JB      lz4s_s2_ll_end
+
+lz4s_s2_ll_loop:
+	INCQ    DX
+	CMPQ    DX, BX
+	JAE     lz4s_s2_corrupt
+	MOVBQZX (DX), R8
+	ADDQ    R8, R9
+	CMPQ    R8, $0xff
+	JEQ     lz4s_s2_ll_loop
+
+lz4s_s2_ll_end:
+	LEAQ  (DX)(R9*1), R8
+	ADDQ  $0x03, R10
+	CMPQ  R8, BX
+	JAE   lz4s_s2_corrupt
+	INCQ  DX
+	INCQ  R8
+	TESTQ R9, R9
+	JZ    lz4s_s2_lits_done
+	LEAQ  (AX)(R9*1), R11
+	CMPQ  R11, CX
+	JAE   lz4s_s2_dstfull
+	ADDQ  R9, SI
+	LEAL  -1(R9), R11
+	CMPL  R11, $0x3c
+	JB    one_byte_lz4s_s2
+	CMPL  R11, $0x00000100
+	JB    two_bytes_lz4s_s2
+	CMPL  R11, $0x00010000
+	JB    three_bytes_lz4s_s2
+	CMPL  R11, $0x01000000
+	JB    four_bytes_lz4s_s2
+	MOVB  $0xfc, (AX)
+	MOVL  R11, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   memmove_long_lz4s_s2
+
+four_bytes_lz4s_s2:
+	MOVL R11, R12
+	SHRL $0x10, R12
+	MOVB $0xf8, (AX)
+	MOVW R11, 1(AX)
+	MOVB R12, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_lz4s_s2
+
+three_bytes_lz4s_s2:
+	MOVB $0xf4, (AX)
+	MOVW R11, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_lz4s_s2
+
+two_bytes_lz4s_s2:
+	MOVB $0xf0, (AX)
+	MOVB R11, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R11, $0x40
+	JB   memmove_lz4s_s2
+	JMP  memmove_long_lz4s_s2
+
+one_byte_lz4s_s2:
+	SHLB $0x02, R11
+	MOVB R11, (AX)
+	ADDQ $0x01, AX
+
+memmove_lz4s_s2:
+	LEAQ (AX)(R9*1), R11
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_lz4s_s2_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_lz4s_s2_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_lz4s_s2_memmove_move_17through32
+	JMP  emit_lit_memmove_lz4s_s2_memmove_move_33through64
+
+emit_lit_memmove_lz4s_s2_memmove_move_8:
+	MOVQ (DX), R12
+	MOVQ R12, (AX)
+	JMP  memmove_end_copy_lz4s_s2
+
+emit_lit_memmove_lz4s_s2_memmove_move_8through16:
+	MOVQ (DX), R12
+	MOVQ -8(DX)(R9*1), DX
+	MOVQ R12, (AX)
+	MOVQ DX, -8(AX)(R9*1)
+	JMP  memmove_end_copy_lz4s_s2
+
+emit_lit_memmove_lz4s_s2_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_lz4s_s2
+
+emit_lit_memmove_lz4s_s2_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R9*1), X2
+	MOVOU -16(DX)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_lz4s_s2:
+	MOVQ R11, AX
+	JMP  lz4s_s2_lits_emit_done
+
+memmove_long_lz4s_s2:
+	LEAQ (AX)(R9*1), R11
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R9*1), X2
+	MOVOU -16(DX)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R12
+	ANDL  $0x0000001f, R12
+	MOVQ  $0x00000040, R14
+	SUBQ  R12, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
+	LEAQ  -32(DX)(R14*1), R12
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_lz4s_s2large_big_loop_back:
+	MOVOU (R12), X4
+	MOVOU 16(R12), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_lz4s_s2large_big_loop_back
+
+emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32:
+	MOVOU -32(DX)(R14*1), X4
+	MOVOU -16(DX)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R11, AX
+
+lz4s_s2_lits_emit_done:
+	MOVQ R8, DX
+
+lz4s_s2_lits_done:
+	CMPQ DX, BX
+	JNE  lz4s_s2_match
+	CMPQ R10, $0x03
+	JEQ  lz4s_s2_done
+	JMP  lz4s_s2_corrupt
+
+lz4s_s2_match:
+	CMPQ    R10, $0x03
+	JEQ     lz4s_s2_loop
+	LEAQ    2(DX), R8
+	CMPQ    R8, BX
+	JAE     lz4s_s2_corrupt
+	MOVWQZX (DX), R9
+	MOVQ    R8, DX
+	TESTQ   R9, R9
+	JZ      lz4s_s2_corrupt
+	CMPQ    R9, SI
+	JA      lz4s_s2_corrupt
+	CMPQ    R10, $0x12
+	JNE     lz4s_s2_ml_done
+
+lz4s_s2_ml_loop:
+	MOVBQZX (DX), R8
+	INCQ    DX
+	ADDQ    R8, R10
+	CMPQ    DX, BX
+	JAE     lz4s_s2_corrupt
+	CMPQ    R8, $0xff
+	JEQ     lz4s_s2_ml_loop
+
+lz4s_s2_ml_done:
+	ADDQ R10, SI
+	CMPQ R9, DI
+	JNE  lz4s_s2_docopy
+
+	// emitRepeat
+emit_repeat_again_lz4_s2:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JBE  repeat_two_lz4_s2
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_lz4_s2
+	CMPL R9, $0x00000800
+	JB   repeat_two_offset_lz4_s2
+
+cant_repeat_two_offset_lz4_s2:
+	CMPL R10, $0x00000104
+	JB   repeat_three_lz4_s2
+	CMPL R10, $0x00010100
+	JB   repeat_four_lz4_s2
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_lz4_s2
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2
+
+repeat_five_lz4_s2:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4s_s2_loop
+
+repeat_four_lz4_s2:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4s_s2_loop
+
+repeat_three_lz4_s2:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_lz4_s2:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_offset_lz4_s2:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+lz4s_s2_docopy:
+	MOVQ R9, DI
+
+	// emitCopy
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_lz4_s2
+	CMPL R9, $0x00000800
+	JAE  long_offset_short_lz4_s2
+	MOVL $0x00000001, R8
+	LEAL 16(R8), R8
+	MOVB R9, 1(AX)
+	MOVL R9, R11
+	SHRL $0x08, R11
+	SHLL $0x05, R11
+	ORL  R11, R8
+	MOVB R8, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R10
+
+	// emitRepeat
+	LEAL -4(R10), R10
+	JMP  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+emit_repeat_again_lz4_s2_emit_copy_short_2b:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JBE  repeat_two_lz4_s2_emit_copy_short_2b
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+	CMPL R9, $0x00000800
+	JB   repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
+	CMPL R10, $0x00000104
+	JB   repeat_three_lz4_s2_emit_copy_short_2b
+	CMPL R10, $0x00010100
+	JB   repeat_four_lz4_s2_emit_copy_short_2b
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_lz4_s2_emit_copy_short_2b
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2_emit_copy_short_2b
+
+repeat_five_lz4_s2_emit_copy_short_2b:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4s_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short_2b:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4s_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short_2b:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short_2b:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short_2b:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+long_offset_short_lz4_s2:
+	MOVB $0xee, (AX)
+	MOVW R9, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+
+	// emitRepeat
+emit_repeat_again_lz4_s2_emit_copy_short:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JBE  repeat_two_lz4_s2_emit_copy_short
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short
+	CMPL R9, $0x00000800
+	JB   repeat_two_offset_lz4_s2_emit_copy_short
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short:
+	CMPL R10, $0x00000104
+	JB   repeat_three_lz4_s2_emit_copy_short
+	CMPL R10, $0x00010100
+	JB   repeat_four_lz4_s2_emit_copy_short
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_lz4_s2_emit_copy_short
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2_emit_copy_short
+
+repeat_five_lz4_s2_emit_copy_short:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4s_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4s_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+two_byte_offset_short_lz4_s2:
+	MOVL R10, R8
+	SHLL $0x02, R8
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_lz4_s2
+	CMPL R9, $0x00000800
+	JAE  emit_copy_three_lz4_s2
+	LEAL -15(R8), R8
+	MOVB R9, 1(AX)
+	SHRL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R8
+	MOVB R8, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+emit_copy_three_lz4_s2:
+	LEAL -2(R8), R8
+	MOVB R8, (AX)
+	MOVW R9, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_s2_loop
+
+lz4s_s2_done:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ SI, uncompressed+48(FP)
+	MOVQ AX, dstUsed+56(FP)
+	RET
+
+lz4s_s2_corrupt:
+	XORQ AX, AX
+	LEAQ -1(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+lz4s_s2_dstfull:
+	XORQ AX, AX
+	LEAQ -2(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+// func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64
+	XORQ SI, SI
+	MOVQ dst_base+0(FP), AX
+	MOVQ dst_len+8(FP), CX
+	MOVQ src_base+24(FP), DX
+	MOVQ src_len+32(FP), BX
+	LEAQ (DX)(BX*1), BX
+	LEAQ -10(AX)(CX*1), CX
+
+lz4_snappy_loop:
+	CMPQ    DX, BX
+	JAE     lz4_snappy_corrupt
+	CMPQ    AX, CX
+	JAE     lz4_snappy_dstfull
+	MOVBQZX (DX), DI
+	MOVQ    DI, R8
+	MOVQ    DI, R9
+	SHRQ    $0x04, R8
+	ANDQ    $0x0f, R9
+	CMPQ    DI, $0xf0
+	JB      lz4_snappy_ll_end
+
+lz4_snappy_ll_loop:
+	INCQ    DX
+	CMPQ    DX, BX
+	JAE     lz4_snappy_corrupt
+	MOVBQZX (DX), DI
+	ADDQ    DI, R8
+	CMPQ    DI, $0xff
+	JEQ     lz4_snappy_ll_loop
+
+lz4_snappy_ll_end:
+	LEAQ  (DX)(R8*1), DI
+	ADDQ  $0x04, R9
+	CMPQ  DI, BX
+	JAE   lz4_snappy_corrupt
+	INCQ  DX
+	INCQ  DI
+	TESTQ R8, R8
+	JZ    lz4_snappy_lits_done
+	LEAQ  (AX)(R8*1), R10
+	CMPQ  R10, CX
+	JAE   lz4_snappy_dstfull
+	ADDQ  R8, SI
+	LEAL  -1(R8), R10
+	CMPL  R10, $0x3c
+	JB    one_byte_lz4_snappy
+	CMPL  R10, $0x00000100
+	JB    two_bytes_lz4_snappy
+	CMPL  R10, $0x00010000
+	JB    three_bytes_lz4_snappy
+	CMPL  R10, $0x01000000
+	JB    four_bytes_lz4_snappy
+	MOVB  $0xfc, (AX)
+	MOVL  R10, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   memmove_long_lz4_snappy
+
+four_bytes_lz4_snappy:
+	MOVL R10, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (AX)
+	MOVW R10, 1(AX)
+	MOVB R11, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_lz4_snappy
+
+three_bytes_lz4_snappy:
+	MOVB $0xf4, (AX)
+	MOVW R10, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_lz4_snappy
+
+two_bytes_lz4_snappy:
+	MOVB $0xf0, (AX)
+	MOVB R10, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R10, $0x40
+	JB   memmove_lz4_snappy
+	JMP  memmove_long_lz4_snappy
+
+one_byte_lz4_snappy:
+	SHLB $0x02, R10
+	MOVB R10, (AX)
+	ADDQ $0x01, AX
+
+memmove_lz4_snappy:
+	LEAQ (AX)(R8*1), R10
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_lz4_snappy_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_lz4_snappy_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_lz4_snappy_memmove_move_17through32
+	JMP  emit_lit_memmove_lz4_snappy_memmove_move_33through64
+
+emit_lit_memmove_lz4_snappy_memmove_move_8:
+	MOVQ (DX), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_lz4_snappy
+
+emit_lit_memmove_lz4_snappy_memmove_move_8through16:
+	MOVQ (DX), R11
+	MOVQ -8(DX)(R8*1), DX
+	MOVQ R11, (AX)
+	MOVQ DX, -8(AX)(R8*1)
+	JMP  memmove_end_copy_lz4_snappy
+
+emit_lit_memmove_lz4_snappy_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_lz4_snappy
+
+emit_lit_memmove_lz4_snappy_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R8*1), X2
+	MOVOU -16(DX)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_lz4_snappy:
+	MOVQ R10, AX
+	JMP  lz4_snappy_lits_emit_done
+
+memmove_long_lz4_snappy:
+	LEAQ (AX)(R8*1), R10
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R8*1), X2
+	MOVOU -16(DX)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
+	LEAQ  -32(DX)(R13*1), R11
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_lz4_snappylarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_lz4_snappylarge_big_loop_back
+
+emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32:
+	MOVOU -32(DX)(R13*1), X4
+	MOVOU -16(DX)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  R10, AX
+
+lz4_snappy_lits_emit_done:
+	MOVQ DI, DX
+
+lz4_snappy_lits_done:
+	CMPQ DX, BX
+	JNE  lz4_snappy_match
+	CMPQ R9, $0x04
+	JEQ  lz4_snappy_done
+	JMP  lz4_snappy_corrupt
+
+lz4_snappy_match:
+	LEAQ    2(DX), DI
+	CMPQ    DI, BX
+	JAE     lz4_snappy_corrupt
+	MOVWQZX (DX), R8
+	MOVQ    DI, DX
+	TESTQ   R8, R8
+	JZ      lz4_snappy_corrupt
+	CMPQ    R8, SI
+	JA      lz4_snappy_corrupt
+	CMPQ    R9, $0x13
+	JNE     lz4_snappy_ml_done
+
+lz4_snappy_ml_loop:
+	MOVBQZX (DX), DI
+	INCQ    DX
+	ADDQ    DI, R9
+	CMPQ    DX, BX
+	JAE     lz4_snappy_corrupt
+	CMPQ    DI, $0xff
+	JEQ     lz4_snappy_ml_loop
+
+lz4_snappy_ml_done:
+	ADDQ R9, SI
+
+	// emitCopy
+two_byte_offset_lz4_s2:
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_lz4_s2
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+	CMPQ AX, CX
+	JAE  lz4_snappy_loop
+	JMP  two_byte_offset_lz4_s2
+
+two_byte_offset_short_lz4_s2:
+	MOVL R9, DI
+	SHLL $0x02, DI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_lz4_s2
+	CMPL R8, $0x00000800
+	JAE  emit_copy_three_lz4_s2
+	LEAL -15(DI), DI
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_snappy_loop
+
+emit_copy_three_lz4_s2:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_snappy_loop
+
+lz4_snappy_done:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ SI, uncompressed+48(FP)
+	MOVQ AX, dstUsed+56(FP)
+	RET
+
+lz4_snappy_corrupt:
+	XORQ AX, AX
+	LEAQ -1(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+lz4_snappy_dstfull:
+	XORQ AX, AX
+	LEAQ -2(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+// func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64
+	XORQ SI, SI
+	MOVQ dst_base+0(FP), AX
+	MOVQ dst_len+8(FP), CX
+	MOVQ src_base+24(FP), DX
+	MOVQ src_len+32(FP), BX
+	LEAQ (DX)(BX*1), BX
+	LEAQ -10(AX)(CX*1), CX
+
+lz4s_snappy_loop:
+	CMPQ    DX, BX
+	JAE     lz4s_snappy_corrupt
+	CMPQ    AX, CX
+	JAE     lz4s_snappy_dstfull
+	MOVBQZX (DX), DI
+	MOVQ    DI, R8
+	MOVQ    DI, R9
+	SHRQ    $0x04, R8
+	ANDQ    $0x0f, R9
+	CMPQ    DI, $0xf0
+	JB      lz4s_snappy_ll_end
+
+lz4s_snappy_ll_loop:
+	INCQ    DX
+	CMPQ    DX, BX
+	JAE     lz4s_snappy_corrupt
+	MOVBQZX (DX), DI
+	ADDQ    DI, R8
+	CMPQ    DI, $0xff
+	JEQ     lz4s_snappy_ll_loop
+
+lz4s_snappy_ll_end:
+	LEAQ  (DX)(R8*1), DI
+	ADDQ  $0x03, R9
+	CMPQ  DI, BX
+	JAE   lz4s_snappy_corrupt
+	INCQ  DX
+	INCQ  DI
+	TESTQ R8, R8
+	JZ    lz4s_snappy_lits_done
+	LEAQ  (AX)(R8*1), R10
+	CMPQ  R10, CX
+	JAE   lz4s_snappy_dstfull
+	ADDQ  R8, SI
+	LEAL  -1(R8), R10
+	CMPL  R10, $0x3c
+	JB    one_byte_lz4s_snappy
+	CMPL  R10, $0x00000100
+	JB    two_bytes_lz4s_snappy
+	CMPL  R10, $0x00010000
+	JB    three_bytes_lz4s_snappy
+	CMPL  R10, $0x01000000
+	JB    four_bytes_lz4s_snappy
+	MOVB  $0xfc, (AX)
+	MOVL  R10, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   memmove_long_lz4s_snappy
+
+four_bytes_lz4s_snappy:
+	MOVL R10, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (AX)
+	MOVW R10, 1(AX)
+	MOVB R11, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_lz4s_snappy
+
+three_bytes_lz4s_snappy:
+	MOVB $0xf4, (AX)
+	MOVW R10, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_lz4s_snappy
+
+two_bytes_lz4s_snappy:
+	MOVB $0xf0, (AX)
+	MOVB R10, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R10, $0x40
+	JB   memmove_lz4s_snappy
+	JMP  memmove_long_lz4s_snappy
+
+one_byte_lz4s_snappy:
+	SHLB $0x02, R10
+	MOVB R10, (AX)
+	ADDQ $0x01, AX
+
+memmove_lz4s_snappy:
+	LEAQ (AX)(R8*1), R10
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_lz4s_snappy_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_lz4s_snappy_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_lz4s_snappy_memmove_move_17through32
+	JMP  emit_lit_memmove_lz4s_snappy_memmove_move_33through64
+
+emit_lit_memmove_lz4s_snappy_memmove_move_8:
+	MOVQ (DX), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_lz4s_snappy
+
+emit_lit_memmove_lz4s_snappy_memmove_move_8through16:
+	MOVQ (DX), R11
+	MOVQ -8(DX)(R8*1), DX
+	MOVQ R11, (AX)
+	MOVQ DX, -8(AX)(R8*1)
+	JMP  memmove_end_copy_lz4s_snappy
+
+emit_lit_memmove_lz4s_snappy_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_lz4s_snappy
+
+emit_lit_memmove_lz4s_snappy_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R8*1), X2
+	MOVOU -16(DX)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_lz4s_snappy:
+	MOVQ R10, AX
+	JMP  lz4s_snappy_lits_emit_done
+
+memmove_long_lz4s_snappy:
+	LEAQ (AX)(R8*1), R10
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R8*1), X2
+	MOVOU -16(DX)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
+	LEAQ  -32(DX)(R13*1), R11
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_lz4s_snappylarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_lz4s_snappylarge_big_loop_back
+
+emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32:
+	MOVOU -32(DX)(R13*1), X4
+	MOVOU -16(DX)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  R10, AX
+
+lz4s_snappy_lits_emit_done:
+	MOVQ DI, DX
+
+lz4s_snappy_lits_done:
+	CMPQ DX, BX
+	JNE  lz4s_snappy_match
+	CMPQ R9, $0x03
+	JEQ  lz4s_snappy_done
+	JMP  lz4s_snappy_corrupt
+
+lz4s_snappy_match:
+	CMPQ    R9, $0x03
+	JEQ     lz4s_snappy_loop
+	LEAQ    2(DX), DI
+	CMPQ    DI, BX
+	JAE     lz4s_snappy_corrupt
+	MOVWQZX (DX), R8
+	MOVQ    DI, DX
+	TESTQ   R8, R8
+	JZ      lz4s_snappy_corrupt
+	CMPQ    R8, SI
+	JA      lz4s_snappy_corrupt
+	CMPQ    R9, $0x12
+	JNE     lz4s_snappy_ml_done
+
+lz4s_snappy_ml_loop:
+	MOVBQZX (DX), DI
+	INCQ    DX
+	ADDQ    DI, R9
+	CMPQ    DX, BX
+	JAE     lz4s_snappy_corrupt
+	CMPQ    DI, $0xff
+	JEQ     lz4s_snappy_ml_loop
+
+lz4s_snappy_ml_done:
+	ADDQ R9, SI
+
+	// emitCopy
+two_byte_offset_lz4_s2:
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_lz4_s2
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+	CMPQ AX, CX
+	JAE  lz4s_snappy_loop
+	JMP  two_byte_offset_lz4_s2
+
+two_byte_offset_short_lz4_s2:
+	MOVL R9, DI
+	SHLL $0x02, DI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_lz4_s2
+	CMPL R8, $0x00000800
+	JAE  emit_copy_three_lz4_s2
+	LEAL -15(DI), DI
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_snappy_loop
+
+emit_copy_three_lz4_s2:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_snappy_loop
+
+lz4s_snappy_done:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ SI, uncompressed+48(FP)
+	MOVQ AX, dstUsed+56(FP)
+	RET
+
+lz4s_snappy_corrupt:
+	XORQ AX, AX
+	LEAQ -1(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+lz4s_snappy_dstfull:
+	XORQ AX, AX
+	LEAQ -2(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/s2/index.go b/vendor/github.com/klauspost/compress/s2/index.go
new file mode 100644
index 0000000..4229957
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/index.go
@@ -0,0 +1,602 @@
+// Copyright (c) 2022+ Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"bytes"
+	"encoding/binary"
+	"encoding/json"
+	"fmt"
+	"io"
+	"sort"
+)
+
+const (
+	S2IndexHeader   = "s2idx\x00"
+	S2IndexTrailer  = "\x00xdi2s"
+	maxIndexEntries = 1 << 16
+	// If distance is less than this, we do not add the entry.
+	minIndexDist = 1 << 20
+)
+
+// Index represents an S2/Snappy index.
+type Index struct {
+	TotalUncompressed int64 // Total Uncompressed size if known. Will be -1 if unknown.
+	TotalCompressed   int64 // Total Compressed size if known. Will be -1 if unknown.
+	info              []struct {
+		compressedOffset   int64
+		uncompressedOffset int64
+	}
+	estBlockUncomp int64
+}
+
+func (i *Index) reset(maxBlock int) {
+	i.estBlockUncomp = int64(maxBlock)
+	i.TotalCompressed = -1
+	i.TotalUncompressed = -1
+	if len(i.info) > 0 {
+		i.info = i.info[:0]
+	}
+}
+
+// allocInfos will allocate an empty slice of infos.
+func (i *Index) allocInfos(n int) {
+	if n > maxIndexEntries {
+		panic("n > maxIndexEntries")
+	}
+	i.info = make([]struct {
+		compressedOffset   int64
+		uncompressedOffset int64
+	}, 0, n)
+}
+
+// add an uncompressed and compressed pair.
+// Entries must be sent in order.
+func (i *Index) add(compressedOffset, uncompressedOffset int64) error {
+	if i == nil {
+		return nil
+	}
+	lastIdx := len(i.info) - 1
+	if lastIdx >= 0 {
+		latest := i.info[lastIdx]
+		if latest.uncompressedOffset == uncompressedOffset {
+			// Uncompressed didn't change, don't add entry,
+			// but update start index.
+			latest.compressedOffset = compressedOffset
+			i.info[lastIdx] = latest
+			return nil
+		}
+		if latest.uncompressedOffset > uncompressedOffset {
+			return fmt.Errorf("internal error: Earlier uncompressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
+		}
+		if latest.compressedOffset > compressedOffset {
+			return fmt.Errorf("internal error: Earlier compressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
+		}
+		if latest.uncompressedOffset+minIndexDist > uncompressedOffset {
+			// Only add entry if distance is large enough.
+			return nil
+		}
+	}
+	i.info = append(i.info, struct {
+		compressedOffset   int64
+		uncompressedOffset int64
+	}{compressedOffset: compressedOffset, uncompressedOffset: uncompressedOffset})
+	return nil
+}
+
+// Find the offset at or before the wanted (uncompressed) offset.
+// If offset is 0 or positive it is the offset from the beginning of the file.
+// If the uncompressed size is known, the offset must be within the file.
+// If an offset outside the file is requested io.ErrUnexpectedEOF is returned.
+// If the offset is negative, it is interpreted as the distance from the end of the file,
+// where -1 represents the last byte.
+// If offset from the end of the file is requested, but size is unknown,
+// ErrUnsupported will be returned.
+func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err error) {
+	if i.TotalUncompressed < 0 {
+		return 0, 0, ErrCorrupt
+	}
+	if offset < 0 {
+		offset = i.TotalUncompressed + offset
+		if offset < 0 {
+			return 0, 0, io.ErrUnexpectedEOF
+		}
+	}
+	if offset > i.TotalUncompressed {
+		return 0, 0, io.ErrUnexpectedEOF
+	}
+	if len(i.info) > 200 {
+		n := sort.Search(len(i.info), func(n int) bool {
+			return i.info[n].uncompressedOffset > offset
+		})
+		if n == 0 {
+			n = 1
+		}
+		return i.info[n-1].compressedOffset, i.info[n-1].uncompressedOffset, nil
+	}
+	for _, info := range i.info {
+		if info.uncompressedOffset > offset {
+			break
+		}
+		compressedOff = info.compressedOffset
+		uncompressedOff = info.uncompressedOffset
+	}
+	return compressedOff, uncompressedOff, nil
+}
+
+// reduce to stay below maxIndexEntries
+func (i *Index) reduce() {
+	if len(i.info) < maxIndexEntries && i.estBlockUncomp >= minIndexDist {
+		return
+	}
+
+	// Algorithm, keep 1, remove removeN entries...
+	removeN := (len(i.info) + 1) / maxIndexEntries
+	src := i.info
+	j := 0
+
+	// Each block should be at least 1MB, but don't reduce below 1000 entries.
+	for i.estBlockUncomp*(int64(removeN)+1) < minIndexDist && len(i.info)/(removeN+1) > 1000 {
+		removeN++
+	}
+	for idx := 0; idx < len(src); idx++ {
+		i.info[j] = src[idx]
+		j++
+		idx += removeN
+	}
+	i.info = i.info[:j]
+	// Update maxblock estimate.
+	i.estBlockUncomp += i.estBlockUncomp * int64(removeN)
+}
+
+func (i *Index) appendTo(b []byte, uncompTotal, compTotal int64) []byte {
+	i.reduce()
+	var tmp [binary.MaxVarintLen64]byte
+
+	initSize := len(b)
+	// We make the start a skippable header+size.
+	b = append(b, ChunkTypeIndex, 0, 0, 0)
+	b = append(b, []byte(S2IndexHeader)...)
+	// Total Uncompressed size
+	n := binary.PutVarint(tmp[:], uncompTotal)
+	b = append(b, tmp[:n]...)
+	// Total Compressed size
+	n = binary.PutVarint(tmp[:], compTotal)
+	b = append(b, tmp[:n]...)
+	// Put EstBlockUncomp size
+	n = binary.PutVarint(tmp[:], i.estBlockUncomp)
+	b = append(b, tmp[:n]...)
+	// Put length
+	n = binary.PutVarint(tmp[:], int64(len(i.info)))
+	b = append(b, tmp[:n]...)
+
+	// Check if we should add uncompressed offsets
+	var hasUncompressed byte
+	for idx, info := range i.info {
+		if idx == 0 {
+			if info.uncompressedOffset != 0 {
+				hasUncompressed = 1
+				break
+			}
+			continue
+		}
+		if info.uncompressedOffset != i.info[idx-1].uncompressedOffset+i.estBlockUncomp {
+			hasUncompressed = 1
+			break
+		}
+	}
+	b = append(b, hasUncompressed)
+
+	// Add each entry
+	if hasUncompressed == 1 {
+		for idx, info := range i.info {
+			uOff := info.uncompressedOffset
+			if idx > 0 {
+				prev := i.info[idx-1]
+				uOff -= prev.uncompressedOffset + (i.estBlockUncomp)
+			}
+			n = binary.PutVarint(tmp[:], uOff)
+			b = append(b, tmp[:n]...)
+		}
+	}
+
+	// Initial compressed size estimate.
+	cPredict := i.estBlockUncomp / 2
+
+	for idx, info := range i.info {
+		cOff := info.compressedOffset
+		if idx > 0 {
+			prev := i.info[idx-1]
+			cOff -= prev.compressedOffset + cPredict
+			// Update compressed size prediction, with half the error.
+			cPredict += cOff / 2
+		}
+		n = binary.PutVarint(tmp[:], cOff)
+		b = append(b, tmp[:n]...)
+	}
+
+	// Add Total Size.
+	// Stored as fixed size for easier reading.
+	binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)-initSize+4+len(S2IndexTrailer)))
+	b = append(b, tmp[:4]...)
+	// Trailer
+	b = append(b, []byte(S2IndexTrailer)...)
+
+	// Update size
+	chunkLen := len(b) - initSize - skippableFrameHeader
+	b[initSize+1] = uint8(chunkLen >> 0)
+	b[initSize+2] = uint8(chunkLen >> 8)
+	b[initSize+3] = uint8(chunkLen >> 16)
+	//fmt.Printf("chunklen: 0x%x Uncomp:%d, Comp:%d\n", chunkLen, uncompTotal, compTotal)
+	return b
+}
+
+// Load a binary index.
+// A zero value Index can be used or a previous one can be reused.
+func (i *Index) Load(b []byte) ([]byte, error) {
+	if len(b) <= 4+len(S2IndexHeader)+len(S2IndexTrailer) {
+		return b, io.ErrUnexpectedEOF
+	}
+	if b[0] != ChunkTypeIndex {
+		return b, ErrCorrupt
+	}
+	chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
+	b = b[4:]
+
+	// Validate we have enough...
+	if len(b) < chunkLen {
+		return b, io.ErrUnexpectedEOF
+	}
+	if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
+		return b, ErrUnsupported
+	}
+	b = b[len(S2IndexHeader):]
+
+	// Total Uncompressed
+	if v, n := binary.Varint(b); n <= 0 || v < 0 {
+		return b, ErrCorrupt
+	} else {
+		i.TotalUncompressed = v
+		b = b[n:]
+	}
+
+	// Total Compressed
+	if v, n := binary.Varint(b); n <= 0 {
+		return b, ErrCorrupt
+	} else {
+		i.TotalCompressed = v
+		b = b[n:]
+	}
+
+	// Read EstBlockUncomp
+	if v, n := binary.Varint(b); n <= 0 {
+		return b, ErrCorrupt
+	} else {
+		if v < 0 {
+			return b, ErrCorrupt
+		}
+		i.estBlockUncomp = v
+		b = b[n:]
+	}
+
+	var entries int
+	if v, n := binary.Varint(b); n <= 0 {
+		return b, ErrCorrupt
+	} else {
+		if v < 0 || v > maxIndexEntries {
+			return b, ErrCorrupt
+		}
+		entries = int(v)
+		b = b[n:]
+	}
+	if cap(i.info) < entries {
+		i.allocInfos(entries)
+	}
+	i.info = i.info[:entries]
+
+	if len(b) < 1 {
+		return b, io.ErrUnexpectedEOF
+	}
+	hasUncompressed := b[0]
+	b = b[1:]
+	if hasUncompressed&1 != hasUncompressed {
+		return b, ErrCorrupt
+	}
+
+	// Add each uncompressed entry
+	for idx := range i.info {
+		var uOff int64
+		if hasUncompressed != 0 {
+			// Load delta
+			if v, n := binary.Varint(b); n <= 0 {
+				return b, ErrCorrupt
+			} else {
+				uOff = v
+				b = b[n:]
+			}
+		}
+
+		if idx > 0 {
+			prev := i.info[idx-1].uncompressedOffset
+			uOff += prev + (i.estBlockUncomp)
+			if uOff <= prev {
+				return b, ErrCorrupt
+			}
+		}
+		if uOff < 0 {
+			return b, ErrCorrupt
+		}
+		i.info[idx].uncompressedOffset = uOff
+	}
+
+	// Initial compressed size estimate.
+	cPredict := i.estBlockUncomp / 2
+
+	// Add each compressed entry
+	for idx := range i.info {
+		var cOff int64
+		if v, n := binary.Varint(b); n <= 0 {
+			return b, ErrCorrupt
+		} else {
+			cOff = v
+			b = b[n:]
+		}
+
+		if idx > 0 {
+			// Update compressed size prediction, with half the error.
+			cPredictNew := cPredict + cOff/2
+
+			prev := i.info[idx-1].compressedOffset
+			cOff += prev + cPredict
+			if cOff <= prev {
+				return b, ErrCorrupt
+			}
+			cPredict = cPredictNew
+		}
+		if cOff < 0 {
+			return b, ErrCorrupt
+		}
+		i.info[idx].compressedOffset = cOff
+	}
+	if len(b) < 4+len(S2IndexTrailer) {
+		return b, io.ErrUnexpectedEOF
+	}
+	// Skip size...
+	b = b[4:]
+
+	// Check trailer...
+	if !bytes.Equal(b[:len(S2IndexTrailer)], []byte(S2IndexTrailer)) {
+		return b, ErrCorrupt
+	}
+	return b[len(S2IndexTrailer):], nil
+}
+
+// LoadStream will load an index from the end of the supplied stream.
+// ErrUnsupported will be returned if the signature cannot be found.
+// ErrCorrupt will be returned if unexpected values are found.
+// io.ErrUnexpectedEOF is returned if there are too few bytes.
+// IO errors are returned as-is.
+func (i *Index) LoadStream(rs io.ReadSeeker) error {
+	// Go to end.
+	_, err := rs.Seek(-10, io.SeekEnd)
+	if err != nil {
+		return err
+	}
+	var tmp [10]byte
+	_, err = io.ReadFull(rs, tmp[:])
+	if err != nil {
+		return err
+	}
+	// Check trailer...
+	if !bytes.Equal(tmp[4:4+len(S2IndexTrailer)], []byte(S2IndexTrailer)) {
+		return ErrUnsupported
+	}
+	sz := binary.LittleEndian.Uint32(tmp[:4])
+	if sz > maxChunkSize+skippableFrameHeader {
+		return ErrCorrupt
+	}
+	_, err = rs.Seek(-int64(sz), io.SeekEnd)
+	if err != nil {
+		return err
+	}
+
+	// Read index.
+	buf := make([]byte, sz)
+	_, err = io.ReadFull(rs, buf)
+	if err != nil {
+		return err
+	}
+	_, err = i.Load(buf)
+	return err
+}
+
+// IndexStream will return an index for a stream.
+// The stream structure will be checked, but
+// data within blocks is not verified.
+// The returned index can either be appended to the end of the stream
+// or stored separately.
+func IndexStream(r io.Reader) ([]byte, error) {
+	var i Index
+	var buf [maxChunkSize]byte
+	var readHeader bool
+	for {
+		_, err := io.ReadFull(r, buf[:4])
+		if err != nil {
+			if err == io.EOF {
+				return i.appendTo(nil, i.TotalUncompressed, i.TotalCompressed), nil
+			}
+			return nil, err
+		}
+		// Start of this chunk.
+		startChunk := i.TotalCompressed
+		i.TotalCompressed += 4
+
+		chunkType := buf[0]
+		if !readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				return nil, ErrCorrupt
+			}
+			readHeader = true
+		}
+		chunkLen := int(buf[1]) | int(buf[2])<<8 | int(buf[3])<<16
+		if chunkLen < checksumSize {
+			return nil, ErrCorrupt
+		}
+
+		i.TotalCompressed += int64(chunkLen)
+		_, err = io.ReadFull(r, buf[:chunkLen])
+		if err != nil {
+			return nil, io.ErrUnexpectedEOF
+		}
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			// Section 4.2. Compressed data (chunk type 0x00).
+			// Skip checksum.
+			dLen, err := DecodedLen(buf[checksumSize:])
+			if err != nil {
+				return nil, err
+			}
+			if dLen > maxBlockSize {
+				return nil, ErrCorrupt
+			}
+			if i.estBlockUncomp == 0 {
+				// Use first block for estimate...
+				i.estBlockUncomp = int64(dLen)
+			}
+			err = i.add(startChunk, i.TotalUncompressed)
+			if err != nil {
+				return nil, err
+			}
+			i.TotalUncompressed += int64(dLen)
+			continue
+		case chunkTypeUncompressedData:
+			n2 := chunkLen - checksumSize
+			if n2 > maxBlockSize {
+				return nil, ErrCorrupt
+			}
+			if i.estBlockUncomp == 0 {
+				// Use first block for estimate...
+				i.estBlockUncomp = int64(n2)
+			}
+			err = i.add(startChunk, i.TotalUncompressed)
+			if err != nil {
+				return nil, err
+			}
+			i.TotalUncompressed += int64(n2)
+			continue
+		case chunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(magicBody) {
+				return nil, ErrCorrupt
+			}
+
+			if string(buf[:len(magicBody)]) != magicBody {
+				if string(buf[:len(magicBody)]) != magicBodySnappy {
+					return nil, ErrCorrupt
+				}
+			}
+
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			return nil, ErrUnsupported
+		}
+		if chunkLen > maxChunkSize {
+			return nil, ErrUnsupported
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+	}
+}
+
+// JSON returns the index as JSON text.
+func (i *Index) JSON() []byte {
+	type offset struct {
+		CompressedOffset   int64 `json:"compressed"`
+		UncompressedOffset int64 `json:"uncompressed"`
+	}
+	x := struct {
+		TotalUncompressed int64    `json:"total_uncompressed"` // Total Uncompressed size if known. Will be -1 if unknown.
+		TotalCompressed   int64    `json:"total_compressed"`   // Total Compressed size if known. Will be -1 if unknown.
+		Offsets           []offset `json:"offsets"`
+		EstBlockUncomp    int64    `json:"est_block_uncompressed"`
+	}{
+		TotalUncompressed: i.TotalUncompressed,
+		TotalCompressed:   i.TotalCompressed,
+		EstBlockUncomp:    i.estBlockUncomp,
+	}
+	for _, v := range i.info {
+		x.Offsets = append(x.Offsets, offset{CompressedOffset: v.compressedOffset, UncompressedOffset: v.uncompressedOffset})
+	}
+	b, _ := json.MarshalIndent(x, "", "  ")
+	return b
+}
+
+// RemoveIndexHeaders will trim all headers and trailers from a given index.
+// This is expected to save 20 bytes.
+// These can be restored using RestoreIndexHeaders.
+// This removes a layer of security, but is the most compact representation.
+// Returns nil if headers contains errors.
+// The returned slice references the provided slice.
+func RemoveIndexHeaders(b []byte) []byte {
+	const save = 4 + len(S2IndexHeader) + len(S2IndexTrailer) + 4
+	if len(b) <= save {
+		return nil
+	}
+	if b[0] != ChunkTypeIndex {
+		return nil
+	}
+	chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
+	b = b[4:]
+
+	// Validate we have enough...
+	if len(b) < chunkLen {
+		return nil
+	}
+	b = b[:chunkLen]
+
+	if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
+		return nil
+	}
+	b = b[len(S2IndexHeader):]
+	if !bytes.HasSuffix(b, []byte(S2IndexTrailer)) {
+		return nil
+	}
+	b = bytes.TrimSuffix(b, []byte(S2IndexTrailer))
+
+	if len(b) < 4 {
+		return nil
+	}
+	return b[:len(b)-4]
+}
+
+// RestoreIndexHeaders will index restore headers removed by RemoveIndexHeaders.
+// No error checking is performed on the input.
+// If a 0 length slice is sent, it is returned without modification.
+func RestoreIndexHeaders(in []byte) []byte {
+	if len(in) == 0 {
+		return in
+	}
+	b := make([]byte, 0, 4+len(S2IndexHeader)+len(in)+len(S2IndexTrailer)+4)
+	b = append(b, ChunkTypeIndex, 0, 0, 0)
+	b = append(b, []byte(S2IndexHeader)...)
+	b = append(b, in...)
+
+	var tmp [4]byte
+	binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)+4+len(S2IndexTrailer)))
+	b = append(b, tmp[:4]...)
+	// Trailer
+	b = append(b, []byte(S2IndexTrailer)...)
+
+	chunkLen := len(b) - skippableFrameHeader
+	b[1] = uint8(chunkLen >> 0)
+	b[2] = uint8(chunkLen >> 8)
+	b[3] = uint8(chunkLen >> 16)
+	return b
+}
diff --git a/vendor/github.com/klauspost/compress/s2/lz4convert.go b/vendor/github.com/klauspost/compress/s2/lz4convert.go
new file mode 100644
index 0000000..46ed908
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/lz4convert.go
@@ -0,0 +1,585 @@
+// Copyright (c) 2022 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+)
+
+// LZ4Converter provides conversion from LZ4 blocks as defined here:
+// https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md
+type LZ4Converter struct {
+}
+
+// ErrDstTooSmall is returned when provided destination is too small.
+var ErrDstTooSmall = errors.New("s2: destination too small")
+
+// ConvertBlock will convert an LZ4 block and append it as an S2
+// block without block length to dst.
+// The uncompressed size is returned as well.
+// dst must have capacity to contain the entire compressed block.
+func (l *LZ4Converter) ConvertBlock(dst, src []byte) ([]byte, int, error) {
+	if len(src) == 0 {
+		return dst, 0, nil
+	}
+	const debug = false
+	const inline = true
+	const lz4MinMatch = 4
+
+	s, d := 0, len(dst)
+	dst = dst[:cap(dst)]
+	if !debug && hasAmd64Asm {
+		res, sz := cvtLZ4BlockAsm(dst[d:], src)
+		if res < 0 {
+			const (
+				errCorrupt     = -1
+				errDstTooSmall = -2
+			)
+			switch res {
+			case errCorrupt:
+				return nil, 0, ErrCorrupt
+			case errDstTooSmall:
+				return nil, 0, ErrDstTooSmall
+			default:
+				return nil, 0, fmt.Errorf("unexpected result: %d", res)
+			}
+		}
+		if d+sz > len(dst) {
+			return nil, 0, ErrDstTooSmall
+		}
+		return dst[:d+sz], res, nil
+	}
+
+	dLimit := len(dst) - 10
+	var lastOffset uint16
+	var uncompressed int
+	if debug {
+		fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
+	}
+
+	for {
+		if s >= len(src) {
+			return dst[:d], 0, ErrCorrupt
+		}
+		// Read literal info
+		token := src[s]
+		ll := int(token >> 4)
+		ml := int(lz4MinMatch + (token & 0xf))
+
+		// If upper nibble is 15, literal length is extended
+		if token >= 0xf0 {
+			for {
+				s++
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return dst[:d], 0, ErrCorrupt
+				}
+				val := src[s]
+				ll += int(val)
+				if val != 255 {
+					break
+				}
+			}
+		}
+		// Skip past token
+		if s+ll >= len(src) {
+			if debug {
+				fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
+			}
+			return nil, 0, ErrCorrupt
+		}
+		s++
+		if ll > 0 {
+			if d+ll > dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+			if debug {
+				fmt.Printf("emit %d literals\n", ll)
+			}
+			d += emitLiteralGo(dst[d:], src[s:s+ll])
+			s += ll
+			uncompressed += ll
+		}
+
+		// Check if we are done...
+		if s == len(src) && ml == lz4MinMatch {
+			break
+		}
+		// 2 byte offset
+		if s >= len(src)-2 {
+			if debug {
+				fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		offset := binary.LittleEndian.Uint16(src[s:])
+		s += 2
+		if offset == 0 {
+			if debug {
+				fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		if int(offset) > uncompressed {
+			if debug {
+				fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
+			}
+			return nil, 0, ErrCorrupt
+		}
+
+		if ml == lz4MinMatch+15 {
+			for {
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				s++
+				ml += int(val)
+				if val != 255 {
+					if s >= len(src) {
+						if debug {
+							fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+						}
+						return nil, 0, ErrCorrupt
+					}
+					break
+				}
+			}
+		}
+		if offset == lastOffset {
+			if debug {
+				fmt.Printf("emit repeat, length: %d, offset: %d\n", ml, offset)
+			}
+			if !inline {
+				d += emitRepeat16(dst[d:], offset, ml)
+			} else {
+				length := ml
+				dst := dst[d:]
+				for len(dst) > 5 {
+					// Repeat offset, make length cheaper
+					length -= 4
+					if length <= 4 {
+						dst[0] = uint8(length)<<2 | tagCopy1
+						dst[1] = 0
+						d += 2
+						break
+					}
+					if length < 8 && offset < 2048 {
+						// Encode WITH offset
+						dst[1] = uint8(offset)
+						dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
+						d += 2
+						break
+					}
+					if length < (1<<8)+4 {
+						length -= 4
+						dst[2] = uint8(length)
+						dst[1] = 0
+						dst[0] = 5<<2 | tagCopy1
+						d += 3
+						break
+					}
+					if length < (1<<16)+(1<<8) {
+						length -= 1 << 8
+						dst[3] = uint8(length >> 8)
+						dst[2] = uint8(length >> 0)
+						dst[1] = 0
+						dst[0] = 6<<2 | tagCopy1
+						d += 4
+						break
+					}
+					const maxRepeat = (1 << 24) - 1
+					length -= 1 << 16
+					left := 0
+					if length > maxRepeat {
+						left = length - maxRepeat + 4
+						length = maxRepeat - 4
+					}
+					dst[4] = uint8(length >> 16)
+					dst[3] = uint8(length >> 8)
+					dst[2] = uint8(length >> 0)
+					dst[1] = 0
+					dst[0] = 7<<2 | tagCopy1
+					if left > 0 {
+						d += 5 + emitRepeat16(dst[5:], offset, left)
+						break
+					}
+					d += 5
+					break
+				}
+			}
+		} else {
+			if debug {
+				fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
+			}
+			if !inline {
+				d += emitCopy16(dst[d:], offset, ml)
+			} else {
+				length := ml
+				dst := dst[d:]
+				for len(dst) > 5 {
+					// Offset no more than 2 bytes.
+					if length > 64 {
+						off := 3
+						if offset < 2048 {
+							// emit 8 bytes as tagCopy1, rest as repeats.
+							dst[1] = uint8(offset)
+							dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
+							length -= 8
+							off = 2
+						} else {
+							// Emit a length 60 copy, encoded as 3 bytes.
+							// Emit remaining as repeat value (minimum 4 bytes).
+							dst[2] = uint8(offset >> 8)
+							dst[1] = uint8(offset)
+							dst[0] = 59<<2 | tagCopy2
+							length -= 60
+						}
+						// Emit remaining as repeats, at least 4 bytes remain.
+						d += off + emitRepeat16(dst[off:], offset, length)
+						break
+					}
+					if length >= 12 || offset >= 2048 {
+						// Emit the remaining copy, encoded as 3 bytes.
+						dst[2] = uint8(offset >> 8)
+						dst[1] = uint8(offset)
+						dst[0] = uint8(length-1)<<2 | tagCopy2
+						d += 3
+						break
+					}
+					// Emit the remaining copy, encoded as 2 bytes.
+					dst[1] = uint8(offset)
+					dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+					d += 2
+					break
+				}
+			}
+			lastOffset = offset
+		}
+		uncompressed += ml
+		if d > dLimit {
+			return nil, 0, ErrDstTooSmall
+		}
+	}
+
+	return dst[:d], uncompressed, nil
+}
+
+// ConvertBlockSnappy will convert an LZ4 block and append it
+// as a Snappy block without block length to dst.
+// The uncompressed size is returned as well.
+// dst must have capacity to contain the entire compressed block.
+func (l *LZ4Converter) ConvertBlockSnappy(dst, src []byte) ([]byte, int, error) {
+	if len(src) == 0 {
+		return dst, 0, nil
+	}
+	const debug = false
+	const lz4MinMatch = 4
+
+	s, d := 0, len(dst)
+	dst = dst[:cap(dst)]
+	// Use assembly when possible
+	if !debug && hasAmd64Asm {
+		res, sz := cvtLZ4BlockSnappyAsm(dst[d:], src)
+		if res < 0 {
+			const (
+				errCorrupt     = -1
+				errDstTooSmall = -2
+			)
+			switch res {
+			case errCorrupt:
+				return nil, 0, ErrCorrupt
+			case errDstTooSmall:
+				return nil, 0, ErrDstTooSmall
+			default:
+				return nil, 0, fmt.Errorf("unexpected result: %d", res)
+			}
+		}
+		if d+sz > len(dst) {
+			return nil, 0, ErrDstTooSmall
+		}
+		return dst[:d+sz], res, nil
+	}
+
+	dLimit := len(dst) - 10
+	var uncompressed int
+	if debug {
+		fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
+	}
+
+	for {
+		if s >= len(src) {
+			return nil, 0, ErrCorrupt
+		}
+		// Read literal info
+		token := src[s]
+		ll := int(token >> 4)
+		ml := int(lz4MinMatch + (token & 0xf))
+
+		// If upper nibble is 15, literal length is extended
+		if token >= 0xf0 {
+			for {
+				s++
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				ll += int(val)
+				if val != 255 {
+					break
+				}
+			}
+		}
+		// Skip past token
+		if s+ll >= len(src) {
+			if debug {
+				fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
+			}
+			return nil, 0, ErrCorrupt
+		}
+		s++
+		if ll > 0 {
+			if d+ll > dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+			if debug {
+				fmt.Printf("emit %d literals\n", ll)
+			}
+			d += emitLiteralGo(dst[d:], src[s:s+ll])
+			s += ll
+			uncompressed += ll
+		}
+
+		// Check if we are done...
+		if s == len(src) && ml == lz4MinMatch {
+			break
+		}
+		// 2 byte offset
+		if s >= len(src)-2 {
+			if debug {
+				fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		offset := binary.LittleEndian.Uint16(src[s:])
+		s += 2
+		if offset == 0 {
+			if debug {
+				fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		if int(offset) > uncompressed {
+			if debug {
+				fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
+			}
+			return nil, 0, ErrCorrupt
+		}
+
+		if ml == lz4MinMatch+15 {
+			for {
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				s++
+				ml += int(val)
+				if val != 255 {
+					if s >= len(src) {
+						if debug {
+							fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+						}
+						return nil, 0, ErrCorrupt
+					}
+					break
+				}
+			}
+		}
+		if debug {
+			fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
+		}
+		length := ml
+		// d += emitCopyNoRepeat(dst[d:], int(offset), ml)
+		for length > 0 {
+			if d >= dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+
+			// Offset no more than 2 bytes.
+			if length > 64 {
+				// Emit a length 64 copy, encoded as 3 bytes.
+				dst[d+2] = uint8(offset >> 8)
+				dst[d+1] = uint8(offset)
+				dst[d+0] = 63<<2 | tagCopy2
+				length -= 64
+				d += 3
+				continue
+			}
+			if length >= 12 || offset >= 2048 || length < 4 {
+				// Emit the remaining copy, encoded as 3 bytes.
+				dst[d+2] = uint8(offset >> 8)
+				dst[d+1] = uint8(offset)
+				dst[d+0] = uint8(length-1)<<2 | tagCopy2
+				d += 3
+				break
+			}
+			// Emit the remaining copy, encoded as 2 bytes.
+			dst[d+1] = uint8(offset)
+			dst[d+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+			d += 2
+			break
+		}
+		uncompressed += ml
+		if d > dLimit {
+			return nil, 0, ErrDstTooSmall
+		}
+	}
+
+	return dst[:d], uncompressed, nil
+}
+
+// emitRepeat writes a repeat chunk and returns the number of bytes written.
+// Length must be at least 4 and < 1<<24
+func emitRepeat16(dst []byte, offset uint16, length int) int {
+	// Repeat offset, make length cheaper
+	length -= 4
+	if length <= 4 {
+		dst[0] = uint8(length)<<2 | tagCopy1
+		dst[1] = 0
+		return 2
+	}
+	if length < 8 && offset < 2048 {
+		// Encode WITH offset
+		dst[1] = uint8(offset)
+		dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
+		return 2
+	}
+	if length < (1<<8)+4 {
+		length -= 4
+		dst[2] = uint8(length)
+		dst[1] = 0
+		dst[0] = 5<<2 | tagCopy1
+		return 3
+	}
+	if length < (1<<16)+(1<<8) {
+		length -= 1 << 8
+		dst[3] = uint8(length >> 8)
+		dst[2] = uint8(length >> 0)
+		dst[1] = 0
+		dst[0] = 6<<2 | tagCopy1
+		return 4
+	}
+	const maxRepeat = (1 << 24) - 1
+	length -= 1 << 16
+	left := 0
+	if length > maxRepeat {
+		left = length - maxRepeat + 4
+		length = maxRepeat - 4
+	}
+	dst[4] = uint8(length >> 16)
+	dst[3] = uint8(length >> 8)
+	dst[2] = uint8(length >> 0)
+	dst[1] = 0
+	dst[0] = 7<<2 | tagCopy1
+	if left > 0 {
+		return 5 + emitRepeat16(dst[5:], offset, left)
+	}
+	return 5
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint16
+//	4 <= length && length <= math.MaxUint32
+func emitCopy16(dst []byte, offset uint16, length int) int {
+	// Offset no more than 2 bytes.
+	if length > 64 {
+		off := 3
+		if offset < 2048 {
+			// emit 8 bytes as tagCopy1, rest as repeats.
+			dst[1] = uint8(offset)
+			dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
+			length -= 8
+			off = 2
+		} else {
+			// Emit a length 60 copy, encoded as 3 bytes.
+			// Emit remaining as repeat value (minimum 4 bytes).
+			dst[2] = uint8(offset >> 8)
+			dst[1] = uint8(offset)
+			dst[0] = 59<<2 | tagCopy2
+			length -= 60
+		}
+		// Emit remaining as repeats, at least 4 bytes remain.
+		return off + emitRepeat16(dst[off:], offset, length)
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[2] = uint8(offset >> 8)
+		dst[1] = uint8(offset)
+		dst[0] = uint8(length-1)<<2 | tagCopy2
+		return 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[1] = uint8(offset)
+	dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	return 2
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	0 <= len(lit) && len(lit) <= math.MaxUint32
+func emitLiteralGo(dst, lit []byte) int {
+	if len(lit) == 0 {
+		return 0
+	}
+	i, n := 0, uint(len(lit)-1)
+	switch {
+	case n < 60:
+		dst[0] = uint8(n)<<2 | tagLiteral
+		i = 1
+	case n < 1<<8:
+		dst[1] = uint8(n)
+		dst[0] = 60<<2 | tagLiteral
+		i = 2
+	case n < 1<<16:
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 61<<2 | tagLiteral
+		i = 3
+	case n < 1<<24:
+		dst[3] = uint8(n >> 16)
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 62<<2 | tagLiteral
+		i = 4
+	default:
+		dst[4] = uint8(n >> 24)
+		dst[3] = uint8(n >> 16)
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 63<<2 | tagLiteral
+		i = 5
+	}
+	return i + copy(dst[i:], lit)
+}
diff --git a/vendor/github.com/klauspost/compress/s2/lz4sconvert.go b/vendor/github.com/klauspost/compress/s2/lz4sconvert.go
new file mode 100644
index 0000000..000f397
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/lz4sconvert.go
@@ -0,0 +1,467 @@
+// Copyright (c) 2022 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"encoding/binary"
+	"fmt"
+)
+
+// LZ4sConverter provides conversion from LZ4s.
+// (Intel modified LZ4 Blocks)
+// https://cdrdv2-public.intel.com/743912/743912-qat-programmers-guide-v2.0.pdf
+// LZ4s is a variant of LZ4 block format. LZ4s should be considered as an intermediate compressed block format.
+// The LZ4s format is selected when the application sets the compType to CPA_DC_LZ4S in CpaDcSessionSetupData.
+// The LZ4s block returned by the Intel® QAT hardware can be used by an external
+// software post-processing to generate other compressed data formats.
+// The following table lists the differences between LZ4 and LZ4s block format. LZ4s block format uses
+// the same high-level formatting as LZ4 block format with the following encoding changes:
+// For Min Match of 4 bytes, Copy length value 1-15 means length 4-18 with 18 bytes adding an extra byte.
+// ONLY "Min match of 4 bytes" is supported.
+type LZ4sConverter struct {
+}
+
+// ConvertBlock will convert an LZ4s block and append it as an S2
+// block without block length to dst.
+// The uncompressed size is returned as well.
+// dst must have capacity to contain the entire compressed block.
+func (l *LZ4sConverter) ConvertBlock(dst, src []byte) ([]byte, int, error) {
+	if len(src) == 0 {
+		return dst, 0, nil
+	}
+	const debug = false
+	const inline = true
+	const lz4MinMatch = 3
+
+	s, d := 0, len(dst)
+	dst = dst[:cap(dst)]
+	if !debug && hasAmd64Asm {
+		res, sz := cvtLZ4sBlockAsm(dst[d:], src)
+		if res < 0 {
+			const (
+				errCorrupt     = -1
+				errDstTooSmall = -2
+			)
+			switch res {
+			case errCorrupt:
+				return nil, 0, ErrCorrupt
+			case errDstTooSmall:
+				return nil, 0, ErrDstTooSmall
+			default:
+				return nil, 0, fmt.Errorf("unexpected result: %d", res)
+			}
+		}
+		if d+sz > len(dst) {
+			return nil, 0, ErrDstTooSmall
+		}
+		return dst[:d+sz], res, nil
+	}
+
+	dLimit := len(dst) - 10
+	var lastOffset uint16
+	var uncompressed int
+	if debug {
+		fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
+	}
+
+	for {
+		if s >= len(src) {
+			return dst[:d], 0, ErrCorrupt
+		}
+		// Read literal info
+		token := src[s]
+		ll := int(token >> 4)
+		ml := int(lz4MinMatch + (token & 0xf))
+
+		// If upper nibble is 15, literal length is extended
+		if token >= 0xf0 {
+			for {
+				s++
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return dst[:d], 0, ErrCorrupt
+				}
+				val := src[s]
+				ll += int(val)
+				if val != 255 {
+					break
+				}
+			}
+		}
+		// Skip past token
+		if s+ll >= len(src) {
+			if debug {
+				fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
+			}
+			return nil, 0, ErrCorrupt
+		}
+		s++
+		if ll > 0 {
+			if d+ll > dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+			if debug {
+				fmt.Printf("emit %d literals\n", ll)
+			}
+			d += emitLiteralGo(dst[d:], src[s:s+ll])
+			s += ll
+			uncompressed += ll
+		}
+
+		// Check if we are done...
+		if ml == lz4MinMatch {
+			if s == len(src) {
+				break
+			}
+			// 0 bytes.
+			continue
+		}
+		// 2 byte offset
+		if s >= len(src)-2 {
+			if debug {
+				fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		offset := binary.LittleEndian.Uint16(src[s:])
+		s += 2
+		if offset == 0 {
+			if debug {
+				fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		if int(offset) > uncompressed {
+			if debug {
+				fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
+			}
+			return nil, 0, ErrCorrupt
+		}
+
+		if ml == lz4MinMatch+15 {
+			for {
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				s++
+				ml += int(val)
+				if val != 255 {
+					if s >= len(src) {
+						if debug {
+							fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+						}
+						return nil, 0, ErrCorrupt
+					}
+					break
+				}
+			}
+		}
+		if offset == lastOffset {
+			if debug {
+				fmt.Printf("emit repeat, length: %d, offset: %d\n", ml, offset)
+			}
+			if !inline {
+				d += emitRepeat16(dst[d:], offset, ml)
+			} else {
+				length := ml
+				dst := dst[d:]
+				for len(dst) > 5 {
+					// Repeat offset, make length cheaper
+					length -= 4
+					if length <= 4 {
+						dst[0] = uint8(length)<<2 | tagCopy1
+						dst[1] = 0
+						d += 2
+						break
+					}
+					if length < 8 && offset < 2048 {
+						// Encode WITH offset
+						dst[1] = uint8(offset)
+						dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
+						d += 2
+						break
+					}
+					if length < (1<<8)+4 {
+						length -= 4
+						dst[2] = uint8(length)
+						dst[1] = 0
+						dst[0] = 5<<2 | tagCopy1
+						d += 3
+						break
+					}
+					if length < (1<<16)+(1<<8) {
+						length -= 1 << 8
+						dst[3] = uint8(length >> 8)
+						dst[2] = uint8(length >> 0)
+						dst[1] = 0
+						dst[0] = 6<<2 | tagCopy1
+						d += 4
+						break
+					}
+					const maxRepeat = (1 << 24) - 1
+					length -= 1 << 16
+					left := 0
+					if length > maxRepeat {
+						left = length - maxRepeat + 4
+						length = maxRepeat - 4
+					}
+					dst[4] = uint8(length >> 16)
+					dst[3] = uint8(length >> 8)
+					dst[2] = uint8(length >> 0)
+					dst[1] = 0
+					dst[0] = 7<<2 | tagCopy1
+					if left > 0 {
+						d += 5 + emitRepeat16(dst[5:], offset, left)
+						break
+					}
+					d += 5
+					break
+				}
+			}
+		} else {
+			if debug {
+				fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
+			}
+			if !inline {
+				d += emitCopy16(dst[d:], offset, ml)
+			} else {
+				length := ml
+				dst := dst[d:]
+				for len(dst) > 5 {
+					// Offset no more than 2 bytes.
+					if length > 64 {
+						off := 3
+						if offset < 2048 {
+							// emit 8 bytes as tagCopy1, rest as repeats.
+							dst[1] = uint8(offset)
+							dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
+							length -= 8
+							off = 2
+						} else {
+							// Emit a length 60 copy, encoded as 3 bytes.
+							// Emit remaining as repeat value (minimum 4 bytes).
+							dst[2] = uint8(offset >> 8)
+							dst[1] = uint8(offset)
+							dst[0] = 59<<2 | tagCopy2
+							length -= 60
+						}
+						// Emit remaining as repeats, at least 4 bytes remain.
+						d += off + emitRepeat16(dst[off:], offset, length)
+						break
+					}
+					if length >= 12 || offset >= 2048 {
+						// Emit the remaining copy, encoded as 3 bytes.
+						dst[2] = uint8(offset >> 8)
+						dst[1] = uint8(offset)
+						dst[0] = uint8(length-1)<<2 | tagCopy2
+						d += 3
+						break
+					}
+					// Emit the remaining copy, encoded as 2 bytes.
+					dst[1] = uint8(offset)
+					dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+					d += 2
+					break
+				}
+			}
+			lastOffset = offset
+		}
+		uncompressed += ml
+		if d > dLimit {
+			return nil, 0, ErrDstTooSmall
+		}
+	}
+
+	return dst[:d], uncompressed, nil
+}
+
+// ConvertBlockSnappy will convert an LZ4s block and append it
+// as a Snappy block without block length to dst.
+// The uncompressed size is returned as well.
+// dst must have capacity to contain the entire compressed block.
+func (l *LZ4sConverter) ConvertBlockSnappy(dst, src []byte) ([]byte, int, error) {
+	if len(src) == 0 {
+		return dst, 0, nil
+	}
+	const debug = false
+	const lz4MinMatch = 3
+
+	s, d := 0, len(dst)
+	dst = dst[:cap(dst)]
+	// Use assembly when possible
+	if !debug && hasAmd64Asm {
+		res, sz := cvtLZ4sBlockSnappyAsm(dst[d:], src)
+		if res < 0 {
+			const (
+				errCorrupt     = -1
+				errDstTooSmall = -2
+			)
+			switch res {
+			case errCorrupt:
+				return nil, 0, ErrCorrupt
+			case errDstTooSmall:
+				return nil, 0, ErrDstTooSmall
+			default:
+				return nil, 0, fmt.Errorf("unexpected result: %d", res)
+			}
+		}
+		if d+sz > len(dst) {
+			return nil, 0, ErrDstTooSmall
+		}
+		return dst[:d+sz], res, nil
+	}
+
+	dLimit := len(dst) - 10
+	var uncompressed int
+	if debug {
+		fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
+	}
+
+	for {
+		if s >= len(src) {
+			return nil, 0, ErrCorrupt
+		}
+		// Read literal info
+		token := src[s]
+		ll := int(token >> 4)
+		ml := int(lz4MinMatch + (token & 0xf))
+
+		// If upper nibble is 15, literal length is extended
+		if token >= 0xf0 {
+			for {
+				s++
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				ll += int(val)
+				if val != 255 {
+					break
+				}
+			}
+		}
+		// Skip past token
+		if s+ll >= len(src) {
+			if debug {
+				fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
+			}
+			return nil, 0, ErrCorrupt
+		}
+		s++
+		if ll > 0 {
+			if d+ll > dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+			if debug {
+				fmt.Printf("emit %d literals\n", ll)
+			}
+			d += emitLiteralGo(dst[d:], src[s:s+ll])
+			s += ll
+			uncompressed += ll
+		}
+
+		// Check if we are done...
+		if ml == lz4MinMatch {
+			if s == len(src) {
+				break
+			}
+			// 0 bytes.
+			continue
+		}
+		// 2 byte offset
+		if s >= len(src)-2 {
+			if debug {
+				fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		offset := binary.LittleEndian.Uint16(src[s:])
+		s += 2
+		if offset == 0 {
+			if debug {
+				fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		if int(offset) > uncompressed {
+			if debug {
+				fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
+			}
+			return nil, 0, ErrCorrupt
+		}
+
+		if ml == lz4MinMatch+15 {
+			for {
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				s++
+				ml += int(val)
+				if val != 255 {
+					if s >= len(src) {
+						if debug {
+							fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+						}
+						return nil, 0, ErrCorrupt
+					}
+					break
+				}
+			}
+		}
+		if debug {
+			fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
+		}
+		length := ml
+		// d += emitCopyNoRepeat(dst[d:], int(offset), ml)
+		for length > 0 {
+			if d >= dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+
+			// Offset no more than 2 bytes.
+			if length > 64 {
+				// Emit a length 64 copy, encoded as 3 bytes.
+				dst[d+2] = uint8(offset >> 8)
+				dst[d+1] = uint8(offset)
+				dst[d+0] = 63<<2 | tagCopy2
+				length -= 64
+				d += 3
+				continue
+			}
+			if length >= 12 || offset >= 2048 || length < 4 {
+				// Emit the remaining copy, encoded as 3 bytes.
+				dst[d+2] = uint8(offset >> 8)
+				dst[d+1] = uint8(offset)
+				dst[d+0] = uint8(length-1)<<2 | tagCopy2
+				d += 3
+				break
+			}
+			// Emit the remaining copy, encoded as 2 bytes.
+			dst[d+1] = uint8(offset)
+			dst[d+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+			d += 2
+			break
+		}
+		uncompressed += ml
+		if d > dLimit {
+			return nil, 0, ErrDstTooSmall
+		}
+	}
+
+	return dst[:d], uncompressed, nil
+}
diff --git a/vendor/github.com/klauspost/compress/s2/reader.go b/vendor/github.com/klauspost/compress/s2/reader.go
new file mode 100644
index 0000000..8372d75
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/reader.go
@@ -0,0 +1,1075 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019+ Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"math"
+	"runtime"
+	"sync"
+)
+
+// ErrCantSeek is returned if the stream cannot be seeked.
+type ErrCantSeek struct {
+	Reason string
+}
+
+// Error returns the error as string.
+func (e ErrCantSeek) Error() string {
+	return fmt.Sprintf("s2: Can't seek because %s", e.Reason)
+}
+
+// NewReader returns a new Reader that decompresses from r, using the framing
+// format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt with S2 changes.
+func NewReader(r io.Reader, opts ...ReaderOption) *Reader {
+	nr := Reader{
+		r:        r,
+		maxBlock: maxBlockSize,
+	}
+	for _, opt := range opts {
+		if err := opt(&nr); err != nil {
+			nr.err = err
+			return &nr
+		}
+	}
+	nr.maxBufSize = MaxEncodedLen(nr.maxBlock) + checksumSize
+	if nr.lazyBuf > 0 {
+		nr.buf = make([]byte, MaxEncodedLen(nr.lazyBuf)+checksumSize)
+	} else {
+		nr.buf = make([]byte, MaxEncodedLen(defaultBlockSize)+checksumSize)
+	}
+	nr.readHeader = nr.ignoreStreamID
+	nr.paramsOK = true
+	return &nr
+}
+
+// ReaderOption is an option for creating a decoder.
+type ReaderOption func(*Reader) error
+
+// ReaderMaxBlockSize allows to control allocations if the stream
+// has been compressed with a smaller WriterBlockSize, or with the default 1MB.
+// Blocks must be this size or smaller to decompress,
+// otherwise the decoder will return ErrUnsupported.
+//
+// For streams compressed with Snappy this can safely be set to 64KB (64 << 10).
+//
+// Default is the maximum limit of 4MB.
+func ReaderMaxBlockSize(blockSize int) ReaderOption {
+	return func(r *Reader) error {
+		if blockSize > maxBlockSize || blockSize <= 0 {
+			return errors.New("s2: block size too large. Must be <= 4MB and > 0")
+		}
+		if r.lazyBuf == 0 && blockSize < defaultBlockSize {
+			r.lazyBuf = blockSize
+		}
+		r.maxBlock = blockSize
+		return nil
+	}
+}
+
+// ReaderAllocBlock allows to control upfront stream allocations
+// and not allocate for frames bigger than this initially.
+// If frames bigger than this is seen a bigger buffer will be allocated.
+//
+// Default is 1MB, which is default output size.
+func ReaderAllocBlock(blockSize int) ReaderOption {
+	return func(r *Reader) error {
+		if blockSize > maxBlockSize || blockSize < 1024 {
+			return errors.New("s2: invalid ReaderAllocBlock. Must be <= 4MB and >= 1024")
+		}
+		r.lazyBuf = blockSize
+		return nil
+	}
+}
+
+// ReaderIgnoreStreamIdentifier will make the reader skip the expected
+// stream identifier at the beginning of the stream.
+// This can be used when serving a stream that has been forwarded to a specific point.
+func ReaderIgnoreStreamIdentifier() ReaderOption {
+	return func(r *Reader) error {
+		r.ignoreStreamID = true
+		return nil
+	}
+}
+
+// ReaderSkippableCB will register a callback for chuncks with the specified ID.
+// ID must be a Reserved skippable chunks ID, 0x80-0xfd (inclusive).
+// For each chunk with the ID, the callback is called with the content.
+// Any returned non-nil error will abort decompression.
+// Only one callback per ID is supported, latest sent will be used.
+// You can peek the stream, triggering the callback, by doing a Read with a 0
+// byte buffer.
+func ReaderSkippableCB(id uint8, fn func(r io.Reader) error) ReaderOption {
+	return func(r *Reader) error {
+		if id < 0x80 || id > 0xfd {
+			return fmt.Errorf("ReaderSkippableCB: Invalid id provided, must be 0x80-0xfd (inclusive)")
+		}
+		r.skippableCB[id-0x80] = fn
+		return nil
+	}
+}
+
+// ReaderIgnoreCRC will make the reader skip CRC calculation and checks.
+func ReaderIgnoreCRC() ReaderOption {
+	return func(r *Reader) error {
+		r.ignoreCRC = true
+		return nil
+	}
+}
+
+// Reader is an io.Reader that can read Snappy-compressed bytes.
+type Reader struct {
+	r           io.Reader
+	err         error
+	decoded     []byte
+	buf         []byte
+	skippableCB [0xff - 0x80]func(r io.Reader) error
+	blockStart  int64 // Uncompressed offset at start of current.
+	index       *Index
+
+	// decoded[i:j] contains decoded bytes that have not yet been passed on.
+	i, j int
+	// maximum block size allowed.
+	maxBlock int
+	// maximum expected buffer size.
+	maxBufSize int
+	// alloc a buffer this size if > 0.
+	lazyBuf        int
+	readHeader     bool
+	paramsOK       bool
+	snappyFrame    bool
+	ignoreStreamID bool
+	ignoreCRC      bool
+}
+
+// GetBufferCapacity returns the capacity of the internal buffer.
+// This might be useful to know when reusing the same reader in combination
+// with the lazy buffer option.
+func (r *Reader) GetBufferCapacity() int {
+	return cap(r.buf)
+}
+
+// ensureBufferSize will ensure that the buffer can take at least n bytes.
+// If false is returned the buffer exceeds maximum allowed size.
+func (r *Reader) ensureBufferSize(n int) bool {
+	if n > r.maxBufSize {
+		r.err = ErrCorrupt
+		return false
+	}
+	if cap(r.buf) >= n {
+		return true
+	}
+	// Realloc buffer.
+	r.buf = make([]byte, n)
+	return true
+}
+
+// Reset discards any buffered data, resets all state, and switches the Snappy
+// reader to read from r. This permits reusing a Reader rather than allocating
+// a new one.
+func (r *Reader) Reset(reader io.Reader) {
+	if !r.paramsOK {
+		return
+	}
+	r.index = nil
+	r.r = reader
+	r.err = nil
+	r.i = 0
+	r.j = 0
+	r.blockStart = 0
+	r.readHeader = r.ignoreStreamID
+}
+
+func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
+	if _, r.err = io.ReadFull(r.r, p); r.err != nil {
+		if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+			r.err = ErrCorrupt
+		}
+		return false
+	}
+	return true
+}
+
+// skippable will skip n bytes.
+// If the supplied reader supports seeking that is used.
+// tmp is used as a temporary buffer for reading.
+// The supplied slice does not need to be the size of the read.
+func (r *Reader) skippable(tmp []byte, n int, allowEOF bool, id uint8) (ok bool) {
+	if id < 0x80 {
+		r.err = fmt.Errorf("internal error: skippable id < 0x80")
+		return false
+	}
+	if fn := r.skippableCB[id-0x80]; fn != nil {
+		rd := io.LimitReader(r.r, int64(n))
+		r.err = fn(rd)
+		if r.err != nil {
+			return false
+		}
+		_, r.err = io.CopyBuffer(ioutil.Discard, rd, tmp)
+		return r.err == nil
+	}
+	if rs, ok := r.r.(io.ReadSeeker); ok {
+		_, err := rs.Seek(int64(n), io.SeekCurrent)
+		if err == nil {
+			return true
+		}
+		if err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+			r.err = ErrCorrupt
+			return false
+		}
+	}
+	for n > 0 {
+		if n < len(tmp) {
+			tmp = tmp[:n]
+		}
+		if _, r.err = io.ReadFull(r.r, tmp); r.err != nil {
+			if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+				r.err = ErrCorrupt
+			}
+			return false
+		}
+		n -= len(tmp)
+	}
+	return true
+}
+
+// Read satisfies the io.Reader interface.
+func (r *Reader) Read(p []byte) (int, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+	for {
+		if r.i < r.j {
+			n := copy(p, r.decoded[r.i:r.j])
+			r.i += n
+			return n, nil
+		}
+		if !r.readFull(r.buf[:4], true) {
+			return 0, r.err
+		}
+		chunkType := r.buf[0]
+		if !r.readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.readHeader = true
+		}
+		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			r.blockStart += int64(r.j)
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err == nil {
+					r.err = ErrUnsupported
+				}
+				return 0, r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			n, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if r.snappyFrame && n > maxSnappyBlockSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+
+			if n > len(r.decoded) {
+				if n > r.maxBlock {
+					r.err = ErrCorrupt
+					return 0, r.err
+				}
+				r.decoded = make([]byte, n)
+			}
+			if _, err := Decode(r.decoded, buf); err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if !r.ignoreCRC && crc(r.decoded[:n]) != checksum {
+				r.err = ErrCRC
+				return 0, r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeUncompressedData:
+			r.blockStart += int64(r.j)
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err == nil {
+					r.err = ErrUnsupported
+				}
+				return 0, r.err
+			}
+			buf := r.buf[:checksumSize]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read directly into r.decoded instead of via r.buf.
+			n := chunkLen - checksumSize
+			if r.snappyFrame && n > maxSnappyBlockSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if n > len(r.decoded) {
+				if n > r.maxBlock {
+					r.err = ErrCorrupt
+					return 0, r.err
+				}
+				r.decoded = make([]byte, n)
+			}
+			if !r.readFull(r.decoded[:n], false) {
+				return 0, r.err
+			}
+			if !r.ignoreCRC && crc(r.decoded[:n]) != checksum {
+				r.err = ErrCRC
+				return 0, r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(magicBody) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.readFull(r.buf[:len(magicBody)], false) {
+				return 0, r.err
+			}
+			if string(r.buf[:len(magicBody)]) != magicBody {
+				if string(r.buf[:len(magicBody)]) != magicBodySnappy {
+					r.err = ErrCorrupt
+					return 0, r.err
+				} else {
+					r.snappyFrame = true
+				}
+			} else {
+				r.snappyFrame = false
+			}
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			// fmt.Printf("ERR chunktype: 0x%x\n", chunkType)
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+		if chunkLen > maxChunkSize {
+			// fmt.Printf("ERR chunkLen: 0x%x\n", chunkLen)
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+
+		// fmt.Printf("skippable: ID: 0x%x, len: 0x%x\n", chunkType, chunkLen)
+		if !r.skippable(r.buf, chunkLen, false, chunkType) {
+			return 0, r.err
+		}
+	}
+}
+
+// DecodeConcurrent will decode the full stream to w.
+// This function should not be combined with reading, seeking or other operations.
+// Up to 'concurrent' goroutines will be used.
+// If <= 0, runtime.NumCPU will be used.
+// On success the number of bytes decompressed nil and is returned.
+// This is mainly intended for bigger streams.
+func (r *Reader) DecodeConcurrent(w io.Writer, concurrent int) (written int64, err error) {
+	if r.i > 0 || r.j > 0 || r.blockStart > 0 {
+		return 0, errors.New("DecodeConcurrent called after ")
+	}
+	if concurrent <= 0 {
+		concurrent = runtime.NumCPU()
+	}
+
+	// Write to output
+	var errMu sync.Mutex
+	var aErr error
+	setErr := func(e error) (ok bool) {
+		errMu.Lock()
+		defer errMu.Unlock()
+		if e == nil {
+			return aErr == nil
+		}
+		if aErr == nil {
+			aErr = e
+		}
+		return false
+	}
+	hasErr := func() (ok bool) {
+		errMu.Lock()
+		v := aErr != nil
+		errMu.Unlock()
+		return v
+	}
+
+	var aWritten int64
+	toRead := make(chan []byte, concurrent)
+	writtenBlocks := make(chan []byte, concurrent)
+	queue := make(chan chan []byte, concurrent)
+	reUse := make(chan chan []byte, concurrent)
+	for i := 0; i < concurrent; i++ {
+		toRead <- make([]byte, 0, r.maxBufSize)
+		writtenBlocks <- make([]byte, 0, r.maxBufSize)
+		reUse <- make(chan []byte, 1)
+	}
+	// Writer
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for toWrite := range queue {
+			entry := <-toWrite
+			reUse <- toWrite
+			if hasErr() || entry == nil {
+				if entry != nil {
+					writtenBlocks <- entry
+				}
+				continue
+			}
+			if hasErr() {
+				writtenBlocks <- entry
+				continue
+			}
+			n, err := w.Write(entry)
+			want := len(entry)
+			writtenBlocks <- entry
+			if err != nil {
+				setErr(err)
+				continue
+			}
+			if n != want {
+				setErr(io.ErrShortWrite)
+				continue
+			}
+			aWritten += int64(n)
+		}
+	}()
+
+	defer func() {
+		if r.err != nil {
+			setErr(r.err)
+		} else if err != nil {
+			setErr(err)
+		}
+		close(queue)
+		wg.Wait()
+		if err == nil {
+			err = aErr
+		}
+		written = aWritten
+	}()
+
+	// Reader
+	for !hasErr() {
+		if !r.readFull(r.buf[:4], true) {
+			if r.err == io.EOF {
+				r.err = nil
+			}
+			return 0, r.err
+		}
+		chunkType := r.buf[0]
+		if !r.readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.readHeader = true
+		}
+		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			r.blockStart += int64(r.j)
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if chunkLen > r.maxBufSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			orgBuf := <-toRead
+			buf := orgBuf[:chunkLen]
+
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			n, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if r.snappyFrame && n > maxSnappyBlockSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+
+			if n > r.maxBlock {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			wg.Add(1)
+
+			decoded := <-writtenBlocks
+			entry := <-reUse
+			queue <- entry
+			go func() {
+				defer wg.Done()
+				decoded = decoded[:n]
+				_, err := Decode(decoded, buf)
+				toRead <- orgBuf
+				if err != nil {
+					writtenBlocks <- decoded
+					setErr(err)
+					entry <- nil
+					return
+				}
+				if !r.ignoreCRC && crc(decoded) != checksum {
+					writtenBlocks <- decoded
+					setErr(ErrCRC)
+					entry <- nil
+					return
+				}
+				entry <- decoded
+			}()
+			continue
+
+		case chunkTypeUncompressedData:
+
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if chunkLen > r.maxBufSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			// Grab write buffer
+			orgBuf := <-writtenBlocks
+			buf := orgBuf[:checksumSize]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read content.
+			n := chunkLen - checksumSize
+
+			if r.snappyFrame && n > maxSnappyBlockSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if n > r.maxBlock {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			// Read uncompressed
+			buf = orgBuf[:n]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+
+			if !r.ignoreCRC && crc(buf) != checksum {
+				r.err = ErrCRC
+				return 0, r.err
+			}
+			entry := <-reUse
+			queue <- entry
+			entry <- buf
+			continue
+
+		case chunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(magicBody) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.readFull(r.buf[:len(magicBody)], false) {
+				return 0, r.err
+			}
+			if string(r.buf[:len(magicBody)]) != magicBody {
+				if string(r.buf[:len(magicBody)]) != magicBodySnappy {
+					r.err = ErrCorrupt
+					return 0, r.err
+				} else {
+					r.snappyFrame = true
+				}
+			} else {
+				r.snappyFrame = false
+			}
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			// fmt.Printf("ERR chunktype: 0x%x\n", chunkType)
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+		if chunkLen > maxChunkSize {
+			// fmt.Printf("ERR chunkLen: 0x%x\n", chunkLen)
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+
+		// fmt.Printf("skippable: ID: 0x%x, len: 0x%x\n", chunkType, chunkLen)
+		if !r.skippable(r.buf, chunkLen, false, chunkType) {
+			return 0, r.err
+		}
+	}
+	return 0, r.err
+}
+
+// Skip will skip n bytes forward in the decompressed output.
+// For larger skips this consumes less CPU and is faster than reading output and discarding it.
+// CRC is not checked on skipped blocks.
+// io.ErrUnexpectedEOF is returned if the stream ends before all bytes have been skipped.
+// If a decoding error is encountered subsequent calls to Read will also fail.
+func (r *Reader) Skip(n int64) error {
+	if n < 0 {
+		return errors.New("attempted negative skip")
+	}
+	if r.err != nil {
+		return r.err
+	}
+
+	for n > 0 {
+		if r.i < r.j {
+			// Skip in buffer.
+			// decoded[i:j] contains decoded bytes that have not yet been passed on.
+			left := int64(r.j - r.i)
+			if left >= n {
+				tmp := int64(r.i) + n
+				if tmp > math.MaxInt32 {
+					return errors.New("s2: internal overflow in skip")
+				}
+				r.i = int(tmp)
+				return nil
+			}
+			n -= int64(r.j - r.i)
+			r.i = r.j
+		}
+
+		// Buffer empty; read blocks until we have content.
+		if !r.readFull(r.buf[:4], true) {
+			if r.err == io.EOF {
+				r.err = io.ErrUnexpectedEOF
+			}
+			return r.err
+		}
+		chunkType := r.buf[0]
+		if !r.readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			r.readHeader = true
+		}
+		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			r.blockStart += int64(r.j)
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err == nil {
+					r.err = ErrUnsupported
+				}
+				return r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			dLen, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return r.err
+			}
+			if dLen > r.maxBlock {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			// Check if destination is within this block
+			if int64(dLen) > n {
+				if len(r.decoded) < dLen {
+					r.decoded = make([]byte, dLen)
+				}
+				if _, err := Decode(r.decoded, buf); err != nil {
+					r.err = err
+					return r.err
+				}
+				if crc(r.decoded[:dLen]) != checksum {
+					r.err = ErrCorrupt
+					return r.err
+				}
+			} else {
+				// Skip block completely
+				n -= int64(dLen)
+				r.blockStart += int64(dLen)
+				dLen = 0
+			}
+			r.i, r.j = 0, dLen
+			continue
+		case chunkTypeUncompressedData:
+			r.blockStart += int64(r.j)
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err != nil {
+					r.err = ErrUnsupported
+				}
+				return r.err
+			}
+			buf := r.buf[:checksumSize]
+			if !r.readFull(buf, false) {
+				return r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read directly into r.decoded instead of via r.buf.
+			n2 := chunkLen - checksumSize
+			if n2 > len(r.decoded) {
+				if n2 > r.maxBlock {
+					r.err = ErrCorrupt
+					return r.err
+				}
+				r.decoded = make([]byte, n2)
+			}
+			if !r.readFull(r.decoded[:n2], false) {
+				return r.err
+			}
+			if int64(n2) < n {
+				if crc(r.decoded[:n2]) != checksum {
+					r.err = ErrCorrupt
+					return r.err
+				}
+			}
+			r.i, r.j = 0, n2
+			continue
+		case chunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(magicBody) {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.readFull(r.buf[:len(magicBody)], false) {
+				return r.err
+			}
+			if string(r.buf[:len(magicBody)]) != magicBody {
+				if string(r.buf[:len(magicBody)]) != magicBodySnappy {
+					r.err = ErrCorrupt
+					return r.err
+				}
+			}
+
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			r.err = ErrUnsupported
+			return r.err
+		}
+		if chunkLen > maxChunkSize {
+			r.err = ErrUnsupported
+			return r.err
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+		if !r.skippable(r.buf, chunkLen, false, chunkType) {
+			return r.err
+		}
+	}
+	return nil
+}
+
+// ReadSeeker provides random or forward seeking in compressed content.
+// See Reader.ReadSeeker
+type ReadSeeker struct {
+	*Reader
+	readAtMu sync.Mutex
+}
+
+// ReadSeeker will return an io.ReadSeeker and io.ReaderAt
+// compatible version of the reader.
+// If 'random' is specified the returned io.Seeker can be used for
+// random seeking, otherwise only forward seeking is supported.
+// Enabling random seeking requires the original input to support
+// the io.Seeker interface.
+// A custom index can be specified which will be used if supplied.
+// When using a custom index, it will not be read from the input stream.
+// The ReadAt position will affect regular reads and the current position of Seek.
+// So using Read after ReadAt will continue from where the ReadAt stopped.
+// No functions should be used concurrently.
+// The returned ReadSeeker contains a shallow reference to the existing Reader,
+// meaning changes performed to one is reflected in the other.
+func (r *Reader) ReadSeeker(random bool, index []byte) (*ReadSeeker, error) {
+	// Read index if provided.
+	if len(index) != 0 {
+		if r.index == nil {
+			r.index = &Index{}
+		}
+		if _, err := r.index.Load(index); err != nil {
+			return nil, ErrCantSeek{Reason: "loading index returned: " + err.Error()}
+		}
+	}
+
+	// Check if input is seekable
+	rs, ok := r.r.(io.ReadSeeker)
+	if !ok {
+		if !random {
+			return &ReadSeeker{Reader: r}, nil
+		}
+		return nil, ErrCantSeek{Reason: "input stream isn't seekable"}
+	}
+
+	if r.index != nil {
+		// Seekable and index, ok...
+		return &ReadSeeker{Reader: r}, nil
+	}
+
+	// Load from stream.
+	r.index = &Index{}
+
+	// Read current position.
+	pos, err := rs.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return nil, ErrCantSeek{Reason: "seeking input returned: " + err.Error()}
+	}
+	err = r.index.LoadStream(rs)
+	if err != nil {
+		if err == ErrUnsupported {
+			// If we don't require random seeking, reset input and return.
+			if !random {
+				_, err = rs.Seek(pos, io.SeekStart)
+				if err != nil {
+					return nil, ErrCantSeek{Reason: "resetting stream returned: " + err.Error()}
+				}
+				r.index = nil
+				return &ReadSeeker{Reader: r}, nil
+			}
+			return nil, ErrCantSeek{Reason: "input stream does not contain an index"}
+		}
+		return nil, ErrCantSeek{Reason: "reading index returned: " + err.Error()}
+	}
+
+	// reset position.
+	_, err = rs.Seek(pos, io.SeekStart)
+	if err != nil {
+		return nil, ErrCantSeek{Reason: "seeking input returned: " + err.Error()}
+	}
+	return &ReadSeeker{Reader: r}, nil
+}
+
+// Seek allows seeking in compressed data.
+func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) {
+	if r.err != nil {
+		if !errors.Is(r.err, io.EOF) {
+			return 0, r.err
+		}
+		// Reset on EOF
+		r.err = nil
+	}
+
+	// Calculate absolute offset.
+	absOffset := offset
+
+	switch whence {
+	case io.SeekStart:
+	case io.SeekCurrent:
+		absOffset = r.blockStart + int64(r.i) + offset
+	case io.SeekEnd:
+		if r.index == nil {
+			return 0, ErrUnsupported
+		}
+		absOffset = r.index.TotalUncompressed + offset
+	default:
+		r.err = ErrUnsupported
+		return 0, r.err
+	}
+
+	if absOffset < 0 {
+		return 0, errors.New("seek before start of file")
+	}
+
+	if !r.readHeader {
+		// Make sure we read the header.
+		_, r.err = r.Read([]byte{})
+		if r.err != nil {
+			return 0, r.err
+		}
+	}
+
+	// If we are inside current block no need to seek.
+	// This includes no offset changes.
+	if absOffset >= r.blockStart && absOffset < r.blockStart+int64(r.j) {
+		r.i = int(absOffset - r.blockStart)
+		return r.blockStart + int64(r.i), nil
+	}
+
+	rs, ok := r.r.(io.ReadSeeker)
+	if r.index == nil || !ok {
+		currOffset := r.blockStart + int64(r.i)
+		if absOffset >= currOffset {
+			err := r.Skip(absOffset - currOffset)
+			return r.blockStart + int64(r.i), err
+		}
+		return 0, ErrUnsupported
+	}
+
+	// We can seek and we have an index.
+	c, u, err := r.index.Find(absOffset)
+	if err != nil {
+		return r.blockStart + int64(r.i), err
+	}
+
+	// Seek to next block
+	_, err = rs.Seek(c, io.SeekStart)
+	if err != nil {
+		return 0, err
+	}
+
+	r.i = r.j                     // Remove rest of current block.
+	r.blockStart = u - int64(r.j) // Adjust current block start for accounting.
+	if u < absOffset {
+		// Forward inside block
+		return absOffset, r.Skip(absOffset - u)
+	}
+	if u > absOffset {
+		return 0, fmt.Errorf("s2 seek: (internal error) u (%d) > absOffset (%d)", u, absOffset)
+	}
+	return absOffset, nil
+}
+
+// ReadAt reads len(p) bytes into p starting at offset off in the
+// underlying input source. It returns the number of bytes
+// read (0 <= n <= len(p)) and any error encountered.
+//
+// When ReadAt returns n < len(p), it returns a non-nil error
+// explaining why more bytes were not returned. In this respect,
+// ReadAt is stricter than Read.
+//
+// Even if ReadAt returns n < len(p), it may use all of p as scratch
+// space during the call. If some data is available but not len(p) bytes,
+// ReadAt blocks until either all the data is available or an error occurs.
+// In this respect ReadAt is different from Read.
+//
+// If the n = len(p) bytes returned by ReadAt are at the end of the
+// input source, ReadAt may return either err == EOF or err == nil.
+//
+// If ReadAt is reading from an input source with a seek offset,
+// ReadAt should not affect nor be affected by the underlying
+// seek offset.
+//
+// Clients of ReadAt can execute parallel ReadAt calls on the
+// same input source. This is however not recommended.
+func (r *ReadSeeker) ReadAt(p []byte, offset int64) (int, error) {
+	r.readAtMu.Lock()
+	defer r.readAtMu.Unlock()
+	_, err := r.Seek(offset, io.SeekStart)
+	if err != nil {
+		return 0, err
+	}
+	n := 0
+	for n < len(p) {
+		n2, err := r.Read(p[n:])
+		if err != nil {
+			// This will include io.EOF
+			return n + n2, err
+		}
+		n += n2
+	}
+	return n, nil
+}
+
+// ReadByte satisfies the io.ByteReader interface.
+func (r *Reader) ReadByte() (byte, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+	if r.i < r.j {
+		c := r.decoded[r.i]
+		r.i++
+		return c, nil
+	}
+	var tmp [1]byte
+	for i := 0; i < 10; i++ {
+		n, err := r.Read(tmp[:])
+		if err != nil {
+			return 0, err
+		}
+		if n == 1 {
+			return tmp[0], nil
+		}
+	}
+	return 0, io.ErrNoProgress
+}
+
+// SkippableCB will register a callback for chunks with the specified ID.
+// ID must be a Reserved skippable chunks ID, 0x80-0xfd (inclusive).
+// For each chunk with the ID, the callback is called with the content.
+// Any returned non-nil error will abort decompression.
+// Only one callback per ID is supported, latest sent will be used.
+// Sending a nil function will disable previous callbacks.
+// You can peek the stream, triggering the callback, by doing a Read with a 0
+// byte buffer.
+func (r *Reader) SkippableCB(id uint8, fn func(r io.Reader) error) error {
+	if id < 0x80 || id >= chunkTypePadding {
+		return fmt.Errorf("ReaderSkippableCB: Invalid id provided, must be 0x80-0xfe (inclusive)")
+	}
+	r.skippableCB[id-0x80] = fn
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/s2/s2.go b/vendor/github.com/klauspost/compress/s2/s2.go
new file mode 100644
index 0000000..cbd1ed6
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/s2.go
@@ -0,0 +1,151 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package s2 implements the S2 compression format.
+//
+// S2 is an extension of Snappy. Similar to Snappy S2 is aimed for high throughput,
+// which is why it features concurrent compression for bigger payloads.
+//
+// Decoding is compatible with Snappy compressed content,
+// but content compressed with S2 cannot be decompressed by Snappy.
+//
+// For more information on Snappy/S2 differences see README in: https://github.com/klauspost/compress/tree/master/s2
+//
+// There are actually two S2 formats: block and stream. They are related,
+// but different: trying to decompress block-compressed data as a S2 stream
+// will fail, and vice versa. The block format is the Decode and Encode
+// functions and the stream format is the Reader and Writer types.
+//
+// A "better" compression option is available. This will trade some compression
+// speed
+//
+// The block format, the more common case, is used when the complete size (the
+// number of bytes) of the original data is known upfront, at the time
+// compression starts. The stream format, also known as the framing format, is
+// for when that isn't always true.
+//
+// Blocks to not offer much data protection, so it is up to you to
+// add data validation of decompressed blocks.
+//
+// Streams perform CRC validation of the decompressed data.
+// Stream compression will also be performed on multiple CPU cores concurrently
+// significantly improving throughput.
+package s2
+
+import (
+	"bytes"
+	"hash/crc32"
+
+	"github.com/klauspost/compress/internal/race"
+)
+
+/*
+Each encoded block begins with the varint-encoded length of the decoded data,
+followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
+first byte of each chunk is broken into its 2 least and 6 most significant bits
+called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
+Zero means a literal tag. All other values mean a copy tag.
+
+For literal tags:
+  - If m < 60, the next 1 + m bytes are literal bytes.
+  - Otherwise, let n be the little-endian unsigned integer denoted by the next
+    m - 59 bytes. The next 1 + n bytes after that are literal bytes.
+
+For copy tags, length bytes are copied from offset bytes ago, in the style of
+Lempel-Ziv compression algorithms. In particular:
+  - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
+    The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
+    of the offset. The next byte is bits 0-7 of the offset.
+  - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
+    The length is 1 + m. The offset is the little-endian unsigned integer
+    denoted by the next 2 bytes.
+  - For l == 3, the offset ranges in [0, 1<<32) and the length in
+    [1, 65). The length is 1 + m. The offset is the little-endian unsigned
+    integer denoted by the next 4 bytes.
+*/
+const (
+	tagLiteral = 0x00
+	tagCopy1   = 0x01
+	tagCopy2   = 0x02
+	tagCopy4   = 0x03
+)
+
+const (
+	checksumSize     = 4
+	chunkHeaderSize  = 4
+	magicChunk       = "\xff\x06\x00\x00" + magicBody
+	magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy
+	magicBodySnappy  = "sNaPpY"
+	magicBody        = "S2sTwO"
+
+	// maxBlockSize is the maximum size of the input to encodeBlock.
+	//
+	// For the framing format (Writer type instead of Encode function),
+	// this is the maximum uncompressed size of a block.
+	maxBlockSize = 4 << 20
+
+	// minBlockSize is the minimum size of block setting when creating a writer.
+	minBlockSize = 4 << 10
+
+	skippableFrameHeader = 4
+	maxChunkSize         = 1<<24 - 1 // 16777215
+
+	// Default block size
+	defaultBlockSize = 1 << 20
+
+	// maxSnappyBlockSize is the maximum snappy block size.
+	maxSnappyBlockSize = 1 << 16
+
+	obufHeaderLen = checksumSize + chunkHeaderSize
+)
+
+const (
+	chunkTypeCompressedData   = 0x00
+	chunkTypeUncompressedData = 0x01
+	ChunkTypeIndex            = 0x99
+	chunkTypePadding          = 0xfe
+	chunkTypeStreamIdentifier = 0xff
+)
+
+var (
+	crcTable              = crc32.MakeTable(crc32.Castagnoli)
+	magicChunkSnappyBytes = []byte(magicChunkSnappy) // Can be passed to functions where it escapes.
+	magicChunkBytes       = []byte(magicChunk)       // Can be passed to functions where it escapes.
+)
+
+// crc implements the checksum specified in section 3 of
+// https://github.com/google/snappy/blob/master/framing_format.txt
+func crc(b []byte) uint32 {
+	race.ReadSlice(b)
+
+	c := crc32.Update(0, crcTable, b)
+	return c>>15 | c<<17 + 0xa282ead8
+}
+
+// literalExtraSize returns the extra size of encoding n literals.
+// n should be >= 0 and <= math.MaxUint32.
+func literalExtraSize(n int64) int64 {
+	if n == 0 {
+		return 0
+	}
+	switch {
+	case n < 60:
+		return 1
+	case n < 1<<8:
+		return 2
+	case n < 1<<16:
+		return 3
+	case n < 1<<24:
+		return 4
+	default:
+		return 5
+	}
+}
+
+type byter interface {
+	Bytes() []byte
+}
+
+var _ byter = &bytes.Buffer{}
diff --git a/vendor/github.com/klauspost/compress/s2/writer.go b/vendor/github.com/klauspost/compress/s2/writer.go
new file mode 100644
index 0000000..0a46f2b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/writer.go
@@ -0,0 +1,1039 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019+ Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"crypto/rand"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"runtime"
+	"sync"
+
+	"github.com/klauspost/compress/internal/race"
+)
+
+const (
+	levelUncompressed = iota + 1
+	levelFast
+	levelBetter
+	levelBest
+)
+
+// NewWriter returns a new Writer that compresses to w, using the
+// framing format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt
+//
+// Users must call Close to guarantee all data has been forwarded to
+// the underlying io.Writer and that resources are released.
+// They may also call Flush zero or more times before calling Close.
+func NewWriter(w io.Writer, opts ...WriterOption) *Writer {
+	w2 := Writer{
+		blockSize:   defaultBlockSize,
+		concurrency: runtime.GOMAXPROCS(0),
+		randSrc:     rand.Reader,
+		level:       levelFast,
+	}
+	for _, opt := range opts {
+		if err := opt(&w2); err != nil {
+			w2.errState = err
+			return &w2
+		}
+	}
+	w2.obufLen = obufHeaderLen + MaxEncodedLen(w2.blockSize)
+	w2.paramsOK = true
+	w2.ibuf = make([]byte, 0, w2.blockSize)
+	w2.buffers.New = func() interface{} {
+		return make([]byte, w2.obufLen)
+	}
+	w2.Reset(w)
+	return &w2
+}
+
+// Writer is an io.Writer that can write Snappy-compressed bytes.
+type Writer struct {
+	errMu    sync.Mutex
+	errState error
+
+	// ibuf is a buffer for the incoming (uncompressed) bytes.
+	ibuf []byte
+
+	blockSize     int
+	obufLen       int
+	concurrency   int
+	written       int64
+	uncompWritten int64 // Bytes sent to compression
+	output        chan chan result
+	buffers       sync.Pool
+	pad           int
+
+	writer    io.Writer
+	randSrc   io.Reader
+	writerWg  sync.WaitGroup
+	index     Index
+	customEnc func(dst, src []byte) int
+
+	// wroteStreamHeader is whether we have written the stream header.
+	wroteStreamHeader bool
+	paramsOK          bool
+	snappy            bool
+	flushOnWrite      bool
+	appendIndex       bool
+	level             uint8
+}
+
+type result struct {
+	b []byte
+	// Uncompressed start offset
+	startOffset int64
+}
+
+// err returns the previously set error.
+// If no error has been set it is set to err if not nil.
+func (w *Writer) err(err error) error {
+	w.errMu.Lock()
+	errSet := w.errState
+	if errSet == nil && err != nil {
+		w.errState = err
+		errSet = err
+	}
+	w.errMu.Unlock()
+	return errSet
+}
+
+// Reset discards the writer's state and switches the Snappy writer to write to w.
+// This permits reusing a Writer rather than allocating a new one.
+func (w *Writer) Reset(writer io.Writer) {
+	if !w.paramsOK {
+		return
+	}
+	// Close previous writer, if any.
+	if w.output != nil {
+		close(w.output)
+		w.writerWg.Wait()
+		w.output = nil
+	}
+	w.errState = nil
+	w.ibuf = w.ibuf[:0]
+	w.wroteStreamHeader = false
+	w.written = 0
+	w.writer = writer
+	w.uncompWritten = 0
+	w.index.reset(w.blockSize)
+
+	// If we didn't get a writer, stop here.
+	if writer == nil {
+		return
+	}
+	// If no concurrency requested, don't spin up writer goroutine.
+	if w.concurrency == 1 {
+		return
+	}
+
+	toWrite := make(chan chan result, w.concurrency)
+	w.output = toWrite
+	w.writerWg.Add(1)
+
+	// Start a writer goroutine that will write all output in order.
+	go func() {
+		defer w.writerWg.Done()
+
+		// Get a queued write.
+		for write := range toWrite {
+			// Wait for the data to be available.
+			input := <-write
+			in := input.b
+			if len(in) > 0 {
+				if w.err(nil) == nil {
+					// Don't expose data from previous buffers.
+					toWrite := in[:len(in):len(in)]
+					// Write to output.
+					n, err := writer.Write(toWrite)
+					if err == nil && n != len(toWrite) {
+						err = io.ErrShortBuffer
+					}
+					_ = w.err(err)
+					w.err(w.index.add(w.written, input.startOffset))
+					w.written += int64(n)
+				}
+			}
+			if cap(in) >= w.obufLen {
+				w.buffers.Put(in)
+			}
+			// close the incoming write request.
+			// This can be used for synchronizing flushes.
+			close(write)
+		}
+	}()
+}
+
+// Write satisfies the io.Writer interface.
+func (w *Writer) Write(p []byte) (nRet int, errRet error) {
+	if err := w.err(nil); err != nil {
+		return 0, err
+	}
+	if w.flushOnWrite {
+		return w.write(p)
+	}
+	// If we exceed the input buffer size, start writing
+	for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err(nil) == nil {
+		var n int
+		if len(w.ibuf) == 0 {
+			// Large write, empty buffer.
+			// Write directly from p to avoid copy.
+			n, _ = w.write(p)
+		} else {
+			n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+			w.ibuf = w.ibuf[:len(w.ibuf)+n]
+			w.write(w.ibuf)
+			w.ibuf = w.ibuf[:0]
+		}
+		nRet += n
+		p = p[n:]
+	}
+	if err := w.err(nil); err != nil {
+		return nRet, err
+	}
+	// p should always be able to fit into w.ibuf now.
+	n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+	w.ibuf = w.ibuf[:len(w.ibuf)+n]
+	nRet += n
+	return nRet, nil
+}
+
+// ReadFrom implements the io.ReaderFrom interface.
+// Using this is typically more efficient since it avoids a memory copy.
+// ReadFrom reads data from r until EOF or error.
+// The return value n is the number of bytes read.
+// Any error except io.EOF encountered during the read is also returned.
+func (w *Writer) ReadFrom(r io.Reader) (n int64, err error) {
+	if err := w.err(nil); err != nil {
+		return 0, err
+	}
+	if len(w.ibuf) > 0 {
+		err := w.AsyncFlush()
+		if err != nil {
+			return 0, err
+		}
+	}
+	if br, ok := r.(byter); ok {
+		buf := br.Bytes()
+		if err := w.EncodeBuffer(buf); err != nil {
+			return 0, err
+		}
+		return int64(len(buf)), w.AsyncFlush()
+	}
+	for {
+		inbuf := w.buffers.Get().([]byte)[:w.blockSize+obufHeaderLen]
+		n2, err := io.ReadFull(r, inbuf[obufHeaderLen:])
+		if err != nil {
+			if err == io.ErrUnexpectedEOF {
+				err = io.EOF
+			}
+			if err != io.EOF {
+				return n, w.err(err)
+			}
+		}
+		if n2 == 0 {
+			if cap(inbuf) >= w.obufLen {
+				w.buffers.Put(inbuf)
+			}
+			break
+		}
+		n += int64(n2)
+		err2 := w.writeFull(inbuf[:n2+obufHeaderLen])
+		if w.err(err2) != nil {
+			break
+		}
+
+		if err != nil {
+			// We got EOF and wrote everything
+			break
+		}
+	}
+
+	return n, w.err(nil)
+}
+
+// AddSkippableBlock will add a skippable block to the stream.
+// The ID must be 0x80-0xfe (inclusive).
+// Length of the skippable block must be <= 16777215 bytes.
+func (w *Writer) AddSkippableBlock(id uint8, data []byte) (err error) {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+	if len(data) == 0 {
+		return nil
+	}
+	if id < 0x80 || id > chunkTypePadding {
+		return fmt.Errorf("invalid skippable block id %x", id)
+	}
+	if len(data) > maxChunkSize {
+		return fmt.Errorf("skippable block excessed maximum size")
+	}
+	var header [4]byte
+	chunkLen := len(data)
+	header[0] = id
+	header[1] = uint8(chunkLen >> 0)
+	header[2] = uint8(chunkLen >> 8)
+	header[3] = uint8(chunkLen >> 16)
+	if w.concurrency == 1 {
+		write := func(b []byte) error {
+			n, err := w.writer.Write(b)
+			if err = w.err(err); err != nil {
+				return err
+			}
+			if n != len(b) {
+				return w.err(io.ErrShortWrite)
+			}
+			w.written += int64(n)
+			return w.err(nil)
+		}
+		if !w.wroteStreamHeader {
+			w.wroteStreamHeader = true
+			if w.snappy {
+				if err := write([]byte(magicChunkSnappy)); err != nil {
+					return err
+				}
+			} else {
+				if err := write([]byte(magicChunk)); err != nil {
+					return err
+				}
+			}
+		}
+		if err := write(header[:]); err != nil {
+			return err
+		}
+		return write(data)
+	}
+
+	// Create output...
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		hWriter := make(chan result)
+		w.output <- hWriter
+		if w.snappy {
+			hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes}
+		} else {
+			hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes}
+		}
+	}
+
+	// Copy input.
+	inbuf := w.buffers.Get().([]byte)[:4]
+	copy(inbuf, header[:])
+	inbuf = append(inbuf, data...)
+
+	output := make(chan result, 1)
+	// Queue output.
+	w.output <- output
+	output <- result{startOffset: w.uncompWritten, b: inbuf}
+
+	return nil
+}
+
+// EncodeBuffer will add a buffer to the stream.
+// This is the fastest way to encode a stream,
+// but the input buffer cannot be written to by the caller
+// until Flush or Close has been called when concurrency != 1.
+//
+// If you cannot control that, use the regular Write function.
+//
+// Note that input is not buffered.
+// This means that each write will result in discrete blocks being created.
+// For buffered writes, use the regular Write function.
+func (w *Writer) EncodeBuffer(buf []byte) (err error) {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+
+	if w.flushOnWrite {
+		_, err := w.write(buf)
+		return err
+	}
+	// Flush queued data first.
+	if len(w.ibuf) > 0 {
+		err := w.AsyncFlush()
+		if err != nil {
+			return err
+		}
+	}
+	if w.concurrency == 1 {
+		_, err := w.writeSync(buf)
+		return err
+	}
+
+	// Spawn goroutine and write block to output channel.
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		hWriter := make(chan result)
+		w.output <- hWriter
+		if w.snappy {
+			hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes}
+		} else {
+			hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes}
+		}
+	}
+
+	for len(buf) > 0 {
+		// Cut input.
+		uncompressed := buf
+		if len(uncompressed) > w.blockSize {
+			uncompressed = uncompressed[:w.blockSize]
+		}
+		buf = buf[len(uncompressed):]
+		// Get an output buffer.
+		obuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen]
+		race.WriteSlice(obuf)
+
+		output := make(chan result)
+		// Queue output now, so we keep order.
+		w.output <- output
+		res := result{
+			startOffset: w.uncompWritten,
+		}
+		w.uncompWritten += int64(len(uncompressed))
+		go func() {
+			race.ReadSlice(uncompressed)
+
+			checksum := crc(uncompressed)
+
+			// Set to uncompressed.
+			chunkType := uint8(chunkTypeUncompressedData)
+			chunkLen := 4 + len(uncompressed)
+
+			// Attempt compressing.
+			n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+			n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+			// Check if we should use this, or store as uncompressed instead.
+			if n2 > 0 {
+				chunkType = uint8(chunkTypeCompressedData)
+				chunkLen = 4 + n + n2
+				obuf = obuf[:obufHeaderLen+n+n2]
+			} else {
+				// copy uncompressed
+				copy(obuf[obufHeaderLen:], uncompressed)
+			}
+
+			// Fill in the per-chunk header that comes before the body.
+			obuf[0] = chunkType
+			obuf[1] = uint8(chunkLen >> 0)
+			obuf[2] = uint8(chunkLen >> 8)
+			obuf[3] = uint8(chunkLen >> 16)
+			obuf[4] = uint8(checksum >> 0)
+			obuf[5] = uint8(checksum >> 8)
+			obuf[6] = uint8(checksum >> 16)
+			obuf[7] = uint8(checksum >> 24)
+
+			// Queue final output.
+			res.b = obuf
+			output <- res
+		}()
+	}
+	return nil
+}
+
+func (w *Writer) encodeBlock(obuf, uncompressed []byte) int {
+	if w.customEnc != nil {
+		if ret := w.customEnc(obuf, uncompressed); ret >= 0 {
+			return ret
+		}
+	}
+	if w.snappy {
+		switch w.level {
+		case levelFast:
+			return encodeBlockSnappy(obuf, uncompressed)
+		case levelBetter:
+			return encodeBlockBetterSnappy(obuf, uncompressed)
+		case levelBest:
+			return encodeBlockBestSnappy(obuf, uncompressed)
+		}
+		return 0
+	}
+	switch w.level {
+	case levelFast:
+		return encodeBlock(obuf, uncompressed)
+	case levelBetter:
+		return encodeBlockBetter(obuf, uncompressed)
+	case levelBest:
+		return encodeBlockBest(obuf, uncompressed, nil)
+	}
+	return 0
+}
+
+func (w *Writer) write(p []byte) (nRet int, errRet error) {
+	if err := w.err(nil); err != nil {
+		return 0, err
+	}
+	if w.concurrency == 1 {
+		return w.writeSync(p)
+	}
+
+	// Spawn goroutine and write block to output channel.
+	for len(p) > 0 {
+		if !w.wroteStreamHeader {
+			w.wroteStreamHeader = true
+			hWriter := make(chan result)
+			w.output <- hWriter
+			if w.snappy {
+				hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes}
+			} else {
+				hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes}
+			}
+		}
+
+		var uncompressed []byte
+		if len(p) > w.blockSize {
+			uncompressed, p = p[:w.blockSize], p[w.blockSize:]
+		} else {
+			uncompressed, p = p, nil
+		}
+
+		// Copy input.
+		// If the block is incompressible, this is used for the result.
+		inbuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen]
+		obuf := w.buffers.Get().([]byte)[:w.obufLen]
+		copy(inbuf[obufHeaderLen:], uncompressed)
+		uncompressed = inbuf[obufHeaderLen:]
+
+		output := make(chan result)
+		// Queue output now, so we keep order.
+		w.output <- output
+		res := result{
+			startOffset: w.uncompWritten,
+		}
+		w.uncompWritten += int64(len(uncompressed))
+
+		go func() {
+			checksum := crc(uncompressed)
+
+			// Set to uncompressed.
+			chunkType := uint8(chunkTypeUncompressedData)
+			chunkLen := 4 + len(uncompressed)
+
+			// Attempt compressing.
+			n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+			n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+			// Check if we should use this, or store as uncompressed instead.
+			if n2 > 0 {
+				chunkType = uint8(chunkTypeCompressedData)
+				chunkLen = 4 + n + n2
+				obuf = obuf[:obufHeaderLen+n+n2]
+			} else {
+				// Use input as output.
+				obuf, inbuf = inbuf, obuf
+			}
+
+			// Fill in the per-chunk header that comes before the body.
+			obuf[0] = chunkType
+			obuf[1] = uint8(chunkLen >> 0)
+			obuf[2] = uint8(chunkLen >> 8)
+			obuf[3] = uint8(chunkLen >> 16)
+			obuf[4] = uint8(checksum >> 0)
+			obuf[5] = uint8(checksum >> 8)
+			obuf[6] = uint8(checksum >> 16)
+			obuf[7] = uint8(checksum >> 24)
+
+			// Queue final output.
+			res.b = obuf
+			output <- res
+
+			// Put unused buffer back in pool.
+			w.buffers.Put(inbuf)
+		}()
+		nRet += len(uncompressed)
+	}
+	return nRet, nil
+}
+
+// writeFull is a special version of write that will always write the full buffer.
+// Data to be compressed should start at offset obufHeaderLen and fill the remainder of the buffer.
+// The data will be written as a single block.
+// The caller is not allowed to use inbuf after this function has been called.
+func (w *Writer) writeFull(inbuf []byte) (errRet error) {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+
+	if w.concurrency == 1 {
+		_, err := w.writeSync(inbuf[obufHeaderLen:])
+		if cap(inbuf) >= w.obufLen {
+			w.buffers.Put(inbuf)
+		}
+		return err
+	}
+
+	// Spawn goroutine and write block to output channel.
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		hWriter := make(chan result)
+		w.output <- hWriter
+		if w.snappy {
+			hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes}
+		} else {
+			hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes}
+		}
+	}
+
+	// Get an output buffer.
+	obuf := w.buffers.Get().([]byte)[:w.obufLen]
+	uncompressed := inbuf[obufHeaderLen:]
+
+	output := make(chan result)
+	// Queue output now, so we keep order.
+	w.output <- output
+	res := result{
+		startOffset: w.uncompWritten,
+	}
+	w.uncompWritten += int64(len(uncompressed))
+
+	go func() {
+		checksum := crc(uncompressed)
+
+		// Set to uncompressed.
+		chunkType := uint8(chunkTypeUncompressedData)
+		chunkLen := 4 + len(uncompressed)
+
+		// Attempt compressing.
+		n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+		n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+		// Check if we should use this, or store as uncompressed instead.
+		if n2 > 0 {
+			chunkType = uint8(chunkTypeCompressedData)
+			chunkLen = 4 + n + n2
+			obuf = obuf[:obufHeaderLen+n+n2]
+		} else {
+			// Use input as output.
+			obuf, inbuf = inbuf, obuf
+		}
+
+		// Fill in the per-chunk header that comes before the body.
+		obuf[0] = chunkType
+		obuf[1] = uint8(chunkLen >> 0)
+		obuf[2] = uint8(chunkLen >> 8)
+		obuf[3] = uint8(chunkLen >> 16)
+		obuf[4] = uint8(checksum >> 0)
+		obuf[5] = uint8(checksum >> 8)
+		obuf[6] = uint8(checksum >> 16)
+		obuf[7] = uint8(checksum >> 24)
+
+		// Queue final output.
+		res.b = obuf
+		output <- res
+
+		// Put unused buffer back in pool.
+		w.buffers.Put(inbuf)
+	}()
+	return nil
+}
+
+func (w *Writer) writeSync(p []byte) (nRet int, errRet error) {
+	if err := w.err(nil); err != nil {
+		return 0, err
+	}
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		var n int
+		var err error
+		if w.snappy {
+			n, err = w.writer.Write(magicChunkSnappyBytes)
+		} else {
+			n, err = w.writer.Write(magicChunkBytes)
+		}
+		if err != nil {
+			return 0, w.err(err)
+		}
+		if n != len(magicChunk) {
+			return 0, w.err(io.ErrShortWrite)
+		}
+		w.written += int64(n)
+	}
+
+	for len(p) > 0 {
+		var uncompressed []byte
+		if len(p) > w.blockSize {
+			uncompressed, p = p[:w.blockSize], p[w.blockSize:]
+		} else {
+			uncompressed, p = p, nil
+		}
+
+		obuf := w.buffers.Get().([]byte)[:w.obufLen]
+		checksum := crc(uncompressed)
+
+		// Set to uncompressed.
+		chunkType := uint8(chunkTypeUncompressedData)
+		chunkLen := 4 + len(uncompressed)
+
+		// Attempt compressing.
+		n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+		n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+		if n2 > 0 {
+			chunkType = uint8(chunkTypeCompressedData)
+			chunkLen = 4 + n + n2
+			obuf = obuf[:obufHeaderLen+n+n2]
+		} else {
+			obuf = obuf[:8]
+		}
+
+		// Fill in the per-chunk header that comes before the body.
+		obuf[0] = chunkType
+		obuf[1] = uint8(chunkLen >> 0)
+		obuf[2] = uint8(chunkLen >> 8)
+		obuf[3] = uint8(chunkLen >> 16)
+		obuf[4] = uint8(checksum >> 0)
+		obuf[5] = uint8(checksum >> 8)
+		obuf[6] = uint8(checksum >> 16)
+		obuf[7] = uint8(checksum >> 24)
+
+		n, err := w.writer.Write(obuf)
+		if err != nil {
+			return 0, w.err(err)
+		}
+		if n != len(obuf) {
+			return 0, w.err(io.ErrShortWrite)
+		}
+		w.err(w.index.add(w.written, w.uncompWritten))
+		w.written += int64(n)
+		w.uncompWritten += int64(len(uncompressed))
+
+		if chunkType == chunkTypeUncompressedData {
+			// Write uncompressed data.
+			n, err := w.writer.Write(uncompressed)
+			if err != nil {
+				return 0, w.err(err)
+			}
+			if n != len(uncompressed) {
+				return 0, w.err(io.ErrShortWrite)
+			}
+			w.written += int64(n)
+		}
+		w.buffers.Put(obuf)
+		// Queue final output.
+		nRet += len(uncompressed)
+	}
+	return nRet, nil
+}
+
+// AsyncFlush writes any buffered bytes to a block and starts compressing it.
+// It does not wait for the output has been written as Flush() does.
+func (w *Writer) AsyncFlush() error {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+
+	// Queue any data still in input buffer.
+	if len(w.ibuf) != 0 {
+		if !w.wroteStreamHeader {
+			_, err := w.writeSync(w.ibuf)
+			w.ibuf = w.ibuf[:0]
+			return w.err(err)
+		} else {
+			_, err := w.write(w.ibuf)
+			w.ibuf = w.ibuf[:0]
+			err = w.err(err)
+			if err != nil {
+				return err
+			}
+		}
+	}
+	return w.err(nil)
+}
+
+// Flush flushes the Writer to its underlying io.Writer.
+// This does not apply padding.
+func (w *Writer) Flush() error {
+	if err := w.AsyncFlush(); err != nil {
+		return err
+	}
+	if w.output == nil {
+		return w.err(nil)
+	}
+
+	// Send empty buffer
+	res := make(chan result)
+	w.output <- res
+	// Block until this has been picked up.
+	res <- result{b: nil, startOffset: w.uncompWritten}
+	// When it is closed, we have flushed.
+	<-res
+	return w.err(nil)
+}
+
+// Close calls Flush and then closes the Writer.
+// Calling Close multiple times is ok,
+// but calling CloseIndex after this will make it not return the index.
+func (w *Writer) Close() error {
+	_, err := w.closeIndex(w.appendIndex)
+	return err
+}
+
+// CloseIndex calls Close and returns an index on first call.
+// This is not required if you are only adding index to a stream.
+func (w *Writer) CloseIndex() ([]byte, error) {
+	return w.closeIndex(true)
+}
+
+func (w *Writer) closeIndex(idx bool) ([]byte, error) {
+	err := w.Flush()
+	if w.output != nil {
+		close(w.output)
+		w.writerWg.Wait()
+		w.output = nil
+	}
+
+	var index []byte
+	if w.err(err) == nil && w.writer != nil {
+		// Create index.
+		if idx {
+			compSize := int64(-1)
+			if w.pad <= 1 {
+				compSize = w.written
+			}
+			index = w.index.appendTo(w.ibuf[:0], w.uncompWritten, compSize)
+			// Count as written for padding.
+			if w.appendIndex {
+				w.written += int64(len(index))
+			}
+		}
+
+		if w.pad > 1 {
+			tmp := w.ibuf[:0]
+			if len(index) > 0 {
+				// Allocate another buffer.
+				tmp = w.buffers.Get().([]byte)[:0]
+				defer w.buffers.Put(tmp)
+			}
+			add := calcSkippableFrame(w.written, int64(w.pad))
+			frame, err := skippableFrame(tmp, add, w.randSrc)
+			if err = w.err(err); err != nil {
+				return nil, err
+			}
+			n, err2 := w.writer.Write(frame)
+			if err2 == nil && n != len(frame) {
+				err2 = io.ErrShortWrite
+			}
+			_ = w.err(err2)
+		}
+		if len(index) > 0 && w.appendIndex {
+			n, err2 := w.writer.Write(index)
+			if err2 == nil && n != len(index) {
+				err2 = io.ErrShortWrite
+			}
+			_ = w.err(err2)
+		}
+	}
+	err = w.err(errClosed)
+	if err == errClosed {
+		return index, nil
+	}
+	return nil, err
+}
+
+// calcSkippableFrame will return a total size to be added for written
+// to be divisible by multiple.
+// The value will always be > skippableFrameHeader.
+// The function will panic if written < 0 or wantMultiple <= 0.
+func calcSkippableFrame(written, wantMultiple int64) int {
+	if wantMultiple <= 0 {
+		panic("wantMultiple <= 0")
+	}
+	if written < 0 {
+		panic("written < 0")
+	}
+	leftOver := written % wantMultiple
+	if leftOver == 0 {
+		return 0
+	}
+	toAdd := wantMultiple - leftOver
+	for toAdd < skippableFrameHeader {
+		toAdd += wantMultiple
+	}
+	return int(toAdd)
+}
+
+// skippableFrame will add a skippable frame with a total size of bytes.
+// total should be >= skippableFrameHeader and < maxBlockSize + skippableFrameHeader
+func skippableFrame(dst []byte, total int, r io.Reader) ([]byte, error) {
+	if total == 0 {
+		return dst, nil
+	}
+	if total < skippableFrameHeader {
+		return dst, fmt.Errorf("s2: requested skippable frame (%d) < 4", total)
+	}
+	if int64(total) >= maxBlockSize+skippableFrameHeader {
+		return dst, fmt.Errorf("s2: requested skippable frame (%d) >= max 1<<24", total)
+	}
+	// Chunk type 0xfe "Section 4.4 Padding (chunk type 0xfe)"
+	dst = append(dst, chunkTypePadding)
+	f := uint32(total - skippableFrameHeader)
+	// Add chunk length.
+	dst = append(dst, uint8(f), uint8(f>>8), uint8(f>>16))
+	// Add data
+	start := len(dst)
+	dst = append(dst, make([]byte, f)...)
+	_, err := io.ReadFull(r, dst[start:])
+	return dst, err
+}
+
+var errClosed = errors.New("s2: Writer is closed")
+
+// WriterOption is an option for creating a encoder.
+type WriterOption func(*Writer) error
+
+// WriterConcurrency will set the concurrency,
+// meaning the maximum number of decoders to run concurrently.
+// The value supplied must be at least 1.
+// By default this will be set to GOMAXPROCS.
+func WriterConcurrency(n int) WriterOption {
+	return func(w *Writer) error {
+		if n <= 0 {
+			return errors.New("concurrency must be at least 1")
+		}
+		w.concurrency = n
+		return nil
+	}
+}
+
+// WriterAddIndex will append an index to the end of a stream
+// when it is closed.
+func WriterAddIndex() WriterOption {
+	return func(w *Writer) error {
+		w.appendIndex = true
+		return nil
+	}
+}
+
+// WriterBetterCompression will enable better compression.
+// EncodeBetter compresses better than Encode but typically with a
+// 10-40% speed decrease on both compression and decompression.
+func WriterBetterCompression() WriterOption {
+	return func(w *Writer) error {
+		w.level = levelBetter
+		return nil
+	}
+}
+
+// WriterBestCompression will enable better compression.
+// EncodeBetter compresses better than Encode but typically with a
+// big speed decrease on compression.
+func WriterBestCompression() WriterOption {
+	return func(w *Writer) error {
+		w.level = levelBest
+		return nil
+	}
+}
+
+// WriterUncompressed will bypass compression.
+// The stream will be written as uncompressed blocks only.
+// If concurrency is > 1 CRC and output will still be done async.
+func WriterUncompressed() WriterOption {
+	return func(w *Writer) error {
+		w.level = levelUncompressed
+		return nil
+	}
+}
+
+// WriterBlockSize allows to override the default block size.
+// Blocks will be this size or smaller.
+// Minimum size is 4KB and maximum size is 4MB.
+//
+// Bigger blocks may give bigger throughput on systems with many cores,
+// and will increase compression slightly, but it will limit the possible
+// concurrency for smaller payloads for both encoding and decoding.
+// Default block size is 1MB.
+//
+// When writing Snappy compatible output using WriterSnappyCompat,
+// the maximum block size is 64KB.
+func WriterBlockSize(n int) WriterOption {
+	return func(w *Writer) error {
+		if w.snappy && n > maxSnappyBlockSize || n < minBlockSize {
+			return errors.New("s2: block size too large. Must be <= 64K and >=4KB on for snappy compatible output")
+		}
+		if n > maxBlockSize || n < minBlockSize {
+			return errors.New("s2: block size too large. Must be <= 4MB and >=4KB")
+		}
+		w.blockSize = n
+		return nil
+	}
+}
+
+// WriterPadding will add padding to all output so the size will be a multiple of n.
+// This can be used to obfuscate the exact output size or make blocks of a certain size.
+// The contents will be a skippable frame, so it will be invisible by the decoder.
+// n must be > 0 and <= 4MB.
+// The padded area will be filled with data from crypto/rand.Reader.
+// The padding will be applied whenever Close is called on the writer.
+func WriterPadding(n int) WriterOption {
+	return func(w *Writer) error {
+		if n <= 0 {
+			return fmt.Errorf("s2: padding must be at least 1")
+		}
+		// No need to waste our time.
+		if n == 1 {
+			w.pad = 0
+		}
+		if n > maxBlockSize {
+			return fmt.Errorf("s2: padding must less than 4MB")
+		}
+		w.pad = n
+		return nil
+	}
+}
+
+// WriterPaddingSrc will get random data for padding from the supplied source.
+// By default crypto/rand is used.
+func WriterPaddingSrc(reader io.Reader) WriterOption {
+	return func(w *Writer) error {
+		w.randSrc = reader
+		return nil
+	}
+}
+
+// WriterSnappyCompat will write snappy compatible output.
+// The output can be decompressed using either snappy or s2.
+// If block size is more than 64KB it is set to that.
+func WriterSnappyCompat() WriterOption {
+	return func(w *Writer) error {
+		w.snappy = true
+		if w.blockSize > 64<<10 {
+			// We choose 8 bytes less than 64K, since that will make literal emits slightly more effective.
+			// And allows us to skip some size checks.
+			w.blockSize = (64 << 10) - 8
+		}
+		return nil
+	}
+}
+
+// WriterFlushOnWrite will compress blocks on each call to the Write function.
+//
+// This is quite inefficient as blocks size will depend on the write size.
+//
+// Use WriterConcurrency(1) to also make sure that output is flushed.
+// When Write calls return, otherwise they will be written when compression is done.
+func WriterFlushOnWrite() WriterOption {
+	return func(w *Writer) error {
+		w.flushOnWrite = true
+		return nil
+	}
+}
+
+// WriterCustomEncoder allows to override the encoder for blocks on the stream.
+// The function must compress 'src' into 'dst' and return the bytes used in dst as an integer.
+// Block size (initial varint) should not be added by the encoder.
+// Returning value 0 indicates the block could not be compressed.
+// Returning a negative value indicates that compression should be attempted.
+// The function should expect to be called concurrently.
+func WriterCustomEncoder(fn func(dst, src []byte) int) WriterOption {
+	return func(w *Writer) error {
+		w.customEnc = fn
+		return nil
+	}
+}
diff --git a/vendor/gopkg.in/inf.v0/LICENSE b/vendor/gopkg.in/inf.v0/LICENSE
new file mode 100644
index 0000000..87a5ced
--- /dev/null
+++ b/vendor/gopkg.in/inf.v0/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2012 Péter Surányi. Portions Copyright (c) 2009 The Go
+Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/gopkg.in/inf.v0/dec.go b/vendor/gopkg.in/inf.v0/dec.go
new file mode 100644
index 0000000..26548b6
--- /dev/null
+++ b/vendor/gopkg.in/inf.v0/dec.go
@@ -0,0 +1,615 @@
+// Package inf (type inf.Dec) implements "infinite-precision" decimal
+// arithmetic.
+// "Infinite precision" describes two characteristics: practically unlimited
+// precision for decimal number representation and no support for calculating
+// with any specific fixed precision.
+// (Although there is no practical limit on precision, inf.Dec can only
+// represent finite decimals.)
+//
+// This package is currently in experimental stage and the API may change.
+//
+// This package does NOT support:
+//  - rounding to specific precisions (as opposed to specific decimal positions)
+//  - the notion of context (each rounding must be explicit)
+//  - NaN and Inf values, and distinguishing between positive and negative zero
+//  - conversions to and from float32/64 types
+//
+// Features considered for possible addition:
+//  + formatting options
+//  + Exp method
+//  + combined operations such as AddRound/MulAdd etc
+//  + exchanging data in decimal32/64/128 formats
+//
+package inf // import "gopkg.in/inf.v0"
+
+// TODO:
+//  - avoid excessive deep copying (quo and rounders)
+
+import (
+	"fmt"
+	"io"
+	"math/big"
+	"strings"
+)
+
+// A Dec represents a signed arbitrary-precision decimal.
+// It is a combination of a sign, an arbitrary-precision integer coefficient
+// value, and a signed fixed-precision exponent value.
+// The sign and the coefficient value are handled together as a signed value
+// and referred to as the unscaled value.
+// (Positive and negative zero values are not distinguished.)
+// Since the exponent is most commonly non-positive, it is handled in negated
+// form and referred to as scale.
+//
+// The mathematical value of a Dec equals:
+//
+//  unscaled * 10**(-scale)
+//
+// Note that different Dec representations may have equal mathematical values.
+//
+//  unscaled  scale  String()
+//  -------------------------
+//         0      0    "0"
+//         0      2    "0.00"
+//         0     -2    "0"
+//         1      0    "1"
+//       100      2    "1.00"
+//        10      0   "10"
+//         1     -1   "10"
+//
+// The zero value for a Dec represents the value 0 with scale 0.
+//
+// Operations are typically performed through the *Dec type.
+// The semantics of the assignment operation "=" for "bare" Dec values is
+// undefined and should not be relied on.
+//
+// Methods are typically of the form:
+//
+//	func (z *Dec) Op(x, y *Dec) *Dec
+//
+// and implement operations z = x Op y with the result as receiver; if it
+// is one of the operands it may be overwritten (and its memory reused).
+// To enable chaining of operations, the result is also returned. Methods
+// returning a result other than *Dec take one of the operands as the receiver.
+//
+// A "bare" Quo method (quotient / division operation) is not provided, as the
+// result is not always a finite decimal and thus in general cannot be
+// represented as a Dec.
+// Instead, in the common case when rounding is (potentially) necessary,
+// QuoRound should be used with a Scale and a Rounder.
+// QuoExact or QuoRound with RoundExact can be used in the special cases when it
+// is known that the result is always a finite decimal.
+//
+type Dec struct {
+	unscaled big.Int
+	scale    Scale
+}
+
+// Scale represents the type used for the scale of a Dec.
+type Scale int32
+
+const scaleSize = 4 // bytes in a Scale value
+
+// Scaler represents a method for obtaining the scale to use for the result of
+// an operation on x and y.
+type scaler interface {
+	Scale(x *Dec, y *Dec) Scale
+}
+
+var bigInt = [...]*big.Int{
+	big.NewInt(0), big.NewInt(1), big.NewInt(2), big.NewInt(3), big.NewInt(4),
+	big.NewInt(5), big.NewInt(6), big.NewInt(7), big.NewInt(8), big.NewInt(9),
+	big.NewInt(10),
+}
+
+var exp10cache [64]big.Int = func() [64]big.Int {
+	e10, e10i := [64]big.Int{}, bigInt[1]
+	for i := range e10 {
+		e10[i].Set(e10i)
+		e10i = new(big.Int).Mul(e10i, bigInt[10])
+	}
+	return e10
+}()
+
+// NewDec allocates and returns a new Dec set to the given int64 unscaled value
+// and scale.
+func NewDec(unscaled int64, scale Scale) *Dec {
+	return new(Dec).SetUnscaled(unscaled).SetScale(scale)
+}
+
+// NewDecBig allocates and returns a new Dec set to the given *big.Int unscaled
+// value and scale.
+func NewDecBig(unscaled *big.Int, scale Scale) *Dec {
+	return new(Dec).SetUnscaledBig(unscaled).SetScale(scale)
+}
+
+// Scale returns the scale of x.
+func (x *Dec) Scale() Scale {
+	return x.scale
+}
+
+// Unscaled returns the unscaled value of x for u and true for ok when the
+// unscaled value can be represented as int64; otherwise it returns an undefined
+// int64 value for u and false for ok. Use x.UnscaledBig().Int64() to avoid
+// checking the validity of the value when the check is known to be redundant.
+func (x *Dec) Unscaled() (u int64, ok bool) {
+	u = x.unscaled.Int64()
+	var i big.Int
+	ok = i.SetInt64(u).Cmp(&x.unscaled) == 0
+	return
+}
+
+// UnscaledBig returns the unscaled value of x as *big.Int.
+func (x *Dec) UnscaledBig() *big.Int {
+	return &x.unscaled
+}
+
+// SetScale sets the scale of z, with the unscaled value unchanged, and returns
+// z.
+// The mathematical value of the Dec changes as if it was multiplied by
+// 10**(oldscale-scale).
+func (z *Dec) SetScale(scale Scale) *Dec {
+	z.scale = scale
+	return z
+}
+
+// SetUnscaled sets the unscaled value of z, with the scale unchanged, and
+// returns z.
+func (z *Dec) SetUnscaled(unscaled int64) *Dec {
+	z.unscaled.SetInt64(unscaled)
+	return z
+}
+
+// SetUnscaledBig sets the unscaled value of z, with the scale unchanged, and
+// returns z.
+func (z *Dec) SetUnscaledBig(unscaled *big.Int) *Dec {
+	z.unscaled.Set(unscaled)
+	return z
+}
+
+// Set sets z to the value of x and returns z.
+// It does nothing if z == x.
+func (z *Dec) Set(x *Dec) *Dec {
+	if z != x {
+		z.SetUnscaledBig(x.UnscaledBig())
+		z.SetScale(x.Scale())
+	}
+	return z
+}
+
+// Sign returns:
+//
+//	-1 if x <  0
+//	 0 if x == 0
+//	+1 if x >  0
+//
+func (x *Dec) Sign() int {
+	return x.UnscaledBig().Sign()
+}
+
+// Neg sets z to -x and returns z.
+func (z *Dec) Neg(x *Dec) *Dec {
+	z.SetScale(x.Scale())
+	z.UnscaledBig().Neg(x.UnscaledBig())
+	return z
+}
+
+// Cmp compares x and y and returns:
+//
+//   -1 if x <  y
+//    0 if x == y
+//   +1 if x >  y
+//
+func (x *Dec) Cmp(y *Dec) int {
+	xx, yy := upscale(x, y)
+	return xx.UnscaledBig().Cmp(yy.UnscaledBig())
+}
+
+// Abs sets z to |x| (the absolute value of x) and returns z.
+func (z *Dec) Abs(x *Dec) *Dec {
+	z.SetScale(x.Scale())
+	z.UnscaledBig().Abs(x.UnscaledBig())
+	return z
+}
+
+// Add sets z to the sum x+y and returns z.
+// The scale of z is the greater of the scales of x and y.
+func (z *Dec) Add(x, y *Dec) *Dec {
+	xx, yy := upscale(x, y)
+	z.SetScale(xx.Scale())
+	z.UnscaledBig().Add(xx.UnscaledBig(), yy.UnscaledBig())
+	return z
+}
+
+// Sub sets z to the difference x-y and returns z.
+// The scale of z is the greater of the scales of x and y.
+func (z *Dec) Sub(x, y *Dec) *Dec {
+	xx, yy := upscale(x, y)
+	z.SetScale(xx.Scale())
+	z.UnscaledBig().Sub(xx.UnscaledBig(), yy.UnscaledBig())
+	return z
+}
+
+// Mul sets z to the product x*y and returns z.
+// The scale of z is the sum of the scales of x and y.
+func (z *Dec) Mul(x, y *Dec) *Dec {
+	z.SetScale(x.Scale() + y.Scale())
+	z.UnscaledBig().Mul(x.UnscaledBig(), y.UnscaledBig())
+	return z
+}
+
+// Round sets z to the value of x rounded to Scale s using Rounder r, and
+// returns z.
+func (z *Dec) Round(x *Dec, s Scale, r Rounder) *Dec {
+	return z.QuoRound(x, NewDec(1, 0), s, r)
+}
+
+// QuoRound sets z to the quotient x/y, rounded using the given Rounder to the
+// specified scale.
+//
+// If the rounder is RoundExact but the result can not be expressed exactly at
+// the specified scale, QuoRound returns nil, and the value of z is undefined.
+//
+// There is no corresponding Div method; the equivalent can be achieved through
+// the choice of Rounder used.
+//
+func (z *Dec) QuoRound(x, y *Dec, s Scale, r Rounder) *Dec {
+	return z.quo(x, y, sclr{s}, r)
+}
+
+func (z *Dec) quo(x, y *Dec, s scaler, r Rounder) *Dec {
+	scl := s.Scale(x, y)
+	var zzz *Dec
+	if r.UseRemainder() {
+		zz, rA, rB := new(Dec).quoRem(x, y, scl, true, new(big.Int), new(big.Int))
+		zzz = r.Round(new(Dec), zz, rA, rB)
+	} else {
+		zz, _, _ := new(Dec).quoRem(x, y, scl, false, nil, nil)
+		zzz = r.Round(new(Dec), zz, nil, nil)
+	}
+	if zzz == nil {
+		return nil
+	}
+	return z.Set(zzz)
+}
+
+// QuoExact sets z to the quotient x/y and returns z when x/y is a finite
+// decimal. Otherwise it returns nil and the value of z is undefined.
+//
+// The scale of a non-nil result is "x.Scale() - y.Scale()" or greater; it is
+// calculated so that the remainder will be zero whenever x/y is a finite
+// decimal.
+func (z *Dec) QuoExact(x, y *Dec) *Dec {
+	return z.quo(x, y, scaleQuoExact{}, RoundExact)
+}
+
+// quoRem sets z to the quotient x/y with the scale s, and if useRem is true,
+// it sets remNum and remDen to the numerator and denominator of the remainder.
+// It returns z, remNum and remDen.
+//
+// The remainder is normalized to the range -1 < r < 1 to simplify rounding;
+// that is, the results satisfy the following equation:
+//
+//  x / y = z + (remNum/remDen) * 10**(-z.Scale())
+//
+// See Rounder for more details about rounding.
+//
+func (z *Dec) quoRem(x, y *Dec, s Scale, useRem bool,
+	remNum, remDen *big.Int) (*Dec, *big.Int, *big.Int) {
+	// difference (required adjustment) compared to "canonical" result scale
+	shift := s - (x.Scale() - y.Scale())
+	// pointers to adjusted unscaled dividend and divisor
+	var ix, iy *big.Int
+	switch {
+	case shift > 0:
+		// increased scale: decimal-shift dividend left
+		ix = new(big.Int).Mul(x.UnscaledBig(), exp10(shift))
+		iy = y.UnscaledBig()
+	case shift < 0:
+		// decreased scale: decimal-shift divisor left
+		ix = x.UnscaledBig()
+		iy = new(big.Int).Mul(y.UnscaledBig(), exp10(-shift))
+	default:
+		ix = x.UnscaledBig()
+		iy = y.UnscaledBig()
+	}
+	// save a copy of iy in case it to be overwritten with the result
+	iy2 := iy
+	if iy == z.UnscaledBig() {
+		iy2 = new(big.Int).Set(iy)
+	}
+	// set scale
+	z.SetScale(s)
+	// set unscaled
+	if useRem {
+		// Int division
+		_, intr := z.UnscaledBig().QuoRem(ix, iy, new(big.Int))
+		// set remainder
+		remNum.Set(intr)
+		remDen.Set(iy2)
+	} else {
+		z.UnscaledBig().Quo(ix, iy)
+	}
+	return z, remNum, remDen
+}
+
+type sclr struct{ s Scale }
+
+func (s sclr) Scale(x, y *Dec) Scale {
+	return s.s
+}
+
+type scaleQuoExact struct{}
+
+func (sqe scaleQuoExact) Scale(x, y *Dec) Scale {
+	rem := new(big.Rat).SetFrac(x.UnscaledBig(), y.UnscaledBig())
+	f2, f5 := factor2(rem.Denom()), factor(rem.Denom(), bigInt[5])
+	var f10 Scale
+	if f2 > f5 {
+		f10 = Scale(f2)
+	} else {
+		f10 = Scale(f5)
+	}
+	return x.Scale() - y.Scale() + f10
+}
+
+func factor(n *big.Int, p *big.Int) int {
+	// could be improved for large factors
+	d, f := n, 0
+	for {
+		dd, dm := new(big.Int).DivMod(d, p, new(big.Int))
+		if dm.Sign() == 0 {
+			f++
+			d = dd
+		} else {
+			break
+		}
+	}
+	return f
+}
+
+func factor2(n *big.Int) int {
+	// could be improved for large factors
+	f := 0
+	for ; n.Bit(f) == 0; f++ {
+	}
+	return f
+}
+
+func upscale(a, b *Dec) (*Dec, *Dec) {
+	if a.Scale() == b.Scale() {
+		return a, b
+	}
+	if a.Scale() > b.Scale() {
+		bb := b.rescale(a.Scale())
+		return a, bb
+	}
+	aa := a.rescale(b.Scale())
+	return aa, b
+}
+
+func exp10(x Scale) *big.Int {
+	if int(x) < len(exp10cache) {
+		return &exp10cache[int(x)]
+	}
+	return new(big.Int).Exp(bigInt[10], big.NewInt(int64(x)), nil)
+}
+
+func (x *Dec) rescale(newScale Scale) *Dec {
+	shift := newScale - x.Scale()
+	switch {
+	case shift < 0:
+		e := exp10(-shift)
+		return NewDecBig(new(big.Int).Quo(x.UnscaledBig(), e), newScale)
+	case shift > 0:
+		e := exp10(shift)
+		return NewDecBig(new(big.Int).Mul(x.UnscaledBig(), e), newScale)
+	}
+	return x
+}
+
+var zeros = []byte("00000000000000000000000000000000" +
+	"00000000000000000000000000000000")
+var lzeros = Scale(len(zeros))
+
+func appendZeros(s []byte, n Scale) []byte {
+	for i := Scale(0); i < n; i += lzeros {
+		if n > i+lzeros {
+			s = append(s, zeros...)
+		} else {
+			s = append(s, zeros[0:n-i]...)
+		}
+	}
+	return s
+}
+
+func (x *Dec) String() string {
+	if x == nil {
+		return "<nil>"
+	}
+	scale := x.Scale()
+	s := []byte(x.UnscaledBig().String())
+	if scale <= 0 {
+		if scale != 0 && x.unscaled.Sign() != 0 {
+			s = appendZeros(s, -scale)
+		}
+		return string(s)
+	}
+	negbit := Scale(-((x.Sign() - 1) / 2))
+	// scale > 0
+	lens := Scale(len(s))
+	if lens-negbit <= scale {
+		ss := make([]byte, 0, scale+2)
+		if negbit == 1 {
+			ss = append(ss, '-')
+		}
+		ss = append(ss, '0', '.')
+		ss = appendZeros(ss, scale-lens+negbit)
+		ss = append(ss, s[negbit:]...)
+		return string(ss)
+	}
+	// lens > scale
+	ss := make([]byte, 0, lens+1)
+	ss = append(ss, s[:lens-scale]...)
+	ss = append(ss, '.')
+	ss = append(ss, s[lens-scale:]...)
+	return string(ss)
+}
+
+// Format is a support routine for fmt.Formatter. It accepts the decimal
+// formats 'd' and 'f', and handles both equivalently.
+// Width, precision, flags and bases 2, 8, 16 are not supported.
+func (x *Dec) Format(s fmt.State, ch rune) {
+	if ch != 'd' && ch != 'f' && ch != 'v' && ch != 's' {
+		fmt.Fprintf(s, "%%!%c(dec.Dec=%s)", ch, x.String())
+		return
+	}
+	fmt.Fprintf(s, x.String())
+}
+
+func (z *Dec) scan(r io.RuneScanner) (*Dec, error) {
+	unscaled := make([]byte, 0, 256) // collects chars of unscaled as bytes
+	dp, dg := -1, -1                 // indexes of decimal point, first digit
+loop:
+	for {
+		ch, _, err := r.ReadRune()
+		if err == io.EOF {
+			break loop
+		}
+		if err != nil {
+			return nil, err
+		}
+		switch {
+		case ch == '+' || ch == '-':
+			if len(unscaled) > 0 || dp >= 0 { // must be first character
+				r.UnreadRune()
+				break loop
+			}
+		case ch == '.':
+			if dp >= 0 {
+				r.UnreadRune()
+				break loop
+			}
+			dp = len(unscaled)
+			continue // don't add to unscaled
+		case ch >= '0' && ch <= '9':
+			if dg == -1 {
+				dg = len(unscaled)
+			}
+		default:
+			r.UnreadRune()
+			break loop
+		}
+		unscaled = append(unscaled, byte(ch))
+	}
+	if dg == -1 {
+		return nil, fmt.Errorf("no digits read")
+	}
+	if dp >= 0 {
+		z.SetScale(Scale(len(unscaled) - dp))
+	} else {
+		z.SetScale(0)
+	}
+	_, ok := z.UnscaledBig().SetString(string(unscaled), 10)
+	if !ok {
+		return nil, fmt.Errorf("invalid decimal: %s", string(unscaled))
+	}
+	return z, nil
+}
+
+// SetString sets z to the value of s, interpreted as a decimal (base 10),
+// and returns z and a boolean indicating success. The scale of z is the
+// number of digits after the decimal point (including any trailing 0s),
+// or 0 if there is no decimal point. If SetString fails, the value of z
+// is undefined but the returned value is nil.
+func (z *Dec) SetString(s string) (*Dec, bool) {
+	r := strings.NewReader(s)
+	_, err := z.scan(r)
+	if err != nil {
+		return nil, false
+	}
+	_, _, err = r.ReadRune()
+	if err != io.EOF {
+		return nil, false
+	}
+	// err == io.EOF => scan consumed all of s
+	return z, true
+}
+
+// Scan is a support routine for fmt.Scanner; it sets z to the value of
+// the scanned number. It accepts the decimal formats 'd' and 'f', and
+// handles both equivalently. Bases 2, 8, 16 are not supported.
+// The scale of z is the number of digits after the decimal point
+// (including any trailing 0s), or 0 if there is no decimal point.
+func (z *Dec) Scan(s fmt.ScanState, ch rune) error {
+	if ch != 'd' && ch != 'f' && ch != 's' && ch != 'v' {
+		return fmt.Errorf("Dec.Scan: invalid verb '%c'", ch)
+	}
+	s.SkipSpace()
+	_, err := z.scan(s)
+	return err
+}
+
+// Gob encoding version
+const decGobVersion byte = 1
+
+func scaleBytes(s Scale) []byte {
+	buf := make([]byte, scaleSize)
+	i := scaleSize
+	for j := 0; j < scaleSize; j++ {
+		i--
+		buf[i] = byte(s)
+		s >>= 8
+	}
+	return buf
+}
+
+func scale(b []byte) (s Scale) {
+	for j := 0; j < scaleSize; j++ {
+		s <<= 8
+		s |= Scale(b[j])
+	}
+	return
+}
+
+// GobEncode implements the gob.GobEncoder interface.
+func (x *Dec) GobEncode() ([]byte, error) {
+	buf, err := x.UnscaledBig().GobEncode()
+	if err != nil {
+		return nil, err
+	}
+	buf = append(append(buf, scaleBytes(x.Scale())...), decGobVersion)
+	return buf, nil
+}
+
+// GobDecode implements the gob.GobDecoder interface.
+func (z *Dec) GobDecode(buf []byte) error {
+	if len(buf) == 0 {
+		return fmt.Errorf("Dec.GobDecode: no data")
+	}
+	b := buf[len(buf)-1]
+	if b != decGobVersion {
+		return fmt.Errorf("Dec.GobDecode: encoding version %d not supported", b)
+	}
+	l := len(buf) - scaleSize - 1
+	err := z.UnscaledBig().GobDecode(buf[:l])
+	if err != nil {
+		return err
+	}
+	z.SetScale(scale(buf[l : l+scaleSize]))
+	return nil
+}
+
+// MarshalText implements the encoding.TextMarshaler interface.
+func (x *Dec) MarshalText() ([]byte, error) {
+	return []byte(x.String()), nil
+}
+
+// UnmarshalText implements the encoding.TextUnmarshaler interface.
+func (z *Dec) UnmarshalText(data []byte) error {
+	_, ok := z.SetString(string(data))
+	if !ok {
+		return fmt.Errorf("invalid inf.Dec")
+	}
+	return nil
+}
diff --git a/vendor/gopkg.in/inf.v0/rounder.go b/vendor/gopkg.in/inf.v0/rounder.go
new file mode 100644
index 0000000..3a97ef5
--- /dev/null
+++ b/vendor/gopkg.in/inf.v0/rounder.go
@@ -0,0 +1,145 @@
+package inf
+
+import (
+	"math/big"
+)
+
+// Rounder represents a method for rounding the (possibly infinite decimal)
+// result of a division to a finite Dec. It is used by Dec.Round() and
+// Dec.Quo().
+//
+// See the Example for results of using each Rounder with some sample values.
+//
+type Rounder rounder
+
+// See http://speleotrove.com/decimal/damodel.html#refround for more detailed
+// definitions of these rounding modes.
+var (
+	RoundDown     Rounder // towards 0
+	RoundUp       Rounder // away from 0
+	RoundFloor    Rounder // towards -infinity
+	RoundCeil     Rounder // towards +infinity
+	RoundHalfDown Rounder // to nearest; towards 0 if same distance
+	RoundHalfUp   Rounder // to nearest; away from 0 if same distance
+	RoundHalfEven Rounder // to nearest; even last digit if same distance
+)
+
+// RoundExact is to be used in the case when rounding is not necessary.
+// When used with Quo or Round, it returns the result verbatim when it can be
+// expressed exactly with the given precision, and it returns nil otherwise.
+// QuoExact is a shorthand for using Quo with RoundExact.
+var RoundExact Rounder
+
+type rounder interface {
+
+	// When UseRemainder() returns true, the Round() method is passed the
+	// remainder of the division, expressed as the numerator and denominator of
+	// a rational.
+	UseRemainder() bool
+
+	// Round sets the rounded value of a quotient to z, and returns z.
+	// quo is rounded down (truncated towards zero) to the scale obtained from
+	// the Scaler in Quo().
+	//
+	// When the remainder is not used, remNum and remDen are nil.
+	// When used, the remainder is normalized between -1 and 1; that is:
+	//
+	//  -|remDen| < remNum < |remDen|
+	//
+	// remDen has the same sign as y, and remNum is zero or has the same sign
+	// as x.
+	Round(z, quo *Dec, remNum, remDen *big.Int) *Dec
+}
+
+type rndr struct {
+	useRem bool
+	round  func(z, quo *Dec, remNum, remDen *big.Int) *Dec
+}
+
+func (r rndr) UseRemainder() bool {
+	return r.useRem
+}
+
+func (r rndr) Round(z, quo *Dec, remNum, remDen *big.Int) *Dec {
+	return r.round(z, quo, remNum, remDen)
+}
+
+var intSign = []*big.Int{big.NewInt(-1), big.NewInt(0), big.NewInt(1)}
+
+func roundHalf(f func(c int, odd uint) (roundUp bool)) func(z, q *Dec, rA, rB *big.Int) *Dec {
+	return func(z, q *Dec, rA, rB *big.Int) *Dec {
+		z.Set(q)
+		brA, brB := rA.BitLen(), rB.BitLen()
+		if brA < brB-1 {
+			// brA < brB-1 => |rA| < |rB/2|
+			return z
+		}
+		roundUp := false
+		srA, srB := rA.Sign(), rB.Sign()
+		s := srA * srB
+		if brA == brB-1 {
+			rA2 := new(big.Int).Lsh(rA, 1)
+			if s < 0 {
+				rA2.Neg(rA2)
+			}
+			roundUp = f(rA2.Cmp(rB)*srB, z.UnscaledBig().Bit(0))
+		} else {
+			// brA > brB-1 => |rA| > |rB/2|
+			roundUp = true
+		}
+		if roundUp {
+			z.UnscaledBig().Add(z.UnscaledBig(), intSign[s+1])
+		}
+		return z
+	}
+}
+
+func init() {
+	RoundExact = rndr{true,
+		func(z, q *Dec, rA, rB *big.Int) *Dec {
+			if rA.Sign() != 0 {
+				return nil
+			}
+			return z.Set(q)
+		}}
+	RoundDown = rndr{false,
+		func(z, q *Dec, rA, rB *big.Int) *Dec {
+			return z.Set(q)
+		}}
+	RoundUp = rndr{true,
+		func(z, q *Dec, rA, rB *big.Int) *Dec {
+			z.Set(q)
+			if rA.Sign() != 0 {
+				z.UnscaledBig().Add(z.UnscaledBig(), intSign[rA.Sign()*rB.Sign()+1])
+			}
+			return z
+		}}
+	RoundFloor = rndr{true,
+		func(z, q *Dec, rA, rB *big.Int) *Dec {
+			z.Set(q)
+			if rA.Sign()*rB.Sign() < 0 {
+				z.UnscaledBig().Add(z.UnscaledBig(), intSign[0])
+			}
+			return z
+		}}
+	RoundCeil = rndr{true,
+		func(z, q *Dec, rA, rB *big.Int) *Dec {
+			z.Set(q)
+			if rA.Sign()*rB.Sign() > 0 {
+				z.UnscaledBig().Add(z.UnscaledBig(), intSign[2])
+			}
+			return z
+		}}
+	RoundHalfDown = rndr{true, roundHalf(
+		func(c int, odd uint) bool {
+			return c > 0
+		})}
+	RoundHalfUp = rndr{true, roundHalf(
+		func(c int, odd uint) bool {
+			return c >= 0
+		})}
+	RoundHalfEven = rndr{true, roundHalf(
+		func(c int, odd uint) bool {
+			return c > 0 || c == 0 && odd == 1
+		})}
+}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index fcc3d46..0c08347 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -11,10 +11,49 @@ github.com/go-chi/docgen
 # github.com/go-chi/render v1.0.3
 ## explicit; go 1.16
 github.com/go-chi/render
+# github.com/gocql/gocql v1.7.0 => github.com/scylladb/gocql v1.14.5
+## explicit; go 1.13
+github.com/gocql/gocql
+github.com/gocql/gocql/debounce
+github.com/gocql/gocql/internal/lru
+github.com/gocql/gocql/internal/murmur
+github.com/gocql/gocql/internal/streams
+github.com/gocql/gocql/serialization/ascii
+github.com/gocql/gocql/serialization/bigint
+github.com/gocql/gocql/serialization/blob
+github.com/gocql/gocql/serialization/boolean
+github.com/gocql/gocql/serialization/counter
+github.com/gocql/gocql/serialization/cqlint
+github.com/gocql/gocql/serialization/cqltime
+github.com/gocql/gocql/serialization/date
+github.com/gocql/gocql/serialization/decimal
+github.com/gocql/gocql/serialization/double
+github.com/gocql/gocql/serialization/duration
+github.com/gocql/gocql/serialization/float
+github.com/gocql/gocql/serialization/inet
+github.com/gocql/gocql/serialization/smallint
+github.com/gocql/gocql/serialization/text
+github.com/gocql/gocql/serialization/timestamp
+github.com/gocql/gocql/serialization/timeuuid
+github.com/gocql/gocql/serialization/tinyint
+github.com/gocql/gocql/serialization/uuid
+github.com/gocql/gocql/serialization/varchar
+github.com/gocql/gocql/serialization/varint
 # github.com/google/uuid v1.6.0
 ## explicit
 github.com/google/uuid
+# github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed
+## explicit
+github.com/hailocab/go-hostpool
+# github.com/klauspost/compress v1.17.9
+## explicit; go 1.20
+github.com/klauspost/compress/internal/race
+github.com/klauspost/compress/s2
 # golang.org/x/crypto v0.36.0
 ## explicit; go 1.23.0
 golang.org/x/crypto/bcrypt
 golang.org/x/crypto/blowfish
+# gopkg.in/inf.v0 v0.9.1
+## explicit
+gopkg.in/inf.v0
+# github.com/gocql/gocql => github.com/scylladb/gocql v1.14.5