Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configure collation/charset for DB connection #100

Merged
merged 12 commits into from
Dec 18, 2022
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Also, going through the MySQL external library *blocks* the Crystal thread using

## Status

This driver is a work in progress.
This driver is a work in progress.
It implements mysql's binary protocol to create prepared statements.
Contributions are most welcome.

Expand Down Expand Up @@ -77,3 +77,17 @@ Then use the example above changing the `DB.open` line to
```crystal
DB.open "mysql://test:yourpassword@localhost/test" do |db|
```

### Connection URI

The connection string has the following syntax:

```
mysql://[user[:[password]]@]host[:port][/schema][?param1=value1&param2=value2]
```

Connection query params:

- encoding: The collation & charset (character set) to use during the connection.
If empty or not defined, it will be set to `utf8_general_ci`.
The list of available collations is defined in [`MySql::Collations::COLLATIONS_IDS_BY_NAME`](src/mysql/collations.cr)
27 changes: 27 additions & 0 deletions spec/driver_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,33 @@ describe Driver do
end
end

it "should connect with default encoding & collation for the connection set to utf8" do
with_db do |db|
db.exec "DROP DATABASE IF EXISTS crystal_mysql_test"
db.exec "CREATE DATABASE crystal_mysql_test"

# By default, the encoding for the DB connection is set to utf8_general_ci
DB.open "mysql://crystal_test:secret@#{database_host}/crystal_mysql_test" do |db|
db.scalar("SELECT @@collation_connection").should eq("utf8_general_ci")
db.scalar("SELECT @@character_set_connection").should eq("utf8")
end
db.exec "DROP DATABASE IF EXISTS crystal_mysql_test"
end
end

it "should connect with requested encoding" do
with_db do |db|
db.exec "DROP DATABASE IF EXISTS crystal_mysql_test"
db.exec "CREATE DATABASE crystal_mysql_test"

DB.open "mysql://crystal_test:secret@#{database_host}/crystal_mysql_test?encoding=utf8mb4_unicode_520_ci" do |db|
db.scalar("SELECT @@collation_connection").should eq("utf8mb4_unicode_520_ci")
db.scalar("SELECT @@character_set_connection").should eq("utf8mb4")
end
db.exec "DROP DATABASE IF EXISTS crystal_mysql_test"
end
end

it "create and drop test database" do
sql = "SELECT count(*) FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = 'crystal_mysql_test'"

Expand Down
170 changes: 170 additions & 0 deletions src/mysql/collations.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@

module MySql::Collations


# Available collations mapped to the internal ID.
# Handshake packet have only 1 byte for collation_id.
# Only collations with ID > 255 are used during the handshake
# The list of collation is from this SQL query:
# SELECT ID, COLLATION_NAME FROM information_schema.COLLATIONS WHERE ID <= 255 ORDER BY ID;
#
# ucs2, utf16, and utf32 are excluded since they cannot be set as connection charset.
# https://dev.mysql.com/doc/refman/5.7/en/charset-connection.html#charset-connection-impermissible-client-charset
COLLATIONS_IDS_BY_NAME = {
"big5_chinese_ci": 1,
"latin2_czech_cs": 2,
"dec8_swedish_ci": 3,
"cp850_general_ci": 4,
"latin1_german1_ci": 5,
"hp8_english_ci": 6,
"koi8r_general_ci": 7,
"latin1_swedish_ci": 8,
"latin2_general_ci": 9,
"swe7_swedish_ci": 10,
"ascii_general_ci": 11,
"ujis_japanese_ci": 12,
"sjis_japanese_ci": 13,
"cp1251_bulgarian_ci": 14,
"latin1_danish_ci": 15,
"hebrew_general_ci": 16,
"tis620_thai_ci": 18,
"euckr_korean_ci": 19,
"latin7_estonian_cs": 20,
"latin2_hungarian_ci": 21,
"koi8u_general_ci": 22,
"cp1251_ukrainian_ci": 23,
"gb2312_chinese_ci": 24,
"greek_general_ci": 25,
"cp1250_general_ci": 26,
"latin2_croatian_ci": 27,
"gbk_chinese_ci": 28,
"cp1257_lithuanian_ci": 29,
"latin5_turkish_ci": 30,
"latin1_german2_ci": 31,
"armscii8_general_ci": 32,
"utf8_general_ci": 33,
"cp1250_czech_cs": 34,
"cp866_general_ci": 36,
"keybcs2_general_ci": 37,
"macce_general_ci": 38,
"macroman_general_ci": 39,
"cp852_general_ci": 40,
"latin7_general_ci": 41,
"latin7_general_cs": 42,
"macce_bin": 43,
"cp1250_croatian_ci": 44,
"utf8mb4_general_ci": 45,
"utf8mb4_bin": 46,
"latin1_bin": 47,
"latin1_general_ci": 48,
"latin1_general_cs": 49,
"cp1251_bin": 50,
"cp1251_general_ci": 51,
"cp1251_general_cs": 52,
"macroman_bin": 53,
"cp1256_general_ci": 57,
"cp1257_bin": 58,
"cp1257_general_ci": 59,
"binary": 63,
"armscii8_bin": 64,
"ascii_bin": 65,
"cp1250_bin": 66,
"cp1256_bin": 67,
"cp866_bin": 68,
"dec8_bin": 69,
"greek_bin": 70,
"hebrew_bin": 71,
"hp8_bin": 72,
"keybcs2_bin": 73,
"koi8r_bin": 74,
"koi8u_bin": 75,
"utf8_tolower_ci": 76,
"latin2_bin": 77,
"latin5_bin": 78,
"latin7_bin": 79,
"cp850_bin": 80,
"cp852_bin": 81,
"swe7_bin": 82,
"utf8_bin": 83,
"big5_bin": 84,
"euckr_bin": 85,
"gb2312_bin": 86,
"gbk_bin": 87,
"sjis_bin": 88,
"tis620_bin": 89,
"ujis_bin": 91,
"geostd8_general_ci": 92,
"geostd8_bin": 93,
"latin1_spanish_ci": 94,
"cp932_japanese_ci": 95,
"cp932_bin": 96,
"eucjpms_japanese_ci": 97,
"eucjpms_bin": 98,
"cp1250_polish_ci": 99,
"utf8_unicode_ci": 192,
"utf8_icelandic_ci": 193,
"utf8_latvian_ci": 194,
"utf8_romanian_ci": 195,
"utf8_slovenian_ci": 196,
"utf8_polish_ci": 197,
"utf8_estonian_ci": 198,
"utf8_spanish_ci": 199,
"utf8_swedish_ci": 200,
"utf8_turkish_ci": 201,
"utf8_czech_ci": 202,
"utf8_danish_ci": 203,
"utf8_lithuanian_ci": 204,
"utf8_slovak_ci": 205,
"utf8_spanish2_ci": 206,
"utf8_roman_ci": 207,
"utf8_persian_ci": 208,
"utf8_esperanto_ci": 209,
"utf8_hungarian_ci": 210,
"utf8_sinhala_ci": 211,
"utf8_german2_ci": 212,
"utf8_croatian_ci": 213,
"utf8_unicode_520_ci": 214,
"utf8_vietnamese_ci": 215,
"utf8_general_mysql500_ci": 223,
"utf8mb4_unicode_ci": 224,
"utf8mb4_icelandic_ci": 225,
"utf8mb4_latvian_ci": 226,
"utf8mb4_romanian_ci": 227,
"utf8mb4_slovenian_ci": 228,
"utf8mb4_polish_ci": 229,
"utf8mb4_estonian_ci": 230,
"utf8mb4_spanish_ci": 231,
"utf8mb4_swedish_ci": 232,
"utf8mb4_turkish_ci": 233,
"utf8mb4_czech_ci": 234,
"utf8mb4_danish_ci": 235,
"utf8mb4_lithuanian_ci": 236,
"utf8mb4_slovak_ci": 237,
"utf8mb4_spanish2_ci": 238,
"utf8mb4_roman_ci": 239,
"utf8mb4_persian_ci": 240,
"utf8mb4_esperanto_ci": 241,
"utf8mb4_hungarian_ci": 242,
"utf8mb4_sinhala_ci": 243,
"utf8mb4_german2_ci": 244,
"utf8mb4_croatian_ci": 245,
"utf8mb4_unicode_520_ci": 246,
"utf8mb4_vietnamese_ci": 247,
}

def self.default_collation
"utf8_general_ci"
Copy link
Contributor Author

@Dakad Dakad Sep 19, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I reverted the default collation to utf8_general_ci as before.

end

def self.default_collation_id
id_for_collation default_collation
end

def self.default_charset
default_collation.split("_")[0]
end

def self.id_for_collation(collation : String)
return COLLATIONS_IDS_BY_NAME.fetch collation, 0
end
end
6 changes: 5 additions & 1 deletion src/mysql/connection.cr
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
require "socket"

class MySql::Connection < DB::Connection

def initialize(context : DB::ConnectionContext)
super(context)
@socket = uninitialized TCPSocket
Expand All @@ -11,6 +12,9 @@ class MySql::Connection < DB::Connection
username = context.uri.user
password = context.uri.password

charset = context.uri.query_params.fetch "encoding", Collations.default_collation
charset_id = Collations.id_for_collation(charset).to_u8

path = context.uri.path
if path && path.size > 1
initial_catalog = path[1..-1]
Expand All @@ -22,7 +26,7 @@ class MySql::Connection < DB::Connection
handshake = read_packet(Protocol::HandshakeV10)

write_packet(1) do |packet|
Protocol::HandshakeResponse41.new(username, password, initial_catalog, handshake.auth_plugin_data).write(packet)
Protocol::HandshakeResponse41.new(username, password, initial_catalog, handshake.auth_plugin_data, charset_id).write(packet)
end

read_ok_or_err do |packet, status|
Expand Down
9 changes: 5 additions & 4 deletions src/mysql/packets.cr
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@ require "openssl/sha1"
module MySql::Protocol
struct HandshakeV10
getter auth_plugin_data : Bytes
getter charset : UInt8

def initialize(@auth_plugin_data)
def initialize(@auth_plugin_data, @charset)
end

def self.read(packet : MySql::ReadPacket)
Expand All @@ -28,7 +29,7 @@ module MySql::Protocol
packet.read_byte!
packet.read_string

HandshakeV10.new(auth_data)
HandshakeV10.new(auth_data, charset)
end
end

Expand Down Expand Up @@ -59,7 +60,7 @@ module MySql::Protocol
CLIENT_SESSION_TRACK = 0x00800000
CLIENT_DEPRECATE_EOF = 0x01000000

def initialize(@username : String?, @password : String?, @initial_catalog : String?, @auth_plugin_data : Bytes)
def initialize(@username : String?, @password : String?, @initial_catalog : String?, @auth_plugin_data : Bytes, @charset : UInt8)
end

def write(packet : MySql::WritePacket)
Expand All @@ -72,7 +73,7 @@ module MySql::Protocol
packet.write_bytes caps, IO::ByteFormat::LittleEndian

packet.write_bytes 0x00000000u32, IO::ByteFormat::LittleEndian
packet.write_byte 0x21u8 # utf8_general_ci
packet.write_byte @charset
23.times { packet.write_byte 0_u8 }

packet << @username
Expand Down