Skip to content

Commit

Permalink
add function to check alphabetic or numeric
Browse files Browse the repository at this point in the history
  • Loading branch information
tamaroning committed Jul 20, 2023
1 parent 370f1f8 commit 7a3bc21
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 19 deletions.
18 changes: 8 additions & 10 deletions gcc/rust/util/make-rust-unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,9 @@ def write_decomposition():


def write_recomposition():
# non const.
print("std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{")
print(
"const std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{"
)
print(" // clang-format off")
for cp in decomposition_map:
if binary_search_ranges(composition_exclusion_ranges, cp) != -1:
Expand All @@ -219,9 +220,7 @@ def write_ccc():

def write_alphabetic():
print(
"const std::array<std::pair<uint32_t, uint32_t>, {}> ALPHABETIC_RANGES = {{{{".format(
len(alphabetic_ranges)
)
"const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES> ALPHABETIC_RANGES = {{"
)
print(" // clang-format off")
for r in alphabetic_ranges:
Expand All @@ -231,11 +230,7 @@ def write_alphabetic():


def write_numeric():
print(
"const std::array<uint32_t, {}> NUMERIC_CODEPOINTS = {{{{".format(
len(numeric_codepoints)
)
)
print("const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{")
print(" // clang-format off")
for i, cp in enumerate(numeric_codepoints):
if i % 16 == 0:
Expand Down Expand Up @@ -268,6 +263,9 @@ def main():
print()
print("namespace Rust {")
print()
print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges)))
print("const uint32_t NUM_NUMERIC_CODEPOINTS = {};".format(len(numeric_codepoints)))
print()

write_decomposition()
print()
Expand Down
18 changes: 11 additions & 7 deletions gcc/rust/util/rust-unicode-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@

namespace Rust {

const uint32_t NUM_ALPHABETIC_RANGES = 1117;
const uint32_t NUM_NUMERIC_CODEPOINTS = 1831;

const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {
// clang-format off
{0x00c0, {0x0041, 0x0300, }},
Expand Down Expand Up @@ -2086,7 +2089,7 @@ const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {
// clang-format on
};

std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{
const std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{
// clang-format off
{{0x0041, 0x0300}, 0x00c0},
{{0x0041, 0x0301}, 0x00c1},
Expand Down Expand Up @@ -3959,8 +3962,9 @@ const std::map<uint32_t, int32_t> CCC_TABLE = {
// clang-format on
};

const std::array<std::pair<uint32_t, uint32_t>, 1117> ALPHABETIC_RANGES = {{
// clang-format off
const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
ALPHABETIC_RANGES = {{
// clang-format off
{0x0041, 0x005b},
{0x0061, 0x007b},
{0x00aa, 0x00ab},
Expand Down Expand Up @@ -5078,10 +5082,10 @@ const std::array<std::pair<uint32_t, uint32_t>, 1117> ALPHABETIC_RANGES = {{
{0x2ceb0, 0x2ebe1},
{0x2f800, 0x2fa1e},
{0x30000, 0x3134b},
// clang-format on
}};
// clang-format on
}};

const std::array<uint32_t, 1831> NUMERIC_CODEPOINTS = {{
const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{
// clang-format off
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x00b2, 0x00b3, 0x00b9, 0x00bc, 0x00bd, 0x00be,
0x0660, 0x0661, 0x0662, 0x0663, 0x0664, 0x0665, 0x0666, 0x0667, 0x0668, 0x0669, 0x06f0, 0x06f1, 0x06f2, 0x06f3, 0x06f4, 0x06f5,
Expand Down Expand Up @@ -5197,7 +5201,7 @@ const std::array<uint32_t, 1831> NUMERIC_CODEPOINTS = {{
0x1ed1d, 0x1ed1e, 0x1ed1f, 0x1ed20, 0x1ed21, 0x1ed22, 0x1ed23, 0x1ed24, 0x1ed25, 0x1ed26, 0x1ed27, 0x1ed28, 0x1ed29, 0x1ed2a, 0x1ed2b, 0x1ed2c,
0x1ed2d, 0x1ed2f, 0x1ed30, 0x1ed31, 0x1ed32, 0x1ed33, 0x1ed34, 0x1ed35, 0x1ed36, 0x1ed37, 0x1ed38, 0x1ed39, 0x1ed3a, 0x1ed3b, 0x1ed3c, 0x1ed3d,
0x1f100, 0x1f101, 0x1f102, 0x1f103, 0x1f104, 0x1f105, 0x1f106, 0x1f107, 0x1f108, 0x1f109, 0x1f10a, 0x1f10b, 0x1f10c, 0x1fbf0, 0x1fbf1, 0x1fbf2,
0x1fbf3, 0x1fbf4, 0x1fbf5, 0x1fbf6, 0x1fbf7, 0x1fbf8, 0x1fbf9,
0x1fbf3, 0x1fbf4, 0x1fbf5, 0x1fbf6, 0x1fbf7, 0x1fbf8, 0x1fbf9,
// clang-format on
}};

Expand Down
106 changes: 106 additions & 0 deletions gcc/rust/util/rust-unicode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,51 @@ namespace Rust {
typedef uint32_t codepoint_t;
typedef std::vector<codepoint_t> string_t;

template <std::size_t SIZE>
int64_t
binary_search_ranges (
const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
uint32_t target_cp)
{
uint32_t low, high, mid, start, end;
low = 0;
high = SIZE;
while (low <= high)
{
mid = (low + high) / 2;
start = ranges[mid].first;
end = ranges[mid].second;
if (start <= target_cp && target_cp <= end - 1)
return mid;
else if (target_cp < start)
high = mid - 1;
else
low = mid + 1;
}
return -1;
}

template <std::size_t SIZE>
int64_t
binary_search_sorted_array (const std::array<std::uint32_t, SIZE> &array,
uint32_t target)
{
uint32_t low, high, mid;
low = 0;
high = SIZE - 1;
while (low <= high)
{
mid = (low + high) / 2;
if (array[mid] == target)
return true;
else if (array[mid] < target)
low = mid + 1;
else
high = mid - 1;
}
return false;
}

int
lookup_cc (codepoint_t c)
{
Expand Down Expand Up @@ -155,6 +200,22 @@ nfc_normalize (string_t s)
return r;
}

bool
is_alphabetic (uint32_t codepoint)
{
int64_t res = binary_search_ranges (ALPHABETIC_RANGES, codepoint);
if (res < 0)
return false;
else
return true;
}

bool
is_numeric (uint32_t codepoint)
{
return binary_search_sorted_array (NUMERIC_CODEPOINTS, codepoint);
}

} // namespace Rust

namespace selftest {
Expand Down Expand Up @@ -191,4 +252,49 @@ rust_utf8_normalize_test ()
// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
}

void
rust_utf8_property_test ()
{
ASSERT_TRUE (Rust::is_alphabetic ('A'));
ASSERT_TRUE (Rust::is_alphabetic ('B'));
ASSERT_TRUE (Rust::is_alphabetic ('x'));
ASSERT_TRUE (Rust::is_alphabetic ('z'));
ASSERT_TRUE (Rust::is_alphabetic (0x00b5)); // µ
ASSERT_TRUE (Rust::is_alphabetic (0x3093)); //
ASSERT_TRUE (Rust::is_alphabetic (0xa8f2)); //
ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃

ASSERT_FALSE (Rust::is_numeric ('\v'));
ASSERT_FALSE (Rust::is_alphabetic ('0'));
ASSERT_FALSE (Rust::is_alphabetic ('9'));
ASSERT_FALSE (Rust::is_alphabetic (0xa720)); //
ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁

// `Nd`s
ASSERT_TRUE (Rust::is_numeric ('0'));
ASSERT_TRUE (Rust::is_numeric ('1'));
ASSERT_TRUE (Rust::is_numeric ('7'));
ASSERT_TRUE (Rust::is_numeric ('9'));
ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂
ASSERT_TRUE (Rust::is_numeric (0x096d)); //
// `Nl`s
ASSERT_TRUE (Rust::is_numeric (0x16e6)); //
ASSERT_TRUE (Rust::is_numeric (0xa6e6)); //
ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀
ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺

// `No`s
ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ²
ASSERT_TRUE (Rust::is_numeric (0x32b1)); //

ASSERT_FALSE (Rust::is_numeric ('\n'));
ASSERT_FALSE (Rust::is_numeric ('z'));
ASSERT_FALSE (Rust::is_numeric (';'));
ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ
ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب
ASSERT_FALSE (Rust::is_numeric (0x0975)); //
ASSERT_FALSE (Rust::is_numeric (0x18f0)); //
ASSERT_FALSE (Rust::is_numeric (0x2f30)); //
}

} // namespace selftest
12 changes: 10 additions & 2 deletions gcc/rust/util/rust-unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,13 @@

namespace Rust {

std::string
nfc_normalize (std::string s);
// TODO: add function nfc_normalize

bool
is_alphabetic (uint32_t codepoint);

bool
is_numeric (uint32_t codepoint);

} // namespace Rust

Expand All @@ -32,6 +37,9 @@ namespace selftest {
void
rust_utf8_normalize_test ();

void
rust_utf8_property_test ();

} // namespace selftest

#endif // CHECKING_P

0 comments on commit 7a3bc21

Please sign in to comment.