Skip to content

Commit

Permalink
Merge pull request #2406 from ruby/code-units
Browse files Browse the repository at this point in the history
Add code unit APIs to location
  • Loading branch information
kddnewton authored Feb 13, 2024
2 parents c90535b + 4757a2c commit 218b83b
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 0 deletions.
39 changes: 39 additions & 0 deletions lib/prism/parse_result.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,23 @@ def character_column(byte_offset)
character_offset(byte_offset) - character_offset(line_start(byte_offset))
end

# Returns the offset from the start of the file for the given byte offset
# counting in code units for the given encoding.
#
# This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
# concept of code units that differs from the number of characters in other
# encodings, it is not captured here.
def code_units_offset(byte_offset, encoding)
byteslice = source.byteslice(0, byte_offset).encode(encoding)
(encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE) ? (byteslice.bytesize / 2) : byteslice.length
end

# Returns the column number in code units for the given encoding for the
# given byte offset.
def code_units_column(byte_offset, encoding)
code_units_offset(byte_offset, encoding) - code_units_offset(line_start(byte_offset), encoding)
end

private

# Binary search through the offsets to find the line number for the given
Expand Down Expand Up @@ -138,6 +155,11 @@ def start_character_offset
source.character_offset(start_offset)
end

# The offset from the start of the file in code units of the given encoding.
def start_code_units_offset(encoding = Encoding::UTF_16LE)
source.code_units_offset(start_offset, encoding)
end

# The byte offset from the beginning of the source where this location ends.
def end_offset
start_offset + length
Expand All @@ -149,6 +171,11 @@ def end_character_offset
source.character_offset(end_offset)
end

# The offset from the start of the file in code units of the given encoding.
def end_code_units_offset(encoding = Encoding::UTF_16LE)
source.code_units_offset(end_offset, encoding)
end

# The line number where this location starts.
def start_line
source.line(start_offset)
Expand Down Expand Up @@ -177,6 +204,12 @@ def start_character_column
source.character_column(start_offset)
end

# The column number in code units of the given encoding where this location
# starts from the start of the line.
def start_code_units_column(encoding = Encoding::UTF_16LE)
source.code_units_column(start_offset, encoding)
end

# The column number in bytes where this location ends from the start of the
# line.
def end_column
Expand All @@ -189,6 +222,12 @@ def end_character_column
source.character_column(end_offset)
end

# The column number in code units of the given encoding where this location
# ends from the start of the line.
def end_code_units_column(encoding = Encoding::UTF_16LE)
source.code_units_column(end_offset, encoding)
end

# Implement the hash pattern matching interface for Location.
def deconstruct_keys(keys)
{ start_offset: start_offset, end_offset: end_offset }
Expand Down
80 changes: 80 additions & 0 deletions test/prism/ruby_api_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,86 @@ def test_location_character_offsets
assert_equal 7, location.end_character_column
end

def test_location_code_units
program = Prism.parse("😀 + 😀\n😍 ||= 😍").value

# first 😀
location = program.statements.body.first.receiver.location

assert_equal 0, location.start_code_units_offset(Encoding::UTF_8)
assert_equal 0, location.start_code_units_offset(Encoding::UTF_16LE)
assert_equal 0, location.start_code_units_offset(Encoding::UTF_32LE)

assert_equal 1, location.end_code_units_offset(Encoding::UTF_8)
assert_equal 2, location.end_code_units_offset(Encoding::UTF_16LE)
assert_equal 1, location.end_code_units_offset(Encoding::UTF_32LE)

assert_equal 0, location.start_code_units_column(Encoding::UTF_8)
assert_equal 0, location.start_code_units_column(Encoding::UTF_16LE)
assert_equal 0, location.start_code_units_column(Encoding::UTF_32LE)

assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)

# second 😀
location = program.statements.body.first.arguments.arguments.first.location

assert_equal 4, location.start_code_units_offset(Encoding::UTF_8)
assert_equal 5, location.start_code_units_offset(Encoding::UTF_16LE)
assert_equal 4, location.start_code_units_offset(Encoding::UTF_32LE)

assert_equal 5, location.end_code_units_offset(Encoding::UTF_8)
assert_equal 7, location.end_code_units_offset(Encoding::UTF_16LE)
assert_equal 5, location.end_code_units_offset(Encoding::UTF_32LE)

assert_equal 4, location.start_code_units_column(Encoding::UTF_8)
assert_equal 5, location.start_code_units_column(Encoding::UTF_16LE)
assert_equal 4, location.start_code_units_column(Encoding::UTF_32LE)

assert_equal 5, location.end_code_units_column(Encoding::UTF_8)
assert_equal 7, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 5, location.end_code_units_column(Encoding::UTF_32LE)

# first 😍
location = program.statements.body.last.name_loc

assert_equal 6, location.start_code_units_offset(Encoding::UTF_8)
assert_equal 8, location.start_code_units_offset(Encoding::UTF_16LE)
assert_equal 6, location.start_code_units_offset(Encoding::UTF_32LE)

assert_equal 7, location.end_code_units_offset(Encoding::UTF_8)
assert_equal 10, location.end_code_units_offset(Encoding::UTF_16LE)
assert_equal 7, location.end_code_units_offset(Encoding::UTF_32LE)

assert_equal 0, location.start_code_units_column(Encoding::UTF_8)
assert_equal 0, location.start_code_units_column(Encoding::UTF_16LE)
assert_equal 0, location.start_code_units_column(Encoding::UTF_32LE)

assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)

# second 😍
location = program.statements.body.last.value.location

assert_equal 12, location.start_code_units_offset(Encoding::UTF_8)
assert_equal 15, location.start_code_units_offset(Encoding::UTF_16LE)
assert_equal 12, location.start_code_units_offset(Encoding::UTF_32LE)

assert_equal 13, location.end_code_units_offset(Encoding::UTF_8)
assert_equal 17, location.end_code_units_offset(Encoding::UTF_16LE)
assert_equal 13, location.end_code_units_offset(Encoding::UTF_32LE)

assert_equal 6, location.start_code_units_column(Encoding::UTF_8)
assert_equal 7, location.start_code_units_column(Encoding::UTF_16LE)
assert_equal 6, location.start_code_units_column(Encoding::UTF_32LE)

assert_equal 7, location.end_code_units_column(Encoding::UTF_8)
assert_equal 9, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE)
end

def test_heredoc?
refute parse_expression("\"foo\"").heredoc?
refute parse_expression("\"foo \#{1}\"").heredoc?
Expand Down

0 comments on commit 218b83b

Please sign in to comment.