Skip to content

Commit

Permalink
Correctly url encode emoji in path segments
Browse files Browse the repository at this point in the history
The previous implementation borked on emoji because invoking
char.toString on a single UTF-8 part of a larger UTF-16 pair results in
the encoding presenting "?" as the value.

This implementation works primarily on Bytes and avoids having to invoke
char.toString and therefore is capable of correctly encoding emoji
characters into a UTF-8 url encoded path segment.

This did involve re-working some of the valid character detection for
path segments, so there is likely a delta to the overall performance,
but I think it should be negligible.
  • Loading branch information
farmdawgnation committed Dec 27, 2019
1 parent 3cbdbb3 commit fa3458e
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 21 deletions.
39 changes: 21 additions & 18 deletions core/src/main/scala/uri.scala
Original file line number Diff line number Diff line change
Expand Up @@ -50,30 +50,33 @@ object UriEncode {
def pchar = unreserved ++ (
':' :: '@' :: '&' :: '=' :: '+' :: '$' :: ',' :: Nil
)
val segmentValid = (';' +: pchar).toSet
val segmentValid: Set[Char] = (';' +: pchar).toSet

private val validMarkers = (0 to segmentValid.max.toInt).map(i => segmentValid(i.toChar)).toArray
private def isValidChar(ch: Char) = (ch < validMarkers.length) && validMarkers(ch.toInt)
// There are likely more optimal ways of doing this calculation, however
// it seems unlikely that long path segments are often on the hot path
// of a request in such a way that they can't be cached. If that proves
// not to be true, then we can revisit.
private def isValidChar(b: Byte) = {
segmentValid.contains(b.toChar)
}

def path(pathSegment: String, encoding: String = "UTF-8") = {
if (pathSegment.forall(isValidChar)) {
val pathBytes = pathSegment.getBytes(encoding)

if (pathBytes.forall(isValidChar)) {
pathSegment
}
else {
} else {
val sb = new StringBuilder(pathSegment.length << 1)

pathSegment foreach { ch =>
if (isValidChar(ch)) {
sb.append(ch)
}
else {
ch.toString.getBytes(encoding) foreach { b =>
val hi = (b >>> 4) & 0xf
val lo = b & 0xf
sb.append('%')
.append((if (hi > 9) hi + '7' else hi + '0').toChar)
.append((if (lo > 9) lo + '7' else lo + '0').toChar)
}
pathBytes.foreach { b =>
if (isValidChar(b)) {
sb.append(b.toChar)
} else {
val hi = (b >>> 4) & 0xf
val lo = b & 0xf
sb.append('%')
.append((if (hi > 9) hi + '7' else hi + '0').toChar)
.append((if (lo > 9) lo + '7' else lo + '0').toChar)
}
}

Expand Down
11 changes: 8 additions & 3 deletions core/src/test/scala/uri.scala
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@
package dispatch.spec

import org.scalacheck._
import org.scalacheck.Prop.BooleanOperators
import org.scalacheck.Prop._

object UriSpecification extends Properties("Uri") {
/** java.net.URLDecoder should *NOT* be used for testing URI segment decoding
* because it implements completely different functionality: query parameter decoding
*/
property("encode-decode") = Prop.forAll { (path: String) =>
property("Encodes and decodes basic strings") = Prop.forAll { (path: String) =>
!path.contains(":") ==> {
new java.net.URI(dispatch.UriEncode.path(path)).getPath == path
} // else Prop.throws(classOf[java.net.URISyntaxException])
}

/** if there is nothing to escape, encoder must return original reference */
property("noop") = Prop.forAll(Gen.choose(0,100)) { (n: Int) =>
property("Does nothing if there's nothing eo encode") = Prop.forAll(Gen.choose(0,100)) { (n: Int) =>
val path = "A" * n
dispatch.UriEncode.path(path) eq path
}

property("Encodes emoji correctly") = forAll(Gen.const("unused")) { (sample: String) =>
val path = "roma🇮🇹"
new java.net.URI(dispatch.UriEncode.path(path)).getPath == (path)
}
}

0 comments on commit fa3458e

Please sign in to comment.