From fa3458e38a676d8c1244b37eb3c6ededfaa1330f Mon Sep 17 00:00:00 2001 From: Matt Farmer Date: Fri, 27 Dec 2019 10:44:29 -0500 Subject: [PATCH] Correctly url encode emoji in path segments The previous implementation borked on emoji because invoking char.toString on a single UTF-8 part of a larger UTF-16 pair results in the encoding presenting "?" as the value. This implementation works primarily on Bytes and avoids having to invoke char.toString and therefore is capable of correctly encoding emoji characters into a UTF-8 url encoded path segment. This did involve re-working some of the valid character detection for path segments, so there is likely a delta to the overall performance, but I think it should be negligible. --- core/src/main/scala/uri.scala | 39 +++++++++++++++++++---------------- core/src/test/scala/uri.scala | 11 +++++++--- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/core/src/main/scala/uri.scala b/core/src/main/scala/uri.scala index db4ead4d..438698b6 100644 --- a/core/src/main/scala/uri.scala +++ b/core/src/main/scala/uri.scala @@ -50,30 +50,33 @@ object UriEncode { def pchar = unreserved ++ ( ':' :: '@' :: '&' :: '=' :: '+' :: '$' :: ',' :: Nil ) - val segmentValid = (';' +: pchar).toSet + val segmentValid: Set[Char] = (';' +: pchar).toSet - private val validMarkers = (0 to segmentValid.max.toInt).map(i => segmentValid(i.toChar)).toArray - private def isValidChar(ch: Char) = (ch < validMarkers.length) && validMarkers(ch.toInt) + // There are likely more optimal ways of doing this calculation, however + // it seems unlikely that long path segments are often on the hot path + // of a request in such a way that they can't be cached. If that proves + // not to be true, then we can revisit. + private def isValidChar(b: Byte) = { + segmentValid.contains(b.toChar) + } def path(pathSegment: String, encoding: String = "UTF-8") = { - if (pathSegment.forall(isValidChar)) { + val pathBytes = pathSegment.getBytes(encoding) + + if (pathBytes.forall(isValidChar)) { pathSegment - } - else { + } else { val sb = new StringBuilder(pathSegment.length << 1) - pathSegment foreach { ch => - if (isValidChar(ch)) { - sb.append(ch) - } - else { - ch.toString.getBytes(encoding) foreach { b => - val hi = (b >>> 4) & 0xf - val lo = b & 0xf - sb.append('%') - .append((if (hi > 9) hi + '7' else hi + '0').toChar) - .append((if (lo > 9) lo + '7' else lo + '0').toChar) - } + pathBytes.foreach { b => + if (isValidChar(b)) { + sb.append(b.toChar) + } else { + val hi = (b >>> 4) & 0xf + val lo = b & 0xf + sb.append('%') + .append((if (hi > 9) hi + '7' else hi + '0').toChar) + .append((if (lo > 9) lo + '7' else lo + '0').toChar) } } diff --git a/core/src/test/scala/uri.scala b/core/src/test/scala/uri.scala index efcb40da..5e419e62 100644 --- a/core/src/test/scala/uri.scala +++ b/core/src/test/scala/uri.scala @@ -1,21 +1,26 @@ package dispatch.spec import org.scalacheck._ -import org.scalacheck.Prop.BooleanOperators +import org.scalacheck.Prop._ object UriSpecification extends Properties("Uri") { /** java.net.URLDecoder should *NOT* be used for testing URI segment decoding * because it implements completely different functionality: query parameter decoding */ - property("encode-decode") = Prop.forAll { (path: String) => + property("Encodes and decodes basic strings") = Prop.forAll { (path: String) => !path.contains(":") ==> { new java.net.URI(dispatch.UriEncode.path(path)).getPath == path } // else Prop.throws(classOf[java.net.URISyntaxException]) } /** if there is nothing to escape, encoder must return original reference */ - property("noop") = Prop.forAll(Gen.choose(0,100)) { (n: Int) => + property("Does nothing if there's nothing eo encode") = Prop.forAll(Gen.choose(0,100)) { (n: Int) => val path = "A" * n dispatch.UriEncode.path(path) eq path } + + property("Encodes emoji correctly") = forAll(Gen.const("unused")) { (sample: String) => + val path = "roma🇮🇹" + new java.net.URI(dispatch.UriEncode.path(path)).getPath == (path) + } }