Skip to content

Commit

Permalink
Special-case parsing of <script> and <style> elements, similarly to HTML
Browse files Browse the repository at this point in the history
In HTML, the content of <script> and <style> tags should not be parsed as HTML,
the parser should simply look for the end tag.
This eliminates the need to HTML-encode all `<` operators (or even
HTML inlined in string literals).

To align dothtml and HTML, the patch implements this behavior in dothtml.
The change may easily break someone's code, if they already have
a script element with entities like &lt;, so it is possible to configure
which tags will be parsed as "raw text".
By default, it is script, style and also dot:InlineScript and
dot:HtmlLiteral (as suggested in #1428). This setting is up for debate.

resolves #1445
  • Loading branch information
exyi committed Dec 28, 2024
1 parent 1fff36f commit a981d4c
Show file tree
Hide file tree
Showing 6 changed files with 319 additions and 14 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
using System;
using System.Collections.Generic;
using System.Linq;
using DotVVM.Framework.Configuration;

namespace DotVVM.Framework.Compilation.Parser.Dothtml;

public sealed class DotvvmSyntaxConfiguration
{
private readonly HashSet<string> rawTextElements;
public IEnumerable<string> RawTextElements => rawTextElements;

public bool IsRawTextElement(string elementName) =>
rawTextElements.Contains(elementName);

public DotvvmSyntaxConfiguration(IEnumerable<string> rawTextElements)
{
this.rawTextElements = rawTextElements.ToHashSet(StringComparer.OrdinalIgnoreCase);
}

public static DotvvmSyntaxConfiguration FromMarkupConfig(DotvvmMarkupConfiguration markupConfiguration)
{
var rawTextElements = markupConfiguration.RawTextElements;

return new DotvvmSyntaxConfiguration(rawTextElements);
}

public static DotvvmSyntaxConfiguration Default { get; } = new DotvvmSyntaxConfiguration(["script", "style", "dot:InlineScript", "dot:HtmlLiteral"]);
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@ namespace DotVVM.Framework.Compilation.Parser.Dothtml.Tokenizer
/// </summary>
public class DothtmlTokenizer : TokenizerBase<DothtmlToken, DothtmlTokenType>
{
public DothtmlTokenizer() : base(DothtmlTokenType.Text, DothtmlTokenType.WhiteSpace)
private readonly DotvvmSyntaxConfiguration config;

public DothtmlTokenizer(DotvvmSyntaxConfiguration? config = null) : base(DothtmlTokenType.Text, DothtmlTokenType.WhiteSpace)
{
this.config = config ?? DotvvmSyntaxConfiguration.Default;
}

private static bool IsAllowedAttributeFirstChar(char ch)
Expand Down Expand Up @@ -249,13 +252,15 @@ private ReadElementType ReadElement(bool wasOpenBraceRead = false)
}

// read tag name
if (!ReadTagOrAttributeName(isAttributeName: false))
if (!ReadTagOrAttributeName(isAttributeName: false, out var tagPrefix, out var tagName))
{
CreateToken(DothtmlTokenType.Text, errorProvider: t => CreateTokenError(t, DothtmlTokenType.OpenTag, DothtmlTokenizerErrors.TagNameExpected));
CreateToken(DothtmlTokenType.CloseTag, errorProvider: t => CreateTokenError());
return ReadElementType.Error;
}

var tagFullName = tagPrefix is null ? tagName ?? "" : tagPrefix + ":" + tagName;

// read tag attributes
SkipWhitespace();
if (!isClosingTag)
Expand Down Expand Up @@ -291,11 +296,14 @@ private ReadElementType ReadElement(bool wasOpenBraceRead = false)
}
}

bool isSelfClosing = false;

if (Peek() == '/' && !isClosingTag)
{
// self closing tag
Read();
CreateToken(DothtmlTokenType.Slash, "/");
isSelfClosing = true;
}
if (Peek() != '>')
{
Expand All @@ -306,20 +314,74 @@ private ReadElementType ReadElement(bool wasOpenBraceRead = false)

Read();
CreateToken(DothtmlTokenType.CloseTag, ">");

if (!isClosingTag && !isSelfClosing && config.IsRawTextElement(tagFullName))
{
// HTML <script>, <style> tags: read content until we find the closing the, i.e. the `</script` sequence
ReadRawTextTag(tagFullName);
return ReadElementType.RawTextTag;
}

return ReadElementType.ValidTag;
}

public enum ReadElementType
{
Error,
ValidTag,
RawTextTag,
CData,
Comment,
Doctype,
XmlProcessingInstruction,
ServerComment
}

public void ReadRawTextTag(string name)
{
// Read everything as raw text until the matching end tag
// used to parsing <script>, <style>, <dot:InlineScript>, <dot:HtmlLiteral>
while (Peek() != NullChar)
{
if (PeekIsString("</") &&
PeekSpan(name.Length + 2).Slice(2).Equals(name.AsSpan(), StringComparison.OrdinalIgnoreCase) &&
!char.IsLetterOrDigit(Peek(name.Length + 2)))
{
CreateToken(DothtmlTokenType.Text);
Debug.Assert(Peek() == '<');
Read();
CreateToken(DothtmlTokenType.OpenTag);

Debug.Assert(Peek() == '/');
Read();
CreateToken(DothtmlTokenType.Slash);

if (!ReadTagOrAttributeName(isAttributeName: false, out _, out _))
{
CreateToken(DothtmlTokenType.Text, errorProvider: t => CreateTokenError(t, DothtmlTokenType.OpenTag, DothtmlTokenizerErrors.TagNameExpected));
}

SkipWhitespace();

if (Read() != '>')
{
CreateToken(DothtmlTokenType.CloseTag, errorProvider: t => CreateTokenError(t, DothtmlTokenType.OpenTag, DothtmlTokenizerErrors.TagNotClosed));
}
else
{
CreateToken(DothtmlTokenType.CloseTag);
}

return;
}
Read();
}

// not terminated

CreateToken(DothtmlTokenType.Text, errorProvider: t => CreateTokenError(t, DothtmlTokenType.OpenTag, DothtmlTokenizerErrors.TagNotClosed));
}

public ReadElementType ReadHtmlSpecial(bool openBraceConsumed = false)
{
var s = ReadOneOf("![CDATA[", "!--", "!DOCTYPE", "?", "%--");
Expand Down Expand Up @@ -437,7 +499,7 @@ private void Assert(bool expression)
/// <summary>
/// Reads the name of the tag or attribute.
/// </summary>
private bool ReadTagOrAttributeName(bool isAttributeName)
private bool ReadTagOrAttributeName(bool isAttributeName, out string? prefix, out string? name)
{
var readIdentifierFunc = isAttributeName ? (Func<DothtmlTokenType, char, bool>)ReadAttributeName : (Func<DothtmlTokenType, char, bool>)ReadIdentifier;

Expand All @@ -446,6 +508,7 @@ private bool ReadTagOrAttributeName(bool isAttributeName)
// read the identifier
if (!readIdentifierFunc(DothtmlTokenType.Text, ':'))
{
prefix = name = null;
return false;
}
}
Expand All @@ -457,14 +520,23 @@ private bool ReadTagOrAttributeName(bool isAttributeName)

if (Peek() == ':')
{
prefix = Tokens[^1].Text;

Read();
CreateToken(DothtmlTokenType.Colon, ":");

if (!readIdentifierFunc(DothtmlTokenType.Text, '\0'))
{
CreateToken(DothtmlTokenType.Text, errorProvider: t => CreateTokenError(t, DothtmlTokenType.OpenTag, DothtmlTokenizerErrors.MissingTagName));
name = null;
return true;
}
name = Tokens[^1].Text;
}
else
{
prefix = null;
name = Tokens[^1].Text;
}

SkipWhitespace();
Expand All @@ -477,7 +549,7 @@ private bool ReadTagOrAttributeName(bool isAttributeName)
private bool ReadAttribute()
{
// attribute name
if (!ReadTagOrAttributeName(isAttributeName: true))
if (!ReadTagOrAttributeName(isAttributeName: true, out _, out _))
{
return false;
}
Expand Down
12 changes: 4 additions & 8 deletions src/Framework/Framework/Compilation/Parser/TokenizerBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -188,12 +188,7 @@ protected bool ReadTextUntil(TTokenType tokenType, string stopString, bool stopO
protected bool PeekIsString(string? str)
{
if (str is null) return false;
if (Peek() != str[0]) return false;
for (int i = 1; i < str.Length; i++)
{
if (Peek(i) != str[i]) return false;
}
return true;
return PeekSpan(str.Length).SequenceEqual(str.AsSpan());
}

protected string? ReadOneOf(params string[] strings)
Expand All @@ -212,8 +207,6 @@ protected bool PeekIsString(string? str)

protected abstract TToken NewToken(string text, TTokenType type, int lineNumber, int columnNumber, int length, int startPosition);

char[] tokenCharBuffer = new char[20];

protected string GetCurrentTokenText(int charsFromEndToSkip = 0)
{
var start = LastTokenPosition;
Expand Down Expand Up @@ -302,6 +295,9 @@ protected void OnTokenFound(TToken token)
TokenFound?.Invoke(token);
}

protected ReadOnlySpan<char> PeekSpan(int length) =>
sourceText.AsSpan(position, Math.Min(length, sourceText.Length - position));

/// <summary>
/// Peeks the current char.
/// </summary>
Expand Down
11 changes: 11 additions & 0 deletions src/Framework/Framework/Configuration/DotvvmMarkupConfiguration.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
using DotVVM.Framework.Compilation.ControlTree.Resolved;
using DotVVM.Framework.Compilation.Javascript;
using System.Text.Json.Serialization;
using DotVVM.Framework.Compilation.Parser.Dothtml;

namespace DotVVM.Framework.Configuration
{
Expand Down Expand Up @@ -71,6 +72,15 @@ public IList<BindingExtensionParameter> DefaultExtensionParameters

public ViewCompilationConfiguration ViewCompilation { get; private set; } = new ViewCompilationConfiguration();

/// <summary> List of HTML elements which content is not parsed as [dot]html, but streated as raw text until the end tag. By default it is <c>script</c> and <c>style</c> tags in addition to DotVVM <c>dot:InlineScript</c>. The property is meant primarily as compatibility option, as it may be ignored by tooling. </summary>
[JsonPropertyName("rawTextElements")]
public IList<string> RawTextElements
{
get => _rawTextElements;
set { ThrowIfFrozen(); _rawTextElements = [..value]; }
}
private IList<string> _rawTextElements = new FreezableList<string>(DotvvmSyntaxConfiguration.Default.RawTextElements);


public void AddServiceImport(string identifier, Type type)
{
Expand Down Expand Up @@ -197,6 +207,7 @@ public void Freeze()
FreezableList.Freeze(ref _importedNamespaces);
JavascriptTranslator.Freeze();
FreezableList.Freeze(ref _defaultExtensionParameters);
FreezableList.Freeze(ref _rawTextElements);
}
}
}
Loading

0 comments on commit a981d4c

Please sign in to comment.