Skip to content
This repository has been archived by the owner on Jul 3, 2020. It is now read-only.

Commit

Permalink
Normalize pixiv
Browse files Browse the repository at this point in the history
  • Loading branch information
OpportunityLiu committed Mar 2, 2020
1 parent 2f64807 commit 4852798
Show file tree
Hide file tree
Showing 2 changed files with 173 additions and 149 deletions.
150 changes: 1 addition & 149 deletions EhTagClient/MarkdigExt/Extension.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,165 +10,17 @@ namespace EhTagClient.MarkdigExt
{
internal static class Extension
{
private static readonly byte[] _HexChars = "0123456789ABCDEF".Select(c => (byte)c).ToArray();
private const int UTF8_MAX_LEN = 6;
private readonly static System.Text.Encoding _Encoding = new System.Text.UTF8Encoding(false, false);

public static string NormalizeUri(string url)
{
if (string.IsNullOrWhiteSpace(url))
return "";
url = url.Trim();
var uri = url.AsSpan();
var bufroot = uri.Length * UTF8_MAX_LEN > 4096
? new byte[uri.Length * UTF8_MAX_LEN]
: stackalloc byte[uri.Length * UTF8_MAX_LEN];
var bufrem = bufroot;
for (var i = 0; i < uri.Length; i++)
{
var ch = uri[i];
if ("()".IndexOf(ch) >= 0 || char.IsWhiteSpace(ch) || char.IsControl(ch))
{
// encode special chars
var l = encodeChar(uri.Slice(i, 1), bufrem);
bufrem = bufrem.Slice(l);
}
else if (ch == '%' && i + 2 < uri.Length && isHexChar(uri[i + 1]) && isHexChar(uri[i + 2]))
{
// %xx format
var bc = byte.Parse(uri.Slice(i + 1, 2), System.Globalization.NumberStyles.HexNumber);
if (bc < 128 &&
("\\\"!*'();:@&=+$,/?#[]".IndexOf((char)bc) >= 0
|| char.IsControl((char)bc)
|| char.IsWhiteSpace((char)bc)))
{
// DO NOT decode special chars, write its %xx format
var l = _Encoding.GetBytes(uri.Slice(i, 3), bufrem);
bufrem = bufrem.Slice(l);
}
else
{
// decode
bufrem[0] = bc;
bufrem = bufrem.Slice(1);
}
i += 2;
}
else
{
var l = _Encoding.GetBytes(uri.Slice(i, 1), bufrem);
bufrem = bufrem.Slice(l);
}
}
return _Encoding.GetString(bufroot.Slice(0, bufroot.Length - bufrem.Length));

bool isHexChar(char ch)
{
return ('0' <= ch && ch <= '9')
|| ('A' <= ch && ch <= 'F')
|| ('a' <= ch && ch <= 'f');
}

int encodeChar(ReadOnlySpan<char> chars, Span<byte> bytes)
{
var chbytes = (Span<byte>)stackalloc byte[UTF8_MAX_LEN];
var chlen = _Encoding.GetBytes(chars, chbytes);
for (var i = 0; i < chlen; i++)
{
var b = chbytes[i];
bytes[3 * i] = (byte)'%';
bytes[3 * i + 1] = _HexChars[b >> 4];
bytes[3 * i + 2] = _HexChars[b & 0x0F];
}
return chlen * 3;
}
}

public static MarkdownDocument Normalize(MarkdownDocument doc)
{
foreach (var link in doc.Descendants().OfType<LinkInline>())
{
_NormalizeLink(link);
Normailze.LinkNormailizer.Normalize(link);
}

return doc;
}

private static readonly Dictionary<string, string> _KnownHosts = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase)
{
["moegirl.org"] = "萌娘百科",
["wikipedia.org"] = "维基百科",
["pixiv.net"] = "pixiv",
["instagram.com"] = "Instagram",
["facebook.com"] = "脸书",
["twitter.com"] = "Twitter",
["weibo.com"] = "微博",
};

private static void _NormalizeLink(LinkInline link)
{
var url = link.GetDynamicUrl?.Invoke() ?? link.Url;
var title = link.Title;
var nsfwmark = default(string);

if (url != null && url.StartsWith("#") && !string.IsNullOrWhiteSpace(title))
{
// nsfw link
nsfwmark = url;
url = title;
}

var (furl, nsfw) = _FormatUrl(url);
if (nsfw && nsfwmark == null)
{
nsfwmark = "#";
}

if (link.IsImage)
{
if (nsfwmark == null)
{
link.Url = furl;
}
else
{
link.Title = furl;
link.Url = nsfwmark;
}
}
else
{
if (link.IsAutoLink
&& link.FirstChild == link.LastChild && link.FirstChild is LiteralInline content
&& Uri.TryCreate(url, UriKind.Absolute, out var purl))
{
foreach (var item in _KnownHosts)
{
if (purl.Host.EndsWith(item.Key))
{
content.Content = new Markdig.Helpers.StringSlice(item.Value);
}
}
}
link.Url = furl;
}
}

private static readonly Regex _ThumbUriRegex = new Regex(@"^(http|https)://(ehgt\.org(/t|)|exhentai\.org/t|ul\.ehgt\.org(/t|))/(.+)$", RegexOptions.Compiled | RegexOptions.Singleline);

private static (string formatted, bool isNsfw) _FormatUrl(string url)
{
var thumbMatch = _ThumbUriRegex.Match(url);
if (!thumbMatch.Success)
return (NormalizeUri(url), false);

var tail = thumbMatch.Groups[5].Value;
var domain = thumbMatch.Groups[2].Value;

var isNsfw = domain.StartsWith("exhentai");
return ("https://ehgt.org/" + tail, isNsfw);
}

public static (string url, string title, string isNsfw) GetData(this LinkInline link)
{
var url = (link.GetDynamicUrl?.Invoke() ?? link.Url).Trim();
Expand Down
172 changes: 172 additions & 0 deletions EhTagClient/MarkdigExt/Normailze/LinkNormailizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using Markdig.Syntax;
using Markdig.Syntax.Inlines;

namespace EhTagClient.MarkdigExt.Normailze
{
public static class LinkNormailizer
{
private static readonly byte[] _HexChars = "0123456789ABCDEF".Select(c => (byte)c).ToArray();
private const int UTF8_MAX_LEN = 6;
private readonly static System.Text.Encoding _Encoding = new System.Text.UTF8Encoding(false, false);

private static string _NormalizeUri(string url)
{
if (string.IsNullOrWhiteSpace(url))
return "";
url = url.Trim();
var uri = url.AsSpan();
var bufroot = uri.Length * UTF8_MAX_LEN > 4096
? new byte[uri.Length * UTF8_MAX_LEN]
: stackalloc byte[uri.Length * UTF8_MAX_LEN];
var bufrem = bufroot;
for (var i = 0; i < uri.Length; i++)
{
var ch = uri[i];
if ("()".IndexOf(ch) >= 0 || char.IsWhiteSpace(ch) || char.IsControl(ch))
{
// encode special chars
var l = encodeChar(uri.Slice(i, 1), bufrem);
bufrem = bufrem.Slice(l);
}
else if (ch == '%' && i + 2 < uri.Length && isHexChar(uri[i + 1]) && isHexChar(uri[i + 2]))
{
// %xx format
var bc = byte.Parse(uri.Slice(i + 1, 2), System.Globalization.NumberStyles.HexNumber);
if (bc < 128 &&
("\\\"!*'();:@&=+$,/?#[]".IndexOf((char)bc) >= 0
|| char.IsControl((char)bc)
|| char.IsWhiteSpace((char)bc)))
{
// DO NOT decode special chars, write its %xx format
var l = _Encoding.GetBytes(uri.Slice(i, 3), bufrem);
bufrem = bufrem.Slice(l);
}
else
{
// decode
bufrem[0] = bc;
bufrem = bufrem.Slice(1);
}
i += 2;
}
else
{
var l = _Encoding.GetBytes(uri.Slice(i, 1), bufrem);
bufrem = bufrem.Slice(l);
}
}
return _Encoding.GetString(bufroot.Slice(0, bufroot.Length - bufrem.Length));

bool isHexChar(char ch)
{
return ('0' <= ch && ch <= '9')
|| ('A' <= ch && ch <= 'F')
|| ('a' <= ch && ch <= 'f');
}

int encodeChar(ReadOnlySpan<char> chars, Span<byte> bytes)
{
var chbytes = (Span<byte>)stackalloc byte[UTF8_MAX_LEN];
var chlen = _Encoding.GetBytes(chars, chbytes);
for (var i = 0; i < chlen; i++)
{
var b = chbytes[i];
bytes[3 * i] = (byte)'%';
bytes[3 * i + 1] = _HexChars[b >> 4];
bytes[3 * i + 2] = _HexChars[b & 0x0F];
}
return chlen * 3;
}
}

private static readonly Regex _EhentaiThumbUriRegex = new Regex(@"^(http|https)://(ehgt\.org(/t|)|exhentai\.org/t|ul\.ehgt\.org(/t|))/(.+)$", RegexOptions.Compiled | RegexOptions.Singleline);
private static readonly Regex _PixivThumbUriRegex = new Regex(@"^(http|https)://i\.pximg\.net/(.+)$", RegexOptions.Compiled | RegexOptions.Singleline);

private static (string formatted, bool isNsfw) _FormatUrl(string url)
{
var thumbMatch = _EhentaiThumbUriRegex.Match(url);
if (thumbMatch.Success)
{
var tail = thumbMatch.Groups[5].Value;
var domain = thumbMatch.Groups[2].Value;

var isNsfw = domain.StartsWith("exhentai");
return ("https://ehgt.org/" + tail, isNsfw);
}

thumbMatch = _PixivThumbUriRegex.Match(url);
if (thumbMatch.Success)
{
return ("https://i.pixiv.cat/" + thumbMatch.Groups[2].Value, false);
}

return (_NormalizeUri(url), false);

}

private static readonly Dictionary<string, string> _KnownHosts = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase)
{
["moegirl.org"] = "萌娘百科",
["wikipedia.org"] = "维基百科",
["pixiv.net"] = "pixiv",
["instagram.com"] = "Instagram",
["facebook.com"] = "脸书",
["twitter.com"] = "Twitter",
["weibo.com"] = "微博",
};

public static void Normalize(LinkInline link)
{
var url = link.GetDynamicUrl?.Invoke() ?? link.Url;
var title = link.Title;
var nsfwmark = default(string);

if (url != null && url.StartsWith("#") && !string.IsNullOrWhiteSpace(title))
{
// nsfw link
nsfwmark = url;
url = title;
}

var (furl, nsfw) = _FormatUrl(url);
if (nsfw && nsfwmark == null)
{
nsfwmark = "#";
}

if (link.IsImage)
{
if (nsfwmark == null)
{
link.Url = furl;
}
else
{
link.Title = furl;
link.Url = nsfwmark;
}
}
else
{
if (link.IsAutoLink
&& link.FirstChild == link.LastChild && link.FirstChild is LiteralInline content
&& Uri.TryCreate(url, UriKind.Absolute, out var purl))
{
foreach (var item in _KnownHosts)
{
if (purl.Host.EndsWith(item.Key))
{
content.Content = new Markdig.Helpers.StringSlice(item.Value);
}
}
}
link.Url = furl;
}
}
}
}

0 comments on commit 4852798

Please sign in to comment.