From ac882d9fa20a188e18bb93f3a28aee9317315b15 Mon Sep 17 00:00:00 2001
From: poire-z <poire-z@users.noreply.github.com>
Date: Thu, 4 Jun 2020 16:45:15 +0200
Subject: [PATCH] CSS: content: open-quote support via TextLangMan

Get the right quote chars for each language,
and ensure nested quote levels (per lang_cfg).
---
 crengine/include/lvstsheet.h |   6 +-
 crengine/include/textlang.h  |  13 ++
 crengine/src/lvrend.cpp      |   9 ++
 crengine/src/lvstsheet.cpp   | 145 +++++++++++++++++----
 crengine/src/lvtinydom.cpp   |  54 ++++++--
 crengine/src/textlang.cpp    | 246 ++++++++++++++++++++++++++++++++++-
 6 files changed, 430 insertions(+), 43 deletions(-)
diff --git a/crengine/include/lvstsheet.h b/crengine/include/lvstsheet.h
index e5cba4725..53bbdf4e6 100644
--- a/crengine/include/lvstsheet.h
+++ b/crengine/include/lvstsheet.h
@@ -46,6 +46,7 @@
 
 #include "cssdef.h"
 #include "lvstyles.h"
+#include "textlang.h"
 
 class lxmlDocBase;
 class ldomNode;
@@ -330,7 +331,10 @@ class LVStyleSheet {
 /// parse color value like #334455, #345 or red
 bool parse_color_value( const char * & str, css_length_t & value );
 
-/// get computed value for a node from its parsed CSS "content:" value
+/// update (if needed) a style->content (parsed from the CSS declaration) before
+//  applying to a node's style
+void update_style_content_property( css_style_rec_t * style, ldomNode * node );
+/// get the computed final text value for a node from its style->content
 lString16 get_applied_content_property( ldomNode * node );
 
 /// extract @import filename from beginning of CSS
diff --git a/crengine/include/textlang.h b/crengine/include/textlang.h
index 8644ded69..fd568e2ad 100644
--- a/crengine/include/textlang.h
+++ b/crengine/include/textlang.h
@@ -80,6 +80,8 @@ class TextLangMan
 
     static HyphMethod * getMainLangHyphMethod(); // For HyphMan::hyphenate()
 
+    static void resetCounters();
+
     // For frontend info about TextLangMan status and seen langs
     static LVPtrVector<TextLangCfg> * getLangCfgList() {
         return &_lang_cfg_list;
@@ -99,6 +101,12 @@ class TextLangCfg
     lString16 _lang_tag;
     HyphMethod * _hyph_method;
 
+    lString16 _open_quote1;
+    lString16 _close_quote1;
+    lString16 _open_quote2;
+    lString16 _close_quote2;
+    int _quote_nesting_level;
+
     #if USE_HARFBUZZ==1
     hb_language_t _hb_language;
     #endif
@@ -110,6 +118,8 @@ class TextLangCfg
 
     bool _duplicate_real_hyphen_on_next_line;
 
+    void resetCounters();
+
 public:
     lString16 getLangTag() const { return _lang_tag; }
 
@@ -129,6 +139,9 @@ class TextLangCfg
         return _hyph_method;
     }
 
+    lString16 & getOpeningQuote( bool update_level=true );
+    lString16 & getClosingQuote( bool update_level=true );
+
     #if USE_HARFBUZZ==1
     hb_language_t getHBLanguage() const { return _hb_language; }
     #endif
diff --git a/crengine/src/lvrend.cpp b/crengine/src/lvrend.cpp
index a8392ccc3..cce1ca781 100755
--- a/crengine/src/lvrend.cpp
+++ b/crengine/src/lvrend.cpp
@@ -9054,6 +9054,15 @@ void setNodeStyle( ldomNode * enode, css_style_ref_t parent_style, LVFontRef par
         delete pstyle->pseudo_elem_after_style;
         pstyle->pseudo_elem_after_style = NULL;
     }
+
+    if ( nodeElementId == el_pseudoElem ) {
+        // Pseudo element ->content may need some update if it contains
+        // any of the open-quote-like tokens, to account for the
+        // quoting nested levels. setNodeStyle() is actually the good
+        // place to do that, as we're visiting all the nodes recursively.
+        update_style_content_property(pstyle, enode);
+    }
+
     pstyle->flags = 0; // cleanup, before setStyle() adds it to cache
 
     // set calculated style
diff --git a/crengine/src/lvstsheet.cpp b/crengine/src/lvstsheet.cpp
index 06179145c..0259a7b2f 100644
--- a/crengine/src/lvstsheet.cpp
+++ b/crengine/src/lvstsheet.cpp
@@ -899,6 +899,9 @@ bool parse_content_property( const char * & str, lString16 & parsed_content)
     //   'n' for 'no-close-quote'
     //   'u' for 'url()', that we don't support
     //   'z' for unsupported tokens, like gradient()...
+    //   '$' (at start) this content needs post processing before
+    //       being applied to a node's style (needed with quotes,
+    //       to get the correct char for the current nested level).
     // Note: this parsing might not be super robust with
     // convoluted declarations...
     parsed_content.clear();
@@ -906,6 +909,7 @@ bool parse_content_property( const char * & str, lString16 & parsed_content)
     // The presence of a single 'none' or 'normal' among multiple
     // values make the whole thing 'none'.
     bool has_none = false;
+    bool needs_processing_when_applying = false;
     while ( skip_spaces( str ) && *str!=';' && *str!='}' && *str!='!' ) {
         if ( substr_icompare("none", str) ) {
             has_none = true;
@@ -918,18 +922,22 @@ bool parse_content_property( const char * & str, lString16 & parsed_content)
         }
         else if ( substr_icompare("open-quote", str) ) {
             parsed_content << L'Q';
+            needs_processing_when_applying = true;
             continue;
         }
         else if ( substr_icompare("close-quote", str) ) {
             parsed_content << L'q';
+            needs_processing_when_applying = true;
             continue;
         }
         else if ( substr_icompare("no-open-quote", str) ) {
             parsed_content << L'N';
+            needs_processing_when_applying = true;
             continue;
         }
         else if ( substr_icompare("no-close-quote", str) ) {
             parsed_content << L'n';
+            needs_processing_when_applying = true;
             continue;
         }
         else if ( substr_icompare("attr", str) ) {
@@ -1052,6 +1060,9 @@ bool parse_content_property( const char * & str, lString16 & parsed_content)
         parsed_content.clear();
         parsed_content << L'X';
     }
+    else if ( needs_processing_when_applying ) {
+        parsed_content.insert(0, 1, L'$');
+    }
     if (*str) // something (;, } or !important) follows
         return true;
     // Restore original position if we reach end of CSS string,
@@ -1062,6 +1073,104 @@ bool parse_content_property( const char * & str, lString16 & parsed_content)
     return false;
 }
 
+/// Update a style->content, post processed for its node
+void update_style_content_property( css_style_rec_t * style, ldomNode * node ) {
+    // We don't want to update too much: styles are hashed and shared by
+    // multiple nodes. We don't resolve "attr()" here as attributes are
+    // stable (and "attr(id)" would make all style->content different
+    // and prevent styles from being shared, increasing the number
+    // of styles to cache).
+    // But we need to resolve quotes, according to their nesting level,
+    // and transform them into a litteral string 's'.
+
+    if ( style->content.empty() || style->content[0] != L'$' ) {
+        // No update needed
+        return;
+    }
+
+    // We need to know if this node is visible: if not, quotes nested
+    // level should not be updated. We might want to still include
+    // the computed quote (with quote char for level 1) for it to be
+    // displayed by writeNodeEx() when displaying the HTML, even if
+    // the node is invisible.
+    bool visible = style->display != css_d_none;
+    if ( visible ) {
+        ldomNode * n = node->getParentNode();
+        for ( ; !n->isRoot(); n = n->getParentNode() ) {
+            if ( n->getStyle()->display == css_d_none ) {
+                visible = false;
+                break;
+            }
+        }
+    }
+
+    // We do not support specifying quote chars to be used via CSS "quotes":
+    //     :root { quotes: '\201c' '\201d' '\2018' '\2019'; }
+    // We use the ones hardcoded for the node lang tag language (or default
+    // typography language) provided by TextLangCfg.
+    // HTML5 default CSS specifies them with:
+    //   :root:lang(af), :not(:lang(af)) > :lang(af) { quotes: '\201c' '\201d' '\2018' '\2019' }
+    // This might (or not) implies that nested levels are reset when entering
+    // text with another language, so this new language first level quote is used.
+    // We can actually get that same behaviour by having each TextLangCfg manage
+    // its own nesting level (which won't be reset when en>fr>en, though).
+    // But all this is quite rare, so don't bother about it much.
+    TextLangCfg * lang_cfg = TextLangMan::getTextLangCfg( node );
+
+    // Note: some quote char like (U+201C / U+201D) seem to not be mirrored
+    // (when using HarfBuzz) when added to some RTL arabic text. But it
+    // appears that way with Firefox too!
+    // But if we use another char (U+00AB / U+00BB), it gets mirrored correctly.
+    // Might be that HarfBuzz first substitute it with arabic quotes (which
+    // happen to look inverted), and then mirror that?
+
+    lString16 res;
+    lString16 parsed_content = style->content;
+    lString16 quote;
+    int i = 1; // skip initial '$'
+    int parsed_content_len = parsed_content.length();
+    while ( i < parsed_content_len ) {
+        lChar16 ctype = parsed_content[i];
+        if ( ctype == 's' ) { // literal string: copy as-is
+            lChar16 len = parsed_content[i];
+            res.append(parsed_content, i, len+2);
+            i += len+2;
+        }
+        else if ( ctype == 'a' ) { // attribute value: copy as-is
+            lChar16 len = parsed_content[i];
+            res.append(parsed_content, i, len+2);
+            i += len+2;
+        }
+        else if ( ctype == 'Q' ) { // open-quote
+            quote = lang_cfg->getOpeningQuote(visible);
+            res << L's' << quote.length() << quote;
+            i += 1;
+        }
+        else if ( ctype == 'q' ) { // close-quote
+            quote = lang_cfg->getClosingQuote(visible);
+            res << L's' << quote.length() << quote;
+            i += 1;
+        }
+        else if ( ctype == 'N' ) { // no-open-quote
+            // This should just increment nested quote level and output nothing.
+            lang_cfg->getOpeningQuote(visible);
+            i += 1;
+        }
+        else if ( ctype == 'n' ) { // no-close-quote
+            // This should just increment nested quote level and output nothing.
+            lang_cfg->getClosingQuote(visible);
+            i += 1;
+        }
+        else {
+            // All other stuff are single char (u, z, X) or unsupported/bogus char.
+            res.append(parsed_content, i, 1);
+            i += 1;
+        }
+    }
+    // Replace style->content with what we built
+    style->content = res;
+}
+
 /// Returns the computed value for a node from its parsed CSS "content:" value
 lString16 get_applied_content_property( ldomNode * node ) {
     lString16 res;
@@ -1100,38 +1209,24 @@ lString16 get_applied_content_property( ldomNode * node ) {
             // res << 0x25FD; // WHITE MEDIUM SMALL SQUARE
             res << 0x2B26; // WHITE MEDIUM DIAMOND
         }
+        else if ( ctype == 'X' ) { // 'none'
+            res.clear(); // should be standalone, but let's be sure
+            break;
+        }
+        else if ( ctype == 'z' ) { // unsupported token
+            // Just ignore it, don't show anything
+        }
         else if ( ctype == 'Q' ) { // open-quote
-            // Add default quoting opening char
-            // We do not support showing a different char for multiple nested <q>,
-            // and neither the way to specify this with CSS, ie:
-            //     q::before { content: open-quote; }
-            //     :root { quotes: '\201c' '\201d' '\2018' '\2019'; }
-            // todo: have the right quote char for a language provided by lang_cfg
-            res << 0x201C;
-            // Note: this specific char seem to not be mirrored (when using HarfBuzz) when
-            // added to some RTL arabic text. But it appears that way with Firefox too!
-            // But if we use another char (0x00AB / 0x00BB), it gets mirrored correctly.
-            // Might be that HarfBuzz first substitute it with arabic quotes (which happen
-            // to look inverted), and then mirror that?
+            // Shouldn't happen: replaced earlier by update_style_content_property()
         }
         else if ( ctype == 'q' ) { // close-quote
-            // Add default quoting closing char
-            res << 0x201D;
+            // Shouldn't happen: replaced earlier by update_style_content_property()
         }
         else if ( ctype == 'N' ) { // no-open-quote
-            // (This should just increment nested quote level if we supported that)
-            // Nothing to output
+            // Shouldn't happen: replaced earlier by update_style_content_property()
         }
         else if ( ctype == 'n' ) { // no-close-quote
-            // (This should just decrement nested quote level if we supported that)
-            // Nothing to output
-        }
-        else if ( ctype == 'X' ) { // 'none'
-            res.clear(); // should be standalone, but let's be sure
-            break;
-        }
-        else if ( ctype == 'z' ) { // unsupported token
-            // Just ignore it, don't show anything
+            // Shouldn't happen: replaced earlier by update_style_content_property()
         }
         else { // unexpected
             break;
diff --git a/crengine/src/lvtinydom.cpp b/crengine/src/lvtinydom.cpp
index f79c4343f..fe17aac44 100644
--- a/crengine/src/lvtinydom.cpp
+++ b/crengine/src/lvtinydom.cpp
@@ -4510,6 +4510,9 @@ bool ldomDocument::render( LVRendPageList * pages, LVDocViewCallback * callback,
         // create elements, but may be prevented from doing so by an existing cache file
         _boxingWishedButPreventedByCache = false;
 
+        // Reset counters (quotes nesting levels...)
+        TextLangMan::resetCounters();
+
         CRLog::trace("Save stylesheet...");
         _stylesheet.push();
         CRLog::trace("Init node styles...");
@@ -5018,16 +5021,24 @@ void ldomElementWriter::onBodyEnter()
             for ( int i=0; i<nb_children; i++ ) {
                 ldomNode * child = _element->getChildNode(i);
                 if ( child->getNodeId() == el_pseudoElem ) {
-                    // ->initNodeStyle() has been done when the element was created;
-                    // as pseudo elements have no children, let's ->initNodeRendMethod()
-                    // now (as done in onBodyExit()).
-                    child->initNodeRendMethod();
-                    // ldomNode::ensurePseudoElement() will always have inserted
-                    // "Before" first, and "After" second. But real children might
-                    // soon be added, and we'll have to move "After" last when done.
-                    // Which will be done in onBodyExit().
-                    if ( child->hasAttribute(attr_After) )
+                    if ( child->hasAttribute(attr_Before) ) {
+                        // The "Before" pseudo element (not part of the XML)
+                        // needs to have its style applied. As it has no
+                        // children, we can also init its rend method.
+                        child->initNodeStyle();
+                        child->initNodeRendMethod();
+                    }
+                    else if ( child->hasAttribute(attr_After) ) {
+                        // For the "After" pseudo element, we need to wait
+                        // for all real children to be added, to move it
+                        // as its right position (last), to init its style
+                        // (because of "content:close-quote", whose nested
+                        // level need to have seen all previous nodes to
+                        // be accurate) and its rendering method.
+                        // We'll do that in onBodyExit() when called for
+                        // this node.
                         _pseudoElementAfterChildIndex = i;
+                    }
                 }
             }
         }
@@ -5098,9 +5109,18 @@ void ldomNode::ensurePseudoElement( bool is_before ) {
             lUInt16 attribute_id = is_before ? attr_Before : attr_After;
             pseudo->setAttributeValue(LXML_NS_NONE, attribute_id, L"");
             // We are called by lvrend.cpp setNodeStyle(), after the parent
-            // style and font have been fully set up.
-            // We can set this pseudo element style as it can now properly inherit.
-            pseudo->initNodeStyle();
+            // style and font have been fully set up. We could set this pseudo
+            // element style with pseudo->initNodeStyle(), as it can inherit
+            // properly, but we should not:
+            // - when re-rendering, initNodeStyleRecursive()/updateStyleDataRecursive()
+            //   will iterate thru this node we just added as a child, and do it.
+            // - when XML loading, we could do it for the "Before" pseudo element,
+            //   but for the "After" one, we need to wait for all real children to be
+            //   added and have their style applied - just because they can change
+            //   open-quote/close-quote nesting levels - to be sure we get the
+            //   proper nesting level quote char for the After node.
+            // So, for the XML loading phase, we do that in onBodyEnter() and
+            // onBodyExit() when called on the parent node.
         }
     }
 
@@ -6726,10 +6746,16 @@ void ldomElementWriter::onBodyExit()
     if ( _pseudoElementAfterChildIndex >= 0 ) {
         if ( _pseudoElementAfterChildIndex != _element->getChildCount()-1 ) {
             // Not the last child: move it there
-            // printf("moving After from %d to %d\n", _pseudoElementAfterChildIndex, _element->getChildCount()-1);
-            // moveItemsTo() just works to remove it, and re-add it (so, adding it at the end)
+            // (moveItemsTo() works just fine when the source node is also the
+            // target node: remove it, and re-add it, so, adding it at the end)
             _element->moveItemsTo( _element, _pseudoElementAfterChildIndex, _pseudoElementAfterChildIndex);
         }
+        // Now that all the real children of this node have had their
+        // style set, we can init the style of the "After" pseudo
+        // element, and its rend method as it has no children.
+        ldomNode * child = _element->getChildNode(_element->getChildCount()-1);
+        child->initNodeStyle();
+        child->initNodeRendMethod();
     }
 //    if ( _element->getStyle().isNull() ) {
 //        lString16 path;
diff --git a/crengine/src/textlang.cpp b/crengine/src/textlang.cpp
index e3e17e12a..c5bc9edc6 100644
--- a/crengine/src/textlang.cpp
+++ b/crengine/src/textlang.cpp
@@ -232,9 +232,209 @@ HyphMethod * TextLangMan::getMainLangHyphMethod() {
     return getTextLangCfg()->getHyphMethod();
 }
 
+void TextLangMan::resetCounters() {
+    for ( int i=0; i<_lang_cfg_list.length(); i++ ) {
+        _lang_cfg_list[i]->resetCounters();
+    }
+}
 
 // TextLangCfg object: per language holder of language specificities
 
+// For CSS "content: open-quote / close-quote"
+typedef struct quotes_spec {
+    const char * lang_tag;
+    const lChar16 *  open_quote_level_1;
+    const lChar16 * close_quote_level_1;
+    const lChar16 *  open_quote_level_2;
+    const lChar16 * close_quote_level_2;
+} quotes_spec;
+
+// List built 20200601 from https://html.spec.whatwg.org/multipage/rendering.html#quotes
+// 2nd part of lang_tag lowercased for easier comparison, and if multiple
+// lang_tag with the same starting chars, put the longest first.
+// Small issue: 3-letters lang tag not specified here might match
+// a 2-letter lang tag specified here ("ito" will get those from "it").
+static quotes_spec _quotes_spec_table[] = {
+    { "af",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "agq",      L"\x201e", L"\x201d", L"\x201a", L"\x2019" }, /* „ ” ‚ ’ */
+    { "ak",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "am",       L"\x00ab", L"\x00bb", L"\x2039", L"\x203a" }, /* « » ‹ › */
+    { "ar",       L"\x201d", L"\x201c", L"\x2019", L"\x2018" }, /* ” “ ’ ‘ */
+    { "asa",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ast",      L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "az-cyrl",  L"\x00ab", L"\x00bb", L"\x2039", L"\x203a" }, /* « » ‹ › */
+    { "az",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "bas",      L"\x00ab", L"\x00bb", L"\x201e", L"\x201c" }, /* « » „ “ */
+    { "bem",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "bez",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "be",       L"\x00ab", L"\x00bb", L"\x201e", L"\x201c" }, /* « » „ “ */
+    { "bg",       L"\x201e", L"\x201c", L"\x201e", L"\x201c" }, /* „ “ „ “ */
+    { "bm",       L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "bn",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "brx",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "br",       L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "bs-cyrl",  L"\x201e", L"\x201c", L"\x201a", L"\x2018" }, /* „ “ ‚ ‘ */
+    { "bs",       L"\x201e", L"\x201d", L"\x2018", L"\x2019" }, /* „ ” ‘ ’ */
+    { "ca",       L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "cgg",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "chr",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "cs",       L"\x201e", L"\x201c", L"\x201a", L"\x2018" }, /* „ “ ‚ ‘ */
+    { "cy",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "dav",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "da",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "de",       L"\x201e", L"\x201c", L"\x201a", L"\x2018" }, /* „ “ ‚ ‘ */
+    { "dje",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "dsb",      L"\x201e", L"\x201c", L"\x201a", L"\x2018" }, /* „ “ ‚ ‘ */
+    { "dua",      L"\x00ab", L"\x00bb", L"\x2018", L"\x2019" }, /* « » ‘ ’ */
+    { "dyo",      L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "dz",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ebu",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ee",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "el",       L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "en",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "es",       L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "et",       L"\x201e", L"\x201c", L"\x201a", L"\x2018" }, /* „ “ ‚ ‘ */
+    { "eu",       L"\x201c", L"\x201d", L"\x201c", L"\x201d" }, /* “ ” “ ” */
+    { "ewo",      L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "fa",       L"\x00ab", L"\x00bb", L"\x2039", L"\x203a" }, /* « » ‹ › */
+    { "ff",       L"\x201e", L"\x201d", L"\x201a", L"\x2019" }, /* „ ” ‚ ’ */
+    { "fil",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "fi",       L"\x201d", L"\x201d", L"\x2019", L"\x2019" }, /* ” ” ’ ’ */
+    { "fo",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "fr-ch",    L"\x00ab", L"\x00bb", L"\x2039", L"\x203a" }, /* « » ‹ › */
+    // { "fr",    L"\x00ab", L"\x00bb", L"\x00ab", L"\x00bb" }, /* « » « » */  /* Same pair for both level, bit sad... */
+    { "fr",       L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */  /* Better to have "fr" just as "it" */
+    { "ga",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "gd",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "gl",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "gsw",      L"\x00ab", L"\x00bb", L"\x2039", L"\x203a" }, /* « » ‹ › */
+    { "guz",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "gu",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ha",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "he",       L"\x201d", L"\x201d", L"\x2019", L"\x2019" }, /* ” ” ’ ’ */
+    { "hi",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "hr",       L"\x201e", L"\x201c", L"\x201a", L"\x2018" }, /* „ “ ‚ ‘ */
+    { "hsb",      L"\x201e", L"\x201c", L"\x201a", L"\x2018" }, /* „ “ ‚ ‘ */
+    { "hu",       L"\x201e", L"\x201d", L"\x00bb", L"\x00ab" }, /* „ ” » « */
+    { "hy",       L"\x00ab", L"\x00bb", L"\x00ab", L"\x00bb" }, /* « » « » */
+    { "id",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ig",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "is",       L"\x201e", L"\x201c", L"\x201a", L"\x2018" }, /* „ “ ‚ ‘ */
+    { "it",       L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "ja",       L"\x300c", L"\x300d", L"\x300e", L"\x300f" }, /* 「 」 『 』 */
+    { "jgo",      L"\x00ab", L"\x00bb", L"\x2039", L"\x203a" }, /* « » ‹ › */
+    { "jmc",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "kab",      L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "kam",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ka",       L"\x201e", L"\x201c", L"\x00ab", L"\x00bb" }, /* „ “ « » */
+    { "kde",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "kea",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "khq",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ki",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "kkj",      L"\x00ab", L"\x00bb", L"\x2039", L"\x203a" }, /* « » ‹ › */
+    { "kk",       L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "kln",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "km",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "kn",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ko",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ksb",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ksf",      L"\x00ab", L"\x00bb", L"\x2018", L"\x2019" }, /* « » ‘ ’ */
+    { "ky",       L"\x00ab", L"\x00bb", L"\x201e", L"\x201c" }, /* « » „ “ */
+    { "lag",      L"\x201d", L"\x201d", L"\x2019", L"\x2019" }, /* ” ” ’ ’ */
+    { "lb",       L"\x201e", L"\x201c", L"\x201a", L"\x2018" }, /* „ “ ‚ ‘ */
+    { "lg",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ln",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "lo",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "lrc",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "lt",       L"\x201e", L"\x201c", L"\x201e", L"\x201c" }, /* „ “ „ “ */
+    { "luo",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "luy",      L"\x201e", L"\x201c", L"\x201a", L"\x2018" }, /* „ “ ‚ ‘ */
+    { "lu",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "lv",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "mas",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "mer",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "mfe",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "mgo",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "mg",       L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "mk",       L"\x201e", L"\x201c", L"\x201a", L"\x2018" }, /* „ “ ‚ ‘ */
+    { "ml",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "mn",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "mr",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ms",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "mt",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "mua",      L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "my",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "mzn",      L"\x00ab", L"\x00bb", L"\x2039", L"\x203a" }, /* « » ‹ › */
+    { "naq",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "nb",       L"\x00ab", L"\x00bb", L"\x2018", L"\x2019" }, /* « » ‘ ’ */
+    { "nd",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ne",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "nl",       L"\x2018", L"\x2019", L"\x201c", L"\x201d" }, /* ‘ ’ “ ” */
+    { "nmg",      L"\x201e", L"\x201d", L"\x00ab", L"\x00bb" }, /* „ ” « » */
+    { "nnh",      L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "nn",       L"\x00ab", L"\x00bb", L"\x2018", L"\x2019" }, /* « » ‘ ’ */
+    { "nus",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "nyn",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "pa",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "pl",       L"\x201e", L"\x201d", L"\x00ab", L"\x00bb" }, /* „ ” « » */
+    { "pt-pt",    L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "pt",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "rn",       L"\x201d", L"\x201d", L"\x2019", L"\x2019" }, /* ” ” ’ ’ */
+    { "rof",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ro",       L"\x201e", L"\x201d", L"\x00ab", L"\x00bb" }, /* „ ” « » */
+    { "ru",       L"\x00ab", L"\x00bb", L"\x201e", L"\x201c" }, /* « » „ “ */
+    { "rwk",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "rw",       L"\x00ab", L"\x00bb", L"\x2018", L"\x2019" }, /* « » ‘ ’ */
+    { "sah",      L"\x00ab", L"\x00bb", L"\x201e", L"\x201c" }, /* « » „ “ */
+    { "saq",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "sbp",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "seh",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ses",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "sg",       L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "shi-latn", L"\x00ab", L"\x00bb", L"\x201e", L"\x201d" }, /* « » „ ” */
+    { "shi",      L"\x00ab", L"\x00bb", L"\x201e", L"\x201d" }, /* « » „ ” */
+    { "si",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "sk",       L"\x201e", L"\x201c", L"\x201a", L"\x2018" }, /* „ “ ‚ ‘ */
+    { "sl",       L"\x201e", L"\x201c", L"\x201a", L"\x2018" }, /* „ “ ‚ ‘ */
+    { "sn",       L"\x201d", L"\x201d", L"\x2019", L"\x2019" }, /* ” ” ’ ’ */
+    { "so",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "sq",       L"\x00ab", L"\x00bb", L"\x201c", L"\x201d" }, /* « » “ ” */
+    { "sr-latn",  L"\x201e", L"\x201c", L"\x2018", L"\x2018" }, /* „ “ ‘ ‘ */
+    { "sr",       L"\x201e", L"\x201c", L"\x2018", L"\x2018" }, /* „ “ ‘ ‘ */
+    { "sv",       L"\x201d", L"\x201d", L"\x2019", L"\x2019" }, /* ” ” ’ ’ */
+    { "sw",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ta",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "teo",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "te",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "th",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "ti-er",    L"\x2018", L"\x2019", L"\x201c", L"\x201d" }, /* ‘ ’ “ ” */
+    { "tk",       L"\x201c", L"\x201d", L"\x201c", L"\x201d" }, /* “ ” “ ” */
+    { "to",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "tr",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "twq",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "tzm",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "uk",       L"\x00ab", L"\x00bb", L"\x201e", L"\x201c" }, /* « » „ “ */
+    { "ur",       L"\x201d", L"\x201c", L"\x2019", L"\x2018" }, /* ” “ ’ ‘ */
+    { "uz-cyrl",  L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "uz",       L"\x201c", L"\x201d", L"\x2019", L"\x2018" }, /* “ ” ’ ‘ */
+    { "vai-latn", L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "vai",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "vi",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "vun",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "xog",      L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "yav",      L"\x00ab", L"\x00bb", L"\x00ab", L"\x00bb" }, /* « » « » */
+    { "yo",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "yue-hans", L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "yue",      L"\x300c", L"\x300d", L"\x300e", L"\x300f" }, /* 「 」 『 』 */
+    { "zgh",      L"\x00ab", L"\x00bb", L"\x201e", L"\x201d" }, /* « » „ ” */
+    { "zh-hant",  L"\x300c", L"\x300d", L"\x300e", L"\x300f" }, /* 「 」 『 』 */
+    { "zh",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { "zu",       L"\x201c", L"\x201d", L"\x2018", L"\x2019" }, /* “ ” ‘ ’ */
+    { NULL, NULL, NULL, NULL, NULL }
+};
+// Default to quotes for English
+static quotes_spec _quotes_spec_default = { "", L"\x201c", L"\x201d", L"\x2018", L"\x2019" };
+
 #if USE_LIBUNIBREAK==1
 lChar16 lb_char_sub_func_polish(const lChar16 * text, int pos, int next_usable) {
     // https://github.com/koreader/koreader/issues/5645#issuecomment-559193057
@@ -297,9 +497,6 @@ lChar16 lb_char_sub_func_czech_slovak(const lChar16 * text, int pos, int next_us
 }
 #endif
 
-TextLangCfg::~TextLangCfg() {
-}
-
 // Instantiate a new TextLangCfg with properties adequate to the provided lang_tag
 TextLangCfg::TextLangCfg( lString16 lang_tag ) {
     if ( TextLangMan::_no_hyph_method == NULL ) {
@@ -464,4 +661,47 @@ TextLangCfg::TextLangCfg( lString16 lang_tag ) {
         _duplicate_real_hyphen_on_next_line = true;
     }
 #endif
+
+    // Language default opening and closing quotes, for CSS
+    //   "q::before { content: open-quote }" and
+    //   "q::after  { content: close-quote }"
+    quotes_spec * quotes = &_quotes_spec_default;
+    for (int i=0; _quotes_spec_table[i].lang_tag!=NULL; i++) {
+        if ( lang_tag.startsWith( _quotes_spec_table[i].lang_tag ) ) {
+            quotes = &_quotes_spec_table[i];
+            break;
+        }
+    }
+    // Avoid a wrap after/before an opening/close quote.
+    const lChar16 * quote_joiner = L"\x2060";
+        // (Zero width, equivalent to deprecated ZERO WIDTH NO-BREAK SPACE)
+        // We might want with some languages to use a non-breaking thin space instead.
+
+    _open_quote1  << quotes->open_quote_level_1    << quote_joiner;
+    _close_quote1 << quote_joiner   << quotes->close_quote_level_1;
+    _open_quote2  << quotes->open_quote_level_2    << quote_joiner;
+    _close_quote2 << quote_joiner   << quotes->close_quote_level_2;
+
+    resetCounters();
+}
+
+TextLangCfg::~TextLangCfg() {
+}
+
+void TextLangCfg::resetCounters() {
+    _quote_nesting_level = 0;
+}
+
+lString16 & TextLangCfg::getOpeningQuote( bool update_level ) {
+    if ( !update_level )
+        return _open_quote1;
+    _quote_nesting_level++;
+    return (_quote_nesting_level % 2) ? _open_quote1 : _open_quote2;
+}
+
+lString16 & TextLangCfg::getClosingQuote( bool update_level ) {
+    if ( !update_level )
+        return _close_quote1;
+    _quote_nesting_level--;
+    return ((_quote_nesting_level+1) % 2) ? _close_quote1 : _close_quote2;
 }