From e19f4ffd2b9ce16582d4129e2bb7ac1c8787cf35 Mon Sep 17 00:00:00 2001 From: poire-z Date: Sat, 18 Apr 2020 10:45:14 +0200 Subject: [PATCH] Adds TextLangMan for text typography by language Parse and store values from lang= attributes, so we can propagate a TextlangCfg object to all calls dealing with text, which will allow to: - Use specific libunibreak rules for line breaking per lang (i.e. reverted quotation marks in German vs French). - Use the right hyphenation dictionary for each language - Add more specific line breaking tweaks for some languages (some single letter prepositions should not be at end of line in Polish and Czech, real hyphens should be duplicated at start of next line in Portuguese and Polish...) - Give the language tag to Harfbuzz so it can pick the right glyphs for the language (e.g. different glyphs for the same codepoint in zh-CN, zh-TW and ja, and for Bulgarian Cyrillic with some fonts). Update existing global HyphMan to use services from TextLangMan to ensure legacy single global hyphenation. TextLangMan still uses the hyphenation methods defined in hyphman.cpp. --- crengine/include/hyphman.h | 67 +++-- crengine/include/lvdocviewprops.h | 17 +- crengine/include/lvfntman.h | 16 +- crengine/include/lvrend.h | 6 +- crengine/include/lvstyles.h | 27 +- crengine/include/lvtextfm.h | 9 +- crengine/include/lvtinydom.h | 2 + crengine/include/textlang.h | 152 ++++++++++ crengine/src/hyphman.cpp | 136 ++++++++- crengine/src/lvdocview.cpp | 37 ++- crengine/src/lvfntman.cpp | 61 ++-- crengine/src/lvrend.cpp | 229 +++++++++++---- crengine/src/lvtextfm.cpp | 92 +++--- crengine/src/lvtinydom.cpp | 46 ++- crengine/src/textlang.cpp | 467 ++++++++++++++++++++++++++++++ 15 files changed, 1193 insertions(+), 171 deletions(-) create mode 100644 crengine/include/textlang.h create mode 100644 crengine/src/textlang.cpp diff --git a/crengine/include/hyphman.h b/crengine/include/hyphman.h index 294ea859e..318e46e15 100644 --- a/crengine/include/hyphman.h +++ b/crengine/include/hyphman.h @@ -17,27 +17,45 @@ #include "lvtypes.h" #include "lvstream.h" - -class HyphMethod -{ -public: - virtual bool hyphenate( const lChar16 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize=1 ) = 0; - virtual ~HyphMethod() { } -}; - +#include "lvhashtable.h" #define WORD_LENGTH 64 #define MAX_REAL_WORD 24 // min value supported by algorithms is 1 (max is arbitrary 10) -// value enforced by algorithm previously was 2, so it's the default -#define HYPH_DEFAULT_HYPHEN_MIN 2 -#define HYPH_MIN_HYPHEN_MIN 1 +// 0 means to use the defaults per HyphMethod +// if set to >= 1, the values apply to all HyphMethods +#define HYPH_MIN_HYPHEN_MIN 0 #define HYPH_MAX_HYPHEN_MIN 10 +// Default for global HyphMan values is 0: use per-HyphMethod defaults +#define HYPH_DEFAULT_HYPHEN_MIN 0 +// Default for per-HyphMethod values (value enforced by algorithms +// previously was 2, so let's keep that as the default) +#define HYPHMETHOD_DEFAULT_HYPHEN_MIN 2 // Don't trust soft-hyphens when using dict or algo methods #define HYPH_DEFAULT_TRUST_SOFT_HYPHENS 0 +class HyphMethod +{ +protected: + lString16 _id; + int _left_hyphen_min; + int _right_hyphen_min; +public: + HyphMethod(lString16 id, int leftHyphenMin=HYPHMETHOD_DEFAULT_HYPHEN_MIN, int rightHyphenMin=HYPHMETHOD_DEFAULT_HYPHEN_MIN) + : _id(id) + , _left_hyphen_min(leftHyphenMin) + , _right_hyphen_min(rightHyphenMin) + { } + lString16 getId() { return _id; } + virtual bool hyphenate( const lChar16 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize=1 ) = 0; + virtual ~HyphMethod() { } + virtual lUInt32 getCount() { return 0; } + virtual lUInt32 getSize() { return 0; } +}; + + enum HyphDictType { HDT_NONE, // disable hyphenation @@ -70,7 +88,6 @@ class HyphDictionary #define HYPH_DICT_ID_SOFTHYPHENS L"@softhyphens" #define HYPH_DICT_ID_DICTIONARY L"@dictionary" - class HyphDictionaryList { LVPtrVector _list; @@ -86,6 +103,11 @@ class HyphDictionaryList }; #define DEF_HYPHENATION_DICT "English_US.pattern" +// We'll be loading English_US.pattern even if non-english users +// may never use it, but it's a bit tedious not going with it. +// It might use around 1M of memory, but it will avoid re-rendering +// the document if the book does not contain any language tag, and +// we end up going with it anyway. class HyphDictionary; class HyphDictionaryList; @@ -100,19 +122,21 @@ class HyphMan friend class TexHyph; friend class AlgoHyph; friend class SoftHyphensHyph; - static HyphMethod * _method; - static HyphDictionary * _selectedDictionary; - static HyphDictionaryList * _dictList; + // Obsolete: now fetched from TextLangMan main lang TextLangCfg + // static HyphMethod * _method; + // static HyphDictionary * _selectedDictionary; + static HyphDictionaryList * _dictList; // available hyph dict files (+ none/algo/softhyphens) + static LVHashTable _loaded_hyph_methods; // methods with loaded dictionaries static int _LeftHyphenMin; static int _RightHyphenMin; static int _TrustSoftHyphens; public: static void uninit(); - static bool activateDictionaryFromStream( LVStreamRef stream ); + static bool initDictionaries(lString16 dir, bool clear = true); static HyphDictionaryList * getDictList() { return _dictList; } static bool activateDictionary( lString16 id ) { return _dictList->activate(id); } - static bool initDictionaries(lString16 dir, bool clear = true); - static HyphDictionary * getSelectedDictionary() { return _selectedDictionary; } + static bool activateDictionaryFromStream( LVStreamRef stream ); // used by CoolReader on Android + static HyphDictionary * getSelectedDictionary(); // was: { return _selectedDictionary; } static int getLeftHyphenMin() { return _LeftHyphenMin; } static int getRightHyphenMin() { return _RightHyphenMin; } static bool setLeftHyphenMin( int left_hyphen_min ); @@ -120,16 +144,19 @@ class HyphMan static int getTrustSoftHyphens() { return _TrustSoftHyphens; } static bool setTrustSoftHyphens( int trust_soft_hyphen ); static bool isEnabled(); + static HyphMethod * getHyphMethodForDictionary( lString16 id, int leftHyphenMin=HYPHMETHOD_DEFAULT_HYPHEN_MIN, + int rightHyphenMin=HYPHMETHOD_DEFAULT_HYPHEN_MIN ); HyphMan(); ~HyphMan(); + static bool hyphenate( const lChar16 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize=1 ); + /* Obsolete: inline static bool hyphenate( const lChar16 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize=1 ) { return _method->hyphenate( str, len, widths, flags, hyphCharWidth, maxWidth, flagSize ); } + */ }; - - #endif diff --git a/crengine/include/lvdocviewprops.h b/crengine/include/lvdocviewprops.h index 4d2fd8bdf..6c91d0f54 100644 --- a/crengine/include/lvdocviewprops.h +++ b/crengine/include/lvdocviewprops.h @@ -48,13 +48,22 @@ #define PROP_SHOW_BATTERY_PERCENT "window.status.battery.percent" //#define PROP_FONT_KERNING_ENABLED "font.kerning.enabled" #define PROP_LANDSCAPE_PAGES "window.landscape.pages" -#define PROP_HYPHENATION_LEFT_HYPHEN_MIN "crengine.hyphenation.left.hyphen.min" -#define PROP_HYPHENATION_RIGHT_HYPHEN_MIN "crengine.hyphenation.right.hyphen.min" -#define PROP_HYPHENATION_TRUST_SOFT_HYPHENS "crengine.hyphenation.trust.soft.hyphens" +#define PROP_AUTOSAVE_BOOKMARKS "crengine.autosave.bookmarks" + +// Obsolete hyph settings: #define PROP_HYPHENATION_DICT "crengine.hyphenation.directory" #define PROP_HYPHENATION_DICT_VALUE_NONE "@none" #define PROP_HYPHENATION_DICT_VALUE_ALGORITHM "@algorithm" -#define PROP_AUTOSAVE_BOOKMARKS "crengine.autosave.bookmarks" +// Still used hyph settings: +#define PROP_HYPHENATION_LEFT_HYPHEN_MIN "crengine.hyphenation.left.hyphen.min" +#define PROP_HYPHENATION_RIGHT_HYPHEN_MIN "crengine.hyphenation.right.hyphen.min" +#define PROP_HYPHENATION_TRUST_SOFT_HYPHENS "crengine.hyphenation.trust.soft.hyphens" +// New textlang typography settings: +#define PROP_TEXTLANG_MAIN_LANG "crengine.textlang.main.lang" +#define PROP_TEXTLANG_EMBEDDED_LANGS_ENABLED "crengine.textlang.embedded.langs.enabled" +#define PROP_TEXTLANG_HYPHENATION_ENABLED "crengine.textlang.hyphenation.enabled" +#define PROP_TEXTLANG_HYPH_SOFT_HYPHENS_ONLY "crengine.textlang.hyphenation.soft.hyphens.only" +#define PROP_TEXTLANG_HYPH_FORCE_ALGORITHMIC "crengine.textlang.hyphenation.force.algorithmic" #define PROP_FLOATING_PUNCTUATION "crengine.style.floating.punctuation.enabled" diff --git a/crengine/include/lvfntman.h b/crengine/include/lvfntman.h index 2f90e3e45..25348de4c 100644 --- a/crengine/include/lvfntman.h +++ b/crengine/include/lvfntman.h @@ -24,6 +24,7 @@ #include "lvptrvec.h" #include "hyphman.h" #include "lvdrawbuf.h" +#include "textlang.h" #if !defined(__SYMBIAN32__) && defined(_WIN32) extern "C" { @@ -340,6 +341,7 @@ class LVFont : public LVRefCounter lUInt8 * flags, int max_width, lChar16 def_char, + TextLangCfg * lang_cfg=NULL, int letter_spacing=0, bool allow_hyphenation=true, lUInt32 hints=0 @@ -350,7 +352,7 @@ class LVFont : public LVRefCounter \param len is number of characters to measure \return width of specified string */ - virtual lUInt32 getTextWidth( const lChar16 * text, int len ) = 0; + virtual lUInt32 getTextWidth( const lChar16 * text, int len, TextLangCfg * lang_cfg=NULL ) = 0; // /** \brief get glyph image in 1 byte per pixel format // \param code is unicode character @@ -391,6 +393,7 @@ class LVFont : public LVRefCounter virtual int DrawTextString( LVDrawBuf * buf, int x, int y, const lChar16 * text, int len, lChar16 def_char, lUInt32 * palette = NULL, bool addHyphen = false, + TextLangCfg * lang_cfg=NULL, lUInt32 flags=0, int letter_spacing=0, int width=-1, int text_decoration_back_gap=0 ) = 0; /// constructor @@ -576,6 +579,7 @@ class LVBaseFont : public LVFont virtual int DrawTextString( LVDrawBuf * buf, int x, int y, const lChar16 * text, int len, lChar16 def_char, lUInt32 * palette, bool addHyphen, + TextLangCfg * lang_cfg=NULL, lUInt32 flags=0, int letter_spacing=0, int width=-1, int text_decoration_back_gap=0 ); }; @@ -595,6 +599,7 @@ class LBitmapFont : public LVBaseFont lUInt8 * flags, int max_width, lChar16 def_char, + TextLangCfg * lang_cfg=NULL, int letter_spacing=0, bool allow_hyphenation=true, lUInt32 hints=0 @@ -605,7 +610,7 @@ class LBitmapFont : public LVBaseFont \return width of specified string */ virtual lUInt32 getTextWidth( - const lChar16 * text, int len + const lChar16 * text, int len, TextLangCfg * lang_cfg=NULL ); virtual LVFontGlyphCacheItem * getGlyph(lUInt32 ch, lChar16 def_char=0); /// returns font baseline offset @@ -757,6 +762,7 @@ class LVWin32DrawFont : public LVBaseWin32Font lUInt8 * flags, int max_width, lChar16 def_char, + TextLangCfg * lang_cfg=NULL, int letter_spacing=0, bool allow_hyphenation=true, lUInt32 hints=0 @@ -767,7 +773,7 @@ class LVWin32DrawFont : public LVBaseWin32Font \return width of specified string */ virtual lUInt32 getTextWidth( - const lChar16 * text, int len + const lChar16 * text, int len, TextLangCfg * lang_cfg=NULL ); /// returns char width @@ -777,6 +783,7 @@ class LVWin32DrawFont : public LVBaseWin32Font virtual int DrawTextString( LVDrawBuf * buf, int x, int y, const lChar16 * text, int len, lChar16 def_char, lUInt32 * palette, bool addHyphen, + TextLangCfg * lang_cfg=NULL, lUInt32 flags=0, int letter_spacing=0, int width=-1, int text_decoration_back_gap=0 ); @@ -935,6 +942,7 @@ class LVWin32Font : public LVBaseWin32Font lUInt8 * flags, int max_width, lChar16 def_char, + TextLangCfg * lang_cfg=NULL, int letter_spacing=0, bool allow_hyphenation=true, lUInt32 hints=0 @@ -945,7 +953,7 @@ class LVWin32Font : public LVBaseWin32Font \return width of specified string */ virtual lUInt32 getTextWidth( - const lChar16 * text, int len + const lChar16 * text, int len, TextLangCfg * lang_cfg=NULL ); /** \brief get glyph image in 1 byte per pixel format diff --git a/crengine/include/lvrend.h b/crengine/include/lvrend.h index b0bfc4843..f21e3c5fd 100644 --- a/crengine/include/lvrend.h +++ b/crengine/include/lvrend.h @@ -15,6 +15,7 @@ #define __LV_REND_H_INCLUDED__ #include "lvtinydom.h" +#include "textlang.h" // Current direction, from dir="ltr" or dir="rtl" element attribute // Should map directly to the RENDER_RECT_FLAG_DIRECTION_* below @@ -120,7 +121,7 @@ int initRendMethod( ldomNode * node, bool recurseChildren, bool allowAutoboxing int styleToTextFmtFlags( const css_style_ref_t & style, int oldflags, int direction=REND_DIRECTION_UNSET ); /// renders block as single text formatter object void renderFinalBlock( ldomNode * node, LFormattedText * txform, RenderRectAccessor * fmt, int & flags, - int indent, int line_h, int valign_dy=0, bool * is_link_start=NULL ); + int indent, int line_h, TextLangCfg * lang_cfg=NULL, int valign_dy=0, bool * is_link_start=NULL ); /// renders block which contains subblocks (with gRenderBlockRenderingFlags as flags) int renderBlockElement( LVRendPageContext & context, ldomNode * enode, int x, int y, int width, int direction=REND_DIRECTION_UNSET, int * baseline=NULL ); /// renders block which contains subblocks @@ -144,7 +145,8 @@ void DrawDocument( LVDrawBuf & drawbuf, ldomNode * node, int x0, int y0, int dx, // minWidth: width with a wrap on all spaces (no hyphenation), so width taken by the longest word // full function for recursive use: void getRenderedWidths(ldomNode * node, int &maxWidth, int &minWidth, int direction, bool ignorePadding, int rendFlags, - int &curMaxWidth, int &curWordWidth, bool &collapseNextSpace, int &lastSpaceWidth, int indent, bool isStartNode=false); + int &curMaxWidth, int &curWordWidth, bool &collapseNextSpace, int &lastSpaceWidth, + int indent, TextLangCfg * lang_cfg, bool isStartNode=false); // simpler function for first call: void getRenderedWidths(ldomNode * node, int &maxWidth, int &minWidth, int direction=REND_DIRECTION_UNSET, bool ignorePadding=false, int rendFlags=0); diff --git a/crengine/include/lvstyles.h b/crengine/include/lvstyles.h index 8d0731e38..b2e9e7837 100644 --- a/crengine/include/lvstyles.h +++ b/crengine/include/lvstyles.h @@ -315,8 +315,8 @@ class lvdomElementFormatRec { int _top_overflow; // Overflow (positive value) below _y int _bottom_overflow; // Overflow (positive value) after _y+_height - int _listprop_node_idx; // dataIndex of the UL/OL node this erm_final block - // should get its marker from + int _lang_node_idx; // dataIndex of the upper node this erm_final block + // should get its lang= langage from // Flags & extras, to have additional info related to this rect cached. // - For erm_final nodes, these contain the footprint of outer floats @@ -332,18 +332,21 @@ class lvdomElementFormatRec { int _extra4; int _extra5; - // Added for padding from 14 to 16 32-bits ints + int _listprop_node_idx; // dataIndex of the UL/OL node this erm_final block + // should get its marker from + + // Added for padding from 15 to 16 32-bits ints int _available1; - int _available2; public: lvdomElementFormatRec() : _x(0), _width(0), _y(0), _height(0) , _inner_width(0), _inner_x(0), _inner_y(0), _baseline(0) - , _top_overflow(0), _bottom_overflow(0), _listprop_node_idx(0) + , _top_overflow(0), _bottom_overflow(0) + , _lang_node_idx(0) , _listprop_node_idx(0) , _flags(0), _extra0(0) , _extra1(0), _extra2(0), _extra3(0), _extra4(0), _extra5(0) - , _available1(0), _available2(0) + , _available1(0) { } ~lvdomElementFormatRec() @@ -354,10 +357,10 @@ class lvdomElementFormatRec { _x = _width = _y = _height = 0; _inner_width = _inner_x = _inner_y = _baseline = 0; _top_overflow = _bottom_overflow = 0; - _listprop_node_idx = 0; + _lang_node_idx = _listprop_node_idx = 0; _flags = _extra0 = 0; _extra1 = _extra2 = _extra3 = _extra4 = _extra5 = 0; - _available1 = 0; _available2 = 0; + _available1 = 0; } bool operator == ( lvdomElementFormatRec & v ) { @@ -365,11 +368,11 @@ class lvdomElementFormatRec { _inner_width==v._inner_width && _inner_x==v._inner_x && _inner_y==v._inner_y && _baseline==v._baseline && _top_overflow==v._top_overflow && _bottom_overflow==v._bottom_overflow && - _listprop_node_idx==v._listprop_node_idx && + _lang_node_idx==v._lang_node_idx && _listprop_node_idx==v._listprop_node_idx && _flags==v._flags && _extra0==v._extra0 && _extra1==v._extra1 && _extra2==v._extra2 && _extra3==v._extra3 && _extra4==v._extra4 && _extra5==v._extra5 && - _available1==v._available1 && _available2==v._available2 + _available1==v._available1 ); } bool operator != ( lvdomElementFormatRec & v ) @@ -378,11 +381,11 @@ class lvdomElementFormatRec { _inner_width!=v._inner_width || _inner_x!=v._inner_x || _inner_y!=v._inner_y || _baseline!=v._baseline || _top_overflow!=v._top_overflow || _bottom_overflow!=v._bottom_overflow || - _listprop_node_idx!=v._listprop_node_idx || + _lang_node_idx!=v._lang_node_idx || _listprop_node_idx!=v._listprop_node_idx || _flags!=v._flags || _extra0!=v._extra0 || _extra1!=v._extra1 || _extra2!=v._extra2 || _extra3!=v._extra3 || _extra4!=v._extra4 || _extra5!=v._extra5 || - _available1!=v._available1 || _available2!=v._available2 + _available1!=v._available1 ); } // Get/Set diff --git a/crengine/include/lvtextfm.h b/crengine/include/lvtextfm.h index a9454ec0b..7211dedcf 100755 --- a/crengine/include/lvtextfm.h +++ b/crengine/include/lvtextfm.h @@ -17,6 +17,7 @@ #include "lvfntman.h" #include "lvbmpbuf.h" +#include "textlang.h" // comment out following line to use old formatter #define USE_NEW_FORMATTER 1 @@ -82,6 +83,7 @@ extern "C" { typedef struct { void * object; /**< \brief pointer to object which represents source */ + TextLangCfg * lang_cfg; lInt16 indent; /**< \brief first line indent (or all but first, when negative) */ lInt16 valign_dy; /* drift y from baseline */ lInt16 interval; /**< \brief line height in screen pixels */ @@ -287,6 +289,7 @@ void lvtextFreeFormatter( formatted_text_fragment_t * pbuffer ); void lvtextAddSourceLine( formatted_text_fragment_t * pbuffer, lvfont_handle font, /* handle of font to draw string */ + TextLangCfg * lang_cfg, const lChar16 * text, /* pointer to unicode text string */ lUInt32 len, /* number of chars in text, 0 for auto(strlen) */ lUInt32 color, /* text color */ @@ -306,6 +309,7 @@ void lvtextAddSourceLine( */ void lvtextAddSourceObject( formatted_text_fragment_t * pbuffer, + TextLangCfg * lang_cfg, lInt16 width, lInt16 height, lUInt32 flags, /* flags */ @@ -367,6 +371,7 @@ class LFormattedText lInt16 valign_dy, /* drift y from baseline */ lInt16 indent, /* first line indent (or all but first, when negative) */ void * object, /* pointer to custom object */ + TextLangCfg * lang_cfg, lInt16 letter_spacing=0 ); @@ -375,7 +380,8 @@ class LFormattedText lUInt32 len, /* number of chars in text, 0 for auto(strlen) */ lUInt32 color, /* text color */ lUInt32 bgcolor, /* background color */ - LVFont * font, /* font to draw string */ + LVFont * font, /* font to draw string */ + TextLangCfg * lang_cfg, lUInt32 flags, /* (had default =LTEXT_ALIGN_LEFT|LTEXT_FLAG_OWNTEXT) */ lInt16 interval, /* line height in screen pixels */ lInt16 valign_dy=0, /* drift y from baseline */ @@ -387,6 +393,7 @@ class LFormattedText { lvtextAddSourceLine(m_pbuffer, font, //font->GetHandle() + lang_cfg, text, len, color, bgcolor, flags, interval, valign_dy, indent, object, (lUInt16)offset, letter_spacing ); } diff --git a/crengine/include/lvtinydom.h b/crengine/include/lvtinydom.h index 03956899c..b9bf950ad 100755 --- a/crengine/include/lvtinydom.h +++ b/crengine/include/lvtinydom.h @@ -704,6 +704,8 @@ class RenderRectAccessor : public lvdomElementFormatRec void setBaseline( int baseline ); int getListPropNodeIndex(); void setListPropNodeIndex( int idx ); + int getLangNodeIndex(); + void setLangNodeIndex( int idx ); unsigned short getFlags(); void setFlags( unsigned short flags ); diff --git a/crengine/include/textlang.h b/crengine/include/textlang.h new file mode 100644 index 000000000..2a8b970f7 --- /dev/null +++ b/crengine/include/textlang.h @@ -0,0 +1,152 @@ +#ifndef __TEXTLANG_H_INCLUDED__ +#define __TEXTLANG_H_INCLUDED__ + +#if USE_HARFBUZZ==1 +#include +#include +#endif + +#if USE_LIBUNIBREAK==1 +#include + // linebreakdef.h is not wrapped by this, unlike linebreak.h + // (not wrapping results in "undefined symbol" with the original + // function name kinda obfuscated) + #ifdef __cplusplus + extern "C" { + #endif +#include + #ifdef __cplusplus + } + #endif +#endif + +// Be similar to HyphMan default state with "English_US.pattern" +#define TEXTLANG_DEFAULT_MAIN_LANG "en" // for LVDocView +#define TEXTLANG_DEFAULT_MAIN_LANG_16 L"en" // for textlang.cpp +#define TEXTLANG_DEFAULT_EMBEDDED_LANGS_ENABLED false +#define TEXTLANG_DEFAULT_HYPHENATION_ENABLED true +#define TEXTLANG_DEFAULT_HYPH_SOFT_HYPHENS_ONLY false +#define TEXTLANG_DEFAULT_HYPH_FORCE_ALGORITHMIC false +#define TEXTLANG_FALLBACK_HYPH_DICT_ID L"English_US.pattern" // For languages without specific hyph dicts + +class TextLangCfg; + +class TextLangMan +{ + friend TextLangCfg; + static lString16 _main_lang; + static bool _embedded_langs_enabled; + static LVPtrVector _lang_cfg_list; + + static bool _overridden_hyph_method; // (to avoid checking the 3 following bool) + static bool _hyphenation_enabled; + static bool _hyphenation_soft_hyphens_only; + static bool _hyphenation_force_algorithmic; + static HyphMethod * _no_hyph_method; // instance of hyphman NoHyph + static HyphMethod * _soft_hyphens_method; // instance of hyphman SoftHyphensHyph + static HyphMethod * _algo_hyph_method; // instance of hyphman AlgoHyph + + static HyphMethod * getHyphMethodForLang( lString16 lang_tag ); // Used by TextLangCfg +public: + static void uninit(); + static lUInt32 getHash(); + + static void setMainLang( lString16 lang_tag ) { _main_lang = lang_tag; } + static void setMainLangFromHyphDict( lString16 id ); // For HyphMan legacy methods + static lString16 getMainLang() { return _main_lang; } + + static void setEmbeddedLangsEnabled( bool enabled ) { _embedded_langs_enabled = enabled; } + static bool getEmbeddedLangsEnabled() { return _embedded_langs_enabled; } + + static bool getHyphenationEnabled() { return _hyphenation_enabled; } + static void setHyphenationEnabled( bool enabled ) { + _hyphenation_enabled = enabled; + _overridden_hyph_method = !_hyphenation_enabled || _hyphenation_soft_hyphens_only || _hyphenation_force_algorithmic; + } + + static bool getHyphenationSoftHyphensOnly() { return _hyphenation_soft_hyphens_only; } + static void setHyphenationSoftHyphensOnly( bool enabled ) { + _hyphenation_soft_hyphens_only = enabled; + _overridden_hyph_method = !_hyphenation_enabled || _hyphenation_soft_hyphens_only || _hyphenation_force_algorithmic; + } + + static bool getHyphenationForceAlgorithmic() { return _hyphenation_force_algorithmic; } + static void setHyphenationForceAlgorithmic( bool enabled ) { + _hyphenation_force_algorithmic = enabled; + _overridden_hyph_method = !_hyphenation_enabled || _hyphenation_soft_hyphens_only || _hyphenation_force_algorithmic; + } + + static TextLangCfg * getTextLangCfg(); // get LangCfg for _main_lang + static TextLangCfg * getTextLangCfg( lString16 lang_tag ); + static TextLangCfg * getTextLangCfg( ldomNode * node ); + static int getLangNodeIndex( ldomNode * node ); + + static HyphMethod * getMainLangHyphMethod(); // For HyphMan::hyphenate() + + // For frontend info about TextLangMan status and seen langs + static LVPtrVector * getLangCfgList() { + return &_lang_cfg_list; + } + + TextLangMan(); + ~TextLangMan(); +}; + +#define MAX_NB_LB_PROPS_ITEMS 10 // for our statically sized array (increase if needed) + +typedef lChar16 (*lb_char_sub_func_t)(const lChar16 * text, int pos, int next_usable); + +class TextLangCfg +{ + friend TextLangMan; + lString16 _lang_tag; + HyphMethod * _hyph_method; + + #if USE_HARFBUZZ==1 + hb_language_t _hb_language; + #endif + + #if USE_LIBUNIBREAK==1 + lb_char_sub_func_t _lb_char_sub_func; + struct LineBreakProperties _lb_props[MAX_NB_LB_PROPS_ITEMS]; + #endif + + bool _duplicate_real_hyphen_on_next_line; + +public: + lString16 getLangTag() const { return _lang_tag; } + + HyphMethod * getHyphMethod() const { + if ( !TextLangMan::_overridden_hyph_method ) + return _hyph_method; + if ( !TextLangMan::_hyphenation_enabled ) + return TextLangMan::_no_hyph_method; + if ( TextLangMan::_hyphenation_soft_hyphens_only ) + return TextLangMan::_soft_hyphens_method; + if ( TextLangMan::_hyphenation_force_algorithmic ) + return TextLangMan::_algo_hyph_method; + // Should not be reached + return _hyph_method; + } + HyphMethod * getDefaultHyphMethod() const { + return _hyph_method; + } + + #if USE_HARFBUZZ==1 + hb_language_t getHBLanguage() const { return _hb_language; } + #endif + + #if USE_LIBUNIBREAK==1 + bool hasLBCharSubFunc() const { return _lb_char_sub_func != NULL; } + lb_char_sub_func_t getLBCharSubFunc() const { return _lb_char_sub_func; } + struct LineBreakProperties * getLBProps() const { return (struct LineBreakProperties *)_lb_props; } + #endif + + bool duplicateRealHyphenOnNextLine() const { return _duplicate_real_hyphen_on_next_line; } + + TextLangCfg( lString16 lang_tag ); + ~TextLangCfg(); +}; + + +#endif diff --git a/crengine/src/hyphman.cpp b/crengine/src/hyphman.cpp index 25f90cecf..e456c81b0 100755 --- a/crengine/src/hyphman.cpp +++ b/crengine/src/hyphman.cpp @@ -38,6 +38,7 @@ #include "../include/hyphman.h" #include "../include/lvfnt.h" #include "../include/lvstring.h" +#include "../include/textlang.h" #ifdef ANDROID @@ -53,8 +54,10 @@ int HyphMan::_LeftHyphenMin = HYPH_DEFAULT_HYPHEN_MIN; int HyphMan::_RightHyphenMin = HYPH_DEFAULT_HYPHEN_MIN; int HyphMan::_TrustSoftHyphens = HYPH_DEFAULT_TRUST_SOFT_HYPHENS; +LVHashTable HyphMan::_loaded_hyph_methods(16); -HyphDictionary * HyphMan::_selectedDictionary = NULL; +// Obsolete: now fetched from TextLangMan main lang TextLangCfg +// HyphDictionary * HyphMan::_selectedDictionary = NULL; HyphDictionaryList * HyphMan::_dictList = NULL; @@ -68,21 +71,25 @@ class TexHyph : public HyphMethod { TexPattern * table[PATTERN_HASH_SIZE]; lUInt32 _hash; + lUInt32 _pattern_count; public: int largest_overflowed_word; bool match( const lChar16 * str, char * mask ); virtual bool hyphenate( const lChar16 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize ); void addPattern( TexPattern * pattern ); - TexHyph(); + TexHyph( lString16 id=HYPH_DICT_ID_DICTIONARY, int leftHyphenMin=HYPHMETHOD_DEFAULT_HYPHEN_MIN, int rightHyphenMin=HYPHMETHOD_DEFAULT_HYPHEN_MIN ); virtual ~TexHyph(); bool load( LVStreamRef stream ); bool load( lString16 fileName ); virtual lUInt32 getHash() { return _hash; } + virtual lUInt32 getCount() { return _pattern_count; } + virtual lUInt32 getSize(); }; class AlgoHyph : public HyphMethod { public: + AlgoHyph(): HyphMethod(HYPH_DICT_ID_ALGORITHM) {}; virtual bool hyphenate( const lChar16 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize ); virtual ~AlgoHyph(); }; @@ -90,6 +97,7 @@ class AlgoHyph : public HyphMethod class SoftHyphensHyph : public HyphMethod { public: + SoftHyphensHyph(): HyphMethod(HYPH_DICT_ID_SOFTHYPHENS) {}; virtual bool hyphenate( const lChar16 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize ); virtual ~SoftHyphensHyph(); }; @@ -97,6 +105,7 @@ class SoftHyphensHyph : public HyphMethod class NoHyph : public HyphMethod { public: + NoHyph(): HyphMethod(HYPH_DICT_ID_NONE) {}; virtual bool hyphenate( const lChar16 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize ) { CR_UNUSED6(str, len, widths, flags, hyphCharWidth, maxWidth); @@ -109,7 +118,8 @@ static NoHyph NO_HYPH; static AlgoHyph ALGO_HYPH; static SoftHyphensHyph SOFTHYPHENS_HYPH; -HyphMethod * HyphMan::_method = &NO_HYPH; +// Obsolete: provided by TextLangMan main lang +// HyphMethod * HyphMan::_method = &NO_HYPH; #pragma pack(push, 1) typedef struct { @@ -132,26 +142,39 @@ typedef struct { void HyphMan::uninit() { - if ( _dictList ) - delete _dictList; + // Avoid existing frontend code to have to call it: + TextLangMan::uninit(); + // Clean up _loaded_hyph_methods + LVHashTable::iterator it = _loaded_hyph_methods.forwardIterator(); + LVHashTable::pair* pair; + while ((pair = it.next())) { + delete pair->value; + } + _loaded_hyph_methods.clear(); + if ( _dictList ) + delete _dictList; _dictList = NULL; + /* Obsolete: _selectedDictionary = NULL; if ( HyphMan::_method != &ALGO_HYPH && HyphMan::_method != &NO_HYPH && HyphMan::_method != &SOFTHYPHENS_HYPH ) delete HyphMan::_method; _method = &NO_HYPH; + */ } bool HyphMan::activateDictionaryFromStream( LVStreamRef stream ) { if ( stream.isNull() ) return false; + /* Obsolete: CRLog::trace("remove old hyphenation method"); if ( HyphMan::_method != &NO_HYPH && HyphMan::_method != &ALGO_HYPH && HyphMan::_method != &SOFTHYPHENS_HYPH && HyphMan::_method ) { delete HyphMan::_method; HyphMan::_method = &NO_HYPH; } + */ CRLog::trace("creating new TexHyph method"); - TexHyph * method = new TexHyph(); + TexHyph * method = new TexHyph(HYPH_DICT_ID_DICTIONARY); CRLog::trace("loading from file"); if ( !method->load( stream ) ) { CRLog::error("HyphMan::activateDictionaryFromStream: Cannot open hyphenation dictionary from stream" ); @@ -161,14 +184,28 @@ bool HyphMan::activateDictionaryFromStream( LVStreamRef stream ) if (method->largest_overflowed_word) printf("CRE WARNING: hyph dict from stream: some hyphenation patterns were too long and have been ignored: increase MAX_PATTERN_SIZE from %d to %d\n", MAX_PATTERN_SIZE, method->largest_overflowed_word); CRLog::debug("Dictionary is loaded successfully. Activating."); + + // Replace any previously dict loaded from stream + HyphMethod * prev_method; + if ( _loaded_hyph_methods.get(HYPH_DICT_ID_DICTIONARY, prev_method) ) { + delete prev_method; + _loaded_hyph_methods.remove(HYPH_DICT_ID_DICTIONARY); + } + _loaded_hyph_methods.set(HYPH_DICT_ID_DICTIONARY, method); + if (!_dictList) _dictList = new HyphDictionaryList(); + /* Obsolete: HyphMan::_method = method; + */ if ( HyphMan::_dictList->find(lString16(HYPH_DICT_ID_DICTIONARY))==NULL ) { HyphDictionary * dict = new HyphDictionary( HDT_DICT_ALAN, cs16("Dictionary"), lString16(HYPH_DICT_ID_DICTIONARY), lString16::empty_str ); HyphMan::_dictList->add(dict); + /* Obsolete: HyphMan::_selectedDictionary = dict; + */ } + TextLangMan::setMainLangFromHyphDict( HYPH_DICT_ID_DICTIONARY ); CRLog::trace("Activation is done"); return true; } @@ -211,11 +248,67 @@ bool HyphMan::setTrustSoftHyphens( int trust_soft_hyphens ) { } bool HyphMan::isEnabled() { + return TextLangMan::getHyphenationEnabled(); + /* Obsolete: return _selectedDictionary != NULL && _selectedDictionary->getId() != HYPH_DICT_ID_NONE; + */ +} + +bool HyphMan::hyphenate( const lChar16 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize ) +{ + return TextLangMan::getMainLangHyphMethod()->hyphenate( str, len, widths, flags, hyphCharWidth, maxWidth, flagSize ); + /* Obsolete: + return _method->hyphenate( str, len, widths, flags, hyphCharWidth, maxWidth, flagSize ); + */ +} + +HyphDictionary * HyphMan::getSelectedDictionary() { + lString16 id = TextLangMan::getTextLangCfg()->getHyphMethod()->getId(); + HyphDictionary * dict = _dictList->find( id ); + return dict; +} + +HyphMethod * HyphMan::getHyphMethodForDictionary( lString16 id, int leftHyphenMin, int rightHyphenMin ) { + if ( id.empty() ) + return &NO_HYPH; + HyphDictionary * p = _dictList->find(id); + if ( !p || p->getType() == HDT_NONE ) + return &NO_HYPH; + if ( p->getType() == HDT_ALGORITHM ) + return &ALGO_HYPH; + if ( p->getType() == HDT_SOFTHYPHENS ) + return &SOFTHYPHENS_HYPH; + if ( p->getType() != HDT_DICT_ALAN && p->getType() != HDT_DICT_TEX ) + return &NO_HYPH; + HyphMethod * method; + if ( _loaded_hyph_methods.get(id, method) ) { + // printf("getHyphMethodForDictionary reusing cached %s\n", UnicodeToUtf8(p->getFilename()).c_str()); + return method; + } + lString16 filename = p->getFilename(); + LVStreamRef stream = LVOpenFileStream( filename.c_str(), LVOM_READ ); + if ( stream.isNull() ) { + CRLog::error("Cannot open hyphenation dictionary %s", UnicodeToUtf8(filename).c_str() ); + return &NO_HYPH; + } + TexHyph * newmethod = new TexHyph(id, leftHyphenMin, rightHyphenMin); + if ( !newmethod->load( stream ) ) { + CRLog::error("Cannot open hyphenation dictionary %s", UnicodeToUtf8(filename).c_str() ); + delete newmethod; + return &NO_HYPH; + } + // printf("CRE: loaded hyphenation dict %s\n", UnicodeToUtf8(id).c_str()); + if ( newmethod->largest_overflowed_word ) + printf("CRE WARNING: %s: some hyphenation patterns were too long and have been ignored: increase MAX_PATTERN_SIZE from %d to %d\n", UnicodeToUtf8(filename).c_str(), MAX_PATTERN_SIZE, newmethod->largest_overflowed_word); + _loaded_hyph_methods.set(id, newmethod); + return newmethod; } bool HyphDictionary::activate() { + TextLangMan::setMainLangFromHyphDict( getId() ); + return true; + /* Obsolete: if (HyphMan::_selectedDictionary == this) return true; // already active if ( getType() == HDT_ALGORITHM ) { @@ -262,6 +355,7 @@ bool HyphDictionary::activate() } HyphMan::_selectedDictionary = this; return true; + */ } bool HyphDictionaryList::activate( lString16 id ) @@ -604,10 +698,11 @@ class HyphPatternReader : public LVXMLParserCallback }; -TexHyph::TexHyph() +TexHyph::TexHyph(lString16 id, int leftHyphenMin, int rightHyphenMin) : HyphMethod(id, leftHyphenMin, rightHyphenMin) { memset( table, 0, sizeof(table) ); _hash = 123456; + _pattern_count = 0; largest_overflowed_word = 0; } @@ -631,6 +726,11 @@ void TexHyph::addPattern( TexPattern * pattern ) p = &((*p)->next); pattern->next = *p; *p = pattern; + _pattern_count++; +} + +lUInt32 TexHyph::getSize() { + return _pattern_count * sizeof(TexPattern); } bool TexHyph::load( LVStreamRef stream ) @@ -684,6 +784,7 @@ bool TexHyph::load( LVStreamRef stream ) CRLog::warn("Pattern overflowed (%d > %d) and ignored: '%s'", pattern->overflowed, MAX_PATTERN_SIZE, LCSTR(lString16(pattern->word))); if (pattern->overflowed > largest_overflowed_word) largest_overflowed_word = pattern->overflowed; + delete pattern; } else { addPattern( pattern ); @@ -721,6 +822,7 @@ bool TexHyph::load( LVStreamRef stream ) CRLog::warn("Pattern overflowed (%d > %d) and ignored: '%s'", pattern->overflowed, MAX_PATTERN_SIZE, LCSTR(lString16(pattern->word))); if (pattern->overflowed > largest_overflowed_word) largest_overflowed_word = pattern->overflowed; + delete pattern; } else { addPattern( pattern ); @@ -753,6 +855,7 @@ bool TexHyph::load( LVStreamRef stream ) CRLog::warn("Pattern overflowed (%d > %d) and ignored: (%s) '%s'", pattern->overflowed, MAX_PATTERN_SIZE, LCSTR(data[i]), LCSTR(lString16(pattern->word))); if (pattern->overflowed > largest_overflowed_word) largest_overflowed_word = pattern->overflowed; + delete pattern; } else { addPattern( pattern ); @@ -885,6 +988,11 @@ bool TexHyph::hyphenate( const lChar16 * str, int len, lUInt16 * widths, lUInt8 CRLog::trace("Hyphenate: %s %s", LCSTR(buf), LCSTR(buf2) ); #endif + // Use HyphMan global left/right hyphen min, unless set to 0 (the default) + // which means we should use the HyphMethod specific values. + int left_hyphen_min = HyphMan::_LeftHyphenMin ? HyphMan::_LeftHyphenMin : _left_hyphen_min; + int right_hyphen_min = HyphMan::_RightHyphenMin ? HyphMan::_RightHyphenMin : _right_hyphen_min; + // Moves allowed hyphenation positions from 'mask' to the provided 'flags', // taking soft-hyphen shifts into account int soft_hyphens_skipped = 0; @@ -895,9 +1003,9 @@ bool TexHyph::hyphenate( const lChar16 * str, int len, lUInt16 * widths, lUInt8 soft_hyphens_skipped++; continue; } - if (p-soft_hyphens_skipped < HyphMan::_LeftHyphenMin - 1) + if (p-soft_hyphens_skipped < left_hyphen_min - 1) continue; - if (p > len - HyphMan::_RightHyphenMin - 1) + if (p > len - right_hyphen_min - 1) continue; // hyphenate //00010030100 @@ -926,6 +1034,12 @@ bool AlgoHyph::hyphenate( const lChar16 * str, int len, lUInt16 * widths, lUInt8 if ( softhyphens_hyphenate(str, len, widths, flags, hyphCharWidth, maxWidth, flagSize) ) return true; } + + // Use HyphMan global left/right hyphen min, unless set to 0 (the default) + // which means we should use the HyphMethod specific values. + int left_hyphen_min = HyphMan::_LeftHyphenMin ? HyphMan::_LeftHyphenMin : _left_hyphen_min; + int right_hyphen_min = HyphMan::_RightHyphenMin ? HyphMan::_RightHyphenMin : _right_hyphen_min; + lUInt16 chprops[WORD_LENGTH]; if ( len > WORD_LENGTH-2 ) len = WORD_LENGTH - 2; @@ -942,9 +1056,9 @@ bool AlgoHyph::hyphenate( const lChar16 * str, int len, lUInt16 * widths, lUInt8 // now look over word, placing hyphens if ( end-start > MIN_WORD_LEN_TO_HYPHEN ) { // word must be long enough for (i=start;i maxWidth ) break; diff --git a/crengine/src/lvdocview.cpp b/crengine/src/lvdocview.cpp index e0df5e683..fb2f528b3 100755 --- a/crengine/src/lvdocview.cpp +++ b/crengine/src/lvdocview.cpp @@ -19,6 +19,7 @@ #include "../include/lvstyles.h" #include "../include/lvrend.h" #include "../include/lvstsheet.h" +#include "../include/textlang.h" #include "../include/wolutil.h" #include "../include/crtxtenc.h" @@ -1059,14 +1060,14 @@ void LVDocView::drawCoverTo(LVDrawBuf * drawBuf, lvRect & rc) { LFormattedText txform; if (!authors.empty()) txform.AddSourceLine(authors.c_str(), authors.length(), 0xFFFFFFFF, - 0xFFFFFFFF, author_fnt.get(), LTEXT_ALIGN_CENTER, + 0xFFFFFFFF, author_fnt.get(), NULL, LTEXT_ALIGN_CENTER, author_fnt->getHeight() * 18 / 16); txform.AddSourceLine(title.c_str(), title.length(), 0xFFFFFFFF, 0xFFFFFFFF, - title_fnt.get(), LTEXT_ALIGN_CENTER, + title_fnt.get(), NULL, LTEXT_ALIGN_CENTER, title_fnt->getHeight() * 18 / 16); if (!series.empty()) txform.AddSourceLine(series.c_str(), series.length(), 0xFFFFFFFF, - 0xFFFFFFFF, series_fnt.get(), LTEXT_ALIGN_CENTER, + 0xFFFFFFFF, series_fnt.get(), NULL, LTEXT_ALIGN_CENTER, series_fnt->getHeight() * 18 / 16); int title_w = rc.width() - rc.width() / 4; int h = txform.Format((lUInt16)title_w, (lUInt16)rc.height()); @@ -6380,6 +6381,36 @@ CRPropRef LVDocView::propsApply(CRPropRef props) { REQUEST_RENDER("propsApply hyphenation trust_soft_hyphens") } #endif + } else if (name == PROP_TEXTLANG_MAIN_LANG) { + lString16 lang = props->getStringDef(PROP_TEXTLANG_MAIN_LANG, TEXTLANG_DEFAULT_MAIN_LANG); + if ( lang != TextLangMan::getMainLang() ) { + TextLangMan::setMainLang( lang ); + REQUEST_RENDER("propsApply textlang main_lang") + } + } else if (name == PROP_TEXTLANG_EMBEDDED_LANGS_ENABLED) { + bool enabled = props->getIntDef(PROP_TEXTLANG_EMBEDDED_LANGS_ENABLED, TEXTLANG_DEFAULT_EMBEDDED_LANGS_ENABLED); + if ( enabled != TextLangMan::getEmbeddedLangsEnabled() ) { + TextLangMan::setEmbeddedLangsEnabled( enabled ); + REQUEST_RENDER("propsApply textlang embedded_langs_enabled") + } + } else if (name == PROP_TEXTLANG_HYPHENATION_ENABLED) { + bool enabled = props->getIntDef(PROP_TEXTLANG_HYPHENATION_ENABLED, TEXTLANG_DEFAULT_HYPHENATION_ENABLED); + if ( enabled != TextLangMan::getHyphenationEnabled() ) { + TextLangMan::setHyphenationEnabled( enabled ); + REQUEST_RENDER("propsApply textlang hyphenation_enabled") + } + } else if (name == PROP_TEXTLANG_HYPH_SOFT_HYPHENS_ONLY) { + bool enabled = props->getIntDef(PROP_TEXTLANG_HYPH_SOFT_HYPHENS_ONLY, TEXTLANG_DEFAULT_HYPH_SOFT_HYPHENS_ONLY); + if ( enabled != TextLangMan::getHyphenationSoftHyphensOnly() ) { + TextLangMan::setHyphenationSoftHyphensOnly( enabled ); + REQUEST_RENDER("propsApply textlang hyphenation_soft_hyphens_only") + } + } else if (name == PROP_TEXTLANG_HYPH_FORCE_ALGORITHMIC) { + bool enabled = props->getIntDef(PROP_TEXTLANG_HYPH_FORCE_ALGORITHMIC, TEXTLANG_DEFAULT_HYPH_FORCE_ALGORITHMIC); + if ( enabled != TextLangMan::getHyphenationForceAlgorithmic() ) { + TextLangMan::setHyphenationForceAlgorithmic( enabled ); + REQUEST_RENDER("propsApply textlang hyphenation_force_algorithmic") + } } else if (name == PROP_INTERLINE_SPACE) { int interlineSpace = props->getIntDef(PROP_INTERLINE_SPACE, cr_interline_spaces[0]); diff --git a/crengine/src/lvfntman.cpp b/crengine/src/lvfntman.cpp index 13ade230b..061cc5675 100644 --- a/crengine/src/lvfntman.cpp +++ b/crengine/src/lvfntman.cpp @@ -1797,6 +1797,7 @@ class LVFreeTypeFace : public LVFont lUInt8 * flags, int max_width, lChar16 def_char, + TextLangCfg * lang_cfg = NULL, int letter_spacing = 0, bool allow_hyphenation = true, lUInt32 hints=0 @@ -1897,6 +1898,9 @@ class LVFreeTypeFace : public LVFont hb_flags |= HB_BUFFER_FLAG_EOT; hb_buffer_set_flags(_hb_buffer, (hb_buffer_flags_t)hb_flags); } + if ( lang_cfg ) { + hb_buffer_set_language(_hb_buffer, lang_cfg->getHBLanguage()); + } // Let HB guess what's not been set (script, direction, language) hb_buffer_guess_segment_properties(_hb_buffer); @@ -2020,7 +2024,7 @@ class LVFreeTypeFace : public LVFont fb_hints &= ~LFNT_HINT_ENDS_PARAGRAPH; fallback->measureText( text + t_notdef_start, t_notdef_end - t_notdef_start, widths + t_notdef_start, flags + t_notdef_start, - max_width, def_char, letter_spacing, allow_hyphenation, + max_width, def_char, lang_cfg, letter_spacing, allow_hyphenation, fb_hints ); // Fix previous bad measurements int last_good_width = t_notdef_start > 0 ? widths[t_notdef_start-1] : 0; @@ -2114,7 +2118,7 @@ class LVFreeTypeFace : public LVFont int chars_measured = fallback->measureText( text + t_notdef_start, // start t_notdef_end - t_notdef_start, // len widths + t_notdef_start, flags + t_notdef_start, - max_width, def_char, letter_spacing, allow_hyphenation, + max_width, def_char, lang_cfg, letter_spacing, allow_hyphenation, fb_hints ); lastFitChar = t_notdef_start + chars_measured; int last_good_width = t_notdef_start > 0 ? widths[t_notdef_start-1] : 0; @@ -2292,7 +2296,10 @@ class LVFreeTypeFace : public LVFont lStr_findWordBounds( text, len, lastFitChar-1, hwStart, hwEnd ); if ( hwStart < (int)(lastFitChar-1) && hwEnd > hwStart+3 ) { //int maxw = max_width - (hwStart>0 ? widths[hwStart-1] : 0); - HyphMan::hyphenate(text+hwStart, hwEnd-hwStart, widths+hwStart, flags+hwStart, _hyphen_width, max_width); + if ( lang_cfg ) + lang_cfg->getHyphMethod()->hyphenate(text+hwStart, hwEnd-hwStart, widths+hwStart, flags+hwStart, _hyphen_width, max_width); + else // Use global lang hyph method + HyphMan::hyphenate(text+hwStart, hwEnd-hwStart, widths+hwStart, flags+hwStart, _hyphen_width, max_width); } } } @@ -2304,7 +2311,7 @@ class LVFreeTypeFace : public LVFont \param len is number of characters to measure \return width of specified string */ - virtual lUInt32 getTextWidth( const lChar16 * text, int len) { + virtual lUInt32 getTextWidth( const lChar16 * text, int len, TextLangCfg * lang_cfg=NULL) { static lUInt16 widths[MAX_LINE_CHARS+1]; static lUInt8 flags[MAX_LINE_CHARS+1]; if ( len>MAX_LINE_CHARS ) @@ -2317,7 +2324,7 @@ class LVFreeTypeFace : public LVFont flags, MAX_LINE_WIDTH, L' ', // def_char - 0 + lang_cfg ); if ( res>0 && resgetHBLanguage()); + } // Let HB guess what's not been set (script, direction, language) hb_buffer_guess_segment_properties(_hb_buffer); @@ -2815,7 +2826,7 @@ class LVFreeTypeFace : public LVFont // text decoration, that we dropped: no update needed) int fb_advance = fallback->DrawTextString( buf, x, fb_y, fb_text, fb_len, - def_char, palette, fb_addHyphen, fb_flags, letter_spacing, + def_char, palette, fb_addHyphen, lang_cfg, fb_flags, letter_spacing, width, text_decoration_back_gap ); x += fb_advance; #ifdef DEBUG_DRAW_TEXT @@ -3142,6 +3153,7 @@ class LVFontBoldTransform : public LVFont lUInt8 * flags, int max_width, lChar16 def_char, + TextLangCfg * lang_cfg = NULL, int letter_spacing=0, bool allow_hyphenation=true, lUInt32 hints=0 @@ -3154,6 +3166,7 @@ class LVFontBoldTransform : public LVFont flags, max_width, def_char, + lang_cfg, letter_spacing, allow_hyphenation, hints @@ -3171,7 +3184,7 @@ class LVFontBoldTransform : public LVFont \param len is number of characters to measure \return width of specified string */ - virtual lUInt32 getTextWidth( const lChar16 * text, int len) { + virtual lUInt32 getTextWidth( const lChar16 * text, int len, TextLangCfg * lang_cfg=NULL) { static lUInt16 widths[MAX_LINE_CHARS+1]; static lUInt8 flags[MAX_LINE_CHARS+1]; if ( len>MAX_LINE_CHARS ) @@ -3184,7 +3197,7 @@ class LVFontBoldTransform : public LVFont flags, MAX_LINE_WIDTH, L' ', // def_char - 0 + lang_cfg ); if ( res>0 && res0 && res0 && resgetHyphMethod()->hyphenate(text+hwStart, hwEnd-hwStart, widths+hwStart, flags+hwStart, _hyphen_width, max_width); + else // Use global lang hyph method + HyphMan::hyphenate(text+hwStart, hwEnd-hwStart, widths+hwStart, flags+hwStart, _hyphen_width, max_width); return nchars; } @@ -5718,6 +5739,7 @@ lUInt16 LVWin32DrawFont::measureText( int LVWin32DrawFont::DrawTextString( LVDrawBuf * buf, int x, int y, const lChar16 * text, int len, lChar16 def_char, lUInt32 * palette, bool addHyphen, + TextLangCfg * lang_cfg, lUInt32 flags, int letter_spacing, int width, int text_decoration_back_gap ) { @@ -5945,7 +5967,7 @@ bool LVWin32Font::getGlyphInfo( lUInt16 code, glyph_info_t * glyph, lChar16 def_ return true; } -lUInt32 LVWin32Font::getTextWidth( const lChar16 * text, int len ) +lUInt32 LVWin32Font::getTextWidth( const lChar16 * text, int len, TextLangCfg * lang_cfg=NULL ) { // static lUInt16 widths[MAX_LINE_CHARS+1]; @@ -5959,7 +5981,8 @@ lUInt32 LVWin32Font::getTextWidth( const lChar16 * text, int len ) widths, flags, MAX_LINE_WIDTH, - L' ' // def_char + L' ', // def_char + lang_cfg ); if ( res>0 && resgetHyphMethod()->hyphenate(text+hwStart, hwEnd-hwStart, widths+hwStart, flags+hwStart, hyphwidth, max_width); + else // Use global lang hyph method + HyphMan::hyphenate(text+hwStart, hwEnd-hwStart, widths+hwStart, flags+hwStart, hyphwidth, max_width); return nchars; } diff --git a/crengine/src/lvrend.cpp b/crengine/src/lvrend.cpp index 8f00bf505..38c4c3994 100755 --- a/crengine/src/lvrend.cpp +++ b/crengine/src/lvrend.cpp @@ -1369,6 +1369,7 @@ class CCRTable { fmt.setInnerWidth( w - padding_left - padding_right ); RENDER_RECT_SET_FLAG(fmt, INNER_FIELDS_SET); RENDER_RECT_SET_DIRECTION(fmt, caption_direction); + fmt.setLangNodeIndex( TextLangMan::getLangNodeIndex(caption) ); } fmt.push(); caption_h = caption->renderFinalBlock( txform, &fmt, w - padding_left - padding_right ); @@ -1479,6 +1480,7 @@ class CCRTable { fmt.setInnerWidth( cell->width - padding_left - padding_right ); RENDER_RECT_SET_FLAG(fmt, INNER_FIELDS_SET); RENDER_RECT_SET_DIRECTION(fmt, cell->direction); + fmt.setLangNodeIndex( TextLangMan::getLangNodeIndex(cell->elem) ); } fmt.push(); int h = cell->elem->renderFinalBlock( txform, &fmt, cell->width - padding_left - padding_right); @@ -2368,7 +2370,8 @@ lString16 renderListItemMarker( ldomNode * enode, int & marker_width, LFormatted // (the "xviii" marker will be in its own LTR segment, and the followup text // in another LTR segment) if ( txform ) { - txform->AddSourceLine( marker.c_str(), marker.length(), cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, 0, 0); + TextLangCfg * lang_cfg = TextLangMan::getTextLangCfg( enode ); + txform->AddSourceLine( marker.c_str(), marker.length(), cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, 0, 0); } } return marker; @@ -2419,13 +2422,19 @@ bool renderAsListStylePositionInside( const css_style_rec_t * style, bool is_rtl // as is to the inline children elements: it is only used to get the width of // the container, which is only needed to compute indent (text-indent) values in %, // and to get paragraph direction (LTR/RTL/UNSET). -void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAccessor * fmt, int & baseflags, int indent, int line_h, int valign_dy, bool * is_link_start ) +void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAccessor * fmt, int & baseflags, int indent, int line_h, TextLangCfg * lang_cfg, int valign_dy, bool * is_link_start ) { if ( enode->isElement() ) { lvdom_element_render_method rm = enode->getRendMethod(); if ( rm == erm_invisible ) return; // don't draw invisible + if ( enode->hasAttribute( attr_lang ) ) { + lString16 lang_tag = enode->getAttributeValue( attr_lang ); + if ( !lang_tag.empty() ) + lang_cfg = TextLangMan::getTextLangCfg( lang_tag ); + } + if ( enode->isFloatingBox() && rm != erm_final ) { // (A floating floatBox can't be erm_final: it is always erm_block, // but let's just be sure of that.) @@ -2436,7 +2445,7 @@ void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAcce // be guessed and renderBlockElement() called to render it // and get is height, so LFormattedText knows how to render // this erm_final text around it. - txform->AddSourceObject(baseflags|LTEXT_SRC_IS_FLOAT, line_h, valign_dy, indent, enode ); + txform->AddSourceObject(baseflags|LTEXT_SRC_IS_FLOAT, line_h, valign_dy, indent, enode, lang_cfg ); baseflags &= ~LTEXT_FLAG_NEWLINE & ~LTEXT_SRC_IS_CLEAR_BOTH; // clear newline flag return; } @@ -2825,7 +2834,7 @@ void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAcce if ( sp==css_lsp_outside ) margin = -marker_width; // will ensure negative/hanging indent-like rendering marker += "\t"; - txform->AddSourceLine( marker.c_str(), marker.length(), cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy, + txform->AddSourceLine( marker.c_str(), marker.length(), cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy, margin, NULL ); flags &= ~LTEXT_FLAG_NEWLINE & ~LTEXT_SRC_IS_CLEAR_BOTH; } @@ -2884,27 +2893,27 @@ void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAcce lString16Collection lines; lines.parse(title, cs16("\\n"), true); for ( int i=0; iAddSourceLine( lines[i].c_str(), lines[i].length(), cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy, 0, NULL ); + txform->AddSourceLine( lines[i].c_str(), lines[i].length(), cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy, 0, NULL ); } - txform->AddSourceObject(flags, line_h, valign_dy, indent, enode ); + txform->AddSourceObject(flags, line_h, valign_dy, indent, enode, lang_cfg ); title = enode->getAttributeValue(attr_subtitle); if ( !title.empty() ) { lString16Collection lines; lines.parse(title, cs16("\\n"), true); for ( int i=0; iAddSourceLine( lines[i].c_str(), lines[i].length(), cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy, 0, NULL ); + txform->AddSourceLine( lines[i].c_str(), lines[i].length(), cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy, 0, NULL ); } title = enode->getAttributeValue(attr_title); if ( !title.empty() ) { lString16Collection lines; lines.parse(title, cs16("\\n"), true); for ( int i=0; iAddSourceLine( lines[i].c_str(), lines[i].length(), cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy, 0, NULL ); + txform->AddSourceLine( lines[i].c_str(), lines[i].length(), cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy, 0, NULL ); } } else { // inline image // We use the flags computed previously (and not baseflags) as they // carry vertical alignment - txform->AddSourceObject(flags, line_h, valign_dy, indent, enode ); + txform->AddSourceObject(flags, line_h, valign_dy, indent, enode, lang_cfg ); flags &= ~LTEXT_FLAG_NEWLINE & ~LTEXT_SRC_IS_CLEAR_BOTH; // clear newline flag } } @@ -2957,7 +2966,7 @@ void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAcce } // We use the flags computed previously (and not baseflags) as they // carry vertical alignment - txform->AddSourceObject(flags|LTEXT_SRC_IS_INLINE_BOX, line_h, valign_dy, indent, enode ); + txform->AddSourceObject(flags|LTEXT_SRC_IS_INLINE_BOX, line_h, valign_dy, indent, enode, lang_cfg ); if ( is_embedded_block ) { // Let flags unchanged, with their newline/alignment flag as if it // hadn't been consumed, so it is reported back into baseflags below @@ -2983,7 +2992,8 @@ void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAcce // Don't handle dir= for the erm_final (

hasAttribute( attr_dir ) && rm != erm_final; + bool hasDirAttribute = enode->hasAttribute( attr_dir ) && rm != erm_final + && rm != erm_table_caption && rm != erm_list_item; bool addGeneratedContent = hasDirAttribute || nodeElementId == el_bdi || nodeElementId == el_bdo || @@ -3009,7 +3019,7 @@ void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAcce // But if we use another char (0x00AB / 0x00BB), it gets mirrored correctly. // Might be that HarfBuzz first substitute it with arabic quotes (which happen // to look inverted), and then mirror that? - txform->AddSourceLine( L"\x201C", 1, cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); + txform->AddSourceLine( L"\x201C", 1, cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); flags &= ~LTEXT_FLAG_NEWLINE & ~LTEXT_SRC_IS_CLEAR_BOTH; // clear newline flag } // The following is needed for fribidi to do the right thing when the content creator @@ -3032,16 +3042,16 @@ void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAcce // leaving => PDF PDI // but it then doesn't have the intended effect (fribidi bug or limitation?) if ( dir.compare("rtl") == 0 ) { - // txform->AddSourceLine( L"\x2068\x202E", 1, cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); + // txform->AddSourceLine( L"\x2068\x202E", 1, cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); // closeWithPDFPDI = true; - txform->AddSourceLine( L"\x202E", 1, cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); + txform->AddSourceLine( L"\x202E", 1, cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); closeWithPDF = true; flags &= ~LTEXT_FLAG_NEWLINE & ~LTEXT_SRC_IS_CLEAR_BOTH; // clear newline flag } else if ( dir.compare("ltr") == 0 ) { - // txform->AddSourceLine( L"\x2068\x202D", 1, cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); + // txform->AddSourceLine( L"\x2068\x202D", 1, cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); // closeWithPDFPDI = true; - txform->AddSourceLine( L"\x202D", 1, cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); + txform->AddSourceLine( L"\x202D", 1, cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); closeWithPDF = true; flags &= ~LTEXT_FLAG_NEWLINE & ~LTEXT_SRC_IS_CLEAR_BOTH; // clear newline flag } @@ -3054,17 +3064,17 @@ void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAcce // dir=auto => FSI U+2068 FIRST STRONG ISOLATE // leaving => PDI U+2069 POP DIRECTIONAL ISOLATE if ( dir.compare("rtl") == 0 ) { - txform->AddSourceLine( L"\x2067", 1, cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); + txform->AddSourceLine( L"\x2067", 1, cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); closeWithPDI = true; flags &= ~LTEXT_FLAG_NEWLINE & ~LTEXT_SRC_IS_CLEAR_BOTH; // clear newline flag } else if ( dir.compare("ltr") == 0 ) { - txform->AddSourceLine( L"\x2066", 1, cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); + txform->AddSourceLine( L"\x2066", 1, cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); closeWithPDI = true; flags &= ~LTEXT_FLAG_NEWLINE & ~LTEXT_SRC_IS_CLEAR_BOTH; // clear newline flag } else if ( nodeElementId == el_bdi || dir.compare("auto") == 0 ) { - txform->AddSourceLine( L"\x2068", 1, cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); + txform->AddSourceLine( L"\x2068", 1, cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); closeWithPDI = true; flags &= ~LTEXT_FLAG_NEWLINE & ~LTEXT_SRC_IS_CLEAR_BOTH; // clear newline flag } @@ -3097,7 +3107,7 @@ void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAcce for (int i=0; igetChildNode( i ); - renderFinalBlock( child, txform, fmt, flags, indent, line_h, valign_dy, is_link_start_p ); + renderFinalBlock( child, txform, fmt, flags, indent, line_h, lang_cfg, valign_dy, is_link_start_p ); } if ( addGeneratedContent ) { @@ -3106,20 +3116,20 @@ void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAcce lUInt32 bgcl = style->background_color.type!=css_val_color ? 0xFFFFFFFF : style->background_color.value; if ( nodeElementId == el_q ) { // Add default quoting closing char - txform->AddSourceLine( L"\x201D", 1, cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); + txform->AddSourceLine( L"\x201D", 1, cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); flags &= ~LTEXT_FLAG_NEWLINE & ~LTEXT_SRC_IS_CLEAR_BOTH; // clear newline flag } // See comment above: these are the closing counterpart if ( closeWithPDI ) { - txform->AddSourceLine( L"\x2069", 1, cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); + txform->AddSourceLine( L"\x2069", 1, cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); flags &= ~LTEXT_FLAG_NEWLINE & ~LTEXT_SRC_IS_CLEAR_BOTH; // clear newline flag } else if ( closeWithPDFPDI ) { - txform->AddSourceLine( L"\x202C\x2069", 1, cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); + txform->AddSourceLine( L"\x202C\x2069", 1, cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); flags &= ~LTEXT_FLAG_NEWLINE & ~LTEXT_SRC_IS_CLEAR_BOTH; // clear newline flag } else if ( closeWithPDF ) { - txform->AddSourceLine( L"\x202C", 1, cl, bgcl, font, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); + txform->AddSourceLine( L"\x202C", 1, cl, bgcl, font, lang_cfg, flags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); flags &= ~LTEXT_FLAG_NEWLINE & ~LTEXT_SRC_IS_CLEAR_BOTH; // clear newline flag } } @@ -3134,7 +3144,7 @@ void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAcce lUInt32 cl = style->color.type!=css_val_color ? 0xFFFFFFFF : style->color.value; lUInt32 bgcl = style->background_color.type!=css_val_color ? 0xFFFFFFFF : style->background_color.value; lChar16 delimiter[] = {UNICODE_NO_BREAK_SPACE, UNICODE_NO_BREAK_SPACE}; //160 - txform->AddSourceLine( delimiter, sizeof(delimiter)/sizeof(lChar16), cl, bgcl, font, LTEXT_FLAG_OWNTEXT | LTEXT_RUNIN_FLAG, line_h, valign_dy, 0, NULL ); + txform->AddSourceLine( delimiter, sizeof(delimiter)/sizeof(lChar16), cl, bgcl, font, lang_cfg, LTEXT_FLAG_OWNTEXT | LTEXT_RUNIN_FLAG, line_h, valign_dy, 0, NULL ); flags &= ~LTEXT_RUNIN_FLAG; } } @@ -3171,7 +3181,7 @@ void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAcce LVFont * font = enode->getFont().get(); lUInt32 cl = style->color.type!=css_val_color ? 0xFFFFFFFF : style->color.value; lUInt32 bgcl = style->background_color.type!=css_val_color ? 0xFFFFFFFF : style->background_color.value; - txform->AddSourceLine( L" ", 1, cl, bgcl, font, baseflags | LTEXT_FLAG_OWNTEXT, line_h, valign_dy); + txform->AddSourceLine( L" ", 1, cl, bgcl, font, lang_cfg, baseflags | LTEXT_FLAG_OWNTEXT, line_h, valign_dy); // baseflags &= ~LTEXT_FLAG_NEWLINE; // clear newline flag // No need to clear the flag, as we set it just below // (any LTEXT_ALIGN_* set implies LTEXT_FLAG_NEWLINE) @@ -3230,7 +3240,7 @@ void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAcce LVFont * font = enode->getFont().get(); lUInt32 cl = style->color.type!=css_val_color ? 0xFFFFFFFF : style->color.value; lUInt32 bgcl = style->background_color.type!=css_val_color ? 0xFFFFFFFF : style->background_color.value; - txform->AddSourceLine( L" ", 1, cl, bgcl, font, baseflags|LTEXT_SRC_IS_CLEAR_LAST|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); + txform->AddSourceLine( L" ", 1, cl, bgcl, font, lang_cfg, baseflags|LTEXT_SRC_IS_CLEAR_LAST|LTEXT_FLAG_OWNTEXT, line_h, valign_dy); } } else if ( enode->isText() ) { @@ -3325,9 +3335,13 @@ void renderFinalBlock( ldomNode * enode, LFormattedText * txform, RenderRectAcce } */ if ( txt.length()>0 ) { - txform->AddSourceLine( txt.c_str(), txt.length(), cl, bgcl, font, baseflags | tflags, + txform->AddSourceLine( txt.c_str(), txt.length(), cl, bgcl, font, lang_cfg, baseflags | tflags, line_h, valign_dy, indent, enode, 0, letter_spacing ); baseflags &= ~LTEXT_FLAG_NEWLINE & ~LTEXT_SRC_IS_CLEAR_BOTH; // clear newline flag + // To show the lang tag for the lang used for this text node AFTER it: + // lString16 lang_tag_txt = L"[" + (lang_cfg ? lang_cfg->getLangTag() : lString16("??")) + L"]"; + // txform->AddSourceLine( lang_tag_txt.c_str(), lang_tag_txt.length(), cl, bgcl, font, + // lang_cfg, baseflags|tflags|LTEXT_FLAG_OWNTEXT, line_h, valign_dy, 0, NULL ); } } } @@ -4095,6 +4109,7 @@ int renderBlockElementLegacy( LVRendPageContext & context, ldomNode * enode, int fmt.setWidth( width ); fmt.setX( fmt.getX() ); fmt.setY( fmt.getY() ); + fmt.setLangNodeIndex( 0 ); // No support for lang in legacy rendering fmt.push(); //if ( CRLog::isTraceEnabled() ) // CRLog::trace("rendering final node: %s %d %s", LCSTR(enode->getNodeName()), enode->getDataIndex(), LCSTR(ldomXPointer(enode,0).toString()) ); @@ -4323,16 +4338,16 @@ class FlowState { // an inner block (so, making a sub-level). class BlockShift { public: int direction; - ldomNode * lang_node; + lInt32 lang_node_idx; int x_min; int x_max; int l_y; int in_y_min; int in_y_max; bool avoid_pb_inside; - void reset(int dir, ldomNode * langnode, int xmin, int xmax, int ly, int iymin, int iymax, bool avoidpbinside) { + void reset(int dir, lInt32 langNodeIdx, int xmin, int xmax, int ly, int iymin, int iymax, bool avoidpbinside) { direction = dir; - lang_node = langnode; + lang_node_idx = langNodeIdx; x_min = xmin; x_max = xmax; l_y = ly; @@ -4340,9 +4355,9 @@ class FlowState { in_y_max = iymax; avoid_pb_inside = avoidpbinside; } - BlockShift(int dir, ldomNode * langnode, int xmin, int xmax, int ly, int iymin, int iymax, bool avoidpbinside) : + BlockShift(int dir, lInt32 langNodeIdx, int xmin, int xmax, int ly, int iymin, int iymax, bool avoidpbinside) : direction(dir), - lang_node(langnode), + lang_node_idx(langNodeIdx), x_min(xmin), x_max(xmax), l_y(ly), @@ -4367,12 +4382,10 @@ class FlowState { { } }; int direction; // flow inline direction (LTR/RTL) - ldomNode * lang_node; // nearest upper node with a lang="" attribute (NULL if none) + lInt32 lang_node_idx; // dataIndex of nearest upper node with a lang="" attribute (0 if none) // We don't need to know its value in here, the idx of this node // will be saved in the final block RenderRectAccessor so it can // be fetched from the node when needed, when laying out text). - // todo: currently not used, should be saved in RenderRectAccessor - // and used by lvtextfm.cpp for typography LVRendPageContext & context; LVPtrVector _shifts; LVPtrVector _floats; @@ -4409,9 +4422,9 @@ class FlowState { int vm_back_usable_as_margin; // previously moved vertical space where next margin could be accounted in public: - FlowState( LVRendPageContext & ctx, int width, int rendflags, int dir=REND_DIRECTION_UNSET, ldomNode * langnode=NULL ): + FlowState( LVRendPageContext & ctx, int width, int rendflags, int dir=REND_DIRECTION_UNSET, lInt32 langNodeIdx=0 ): direction(dir), - lang_node(langnode), + lang_node_idx(langNodeIdx), context(ctx), rend_flags(rendflags), level(0), @@ -4466,11 +4479,14 @@ class FlowState { } } + bool isMainFlow() { + return is_main_flow; + } int getDirection() { return direction; } - bool isMainFlow() { - return is_main_flow; + lInt32 getLangNodeIndex() { + return lang_node_idx; } int getOriginalContainerWidth() { return o_width; @@ -5224,18 +5240,18 @@ class FlowState { // Enter/leave a block level: backup/restore some of this FlowState // fields, and do some housekeeping. - void newBlockLevel( int width, int d_left, bool avoid_pb, int dir, ldomNode * langnode ) { + void newBlockLevel( int width, int d_left, bool avoid_pb, int dir, lInt32 langNodeIdx ) { // Don't new/delete to avoid too many malloc/free, keep and re-use/reset // the ones already created if ( _shifts.length() <= level ) { - _shifts.push( new BlockShift( direction, lang_node, x_min, x_max, l_y, in_y_min, in_y_max, avoid_pb_inside ) ); + _shifts.push( new BlockShift( direction, lang_node_idx, x_min, x_max, l_y, in_y_min, in_y_max, avoid_pb_inside ) ); } else { - _shifts[level]->reset( direction, lang_node, x_min, x_max, l_y, in_y_min, in_y_max, avoid_pb_inside ); + _shifts[level]->reset( direction, lang_node_idx, x_min, x_max, l_y, in_y_min, in_y_max, avoid_pb_inside ); } direction = dir; - if (langnode != NULL) - lang_node = langnode; + if (langNodeIdx != -1) + lang_node_idx = langNodeIdx; x_min += d_left; x_max = x_min + width; l_y = c_y; @@ -5256,7 +5272,7 @@ class FlowState { bottom_overflow = in_y_max > last_c_y ? in_y_max - last_c_y : 0; // positive value BlockShift * prev = _shifts[level-1]; direction = prev->direction; - lang_node = prev->lang_node; + lang_node_idx = prev->lang_node_idx; x_min = prev->x_min; x_max = prev->x_max; l_y = prev->l_y; @@ -5946,7 +5962,7 @@ void renderBlockElementEnhanced( FlowState * flow, ldomNode * enode, int x, int // See if lang= attribute bool has_lang_attribute = false; - if ( enode->hasAttribute( attr_lang ) ) { + if ( enode->hasAttribute( attr_lang ) && !enode->getAttributeValue( attr_lang ).empty() ) { // We'll probably have to check it is a valid lang specification // before overriding the upper one. // lString16 lang = enode->getAttributeValue( attr_lang ); @@ -6488,7 +6504,14 @@ void renderBlockElementEnhanced( FlowState * flow, ldomNode * enode, int x, int // Set direction for all blocks (needed for text in erm_final, but also for list item // markers in erm_block, so that DrawDocument can draw it on the right if rtl). RENDER_RECT_SET_DIRECTION(fmt, direction); - // todo: also set/store lang_node when we'll start implementing it + // Store lang node index if it's an erm_final like node (it's only needed for these, + // as the starting lang for renderFinalBlock()) + if ( m == erm_final || m == erm_table_caption || m == erm_list_item ) { + if ( has_lang_attribute ) + fmt.setLangNodeIndex( enode->getDataIndex() ); + else + fmt.setLangNodeIndex( flow->getLangNodeIndex() ); + } fmt.setX( x ); fmt.setY( flow->getCurrentRelativeY() ); fmt.setWidth( width ); @@ -6683,7 +6706,7 @@ void renderBlockElementEnhanced( FlowState * flow, ldomNode * enode, int x, int margin_left + (is_rtl ? 0 : list_marker_padding) + padding_left, // d_left break_inside==RN_SPLIT_AVOID, direction, - has_lang_attribute ? enode : NULL); + has_lang_attribute ? enode->getDataIndex() : -1); if (padding_top>0) { // This may push accumulated vertical margin @@ -7255,7 +7278,7 @@ int renderBlockElement( LVRendPageContext & context, ldomNode * enode, int x, in // (We are called when rendering the root node, and when rendering each float // met along walking the root node hierarchy - and when meeting a new float // in a float, etc...) - FlowState flow( context, width, rend_flags, direction ); + FlowState flow( context, width, rend_flags, direction, TextLangMan::getLangNodeIndex(enode) ); if (baseline != NULL) { flow.setRequestedBaselineType(*baseline); } @@ -8949,11 +8972,12 @@ void getRenderedWidths(ldomNode * node, int &maxWidth, int &minWidth, int direct bool isStartNode = true; // we are starting measurement on that node // Start measurements and recursions: getRenderedWidths(node, maxWidth, minWidth, direction, ignoreMargin, rendFlags, - curMaxWidth, curWordWidth, collapseNextSpace, lastSpaceWidth, indent, isStartNode); + curMaxWidth, curWordWidth, collapseNextSpace, lastSpaceWidth, indent, NULL, isStartNode); } void getRenderedWidths(ldomNode * node, int &maxWidth, int &minWidth, int direction, bool ignoreMargin, int rendFlags, - int &curMaxWidth, int &curWordWidth, bool &collapseNextSpace, int &lastSpaceWidth, int indent, bool isStartNode) + int &curMaxWidth, int &curWordWidth, bool &collapseNextSpace, int &lastSpaceWidth, + int indent, TextLangCfg * lang_cfg, bool isStartNode) { // This does mostly what renderBlockElement, renderFinalBlock and lvtextfm.cpp // do, but only with widths and horizontal margin/border/padding and indent @@ -8970,6 +8994,15 @@ void getRenderedWidths(ldomNode * node, int &maxWidth, int &minWidth, int direct if (m == erm_invisible) return; + if ( isStartNode ) { + lang_cfg = TextLangMan::getTextLangCfg( node ); // Fetch it from node or its parents + } + else if ( node->hasAttribute( attr_lang ) ) { + lString16 lang_tag = node->getAttributeValue( attr_lang ); + if ( !lang_tag.empty() ) + lang_cfg = TextLangMan::getTextLangCfg( lang_tag ); + } + if ( isStartNode && node->isBoxingInlineBox() ) { // The inlineBox is erm_inline, and we'll be measuring it below // as part of measuring other erm_inline in some erm_final. @@ -9100,7 +9133,7 @@ void getRenderedWidths(ldomNode * node, int &maxWidth, int &minWidth, int direct // Nothing more to do with inline elements: they just carry some // styles that will be grabbed by children text nodes getRenderedWidths(child, maxWidth, minWidth, direction, false, rendFlags, - curMaxWidth, curWordWidth, collapseNextSpace, lastSpaceWidth, indent); + curMaxWidth, curWordWidth, collapseNextSpace, lastSpaceWidth, indent, lang_cfg); } return; } @@ -9216,7 +9249,7 @@ void getRenderedWidths(ldomNode * node, int &maxWidth, int &minWidth, int direct for (int i = 0; i < node->getChildCount(); i++) { ldomNode * child = node->getChildNode(i); getRenderedWidths(child, _maxWidth, _minWidth, direction, false, rendFlags, - curMaxWidth, curWordWidth, collapseNextSpace, lastSpaceWidth, indent); + curMaxWidth, curWordWidth, collapseNextSpace, lastSpaceWidth, indent, lang_cfg); // A
can happen deep among our children, so we deal with that when erm_inline above } if (lastSpaceWidth) @@ -9245,7 +9278,7 @@ void getRenderedWidths(ldomNode * node, int &maxWidth, int &minWidth, int direct int _minw = 0; ldomNode * child = node->getChildNode(i); getRenderedWidths(child, _maxw, _minw, direction, false, rendFlags, - curMaxWidth, curWordWidth, collapseNextSpace, lastSpaceWidth, indent); + curMaxWidth, curWordWidth, collapseNextSpace, lastSpaceWidth, indent, lang_cfg); if (m == erm_table_row) { // For table rows, adding the min/max widths of each children // (the table cells), instead of taking the largest, gives @@ -9400,6 +9433,18 @@ void getRenderedWidths(ldomNode * node, int &maxWidth, int &minWidth, int direct // getAdditionalCharWidthOnLeft(). // todo: use fribidi and split measurement at fribidi level change, // and beware left/right side bearing adjustments... + #if (USE_LIBUNIBREAK==1) + // If using libunibreak, we do similarly as in lvtextfm.cpp copyText(), + // except that we don't update previous char, but look ahead at next + // char to know about current break. + // Also, as we do all that only text node by text node, we may lose + // line breaking rules between contiguous text nodes (but it's a bit + // complicated to pass this lbCtx across calls...) + struct LineBreakContext lbCtx; + lb_init_break_context(&lbCtx, 0x0020, NULL); + lbCtx.lbpLang = lang_cfg->getLBProps(); + lb_process_next_char(&lbCtx, (utf32_t)(*txt)); + #endif while (true) { LVFont * font = node->getParentNode()->getFont().get(); int chars_measured = font->measureText( @@ -9408,9 +9453,82 @@ void getRenderedWidths(ldomNode * node, int &maxWidth, int &minWidth, int direct widths, flags, 0x7FFF, // very wide width '?', // replacement char + lang_cfg, letter_spacing, false); // no hyphenation // todo: provide direction and hints + #if (USE_LIBUNIBREAK==1) + for (int i=0; i0 ? widths[i-1] : 0); + lChar16 c = *(txt + start + i); + lChar16 next_c = *(txt + start + i + 1); // might be 0 at end of string + if ( lang_cfg->hasLBCharSubFunc() ) { + next_c = lang_cfg->getLBCharSubFunc()(txt+start, i+1, len-1 - (i+1)); + } + int brk = lb_process_next_char(&lbCtx, (utf32_t)next_c); + // We don't need to bother with collapsing consecutive spaces, as + // we're dealing with a single text node, and the HTML parser has + // removed multiple consecutive spaces (except with PRE, that we + // already did not handle correctly when !USE_LIBUNIBREAK). + // printf("between <%c%c>: brk %d\n", c, next_c, brk); + if (brk == LINEBREAK_ALLOWBREAK) { + if (flags[i] & LCHAR_IS_SPACE) { // A space + if (collapseNextSpace) // ignore this space + continue; + collapseNextSpace = true; // ignore next spaces, even if in another node + lastSpaceWidth = w; + curMaxWidth += w; // add this space to non-wrap width + if (curWordWidth > 0) { // there was a word before this space + if (start+i > 0) { + // adjust for last word's last char overflow (italic, letter f...) + lChar16 prevc = *(txt + start + i - 1); + int right_overflow = - font->getRightSideBearing(prevc, true, true); + curWordWidth += right_overflow; + } + } + if (curWordWidth > minWidth) // done with previous word + minWidth = curWordWidth; // longest word found + curWordWidth = 0; + } + else { // break after a non space: might be a CJK char (or other stuff) + collapseNextSpace = false; // next space should not be ignored + lastSpaceWidth = 0; // no width to take off if we stop with this char + curMaxWidth += w; + if (curWordWidth > 0) { // there was a word or CJK char before this CJK char + if (start+i > 0) { + // adjust for last word's last char or previous CJK char right overflow + lChar16 prevc = *(txt + start + i - 1); + int right_overflow = - font->getRightSideBearing(prevc, true, true); + curWordWidth += right_overflow; + } + } + if (curWordWidth > minWidth) // done with previous word + minWidth = curWordWidth; // longest word found + curWordWidth = w; + // adjust for leading overflow + int left_overflow = - font->getLeftSideBearing(c, false, true); + curWordWidth += left_overflow; + if (start + i == 0) // at start of text only? (not sure) + curMaxWidth += left_overflow; // also add it to max width + } + } + else { // break not allowed: this char is part of a word + collapseNextSpace = false; // next space should not be ignored + lastSpaceWidth = 0; // no width to take off if we stop with this char + if (curWordWidth == 0) { // first char of a word + // adjust for leading overflow on first char of a word + int left_overflow = - font->getLeftSideBearing(c, false, true); + curWordWidth += left_overflow; + if (start + i == 0) // at start of text only? (not sure) + curMaxWidth += left_overflow; // also add it to max width + } + curMaxWidth += w; + curWordWidth += w; + // libunibreak should handle properly '/' in urls (except may be + // if the url parts are made of numbers...) + } + } + #else // not USE_LIBUNIBREAK==1 for (int i=0; i0 ? widths[i-1] : 0); lChar16 c = *(txt + start + i); @@ -9486,6 +9604,7 @@ void getRenderedWidths(ldomNode * node, int &maxWidth, int &minWidth, int direct } } } + #endif // not USE_LIBUNIBREAK==1 if ( chars_measured == len ) { // done with this text node if (curWordWidth > 0) { // we end with a word if (start+len > 0) { diff --git a/crengine/src/lvtextfm.cpp b/crengine/src/lvtextfm.cpp index f1f8566ca..45e364250 100755 --- a/crengine/src/lvtextfm.cpp +++ b/crengine/src/lvtextfm.cpp @@ -24,6 +24,7 @@ #include "../include/lvimg.h" #include "../include/lvtinydom.h" #include "../include/lvrend.h" +#include "../include/textlang.h" #endif #if USE_HARFBUZZ==1 @@ -34,20 +35,6 @@ #include #endif -#if (USE_LIBUNIBREAK==1) -#include - // linebreakdef.h is not wrapped by this, unlike linebreak.h - // (not wrapping results in "undefined symbol" with the original - // function name kinda obfuscated) - #ifdef __cplusplus - extern "C" { - #endif -#include - #ifdef __cplusplus - } - #endif -#endif - #define SPACE_WIDTH_SCALE_PERCENT 100 #define MIN_SPACE_CONDENSING_PERCENT 50 @@ -212,6 +199,7 @@ void lvtextFreeFormatter( formatted_text_fragment_t * pbuffer ) void lvtextAddSourceLine( formatted_text_fragment_t * pbuffer, lvfont_handle font, /* handle of font to draw string */ + TextLangCfg * lang_cfg, const lChar16 * text, /* pointer to unicode text string */ lUInt32 len, /* number of chars in text, 0 for auto(strlen) */ lUInt32 color, /* color */ @@ -241,6 +229,9 @@ void lvtextAddSourceLine( formatted_text_fragment_t * pbuffer, // if (font == NULL && ((flags & LTEXT_WORD_IS_OBJECT) == 0)) { // CRLog::fatal("No font specified for text"); // } + if ( !lang_cfg ) + lang_cfg = TextLangMan::getTextLangCfg(); // use main_lang + pline->lang_cfg = lang_cfg; if (!len) for (len=0; text[len]; len++) ; if (flags & LTEXT_FLAG_OWNTEXT) { @@ -274,6 +265,7 @@ void lvtextAddSourceObject( lInt16 valign_dy, /* drift y from baseline */ lInt16 indent, /* first line indent (or all but first, when negative) */ void * object, /* pointer to custom object */ + TextLangCfg * lang_cfg, lInt16 letter_spacing ) { @@ -293,6 +285,9 @@ void lvtextAddSourceObject( pline->interval = interval; pline->valign_dy = valign_dy; pline->letter_spacing = letter_spacing; + if ( !lang_cfg ) + lang_cfg = TextLangMan::getTextLangCfg(); // use main_lang + pline->lang_cfg = lang_cfg; } @@ -312,6 +307,7 @@ void LFormattedText::AddSourceObject( lInt16 valign_dy, /* drift y from baseline */ lInt16 indent, /* first line indent (or all but first, when negative) */ void * object, /* pointer to custom object */ + TextLangCfg * lang_cfg, lInt16 letter_spacing ) { @@ -324,7 +320,7 @@ void LFormattedText::AddSourceObject( if (flags & LTEXT_SRC_IS_FLOAT) { // not an image but a float:'ing node // Nothing much to do with it at this point lvtextAddSourceObject(m_pbuffer, 0, 0, - flags, interval, valign_dy, indent, object, letter_spacing ); + flags, interval, valign_dy, indent, object, lang_cfg, letter_spacing ); // lvtextAddSourceObject will itself add to flags: | LTEXT_SRC_IS_OBJECT // (only flags & object parameter will be used, the others are not, // but they matter if this float is the first node in a paragraph, @@ -336,7 +332,7 @@ void LFormattedText::AddSourceObject( // get its width & neight, as they might be in % of our main width, that // we don't know yet (but only when ->Format() is called). lvtextAddSourceObject(m_pbuffer, 0, 0, - flags, interval, valign_dy, indent, object, letter_spacing ); + flags, interval, valign_dy, indent, object, lang_cfg, letter_spacing ); // lvtextAddSourceObject will itself add to flags: | LTEXT_SRC_IS_OBJECT return; } @@ -382,7 +378,7 @@ void LFormattedText::AddSourceObject( height = h; lvtextAddSourceObject(m_pbuffer, width, height, - flags, interval, valign_dy, indent, object, letter_spacing ); + flags, interval, valign_dy, indent, object, lang_cfg, letter_spacing ); } class LVFormatter { @@ -928,14 +924,13 @@ class LVFormatter { { #if (USE_LIBUNIBREAK==1) struct LineBreakContext lbCtx; - // libunibreak's lb_prop_French provides quite generic additional rules, - // similar to the ones hardcoded when not USE_LIBUNIBREAK. - // Let's init it before the first char, by adding a leading space which will - // be treated as WJ (non-breakable) and should not change behaviour with - // the real first char coming up. We then can just use lb_process_next_char() - // with the real text. - const char * lang = "fr"; - lb_init_break_context(&lbCtx, 0x0020, lang); + // Let's init it before the first char, by adding a leading space which + // will be treated as WJ (Word Joiner, non-breakable) and should not + // change the behaviour with the real first char coming up. We then + // can just use lb_process_next_char() with the real text. + // The lang lb_props will be plugged in from the TextLangCfg of the + // coming up text node. + lb_init_break_context(&lbCtx, 0x0020, NULL); #endif m_has_bidi = false; // will be set if fribidi detects it is bidirectionnal text @@ -1023,6 +1018,12 @@ class LVFormatter { pos++; } else { + #if (USE_LIBUNIBREAK==1) + // We hack into lbCtx private member and switch its lbpLang + // on-the-fly to the props for a possibly new language. + lbCtx.lbpLang = src->lang_cfg->getLBProps(); + #endif + int len = src->t.len; lStr_ncpy( m_text+pos, src->t.text, len ); if ( i==0 || (src->flags & LTEXT_FLAG_NEWLINE) ) @@ -1164,6 +1165,11 @@ class LVFormatter { #if (USE_LIBUNIBREAK==1) lChar16 ch = m_text[pos]; + if ( src->lang_cfg->hasLBCharSubFunc() ) { + // Lang specific function may want to substitute char (for + // libunibreak only) to tweak line breaking around it + ch = src->lang_cfg->getLBCharSubFunc()(m_text, pos, len-1 - k); + } int brk = lb_process_next_char(&lbCtx, (utf32_t)ch); // printf("between <%c%c>: brk %d\n", m_text[pos-1], m_text[pos], brk); if (brk != LINEBREAK_ALLOWBREAK) { @@ -1194,9 +1200,9 @@ class LVFormatter { // Given the algorithm described in addLine(), we want the break // after the first space, so the following collapsed spaces can // be at start of next line where they will be ignored. - // (Not certain this is really needed, but let's do it as the - // code expecting that has been quite well tested and fixed other - // the months, so don't add uncertainty.) + // (Not certain this is really needed, but let's do it, as the + // code expecting that has been quite well tested and fixed over + // the months, so let's avoid adding uncertainty.) if ( m_flags[pos-1] & LCHAR_IS_COLLAPSED_SPACE ) { // We have spaces before, and if we are allowed to break, // the break is allowed on all preceeding spaces. @@ -1459,6 +1465,7 @@ class LVFormatter { widths, flags, 0x7FFF, '?', + srcline->lang_cfg, srcline->letter_spacing, false, hints ); @@ -1605,6 +1612,7 @@ class LVFormatter { widths, flags, 0x7FFF, //pbuffer->width, '?', + lastSrc->lang_cfg, lastLetterSpacing, false, hints @@ -1668,9 +1676,10 @@ class LVFormatter { } m_widths[start + k] = lastWidth + widths[k]; #if (USE_LIBUNIBREAK==1) - // Reset this flag if lastFont->measureText() has set it, as we trust - // only libunibreak. - flags[k] &= ~LCHAR_ALLOW_WRAP_AFTER; + // Reset these flags if lastFont->measureText() has set them, as we trust + // only libunibreak (which is more clever with hyphens, that our code flag + // with LCHAR_DEPRECATED_WRAP_AFTER). + flags[k] &= ~(LCHAR_ALLOW_WRAP_AFTER|LCHAR_DEPRECATED_WRAP_AFTER); #endif m_flags[start + k] |= flags[k]; // printf(" => w=%d\n", m_widths[start + k]); @@ -3246,7 +3255,7 @@ class LVFormatter { // but it should be a candidate for lastNormalWrap (otherwise, the // previous word will be hyphenated and we will get spaces widen for // text justification) - if ( (flags & LCHAR_ALLOW_WRAP_AFTER) && !(flags & LCHAR_IS_OBJECT) ) // don't break yet + if ( (flags & LCHAR_IS_SPACE) && (flags & LCHAR_ALLOW_WRAP_AFTER) ) // don't break yet grabbedExceedingSpace = true; else break; @@ -3258,6 +3267,7 @@ class LVFormatter { // but this does not look right, as any other unicode char would allow wrap. // #if (USE_LIBUNIBREAK==1) + // Note: with libunibreak, we can't assume anymore that LCHAR_ALLOW_WRAP_AFTER is synonym to IS_SPACE. if (flags & LCHAR_ALLOW_WRAP_AFTER) { lastNormalWrap = i; } @@ -3309,7 +3319,7 @@ class LVFormatter { else if ( flags & LCHAR_DEPRECATED_WRAP_AFTER ) // Hyphens make a less priority wrap lastDeprecatedWrap = i; else if ( flags & LCHAR_ALLOW_HYPH_WRAP_AFTER ) // can't happen at this point as we haven't - lastHyphWrap = i; // gone thru HyphMan::hyphenate() + lastHyphWrap = i; // gone thru hyphenate() if ( !grabbedExceedingSpace && m_pbuffer->min_space_condensing_percent != 100 && i < m_length-1 && @@ -3403,7 +3413,7 @@ class LVFormatter { // We have a valid word to look for hyphenation if ( len > MAX_WORD_SIZE ) // hyphenate() stops/truncates at 64 chars len = MAX_WORD_SIZE; - // HyphMan::hyphenate(), which is used by some other parts of the code, + // ->hyphenate(), which is used by some other parts of the code, // expects a lUInt8 array. We added flagSize=1|2 so it can set the correct // flags on our upgraded (from lUInt8 to lUInt16) m_flags. lUInt8 * flags = (lUInt8*) (m_flags + wstart); @@ -3426,7 +3436,8 @@ class LVFormatter { break; } } - if ( HyphMan::hyphenate(m_text+wstart, len, widths, flags, _hyphen_width, max_width, 2) ) { + // Use the hyph method of the source node that contains wordpos + if ( m_srcs[wordpos]->lang_cfg->getHyphMethod()->hyphenate(m_text+wstart, len, widths, flags, _hyphen_width, max_width, 2) ) { // We need to reset the flag for the multiple hyphenation // opportunities we will not be using (or they could cause // spurious spaces, as a word here may be multiple words @@ -3566,6 +3577,16 @@ class LVFormatter { endp = m_length; addLine(pos, endp, x + firstCharMargin, para, interval, pos==0, wrapPos>=m_length-1, preFormattedOnly, needReduceSpace, isLastPara); pos = wrapPos + 1; + #if (USE_LIBUNIBREAK==1) + // (Only when using libunibreak, which we trust decisions to wrap on hyphens.) + if ( m_srcs[wrapPos]->lang_cfg->duplicateRealHyphenOnNextLine() && pos > 0 && pos < m_length-1 ) { + if ( m_text[wrapPos] == '-' || m_text[wrapPos] == UNICODE_HYPHEN ) { + pos--; // Have that last hyphen also at the start of next line + // (small caveat: the duplicated hyphen at start of next + // line won't be part of the highlighted text) + } + } + #endif } } @@ -4249,6 +4270,7 @@ void LFormattedText::Draw( LVDrawBuf * buf, int x, int y, ldomMarkedRangeList * '?', NULL, flgHyphen, + srcline->lang_cfg, drawFlags, srcline->letter_spacing, word->width, diff --git a/crengine/src/lvtinydom.cpp b/crengine/src/lvtinydom.cpp index 08cdc553c..c1db25355 100644 --- a/crengine/src/lvtinydom.cpp +++ b/crengine/src/lvtinydom.cpp @@ -387,7 +387,7 @@ lUInt32 calcGlobalSettingsHash(int documentId) hash = hash * 75 + 2384761; if ( gFlgFloatingPunctuationEnabled ) hash = hash * 75 + 1761; - hash = hash * 31 + (HyphMan::getSelectedDictionary()!=NULL ? HyphMan::getSelectedDictionary()->getHash() : 123 ); + hash = hash * 31 + TextLangMan::getHash(); hash = hash * 31 + HyphMan::getLeftHyphenMin(); hash = hash * 31 + HyphMan::getRightHyphenMin(); hash = hash * 31 + HyphMan::getTrustSoftHyphens(); @@ -1700,6 +1700,31 @@ void RenderRectAccessor::setListPropNodeIndex( int idx ) _modified = true; } } +int RenderRectAccessor::getLangNodeIndex() +{ + if ( _dirty ) { + _dirty = false; + _node->getRenderData(*this); +#ifdef DEBUG_RENDER_RECT_ACCESS + rr_lock( _node ); +#endif + } + return _lang_node_idx; +} +void RenderRectAccessor::setLangNodeIndex( int idx ) +{ + if ( _dirty ) { + _dirty = false; + _node->getRenderData(*this); +#ifdef DEBUG_RENDER_RECT_ACCESS + rr_lock( _node ); +#endif + } + if ( _lang_node_idx != idx ) { + _lang_node_idx = idx; + _modified = true; + } +} unsigned short RenderRectAccessor::getFlags() { if ( _dirty ) { @@ -3797,8 +3822,10 @@ static void writeNodeEx( LVStream * stream, ldomNode * node, lString16Collection // We have a valid word to look for hyphenation if ( len > HYPH_MAX_WORD_SIZE ) // hyphenate() stops/truncates at 64 chars len = HYPH_MAX_WORD_SIZE; - // Have HyphMan set flags inside 'flags' - HyphMan::hyphenate(text16+start, len, widths, flags+start, 0, 0xFFFF, 1); + // Have hyphenate() set flags inside 'flags' + // (Fetching the lang_cfg for each text node is not really cheap, but + // it's easier than having to pass it to each writeNodeEx()) + TextLangMan::getTextLangCfg(node)->getHyphMethod()->hyphenate(text16+start, len, widths, flags+start, 0, 0xFFFF, 1); // Continue with previous word wordpos = start - 1; } @@ -7729,7 +7756,7 @@ ldomXPointer ldomDocument::createXPointer( lvPoint pt, int direction, bool stric lUInt32 hints = WORD_FLAGS_TO_FNT_FLAGS(word->flags); font->measureText( str.c_str()+word->t.start, word->t.len, width, flg, - word->width+50, '?', src->letter_spacing, false, hints); + word->width+50, '?', src->lang_cfg, src->letter_spacing, false, hints); bool word_is_rtl = word->flags & LTEXT_WORD_DIRECTION_IS_RTL; if ( word_is_rtl ) { @@ -8081,6 +8108,7 @@ bool ldomXPointer::getRect(lvRect & rect, bool extended, bool adjusted) const flg, word->width+50, '?', + txtform->GetSrcInfo(srcIndex)->lang_cfg, txtform->GetSrcInfo(srcIndex)->letter_spacing, false, hints); @@ -8249,6 +8277,7 @@ bool ldomXPointer::getRect(lvRect & rect, bool extended, bool adjusted) const flg, word->width+50, '?', + txtform->GetSrcInfo(srcIndex)->lang_cfg, txtform->GetSrcInfo(srcIndex)->letter_spacing, false, hints ); @@ -13010,7 +13039,6 @@ lUInt32 tinyNodeCollection::calcStyleHash() { CRLog::debug("calcStyleHash start"); // int maxlog = 20; - int count = ((_elemCount+TNC_PART_LEN-1) >> TNC_PART_SHIFT); lUInt32 res = 0; //_elemCount; lUInt32 globalHash = calcGlobalSettingsHash(getFontContextDocIndex()); lUInt32 docFlags = getDocFlags(); @@ -13030,6 +13058,7 @@ lUInt32 tinyNodeCollection::calcStyleHash() // we should invalidate the cache so a new correct DOM is build on load. _nodeDisplayStyleHash = 0; + int count = ((_elemCount+TNC_PART_LEN-1) >> TNC_PART_SHIFT); for ( int i=0; igetTextWidth((marker + " ").c_str(), marker.length()+2) + font->getSize()/8; + TextLangCfg * lang_cfg = TextLangMan::getTextLangCfg( this ); + markerWidth = font->getTextWidth((marker + " ").c_str(), marker.length()+2, lang_cfg) + font->getSize()/8; res = true; } else { marker.clear(); @@ -16371,7 +16401,9 @@ int ldomNode::renderFinalBlock( LFormattedTextRef & frmtext, RenderRectAccessor /// render whole node content as single formatted object int direction = RENDER_RECT_PTR_GET_DIRECTION(fmt); int flags = styleToTextFmtFlags( getStyle(), 0, direction ); - ::renderFinalBlock( this, f.get(), fmt, flags, 0, -1 ); + int lang_node_idx = fmt->getLangNodeIndex(); + TextLangCfg * lang_cfg = TextLangMan::getTextLangCfg(lang_node_idx>0 ? getDocument()->getTinyNode(lang_node_idx) : NULL); + ::renderFinalBlock( this, f.get(), fmt, flags, 0, -1, lang_cfg ); cache.set( this, f ); bool flg=gFlgFloatingPunctuationEnabled; if (this->getNodeName()=="th"||this->getNodeName()=="td"|| diff --git a/crengine/src/textlang.cpp b/crengine/src/textlang.cpp new file mode 100644 index 000000000..e3e17e12a --- /dev/null +++ b/crengine/src/textlang.cpp @@ -0,0 +1,467 @@ +// IMPORTANT : when making changes in language detection logic and per-language +// rules here, be sure to also bump FORMATTING_VERSION_ID in src/lvtinydom.cpp + +#include "../include/lvtypes.h" +#include "../include/lvstring.h" +#include "../include/lvtinydom.h" +#include "../include/fb2def.h" +#include "../include/textlang.h" +#include "../include/hyphman.h" + +// Uncomment to see which lang_tags are seen and lang_cfg created +// #define DEBUG_LANG_USAGE + +// Some macros to expand: LANG_STARTS_WITH(("fr") ("es")) (no comma!) +// to: lang_tag.startsWith("fr") || lang_tag.startsWith("es") || false +// (from https://stackoverflow.com/questions/19680962/translate-sequence-in-macro-parameters-to-separate-macros ) +#define PRIMITIVE_SEQ_ITERATE(...) __VA_ARGS__ ## _END +#define SEQ_ITERATE(...) PRIMITIVE_SEQ_ITERATE(__VA_ARGS__) +#define LANG_STARTS_WITH(seq) SEQ_ITERATE(LANG_STARTS_WITH_EACH_1 seq) +#define LANG_STARTS_WITH_EACH_1(...) lang_tag.startsWith(__VA_ARGS__) || LANG_STARTS_WITH_EACH_2 +#define LANG_STARTS_WITH_EACH_2(...) lang_tag.startsWith(__VA_ARGS__) || LANG_STARTS_WITH_EACH_1 +#define LANG_STARTS_WITH_EACH_1_END false +#define LANG_STARTS_WITH_EACH_2_END false + +// (hyph_filename_prefix added because CoolReader may still have both +// current "Italian.pattern" and old "Italian_hyphen_(Alan).pdb".) +static struct { + const char * lang_tag; + const char * hyph_filename_prefix; + const char * hyph_filename; + int left_hyphen_min; + int right_hyphen_min; +} _hyph_dict_table[] = { + { "bg", "Bulgarian", "Bulgarian.pattern", 2, 2 }, + { "ca", "Catalan", "Catalan.pattern", 2, 2 }, + { "cs", "Czech", "Czech.pattern", 2, 2 }, + { "da", "Danish", "Danish.pattern", 2, 2 }, + { "nl", "Dutch", "Dutch.pattern", 2, 2 }, + { "en-GB", "English_GB", "English_GB.pattern", 2, 2 }, + { "en", "English_US", "English_US.pattern", 2, 2 }, + { "fi", "Finnish", "Finnish.pattern", 2, 2 }, + { "fr", "French", "French.pattern", 2, 1 }, + { "gl", "Galician", "Galician.pattern", 2, 2 }, + { "de", "German", "German.pattern", 2, 2 }, + { "el", "Greek", "Greek.pattern", 2, 2 }, + { "hu", "Hungarian", "Hungarian.pattern", 2, 2 }, + { "is", "Icelandic", "Icelandic.pattern", 2, 2 }, + { "ga", "Irish", "Irish.pattern", 2, 2 }, + { "it", "Italian", "Italian.pattern", 2, 2 }, + { "no", "Norwegian", "Norwegian.pattern", 2, 2 }, + { "pl", "Polish", "Polish.pattern", 2, 2 }, + { "pt", "Portuguese", "Portuguese.pattern", 2, 2 }, + { "ro", "Roman", "Roman.pattern", 2, 2 }, + { "ru-GB", "Russian_EnGB", "Russian_EnGB.pattern", 2, 2 }, + { "ru-US", "Russian_EnUS", "Russian_EnUS.pattern", 2, 2 }, + { "ru", "Russian", "Russian.pattern", 2, 2 }, + { "sk", "Slovak", "Slovak.pattern", 2, 2 }, + { "sl", "Slovenian", "Slovenian.pattern", 2, 2 }, + { "es", "Spanish", "Spanish.pattern", 2, 2 }, + { "sv", "Swedish", "Swedish.pattern", 2, 2 }, + { "tr", "Turkish", "Turkish.pattern", 2, 2 }, + { "uk", "Ukrain", "Ukrain.pattern", 2, 2 }, + // No-lang hyph methods, for legacy HyphMan methods: other lang properties will be from English + { "en#@none", "@none", "@none", 2, 2 }, + { "en#@softhyphens", "@softhyphens", "@softhyphens", 2, 2 }, + { "en#@algorithm", "@algorithm", "@algorithm", 2, 2 }, + { "en#@dictionary", "@dictionary", "@dictionary", 2, 2 }, // single instance of a dict created from + // stream (by CoolReader on Android) + { NULL, NULL, NULL, 0, 0 } +}; + +// Init global TextLangMan members +lString16 TextLangMan::_main_lang = TEXTLANG_DEFAULT_MAIN_LANG_16; +bool TextLangMan::_embedded_langs_enabled = TEXTLANG_DEFAULT_EMBEDDED_LANGS_ENABLED; +LVPtrVector TextLangMan::_lang_cfg_list; + +bool TextLangMan::_hyphenation_enabled = TEXTLANG_DEFAULT_HYPHENATION_ENABLED; +bool TextLangMan::_hyphenation_soft_hyphens_only = TEXTLANG_DEFAULT_HYPH_SOFT_HYPHENS_ONLY; +bool TextLangMan::_hyphenation_force_algorithmic = TEXTLANG_DEFAULT_HYPH_FORCE_ALGORITHMIC; +bool TextLangMan::_overridden_hyph_method = !TEXTLANG_DEFAULT_HYPHENATION_ENABLED + || TEXTLANG_DEFAULT_HYPH_SOFT_HYPHENS_ONLY + || TEXTLANG_DEFAULT_HYPH_FORCE_ALGORITHMIC ; +// These will be set when we can +HyphMethod * TextLangMan::_no_hyph_method = NULL; +HyphMethod * TextLangMan::_algo_hyph_method = NULL; +HyphMethod * TextLangMan::_soft_hyphens_method = NULL; + +TextLangMan::TextLangMan() { +} + +TextLangMan::~TextLangMan() { +} + +lUInt32 TextLangMan::getHash() { + lUInt32 hash = _main_lang.getHash(); + hash = hash << 4; + hash = hash + (_embedded_langs_enabled << 3); + hash = hash + (_hyphenation_soft_hyphens_only << 2); + hash = hash + (_hyphenation_force_algorithmic << 1); + hash = hash + _hyphenation_enabled; + // printf("TextLangMan::getHash %x\n", hash); + return hash; +} + +// No need to explicitely call this in frontend code. +// Calling HyphMan::uninit() will have this one called. +void TextLangMan::uninit() { + _lang_cfg_list.clear(); +} + +// For HyphMan legacy methods +void TextLangMan::setMainLangFromHyphDict( lString16 id ) { + // When setting up TextlangMan thru HyphMan legacy methods, + // disable embedded langs, for a consistent hyphenation. + TextLangMan::setEmbeddedLangsEnabled( false ); + // Update flags if asked for @none, @softhyphens or @algorithm + TextLangMan::setHyphenationEnabled( id != HYPH_DICT_ID_NONE ); + TextLangMan::setHyphenationSoftHyphensOnly( id == HYPH_DICT_ID_SOFTHYPHENS ); + TextLangMan::setHyphenationForceAlgorithmic( id == HYPH_DICT_ID_ALGORITHM ); + + for (int i=0; _hyph_dict_table[i].lang_tag!=NULL; i++) { + if ( id.startsWith( _hyph_dict_table[i].hyph_filename_prefix ) ) { + TextLangMan::setMainLang( lString16(_hyph_dict_table[i].lang_tag) ); + #ifdef DEBUG_LANG_USAGE + printf("TextLangMan::setMainLangFromHyphDict %s => %s\n", + UnicodeToLocal(id).c_str(), UnicodeToLocal(TextLangMan::getMainLang()).c_str()); + #endif + return; + } + } + printf("CRE WARNING: lang not found for hyphenation dict: %s\n", UnicodeToLocal(id).c_str()); +} + +// Used only by TextLangCfg +HyphMethod * TextLangMan::getHyphMethodForLang( lString16 lang_tag ) { + // Look for full lang_tag + for (int i=0; _hyph_dict_table[i].lang_tag!=NULL; i++) { + if ( lang_tag == lString16(_hyph_dict_table[i].lang_tag).lowercase() ) { + return HyphMan::getHyphMethodForDictionary( lString16(_hyph_dict_table[i].hyph_filename), + _hyph_dict_table[i].left_hyphen_min, _hyph_dict_table[i].right_hyphen_min); + } + } + // Look for lang_tag initial subpart + int m_pos = lang_tag.pos("-"); + if ( m_pos > 0 ) { + lString16 lang_tag2 = lang_tag.substr(0, m_pos); + for (int i=0; _hyph_dict_table[i].lang_tag!=NULL; i++) { + if ( lang_tag2 == lString16(_hyph_dict_table[i].lang_tag).lowercase() ) { + return HyphMan::getHyphMethodForDictionary( lString16(_hyph_dict_table[i].hyph_filename), + _hyph_dict_table[i].left_hyphen_min, _hyph_dict_table[i].right_hyphen_min); + } + } + } + // Fallback to English_US, as other languages are more likely to get mixed + // with english text (it feels better than using @algorithm) + return HyphMan::getHyphMethodForDictionary(TEXTLANG_FALLBACK_HYPH_DICT_ID); + +} + +// Return the (single and cached) TextLangCfg for the provided lang_tag +TextLangCfg * TextLangMan::getTextLangCfg( lString16 lang_tag ) { + if ( !_embedded_langs_enabled ) { + // Drop provided lang_tag: always return main lang TextLangCfg + lang_tag = _main_lang; + } + // Not sure if we can lowercase lang_tag and avoid duplicate (Harfbuzz might + // need the proper lang tag with some parts starting with some uppercase letter) + for ( int i=0; i<_lang_cfg_list.length(); i++ ) { + if ( _lang_cfg_list[i]->_lang_tag == lang_tag ) { + // printf("TextLangCfg %s reused\n", UnicodeToLocal(lang_tag).c_str()); + // There should rarely be more than 3 lang in a document, so move + // any requested far down in the list at top to shorten next loops. + if ( i > 2 ) { + _lang_cfg_list.move(0, i); + return _lang_cfg_list[0]; + } + return _lang_cfg_list[i]; + } + } + // Not found in cache: create it + TextLangCfg * lang_cfg = new TextLangCfg( lang_tag ); + _lang_cfg_list.add( lang_cfg ); // and cache it + return lang_cfg; +} + +TextLangCfg * TextLangMan::getTextLangCfg() { + // No lang_tag specified: return main lang one + return TextLangMan::getTextLangCfg( _main_lang ); +} + +TextLangCfg * TextLangMan::getTextLangCfg( ldomNode * node ) { + if ( !_embedded_langs_enabled || !node ) { + // No need to look at nodes: return main lang one + return TextLangMan::getTextLangCfg( _main_lang ); + } + if ( node->isText() ) + node = node->getParentNode(); + // We are usually called from renderFinalBlock() with a node that + // we know has a lang= attribute. + // But we may be called in other contexts (e.g. writeNodeEx) with + // any node: so, look at this node parents for that lang= attribute. + for ( ; !node->isRoot(); node = node->getParentNode() ) { + if ( node->hasAttribute( attr_lang ) ) { + lString16 lang_tag = node->getAttributeValue( attr_lang ); + if ( !lang_tag.empty() ) + return TextLangMan::getTextLangCfg( lang_tag ); + } + } + // No parent with lang= attribute: return main lang one + return TextLangMan::getTextLangCfg( _main_lang ); +} + +int TextLangMan::getLangNodeIndex( ldomNode * node ) { + if ( !_embedded_langs_enabled || !node ) { + // No need to look up if !_embedded_langs_enabled + return 0; + } + if ( node->isText() ) + node = node->getParentNode(); + for ( ; !node->isRoot(); node = node->getParentNode() ) { + if ( node->hasAttribute( attr_lang ) ) { + if ( !node->getAttributeValue( attr_lang ).empty() ) { + return node->getDataIndex(); + } + } + } + return 0; +} + +// For HyphMan::hyphenate() +HyphMethod * TextLangMan::getMainLangHyphMethod() { + return getTextLangCfg()->getHyphMethod(); +} + + +// TextLangCfg object: per language holder of language specificities + +#if USE_LIBUNIBREAK==1 +lChar16 lb_char_sub_func_polish(const lChar16 * text, int pos, int next_usable) { + // https://github.com/koreader/koreader/issues/5645#issuecomment-559193057 + // Letters aiouwzAIOUWS are prepositions that should not be left at the + // end of a line. + // Make them behave (for libunibreak) just like a opening paren (which + // being LBC_OP, will prevent a line break after it, even if followed + // by a space). + if ( pos >= 1 && text[pos-1] == ' ' ) { + switch ( text[pos] ) { + case 'A': + case 'I': + case 'O': + case 'U': + case 'W': + case 'Z': // Meaning in english: + case 'a': // and + case 'i': // and + case 'o': // about + case 'u': // at + case 'w': // in + case 'z': // with + return '('; + break; + default: + break; + } + } + return text[pos]; +} + +lChar16 lb_char_sub_func_czech_slovak(const lChar16 * text, int pos, int next_usable) { + // Same for Czech and Slovak : AIiVvOoUuSsZzKk + // https://tex.stackexchange.com/questions/27780/one-letter-word-at-the-end-of-line + // https://github.com/michal-h21/luavlna + if ( pos >= 1 && text[pos-1] == ' ' ) { + switch ( text[pos] ) { + case 'A': + case 'I': + case 'K': + case 'O': + case 'S': + case 'U': + case 'V': + case 'Z': + case 'i': + case 'k': + case 'o': + case 's': + case 'u': + case 'v': + case 'z': + return '('; + break; + default: + break; + } + } + return text[pos]; +} +#endif + +TextLangCfg::~TextLangCfg() { +} + +// Instantiate a new TextLangCfg with properties adequate to the provided lang_tag +TextLangCfg::TextLangCfg( lString16 lang_tag ) { + if ( TextLangMan::_no_hyph_method == NULL ) { + // We need to init static TextLangMan::_no_hyph_method and friends after + // HyphMan is set up. Do that here, even if unrelated, as TextLangCfg + // creation is called less often that every other methods around here. + TextLangMan::_no_hyph_method = HyphMan::getHyphMethodForDictionary(HYPH_DICT_ID_NONE); + TextLangMan::_soft_hyphens_method = HyphMan::getHyphMethodForDictionary(HYPH_DICT_ID_SOFTHYPHENS); + TextLangMan::_algo_hyph_method = HyphMan::getHyphMethodForDictionary(HYPH_DICT_ID_ALGORITHM); + } + + // Keep as our id the provided and non-lowercase'd lang_tag (with possibly bogus #@algorithm) + _lang_tag = lang_tag; + // Harfbuzz may know more than us about exotic/complex lang tags, + // so let it deal the the provided one as-is. + lString16 hb_lang_tag = lang_tag; + // Lowercase it for our tests + lang_tag.lowercase(); // (used by LANG_STARTS_WITH() macros) + + // Get hyph method/dictionary from _hyph_dict_table + _hyph_method = TextLangMan::getHyphMethodForLang(lang_tag); + + // Cleanup if we got "en#@something" from legacy HyphMan methods + int h_pos = lang_tag.pos("#"); + if ( h_pos > 0 ) { + lang_tag = lang_tag.substr(0, h_pos); + hb_lang_tag = hb_lang_tag.substr(0, h_pos); // Also clean the one for HB + } + #ifdef DEBUG_LANG_USAGE + printf("TextLangCfg %s created (%s %s)\n", UnicodeToLocal(_lang_tag).c_str(), + UnicodeToLocal(lang_tag).c_str(), UnicodeToLocal(_hyph_method->getId()).c_str()); + #endif + + // https://drafts.csswg.org/css-text-3/#script-tagging + // We might need to check for the script subpart (optional 2nd + // subpart) Lant, Hant, Hrkt... and make some non latin language + // with a Lant script behave more like latin languages... + + // Note that Harfbuzz seems to do the right same thing with + // either "zh-TW" and "zh-Hant". + + // See for more clever/complex handling of lang tags: + // https://android.googlesource.com/platform/frameworks/minikin/+/refs/heads/master/libs/minikin/Locale.cpp + + // We thought about adding a 2nd fallback font per-language, but it feels + // a bit wrong to limit this feature to documents with lang tags. + // Better to implement a generic font fallback chain independant of language. + + // https://unicode.org/reports/tr14/#Hyphen : in Polish and Portuguese, + // a real hyphen at end of line must be duplicated at start of next line. + _duplicate_real_hyphen_on_next_line = false; + +#if USE_HARFBUZZ==1 + _hb_language = hb_language_from_string(UnicodeToLocal(hb_lang_tag).c_str(), -1); +#endif + +#if USE_LIBUNIBREAK==1 + // libunibreak per-language LineBreakProperties extensions + // + // Rules extracted from libunibreak/src/linebreakdef.c, so we can adapt + // them and build LineBreakProperties adequately for more languages. + // See https://en.wikipedia.org/wiki/Quotation_mark + // These are mostly need only for languages that may add a space between + // the quote and its content - otherwise, the quote will be part of the + // word it sticks to, and break will be allowed on the other side which + // probably is a space. + // When a language allows the use of unpaired quotes (same quote on both + // sides), it seems best to not specify anything. + bool has_left_single_quotation_mark_opening = false; // U+2018 ‘ + bool has_left_single_quotation_mark_closing = false; + bool has_right_single_quotation_mark_opening = false; // U+2019 ’ + bool has_right_single_quotation_mark_closing = false; + bool has_right_single_quotation_mark_glue = false; + bool has_left_double_quotation_mark_opening = false; // U+201C “ + bool has_left_double_quotation_mark_closing = false; + bool has_right_double_quotation_mark_opening = false; // U+201D ” + bool has_right_double_quotation_mark_closing = false; + bool has_left_single_angle_quotation_mark_opening = false; // U+2039 ‹ + bool has_left_single_angle_quotation_mark_closing = false; + bool has_right_single_angle_quotation_mark_opening = false; // U+203A › + bool has_right_single_angle_quotation_mark_closing = false; + bool has_left_double_angle_quotation_mark_opening = false; // U+00AB « + bool has_left_double_angle_quotation_mark_closing = false; + bool has_right_double_angle_quotation_mark_opening = false; // U+00BB » + bool has_right_double_angle_quotation_mark_closing = false; + + // Note: these macros use 'lang_tag'. + if ( LANG_STARTS_WITH(("en")) ) { // English + has_left_single_quotation_mark_opening = true; // no right..closing in linebreakdef.c + has_left_double_quotation_mark_opening = true; + has_right_double_quotation_mark_closing = true; + } + else if ( LANG_STARTS_WITH(("fr") ("es")) ) { // French, Spanish + has_left_single_quotation_mark_opening = true; // no right..closing in linebreakdef.c + has_left_double_quotation_mark_opening = true; + has_right_double_quotation_mark_closing = true; + has_left_single_angle_quotation_mark_opening = true; + has_right_single_angle_quotation_mark_closing = true; + has_left_double_angle_quotation_mark_opening = true; + has_right_double_angle_quotation_mark_closing = true; + } + else if ( LANG_STARTS_WITH(("de")) ) { // German + has_left_single_quotation_mark_closing = true; + has_right_single_quotation_mark_glue = true; + has_left_double_quotation_mark_closing = true; + has_left_single_angle_quotation_mark_closing = true; + has_right_single_angle_quotation_mark_opening = true; + has_left_double_angle_quotation_mark_closing = true; + has_right_double_angle_quotation_mark_opening = true; + } + else if ( LANG_STARTS_WITH(("ru")) ) { // Russian + has_left_double_quotation_mark_closing = true; + has_left_double_angle_quotation_mark_opening = true; + has_right_double_angle_quotation_mark_closing = true; + } + else if ( LANG_STARTS_WITH(("zh")) ) { // Chinese + has_left_single_quotation_mark_opening = true; + has_right_single_quotation_mark_closing = true; + has_left_double_quotation_mark_opening = true; + has_right_double_quotation_mark_closing = true; + } + // Add languages rules here, or reuse previous one with other languages if needed. + + // Set up _lb_props. + // Important: the unicode indices must be in strict ascending order (or libunibreak + // might abort checking them all) + int n = 0; + if ( has_left_double_angle_quotation_mark_opening ) _lb_props[n++] = { 0x00AB, 0x00AB, LBP_OP }; + if ( has_left_double_angle_quotation_mark_closing ) _lb_props[n++] = { 0x00AB, 0x00AB, LBP_CL }; + // Soft-Hyphens are handled by Hyphman hyphenate(), have them handled as Zero-Width-Joiner by + // libunibreak so they don't allow any break and don't prevent hyphenate() to handle them correctly. + _lb_props[n++] = { 0x00AD, 0x00AD, LBP_ZWJ }; + if ( has_right_double_angle_quotation_mark_opening ) _lb_props[n++] = { 0x00BB, 0x00BB, LBP_OP }; + if ( has_right_double_angle_quotation_mark_closing ) _lb_props[n++] = { 0x00BB, 0x00BB, LBP_CL }; + if ( has_left_single_quotation_mark_opening ) _lb_props[n++] = { 0x2018, 0x2018, LBP_OP }; + if ( has_left_single_quotation_mark_closing ) _lb_props[n++] = { 0x2018, 0x2018, LBP_CL }; + if ( has_right_single_quotation_mark_opening ) _lb_props[n++] = { 0x2019, 0x2019, LBP_OP }; + if ( has_right_single_quotation_mark_closing ) _lb_props[n++] = { 0x2019, 0x2019, LBP_CL }; + if ( has_right_single_quotation_mark_glue ) _lb_props[n++] = { 0x2019, 0x2019, LBP_GL }; + if ( has_left_double_quotation_mark_opening ) _lb_props[n++] = { 0x201C, 0x201C, LBP_OP }; + if ( has_left_double_quotation_mark_closing ) _lb_props[n++] = { 0x201C, 0x201C, LBP_CL }; + if ( has_right_double_quotation_mark_opening ) _lb_props[n++] = { 0x201D, 0x201D, LBP_OP }; + if ( has_right_double_quotation_mark_closing ) _lb_props[n++] = { 0x201D, 0x201D, LBP_CL }; + if ( has_left_single_angle_quotation_mark_opening ) _lb_props[n++] = { 0x2039, 0x2039, LBP_OP }; + if ( has_left_single_angle_quotation_mark_closing ) _lb_props[n++] = { 0x2039, 0x2039, LBP_CL }; + if ( has_right_single_angle_quotation_mark_opening ) _lb_props[n++] = { 0x203A, 0x203A, LBP_OP }; + if ( has_right_single_angle_quotation_mark_closing ) _lb_props[n++] = { 0x203A, 0x203A, LBP_CL }; + // End of list + _lb_props[n++] = { 0, 0, LBP_Undefined }; + // Done with libunibreak per-language LineBreakProperties extensions + + // Other line breaking and text layout tweaks + _lb_char_sub_func = NULL; + if ( LANG_STARTS_WITH(("pl")) ) { // Polish + _lb_char_sub_func = &lb_char_sub_func_polish; + _duplicate_real_hyphen_on_next_line = true; + } + if ( LANG_STARTS_WITH(("cs") ("sk")) ) { // Czech, Slovak + _lb_char_sub_func = &lb_char_sub_func_czech_slovak; + } + if ( LANG_STARTS_WITH(("pt")) ) { // Portuguese + _duplicate_real_hyphen_on_next_line = true; + } +#endif +}