From a0473230f9da69a138be8b81d4f7d168bd6118c7 Mon Sep 17 00:00:00 2001 From: Pat Nakajima Date: Fri, 3 May 2024 11:31:12 -0700 Subject: [PATCH 1/9] add failing test case for stripped
 contents

---
 .../cases/access-cases/case-1006@0.html          | 15 +++++++++++++++
 .../cases/access-expects/case-1006.html          | 16 ++++++++++++++++
 .../cases/access-expects/case-1006.txt           |  4 ++++
 3 files changed, 35 insertions(+)
 create mode 100644 regression_testing/cases/access-cases/case-1006@0.html
 create mode 100644 regression_testing/cases/access-expects/case-1006.html
 create mode 100644 regression_testing/cases/access-expects/case-1006.txt

diff --git a/regression_testing/cases/access-cases/case-1006@0.html b/regression_testing/cases/access-cases/case-1006@0.html
new file mode 100644
index 000000000..025866c69
--- /dev/null
+++ b/regression_testing/cases/access-cases/case-1006@0.html
@@ -0,0 +1,15 @@
+
+
+
+aert1.0/13.10.1
+
+
+
+
+func sup() {
+  return
+}
+
+
+ + diff --git a/regression_testing/cases/access-expects/case-1006.html b/regression_testing/cases/access-expects/case-1006.html new file mode 100644 index 000000000..eb59f2c7a --- /dev/null +++ b/regression_testing/cases/access-expects/case-1006.html @@ -0,0 +1,16 @@ + + + +aert1.0/13.10.1 + + +
+
+func sup() {
+  return
+}
+
+
+ + diff --git a/regression_testing/cases/access-expects/case-1006.txt b/regression_testing/cases/access-expects/case-1006.txt new file mode 100644 index 000000000..beb27b61e --- /dev/null +++ b/regression_testing/cases/access-expects/case-1006.txt @@ -0,0 +1,4 @@ +line 1 column 1 - Access: [3.3.1.1]: use style sheets to control presentation. +line 3 column 1 - Access: [13.2.1.1]: Metadata missing. +No warnings or errors were found. + From 54bd920f3f7226ca8b2e89d2c2fb2c36d6344a95 Mon Sep 17 00:00:00 2001 From: Pat Nakajima Date: Fri, 3 May 2024 17:38:41 -0700 Subject: [PATCH 2/9] update tests --- regression_testing/cases/access-cases/case-1006.conf | 3 +++ regression_testing/cases/access-cases/case-1006@0.html | 2 +- regression_testing/cases/access-expects/case-1006.html | 3 +-- regression_testing/cases/access-expects/case-1006.txt | 9 ++++++--- 4 files changed, 11 insertions(+), 6 deletions(-) create mode 100644 regression_testing/cases/access-cases/case-1006.conf diff --git a/regression_testing/cases/access-cases/case-1006.conf b/regression_testing/cases/access-cases/case-1006.conf new file mode 100644 index 000000000..e28fa9485 --- /dev/null +++ b/regression_testing/cases/access-cases/case-1006.conf @@ -0,0 +1,3 @@ +output-xhtml: yes +show-info: no +force-output: yes diff --git a/regression_testing/cases/access-cases/case-1006@0.html b/regression_testing/cases/access-cases/case-1006@0.html index 025866c69..aa46d2d3c 100644 --- a/regression_testing/cases/access-cases/case-1006@0.html +++ b/regression_testing/cases/access-cases/case-1006@0.html @@ -1,4 +1,3 @@ - aert1.0/13.10.1 @@ -7,6 +6,7 @@
 
 func sup() {
+  print("hello")
   return
 }
 
diff --git a/regression_testing/cases/access-expects/case-1006.html b/regression_testing/cases/access-expects/case-1006.html
index eb59f2c7a..aa46d2d3c 100644
--- a/regression_testing/cases/access-expects/case-1006.html
+++ b/regression_testing/cases/access-expects/case-1006.html
@@ -1,5 +1,3 @@
-
 
 
 aert1.0/13.10.1
@@ -8,6 +6,7 @@
 
 
 func sup() {
+  print("hello")
   return
 }
 
diff --git a/regression_testing/cases/access-expects/case-1006.txt b/regression_testing/cases/access-expects/case-1006.txt
index beb27b61e..742dad556 100644
--- a/regression_testing/cases/access-expects/case-1006.txt
+++ b/regression_testing/cases/access-expects/case-1006.txt
@@ -1,4 +1,7 @@
-line 1 column 1 - Access: [3.3.1.1]: use style sheets to control presentation.
-line 3 column 1 - Access: [13.2.1.1]: Metadata missing.
-No warnings or errors were found.
+line 1 column 1 - Warning: missing  declaration
+line 3 column 31 - Warning: plain text isn't allowed in  elements
+line 3 column 31 - Warning: inserting implicit 
+line 4 column 1 - Warning: discarding unexpected 
+line 5 column 1 - Warning: discarding unexpected 
+Tidy found 5 warnings and 0 errors!
 

From 57d9dac5247cba3750d250889aaf3e2fdf543e2f Mon Sep 17 00:00:00 2001
From: Pat Nakajima 
Date: Fri, 3 May 2024 18:13:09 -0700
Subject: [PATCH 3/9] newlines work! but now there are literally millions of
 failing tests......

---
 .../cases/access-expects/case-1006.html            |  3 ++-
 .../cases/access-expects/case-1006.txt             |  6 ------
 src/lexer.c                                        | 14 ++++++++------
 src/pprint.c                                       |  2 +-
 4 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/regression_testing/cases/access-expects/case-1006.html b/regression_testing/cases/access-expects/case-1006.html
index aa46d2d3c..74bf827cc 100644
--- a/regression_testing/cases/access-expects/case-1006.html
+++ b/regression_testing/cases/access-expects/case-1006.html
@@ -1,4 +1,5 @@
-
+
+
 
 aert1.0/13.10.1
 
diff --git a/regression_testing/cases/access-expects/case-1006.txt b/regression_testing/cases/access-expects/case-1006.txt
index 742dad556..8b1378917 100644
--- a/regression_testing/cases/access-expects/case-1006.txt
+++ b/regression_testing/cases/access-expects/case-1006.txt
@@ -1,7 +1 @@
-line 1 column 1 - Warning: missing  declaration
-line 3 column 31 - Warning: plain text isn't allowed in  elements
-line 3 column 31 - Warning: inserting implicit 
-line 4 column 1 - Warning: discarding unexpected 
-line 5 column 1 - Warning: discarding unexpected 
-Tidy found 5 warnings and 0 errors!
 
diff --git a/src/lexer.c b/src/lexer.c
index b0afccdab..af96c078f 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -2516,7 +2516,7 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
     AttVal *attributes = NULL;
     Node *node;
     Bool fixComments;
-    
+
     switch ( cfgAutoBool(doc, TidyFixComments) )
     {
         case TidyYesState:
@@ -2557,13 +2557,12 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
         switch (lexer->state)
         {
             case LEX_CONTENT:  /* element content */
-
                 /*
                  Discard white space if appropriate. Its cheaper
                  to do this here rather than in parser methods
                  for elements that don't have mixed content.
                 */
-                if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace) 
+                if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace) && (c != '\n' && mode != Preformatted)
                       && lexer->lexsize == lexer->txtstart + 1)
                 {
                     --(lexer->lexsize);
@@ -2591,10 +2590,11 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
                     }
                     else /* prev character wasn't white */
                     {
-                        lexer->waswhite = yes;
-
-                        if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
+                        // THIS CHANGE ADDS NEWLINES BUT WE'RE STILL MISSING SPACES
+                        if (mode != Preformatted && mode != IgnoreMarkup && c != ' ' && c != '\n') {
+                        // printf("(mode != Preformatted && mode != IgnoreMarkup && c != ' ') == true %d %c\n", c, c);
                             ChangeChar(lexer, ' ');
+                        }
                     }
 
                     continue;
@@ -2915,6 +2915,8 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
                     mode = Preformatted;
                 }
 
+                
+
                 if ((mode != Preformatted && ExpectsContent(lexer->token))
                     || nodeIsBR(lexer->token) || nodeIsHR(lexer->token))
                 {
diff --git a/src/pprint.c b/src/pprint.c
index 53fcbc968..c0b5c0ef5 100644
--- a/src/pprint.c
+++ b/src/pprint.c
@@ -2291,7 +2291,7 @@ void TY_(PPrintTree)( TidyDocImpl* doc, uint mode, uint indent, Node *node )
 
     if (node->type == TextNode)
     {
-        PPrintText( doc, mode, indent, node );
+      PPrintText( doc, mode, indent, node );
     }
     else if ( node->type == CommentTag )
     {

From 2c95714c0465834c59866f3198c5e604613f7f68 Mon Sep 17 00:00:00 2001
From: Pat Nakajima 
Date: Sat, 4 May 2024 12:00:26 -0700
Subject: [PATCH 4/9] ok only 4 tests fail now and pre->code is working i think

---
 .../cases/access-cases/case-1006@0.html       |  1 +
 .../cases/access-expects/case-1006.html       |  3 ++-
 .../cases/access-expects/case-1006.txt        |  1 +
 src/lexer.c                                   | 24 +++++++++++++++----
 4 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/regression_testing/cases/access-cases/case-1006@0.html b/regression_testing/cases/access-cases/case-1006@0.html
index aa46d2d3c..092a85432 100644
--- a/regression_testing/cases/access-cases/case-1006@0.html
+++ b/regression_testing/cases/access-cases/case-1006@0.html
@@ -1,3 +1,4 @@
+
 
 
 aert1.0/13.10.1
diff --git a/regression_testing/cases/access-expects/case-1006.html b/regression_testing/cases/access-expects/case-1006.html
index 74bf827cc..fcca0306f 100644
--- a/regression_testing/cases/access-expects/case-1006.html
+++ b/regression_testing/cases/access-expects/case-1006.html
@@ -1,4 +1,5 @@
-
+
 
 
 aert1.0/13.10.1
diff --git a/regression_testing/cases/access-expects/case-1006.txt b/regression_testing/cases/access-expects/case-1006.txt
index 8b1378917..9fbbf4b27 100644
--- a/regression_testing/cases/access-expects/case-1006.txt
+++ b/regression_testing/cases/access-expects/case-1006.txt
@@ -1 +1,2 @@
+No warnings or errors were found.
 
diff --git a/src/lexer.c b/src/lexer.c
index af96c078f..b5730d8ba 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -2516,6 +2516,7 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
     AttVal *attributes = NULL;
     Node *node;
     Bool fixComments;
+    Node *parent = lexer->parent;
 
     switch ( cfgAutoBool(doc, TidyFixComments) )
     {
@@ -2562,7 +2563,17 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
                  to do this here rather than in parser methods
                  for elements that don't have mixed content.
                 */
-                if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace) && (c != '\n' && mode != Preformatted)
+                
+                // Check to see if we're in a pre, if  so, don't worry about whitespace
+                while (parent) {
+                  if (nodeIsPRE(parent)) {
+                    mode = Preformatted;
+                  }
+                  
+                  parent = parent->parent;
+                }
+
+                if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace)
                       && lexer->lexsize == lexer->txtstart + 1)
                 {
                     --(lexer->lexsize);
@@ -2590,11 +2601,14 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
                     }
                     else /* prev character wasn't white */
                     {
+                        lexer->waswhite = yes;
+
+                        if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
+                          ChangeChar(lexer, ' ');
                         // THIS CHANGE ADDS NEWLINES BUT WE'RE STILL MISSING SPACES
-                        if (mode != Preformatted && mode != IgnoreMarkup && c != ' ' && c != '\n') {
-                        // printf("(mode != Preformatted && mode != IgnoreMarkup && c != ' ') == true %d %c\n", c, c);
-                            ChangeChar(lexer, ' ');
-                        }
+                        // if (mode != Preformatted && mode != IgnoreMarkup && c != ' ' && c != '\n') {
+                            
+                        // }
                     }
 
                     continue;

From b2f2b18beec635926668749d7164ce9daa1f1952 Mon Sep 17 00:00:00 2001
From: Pat Nakajima 
Date: Sat, 4 May 2024 12:24:38 -0700
Subject: [PATCH 5/9] make sure tabs still work

---
 .../cases/access-cases/case-1006.conf         |  1 +
 .../cases/access-cases/case-1006@0.html       |  2 +-
 .../cases/access-expects/case-1006.html       |  2 +-
 src/lexer.c                                   | 25 +++++++++----------
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/regression_testing/cases/access-cases/case-1006.conf b/regression_testing/cases/access-cases/case-1006.conf
index e28fa9485..8911097d5 100644
--- a/regression_testing/cases/access-cases/case-1006.conf
+++ b/regression_testing/cases/access-cases/case-1006.conf
@@ -1,3 +1,4 @@
+keep-tabs: yes
 output-xhtml: yes
 show-info: no
 force-output: yes
diff --git a/regression_testing/cases/access-cases/case-1006@0.html b/regression_testing/cases/access-cases/case-1006@0.html
index 092a85432..99ef3fbbd 100644
--- a/regression_testing/cases/access-cases/case-1006@0.html
+++ b/regression_testing/cases/access-cases/case-1006@0.html
@@ -7,7 +7,7 @@
 
 
 func sup() {
-  print("hello")
+	print("hello")
   return
 }
 
diff --git a/regression_testing/cases/access-expects/case-1006.html b/regression_testing/cases/access-expects/case-1006.html
index fcca0306f..1493e322b 100644
--- a/regression_testing/cases/access-expects/case-1006.html
+++ b/regression_testing/cases/access-expects/case-1006.html
@@ -8,7 +8,7 @@
 
 
 func sup() {
-  print("hello")
+	print("hello")
   return
 }
 
diff --git a/src/lexer.c b/src/lexer.c
index b5730d8ba..9857a2d6f 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -2517,6 +2517,7 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
     Node *node;
     Bool fixComments;
     Node *parent = lexer->parent;
+    Bool hasPreAncestor = no;
 
     switch ( cfgAutoBool(doc, TidyFixComments) )
     {
@@ -2543,6 +2544,16 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
 
     while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
     {
+        // Check to see if we're in a pre, if  so, don't worry about whitespace
+        while (parent) {
+          if (nodeIsPRE(parent)) {
+            mode = Preformatted;
+            hasPreAncestor = yes;
+          }
+
+          parent = parent->parent;
+        }
+
         if (lexer->insertspace)
         {
             TY_(AddCharToLexer)(lexer, ' ');
@@ -2550,6 +2561,7 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
             lexer->insertspace = no;
         }
 
+        // If non breaking space, change to a space
         if (c == 160 && (mode == Preformatted))
             c = ' ';
 
@@ -2564,15 +2576,6 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
                  for elements that don't have mixed content.
                 */
                 
-                // Check to see if we're in a pre, if  so, don't worry about whitespace
-                while (parent) {
-                  if (nodeIsPRE(parent)) {
-                    mode = Preformatted;
-                  }
-                  
-                  parent = parent->parent;
-                }
-
                 if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace)
                       && lexer->lexsize == lexer->txtstart + 1)
                 {
@@ -2605,10 +2608,6 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
 
                         if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
                           ChangeChar(lexer, ' ');
-                        // THIS CHANGE ADDS NEWLINES BUT WE'RE STILL MISSING SPACES
-                        // if (mode != Preformatted && mode != IgnoreMarkup && c != ' ' && c != '\n') {
-                            
-                        // }
                     }
 
                     continue;

From bafc38f9e6ed47070d05a7e3bf0babf7adbe23ba Mon Sep 17 00:00:00 2001
From: Pat Nakajima 
Date: Sat, 4 May 2024 12:46:07 -0700
Subject: [PATCH 6/9] cleanup

---
 src/lexer.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/lexer.c b/src/lexer.c
index 9857a2d6f..6e263ef84 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -2517,7 +2517,6 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
     Node *node;
     Bool fixComments;
     Node *parent = lexer->parent;
-    Bool hasPreAncestor = no;
 
     switch ( cfgAutoBool(doc, TidyFixComments) )
     {
@@ -2548,7 +2547,6 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
         while (parent) {
           if (nodeIsPRE(parent)) {
             mode = Preformatted;
-            hasPreAncestor = yes;
           }
 
           parent = parent->parent;

From 760977bc5a61e8a3388ed1d0263fd7f30f96e8c2 Mon Sep 17 00:00:00 2001
From: Pat Nakajima 
Date: Sat, 4 May 2024 12:46:47 -0700
Subject: [PATCH 7/9] move this

---
 src/lexer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lexer.c b/src/lexer.c
index 6e263ef84..a43eb6803 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -2516,7 +2516,6 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
     AttVal *attributes = NULL;
     Node *node;
     Bool fixComments;
-    Node *parent = lexer->parent;
 
     switch ( cfgAutoBool(doc, TidyFixComments) )
     {
@@ -2544,6 +2543,7 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
     while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
     {
         // Check to see if we're in a pre, if  so, don't worry about whitespace
+        Node *parent = lexer->parent;
         while (parent) {
           if (nodeIsPRE(parent)) {
             mode = Preformatted;

From 91adc0ad54580cb44511ae62bd4d6b0a95aa7ddd Mon Sep 17 00:00:00 2001
From: Pat Nakajima 
Date: Sat, 4 May 2024 12:47:51 -0700
Subject: [PATCH 8/9] format

---
 src/lexer.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/lexer.c b/src/lexer.c
index a43eb6803..f87a3a360 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -2559,7 +2559,6 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
             lexer->insertspace = no;
         }
 
-        // If non breaking space, change to a space
         if (c == 160 && (mode == Preformatted))
             c = ' ';
 
@@ -2573,7 +2572,6 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
                  to do this here rather than in parser methods
                  for elements that don't have mixed content.
                 */
-                
                 if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace)
                       && lexer->lexsize == lexer->txtstart + 1)
                 {
@@ -2605,7 +2603,7 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
                         lexer->waswhite = yes;
 
                         if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
-                          ChangeChar(lexer, ' ');
+                            ChangeChar(lexer, ' ');
                     }
 
                     continue;

From f4d698fc47c10717ad579ee0fb794f7a84832d2d Mon Sep 17 00:00:00 2001
From: Pat Nakajima 
Date: Sat, 4 May 2024 12:49:28 -0700
Subject: [PATCH 9/9] more format

---
 src/lexer.c  | 5 ++---
 src/pprint.c | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/lexer.c b/src/lexer.c
index f87a3a360..35b783178 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -2567,12 +2567,13 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
         switch (lexer->state)
         {
             case LEX_CONTENT:  /* element content */
+
                 /*
                  Discard white space if appropriate. Its cheaper
                  to do this here rather than in parser methods
                  for elements that don't have mixed content.
                 */
-                if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace)
+                if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace) 
                       && lexer->lexsize == lexer->txtstart + 1)
                 {
                     --(lexer->lexsize);
@@ -2924,8 +2925,6 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
                     mode = Preformatted;
                 }
 
-                
-
                 if ((mode != Preformatted && ExpectsContent(lexer->token))
                     || nodeIsBR(lexer->token) || nodeIsHR(lexer->token))
                 {
diff --git a/src/pprint.c b/src/pprint.c
index c0b5c0ef5..53fcbc968 100644
--- a/src/pprint.c
+++ b/src/pprint.c
@@ -2291,7 +2291,7 @@ void TY_(PPrintTree)( TidyDocImpl* doc, uint mode, uint indent, Node *node )
 
     if (node->type == TextNode)
     {
-      PPrintText( doc, mode, indent, node );
+        PPrintText( doc, mode, indent, node );
     }
     else if ( node->type == CommentTag )
     {