-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhtml.hpp
91 lines (64 loc) · 2.48 KB
/
html.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
///
/// Created by Ramsay on 2019/9/9.
///
#ifndef DOUBANCRAWLER_HTML_H
#define DOUBANCRAWLER_HTML_H
#include "dom.hpp"
#include <memory>
#include <set>
#include <string>
namespace crawler {
class Parser {
public:
Parser(size_t pos, std::string input);
/// Parse a sequence of sibling nodes.
std::vector<crawler::Node>
parseNodes(const std::shared_ptr<crawler::Node> &parent);
/// Parse a sequence of sibling nodes.
std::vector<crawler::Node> parseNodes();
/// Parse a single node.
crawler::Node parseNode();
/// Parse a single node.
crawler::Node parseNode(const std::shared_ptr<crawler::Node> &parent);
/// Parse a single element, including its open tag, content, and closing tag.
crawler::Node parseElement();
/// Parse a single element, including its open tag, content, and closing tag.
crawler::Node parseElement(std::shared_ptr<crawler::Node> parent);
private:
size_t pos;
std::string input;
inline static const std::set<std::string> SELF_CLOSING_TAGS = {
"area", "base", "br", "col", "embed", "hr", "img", "input",
"keygen", "link", "menuitem", "meta", "param", "source", "track", "wbr"};
/// Parse a tag or attribute name.
std::string parseTagName();
/// Parse a list of name="values" pairs, separated by whitespace.
crawler::AttrMap parseAttributes();
/// Parse a single name="value" pair.
std::pair<std::string, std::string> parseAttribute();
/// Parse a quoted value.
std::string parseAttributeValue();
/// Parse a text node
crawler::Node parseText(const std::shared_ptr<crawler::Node> &parent);
/// Consume and discard zero or more whitespace characters
void consumeWhitespace();
/// Consume and discard zero or more comment
void consumeComment();
/// Consume and discard doctype.
void consumeDoctype();
/// Consume character until `test` return false.
template <class Predicate> std::string consumeWhile(Predicate predicate);
/// Return the current character and advance `pos` to the next character
char consumeChar();
/// Read the current character without consuming it.
char nextChar();
/// Does the current input start with the given string
bool startsWith(const std::string &prefix);
/// Return true if all input is consumed.
bool eof();
/// Check if `currentTagName` is a self-closing tag or not.
[[nodiscard]] bool isSelfClosingTag(const std::string ¤tTagName) const;
};
crawler::Node parse(const std::string &source);
} // namespace crawler
#endif // DOUBANCRAWLER_HTML_H