From 7dbb6c0afad04cf9dedeb8d018a2c7744c645d77 Mon Sep 17 00:00:00 2001 From: Oscar Date: Thu, 12 Aug 2021 16:33:12 -0700 Subject: [PATCH] Update creating_parsers.md (#605) fixup markdown conversion issues --- docs/creating_parsers.md | 231 ++++++++++++++++++++------------------- 1 file changed, 119 insertions(+), 112 deletions(-) diff --git a/docs/creating_parsers.md b/docs/creating_parsers.md index ccf3e2b5..4fdf8b17 100644 --- a/docs/creating_parsers.md +++ b/docs/creating_parsers.md @@ -7,39 +7,40 @@ contributor/maintainer of dpkt. Let's look at the IPv4 parser, defined in `dpkt/ip.py`, as an example. - class IP(dpkt.Packet): - """Internet Protocol.""" - - __hdr__ = ( - ('_v_hl', 'B', (4 << 4) | (20 >> 2)), - ('tos', 'B', 0), - ('len', 'H', 20), - ('id', 'H', 0), - ('_flags_offset', 'H', 0), - ('ttl', 'B', 64), - ('p', 'B', 0), - ('sum', 'H', 0), - ('src', '4s', b'\x00' * 4), - ('dst', '4s', b'\x00' * 4) - ) - __bit_fields__ = { - '_v_hl': ( - ('v', 4), # version, 4 bits - ('hl', 4), # header len, 4 bits - ), - '_flags_offset': ( - ('rf', 1), # reserved bit - ('df', 1), # don't fragment - ('mf', 1), # more fragments - ('offset', 13), # fragment offset, 13 bits - ) - } - __pprint_funcs__ = { - 'dst': inet_to_str, - 'src': inet_to_str, - 'p': get_ip_proto_name - } +```python +class IP(dpkt.Packet): + """Internet Protocol.""" + __hdr__ = ( + ('_v_hl', 'B', (4 << 4) | (20 >> 2)), + ('tos', 'B', 0), + ('len', 'H', 20), + ('id', 'H', 0), + ('_flags_offset', 'H', 0), + ('ttl', 'B', 64), + ('p', 'B', 0), + ('sum', 'H', 0), + ('src', '4s', b'\x00' * 4), + ('dst', '4s', b'\x00' * 4) + ) + __bit_fields__ = { + '_v_hl': ( + ('v', 4), # version, 4 bits + ('hl', 4), # header len, 4 bits + ), + '_flags_offset': ( + ('rf', 1), # reserved bit + ('df', 1), # don't fragment + ('mf', 1), # more fragments + ('offset', 13), # fragment offset, 13 bits + ) + } + __pprint_funcs__ = { + 'dst': inet_to_str, + 'src': inet_to_str, + 'p': get_ip_proto_name + } +``` A lot is going on in the header, before we even got to `__init__`\! Here is the breakdown: @@ -53,37 +54,40 @@ is the breakdown: but there are some rules to naming the fields that affect `dpkt` processing: - - a name that doesn't start with an underscore represents a + * a name that doesn't start with an underscore represents a regular public protocol field. *Examples:* `tos`, `len`, `id` - - a name that starts with an underscore and contains NO more + + * a name that starts with an underscore and contains NO more underscores is considered private and gets hidden in `__repr__` and `pprint()` outputs; this is useful for hiding fields reserved for future use, or fields that should be decoded according to some custom rules. *Example:* `_reserved` - - a name that starts with an underscore and DOES contain more + + * a name that starts with an underscore and DOES contain more underscores is similarly considered private and hidden, but gets processed as a collection of multiple protocol fields, separated by underscore. Each field name may contain up to 1 underscore as well. These fields are only created when the class definition contains matching property definitions, which could be defined explicitly or created automagically via `__bit_fields__` (more - on this later). - - *Examples:* - - - `_foo_bar_m_flag` will map to fields named `foo`, `bar`, - `m_flag`, when the class contains properties with these - names (note `foo_bar_m` will be ignored since it - contains two underscores). - - In the IP class the `_v_hl` field itself is hidden in - the output of `__repr__` and `pprint()`, and is decoded - into `v` and `hl` fields that are displayed instead. + on this later). *Examples:* + + * `_foo_bar_m_flag` will map to fields named `foo`, `bar`, + `m_flag`, when the class contains properties with these + names (note `foo_bar_m` will be ignored since it + contains two underscores). + + * in the IP class the `_v_hl` field itself is hidden in + the output of `__repr__` and `pprint()`, and is decoded + into `v` and `hl` fields that are displayed instead. The second component of the tuple specifies the format of the protocol field, as it corresponds to Python's native `struct` - module. `'B'` means the field will decode to an unsigned byte, `'H'` - - to an unsigned word, etc. The default byte order is big endian - (network order). The endianness can be changed to little endian by - specifying `__byte_order__ = '<'` in the class definition. + module. `'B'` means the field will decode to an unsigned byte, + `'H'` - to an unsigned word, etc. The default byte order is big + endian (network order). Endianness can be changed to little + endian by specifying `__byte_order__ = '<'` in the class + definition. 3. Next, `__bit_fields__` is an optional dict that helps decode compound protocol fields, such as `_v_hl` or `_flags_offset` in the @@ -127,34 +131,35 @@ Let's look at the standard methods of the `Packet` class and how they contribute to parsing (aka unpacking or deserializing) and constructing (aka packing or serializing) the packet. - class IP(dpkt.Packet): +```python +class IP(dpkt.Packet): + ... + def __init__(self, *args, **kwargs): + super(IP, self).__init__(*args, **kwargs) ... - def __init__(self, *args, **kwargs): - super(IP, self).__init__(*args, **kwargs) - ... - - def __len__(self): - return self.__hdr_len__ + len(self.opts) + len(self.data) - - def __bytes__(self): - # calculate IP checksum - if self.sum == 0: - self.sum = dpkt.in_cksum(self.pack_hdr() + bytes(self.opts)) - ... - return self.pack_hdr() + bytes(self.opts) + bytes(self.data) - - def unpack(self, buf): - dpkt.Packet.unpack(self, buf) - ... - self.opts = ... # add IP options - ... - self.data = ... # bytes that remain after unpacking - - def pack_hdr(self): - buf = dpkt.Packet.pack_hdr(self) - ... - return buf + def __len__(self): + return self.__hdr_len__ + len(self.opts) + len(self.data) + + def __bytes__(self): + # calculate IP checksum + if self.sum == 0: + self.sum = dpkt.in_cksum(self.pack_hdr() + bytes(self.opts)) + ... + return self.pack_hdr() + bytes(self.opts) + bytes(self.data) + + def unpack(self, buf): + dpkt.Packet.unpack(self, buf) + ... + self.opts = ... # add IP options + ... + self.data = ... # bytes that remain after unpacking + + def pack_hdr(self): + buf = dpkt.Packet.pack_hdr(self) + ... + return buf +``` Instantiating the class with a bytes buffer (`ip = dpkt.ip.IP(buf)`) will trigger the unpacking sequence as follows: @@ -172,11 +177,13 @@ parsed packet, packing will return serialized packet as a `bytes` object (`bytes(ip) => buf`). It goes as follows: 1. Calling `bytes(obj)` invokes `self.__bytes__(obj)` + 2. `Packet.__bytes()__` calls `self.pack_hdr()` and returns its result with appended `bytes(self.data)`. The latter recursively triggers serialization of `self.data`, which could be another packet class, - e.g. `Ethernet(.., data=IP(.., data=TCP(...)))`\`, so everything + e.g. `Ethernet(.., data=IP(.., data=TCP(...)))`, so everything gets serialized. + 3. `Packet.pack_hdr()` iterates over the protocol fields given in `__hdr__`, calls `struct.pack()` on them and returns the resulting bytes. @@ -188,7 +195,7 @@ the IP parser overrides `__bytes__` to calculate the IP checksum prior to packing, and insert `bytes(self.opts)` between the packed header and data. ------ +### \_\_len\_\_ `__len__()` returns the size of the serialized packet and is typically invoked when calling `len(obj)`. Note how in the IP class, this method @@ -199,7 +206,7 @@ the size of the resulting buffer (`return len(bytes(self))`). While this works and is acceptable in some cases, dpkt views this as an anti-pattern that should be avoided. ------ +### \_\_repr\_\_ and pprint() These methods are provided by `dpkt.Packet` and are typically not overridden in the child class. However they are important to understand @@ -220,38 +227,38 @@ but there are some differences: values. See below how `src` and `dst` IP addresses get human readable interpretation with `pprint()`, but not with `__repr__`. - +```python +# repr() +>>> ip +IP(len=34, p=17, sum=29376, src=b'\x01\x02\x03\x04', dst=b'\x01\x02\x03\x04', opts=b'', data=UDP(sport=111, dport=222, ulen=14, sum=48949, data=b'foobar')) - # repr() - >>> ip - IP(len=34, p=17, sum=29376, src=b'\x01\x02\x03\x04', dst=b'\x01\x02\x03\x04', opts=b'', data=UDP(sport=111, dport=222, ulen=14, sum=48949, data=b'foobar')) - - # IP version field is default and is not returned by repr() - >>> ip.v - 4 - - >>> ip.pprint() - IP( - v=4, - hl=5, - tos=0, - len=34, - id=0, - rf=0, - df=0, - mf=0, - offset=0, - ttl=64, - p=17, # UDP - sum=29376, - src=b'\x01\x02\x03\x04', # 1.2.3.4 - dst=b'\x01\x02\x03\x04', # 1.2.3.4 - opts=b'', - data=UDP( - sport=111, - dport=222, - ulen=14, - sum=48949, - data=b'foobar' - ) # UDP - ) # IP +# IP version field is default and is not returned by repr() +>>> ip.v +4 + +>>> ip.pprint() +IP( + v=4, + hl=5, + tos=0, + len=34, + id=0, + rf=0, + df=0, + mf=0, + offset=0, + ttl=64, + p=17, # UDP + sum=29376, + src=b'\x01\x02\x03\x04', # 1.2.3.4 + dst=b'\x01\x02\x03\x04', # 1.2.3.4 + opts=b'', + data=UDP( + sport=111, + dport=222, + ulen=14, + sum=48949, + data=b'foobar' + ) # UDP +) # IP +```