Skip to content
This repository has been archived by the owner on Feb 23, 2020. It is now read-only.

Commit

Permalink
#741 Parse categories (#758)
Browse files Browse the repository at this point in the history
* #741  Parse roots

* #741  Cleanup categories parsing code

* #741  Parse category text

* #741  Review#1 fixes. Subtask arch improvements
  • Loading branch information
duker33 authored Jul 16, 2019
1 parent 3ca7da2 commit 36a90c3
Showing 1 changed file with 89 additions and 51 deletions.
140 changes: 89 additions & 51 deletions stroyprombeton/management/commands/parse_stalbeton.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Takes catalog data from stalbeton.pro site."""

import typing
from functools import lru_cache
from itertools import chain
from urllib.parse import urljoin

Expand All @@ -9,88 +10,125 @@
from django.core.management.base import BaseCommand


class Page:
SITE_URL = 'https://stalbeton.pro'
path = '/'

def url(self) -> str:
return urljoin([self.SITE_URL.strip('/'), self.path.strip('/')])

def get_page(self) -> requests.Response:
return requests.get(self.url())

def get_soup(self) -> bs4.BeautifulSoup:
return bs4.BeautifulSoup(
self.get_page().content.decode('utf-8'),
'html.parser'
)
class ThroughElements:
"""Such elements are presented on every page. Header and footer for example."""

def title(self) -> str:
raise NotImplemented()
def __init__(self, page: 'Page'):
self.page = page

def h1(self) -> str:
raise NotImplemented()
def roots(self) -> typing.List['RootCategoryPage']:
roots = self.page.soup.select('.catalog-tabs-content__list .catalog-list__link')
assert roots
return [RootCategoryPage(path=r['href']) for r in roots]

def keywords(self) -> str:
def work_doc(self) -> typing.List['CategoryPage']:
# @todo #741:30m Parse work docs from stalbeton.
# Don't create series entity. It's for another task.
raise NotImplemented()

def description(self) -> str:
raise NotImplemented()

class Page:
SITE_URL = 'https://stalbeton.pro'

class CategoryPage(Page):
def __init__(self, path: str):
# '/catalog/dorozhnoe-stroitelstvo' for example
self.path = path

@property
def url(self) -> str:
return urljoin(self.SITE_URL.strip('/'), self.path.strip('/'))

@property
@lru_cache(maxsize=1)
def page(self) -> requests.Response:
response = requests.get(self.url)
assert response.status_code == 200, self
return response

@property
@lru_cache(maxsize=1)
def soup(self) -> bs4.BeautifulSoup:
return bs4.BeautifulSoup(
self.page.content.decode('utf-8'),
'html.parser'
)

class ThroughElements(Page):
"""Such elements are presented on every page. Header and footer for example."""
def __str__(self):
return self.path

def work_doc(self) -> typing.List[CategoryPage]:
# parse work doc categories
pass
@property
def title(self) -> str:
return self.soup.find('title').text

@property
def h1(self) -> str:
return self.soup.find('h1').text

class CatalogPage(Page):
path = 'catalog'
@property
def description(self) -> str:
return self.soup.select_one('meta[name="Description"]')['content']

def roots(self) -> typing.List['RootCategoryPage']:
# parse root categories
pass

def second_level(self) -> typing.List['SecondLevelCategoryPage']:
# parse second level categories
pass
class CategoryPage(Page):
@property
def text(self) -> str:
"""
Only category page has unique text.
Every another text has autogenerated content.
"""
return self.soup.select_one('#js-category-description').text


class RootCategoryPage(CategoryPage):
pass
# @todo #741:30m Implement parse_stalbeton.Category.children() method.
# And reuse it as polymorphic method in subclasses.
# The task has pros and cons, so, we'll discuss it for the first.
def second_level(self) -> typing.List['SecondLevelCategoryPage']:
return [
SecondLevelCategoryPage(p['href'])
for p in self.soup.select('h2 > a.catalog-list__link')
]


class SecondLevelCategoryPage(CategoryPage):
def third_level(self) -> typing.List['ThirdLevelCategoryPage']:
# parse it
pass
return [
ThirdLevelCategoryPage(p['href'])
for p in self.soup.select('h2 > a.catalog-list__link')
]


class ThirdLevelCategoryPage(CategoryPage):
def options(self) -> list:
# parse it
pass
# @todo #741:60m Parse stalbeton's options.
raise NotImplemented()

# @todo #741:60m Parse series from stalbeton.
# Series are already parsed as text strings.
# Parse them as separated pages to get options-series relation.
def series(self) -> typing.List[str]:
return [
item.text for item in self.soup.select(
'.documentation-block span.documentation-block__item > a'
)
]


# @todo #736:120m Parse stalbeton pages.
# See "parse it" comments inside the classes.
def parse():
# @todo #736:60m Try to use stalbeton sitemap.xml.
# To get all category links. If you'll be succeed, use it
# instead of nested category parsing, drafted with the code below.
roots = CatalogPage().roots() # Ignore PyFlakesBear
second_level = CatalogPage().second_level()
third_level = chain.from_iterable((s.third_level() for s in second_level))
options = chain.from_iterable((t.options() for t in third_level)) # Ignore PyFlakesBear
# save it into our DB
main = Page(path='/')
through = ThroughElements(page=main)
roots = through.roots()
# @todo #741:30m Create parse_stalbeton.Categories class.
# And hide children list assembling there.
# See PR #758 discussion for example.
seconds = chain.from_iterable((r.second_level() for r in roots))
thirds = chain.from_iterable((s.third_level() for s in seconds))
options = chain.from_iterable((t.options() for t in thirds)) # Ignore PyFlakesBear
# @todo #741:60m Save parsed stalbeton to a DB.
# DB isn't required to be high performance.
# It can be sqlite or postgres or pickle lib or whatever else.
# DB is required to analyze data without loading stalbeton site every time.


class Command(BaseCommand):
Expand Down

8 comments on commit 36a90c3

@0pdd
Copy link
Collaborator

@0pdd 0pdd commented on 36a90c3 Jul 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Puzzle 736-bb965292 disappeared from stroyprombeton/management/commands/parse_stalbeton.py, that's why I closed #741. Please, remember that the puzzle was not necessarily removed in this particular commit. Maybe it happened earlier, but we discovered this fact only now.

@0pdd
Copy link
Collaborator

@0pdd 0pdd commented on 36a90c3 Jul 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Puzzle 736-8fdc8f25 disappeared from stroyprombeton/management/commands/parse_stalbeton.py, that's why I closed #742. Please, remember that the puzzle was not necessarily removed in this particular commit. Maybe it happened earlier, but we discovered this fact only now.

@0pdd
Copy link
Collaborator

@0pdd 0pdd commented on 36a90c3 Jul 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Puzzle 741-521eea6b discovered in stroyprombeton/management/commands/parse_stalbeton.py and submitted as #759. Please, remember that the puzzle was not necessarily added in this particular commit. Maybe it was added earlier, but we discovered it only now.

@0pdd
Copy link
Collaborator

@0pdd 0pdd commented on 36a90c3 Jul 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Puzzle 741-0ee87234 discovered in stroyprombeton/management/commands/parse_stalbeton.py and submitted as #760. Please, remember that the puzzle was not necessarily added in this particular commit. Maybe it was added earlier, but we discovered it only now.

@0pdd
Copy link
Collaborator

@0pdd 0pdd commented on 36a90c3 Jul 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Puzzle 741-8977a8e5 discovered in stroyprombeton/management/commands/parse_stalbeton.py and submitted as #761. Please, remember that the puzzle was not necessarily added in this particular commit. Maybe it was added earlier, but we discovered it only now.

@0pdd
Copy link
Collaborator

@0pdd 0pdd commented on 36a90c3 Jul 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Puzzle 741-29682897 discovered in stroyprombeton/management/commands/parse_stalbeton.py and submitted as #762. Please, remember that the puzzle was not necessarily added in this particular commit. Maybe it was added earlier, but we discovered it only now.

@0pdd
Copy link
Collaborator

@0pdd 0pdd commented on 36a90c3 Jul 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Puzzle 741-d9d9a206 discovered in stroyprombeton/management/commands/parse_stalbeton.py and submitted as #763. Please, remember that the puzzle was not necessarily added in this particular commit. Maybe it was added earlier, but we discovered it only now.

@0pdd
Copy link
Collaborator

@0pdd 0pdd commented on 36a90c3 Jul 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Puzzle 741-c3a9fcaf discovered in stroyprombeton/management/commands/parse_stalbeton.py and submitted as #764. Please, remember that the puzzle was not necessarily added in this particular commit. Maybe it was added earlier, but we discovered it only now.

Please sign in to comment.