Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/codewithtalos/DockerAgent i…
Browse files Browse the repository at this point in the history
…nto main
  • Loading branch information
ejmejm committed Jun 4, 2024
2 parents 99c95b9 + a9c3ab5 commit 0b383e0
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 13 deletions.
35 changes: 23 additions & 12 deletions src/crawler/docs_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
from src.crawler.utils.html_processors import extract_main_text_from_html

html2text.config.MARK_CODE = False
LATEST_DOCS_KEYWORDS = ['latest', 'stable', 'current', 'master', 'release', 'main', 'default', ]
# Not using word "latest" because some projects will have a link that says "to the latest version"
LATEST_DOCS_KEYWORDS = ['stable', 'current', 'master', 'release', 'main', 'default', ]
EXCLUDE_DOCS_KEYWORDS = ['/version']
DOC_EXTENSIONS_ON_GITHUB = ('.md', '.rst', '.txt', '.html')
MAX_OVERVIEW_TOKENS = 10000
Expand Down Expand Up @@ -120,19 +121,22 @@ def parse(self, response: scrapy.http.Response) -> Generator[dict, None, None]:
'content': content,
'code_blocks': code_blocks,
}
else:
return

except Exception as e:
logging.info(f"Failed to extract content from {response.url}: {str(e)}")

# To follow links to next pages and continue crawling, but only within the latest documentation
try:
try:
links = response.css('a::attr(href)').getall()
except Exception as e:
logging.info(f"Failed to extract links from {response.url}: {str(e)}")
links = []

for link in links:
# check if LATEST_DOCS_KEYWORDS are in the link
if link and not link.startswith('http'):
if link and not link.startswith('http') and not link.startswith('mailto:'):
if not self.allowed_paths or any(link.startswith(path) for path in self.allowed_paths):
link = response.urljoin(link)
yield scrapy.Request(link, callback=self.parse)
Expand Down Expand Up @@ -246,19 +250,17 @@ def _format_properly(self, spider_output: list[dict]) -> dict:
result_dict['code'].extend(item['code_blocks'])
return result_dict

def run_crawler(self, queue: multiprocessing.Queue, start_urls: list, file_path: Path):
def run_crawler(self, start_urls: list, file_path: Path):
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'FEED_FORMAT': 'json',
'FEED_URI': file_path,
'LOG_LEVEL': 'ERROR',
'COMPRESSION_ENABLED': 'False',
'CLOSESPIDER_PAGECOUNT': '10',
})
process.crawl(DocsSpider, start_urls=start_urls)
process.start(install_signal_handlers=False)
with open(file_path, 'r') as f:
data = json.load(f)
queue.put(data)

def scrape(self):
"""extracts the content and code blocks from the documentation
Expand All @@ -282,7 +284,6 @@ def scrape(self):
target=self.run_crawler, args=(queue, self._start_urls, file_path))
process.start()
process.join()
data = queue.get()

with open(file_path, 'r') as f:
data = json.load(f)
Expand All @@ -308,10 +309,12 @@ def scrape(self, limit_tokens: bool = False):
visited = set()
content = []
char_count = 0
page_count = 0

allowed_domains = [urlparse(start_url).netloc]

while queue:
while queue and page_count < 30:

current_url = queue.pop(0)
if current_url in visited:
continue
Expand All @@ -333,6 +336,8 @@ def scrape(self, limit_tokens: bool = False):
response = requests.get(current_url)
if response.status_code != 200:
continue
else:
page_count += 1

page_content = extract_main_text_from_html(response.content.decode('utf-8'))
char_count += len(page_content)
Expand All @@ -341,11 +346,14 @@ def scrape(self, limit_tokens: bool = False):
content.append(page_content)
if char_count >= MAX_OVERVIEW_TOKENS and limit_tokens:
return '\n\n'.join(content)

code_blocks = soup.find_all('pre')
print('code_blocks:', code_blocks)

# Find all links and add them to the queue if not visited
for link in soup.find_all('a', href=True):
absolute_link = urljoin(current_url, link['href'])
if absolute_link not in visited:
if absolute_link not in visited and not link['href'].startswith('mailto:'):
queue.append(absolute_link)

except Exception as e:
Expand All @@ -357,7 +365,10 @@ def scrape(self, limit_tokens: bool = False):

if limit_tokens:
return '\n\n'.join(content)
return content
return {
'content': content,
'code': []
}


def get_doc_data(library: str, language: Optional[str]) -> dict:
Expand All @@ -378,7 +389,7 @@ def get_doc_data(library: str, language: Optional[str]) -> dict:

if '//github.com/' in url:
return GithubScraper([url]).scrape()
return AsyncDocsScraper([url]).scrape()
return SyncDocsScraper([url]).scrape()


def get_docs_overview(library: str, language: Optional[str]) -> str:
Expand Down
Empty file modified src/training/interactive/run_py_script.sh
100644 → 100755
Empty file.
2 changes: 1 addition & 1 deletion src/training/interactive/train_multi_step_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def revise_batch(
correct = gather_object(process_correct)

if accelerator.is_main_process:
log.info(f"Finished! {sum(correct).item()}/{len(correct)} correct responses.")
log.info(f"Finished! {sum(correct)}/{len(correct)} correct responses.")

### Select incorrect responses for revision ###

Expand Down

0 comments on commit 0b383e0

Please sign in to comment.