diff --git a/.github/workflows/pipy-publish.yml b/.github/workflows/pipy-publish.yml new file mode 100644 index 0000000..e4a8e14 --- /dev/null +++ b/.github/workflows/pipy-publish.yml @@ -0,0 +1,34 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + #push: + # branches: [main] + + release: + types: [published] + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python setup.py sdist bdist_wheel + twine upload dist/* \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b6e4761 --- /dev/null +++ b/.gitignore @@ -0,0 +1,129 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..21a84ad --- /dev/null +++ b/README.md @@ -0,0 +1,19 @@ +# cms-detector + +## Whats in cms-detector? + +A Python Package to detect the Content Management System of a Website. + +We believe in monolithic software development and created this tiny package that does its job without any bloat. + +## How to Use cms-detector? + +You can find detailed tutorial on [cms-detector tutorial website](https://serpwings.com/software/python-cms-detector/). + +## Contribute + +Pull Requests, Feature Suggestions, and collaborations are welcome. + +## About Us + +This work is a collaborative effort of [seowings](https://seowings.org/), and [serpwings](https://serpwings.com/). diff --git a/cms_detector/__init__.py b/cms_detector/__init__.py new file mode 100644 index 0000000..7c3d6cf --- /dev/null +++ b/cms_detector/__init__.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- # + +""" +cms-detector: A Python Package to detect the Content Management System of a Website. + +MIT License +Copyright (c) 2023 SERP Wings www.serpwings.com +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + + +# +++++++++++++++++++++++++++++++++++++++++++++++++++++ +# IMPORTS Standard Library +# +++++++++++++++++++++++++++++++++++++++++++++++++++++ + +import re +from unittest.mock import Mock + +# +++++++++++++++++++++++++++++++++++++++++++++++++++++ +# IMPORTS 3rd Party Libraries +# +++++++++++++++++++++++++++++++++++++++++++++++++++++ + +import requests +from requests.adapters import HTTPAdapter +from requests.models import Response +from bs4 import BeautifulSoup + +# +++++++++++++++++++++++++++++++++++++++++++++++++++++ +# DATABASE/CONSTANTS LIST +# +++++++++++++++++++++++++++++++++++++++++++++++++++++ + +HEADER = { + "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0", +} + +# +++++++++++++++++++++++++++++++++++++++++++++++++++++ +# Utility Functions +# +++++++++++++++++++++++++++++++++++++++++++++++++++++ + + +def mock_requests_object(url): + """Generates a mock request object""" + response = Mock(spec=Response) + response.text = "" + response.status_code = 9999 + response.url = url + return response + + +def get_remote_content(url, max_retires=2): + """Get remote content avialble on a given url""" + try: + s = requests.Session() + s.mount(url, HTTPAdapter(max_retries=max_retires)) + return s.get(url, headers=HEADER) + except: + return mock_requests_object(url) + + +def get_corrected_url(url, fix_slash="/"): + """correct scheme and end slash of a url""" + if not url.startswith("http://") and not url.startswith("https://"): + url = f"http://{url}" + + if not url.endswith(fix_slash): + url = f"{url}{fix_slash}" + + return url + + +def wp_details(target_url): + """Check if WordPress is installed on a given webiste. + + It will also return name of plugins and themes, if installed on the website. + + """ + + target_url = get_corrected_url(target_url, fix_slash="/") + response = get_remote_content(target_url) + + if response.status_code < 400: + link_regex = re.compile( + "((https?):((/)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)", + re.DOTALL, + ) + all_link = set([link[0] for link in re.findall(link_regex, response.text)]) + wp_content = [meta for meta in all_link if "wp-content" in meta] + wp_includes = [meta for meta in all_link if "wp-includes" in meta] + wp_json = [meta for meta in all_link if "wp-json" in meta] + + themes = [ + re.search("/themes/(.*)/", link) for link in all_link if "/themes/" in link + ] + + if themes: + themes = list( + set([theme.group(1).split("/")[0] for theme in themes if theme]) + ) + + plugins = [ + re.search("/plugins/(.*)/", link) + for link in all_link + if "/plugins/" in link + ] + + if plugins: + plugins = list( + set([plugin.group(1).split("/")[0] for plugin in plugins if plugin]) + ) + + wp_found = False + wp_version = "" + + if any([wp_content, wp_includes, wp_json]): + wp_found = True + soup_xml = BeautifulSoup(response.content, "lxml") + wp_version_tag = soup_xml.find("meta", attrs={"name": "generator"}) + if wp_version_tag: + wp_version = wp_version_tag.get("content") + + return { + "is_wp_installed": wp_found, + "wp_version": wp_version, + "themes": themes, + "plugins": plugins, + } diff --git a/examples/tutorial.py b/examples/tutorial.py new file mode 100644 index 0000000..6fc4003 --- /dev/null +++ b/examples/tutorial.py @@ -0,0 +1,5 @@ +from cms_detector import wp_details + +result = wp_details(target_url="https://wordpress.org") + +print(result) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..377ba82 --- /dev/null +++ b/setup.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- # + +""" +cms-detector: A Python Package to get the Content Management System of a Website. + +MIT License +Copyright (c) 2023 SERP Wings www.serpwings.com +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +from setuptools import setup + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +version = "0.0.1" + +setup( + name="cms-detector", + version=version, + author="Faisal Shahzad", + author_email="seowingsorg@gmail.com", + description="Python Package to detect Content Management System", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/serpwings/cms-detector/", + project_urls={ + "Bug Tracker": "https://github.com/serpwings/cms-detector/issues", + "Documentation": "https://serpwings.com/software/python-cms-detector/", + }, + classifiers=[ + "Topic :: Utilities", + "Development Status :: 1 - Planning", + "Intended Audience :: Education", + "Intended Audience :: System Administrators", + "Intended Audience :: Financial and Insurance Industry", + "Intended Audience :: Healthcare Industry", + "Intended Audience :: Science/Research", + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Topic :: Education", + "Topic :: Office/Business :: Scheduling", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Visualization", + "Topic :: Software Development :: Libraries", + ], + packages=["cms_detector"], + python_requires=">=3.9", + install_requires=["requests", "beautifulsoup4", "lxml"], + extras_require={ + "dev": [ + "setuptools", + "pytest", + "pytest-cov", + "twine", + "wheel", + ] + }, +)