import requests
import argparse
import os
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup
internal_links = set()
external_links = set()
urls = []
total_links_visited = 0
def is_valid(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_website_links(url):
global urls
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for a_tag in soup.findAll("a"):
href_tag = a_tag.attrs.get("href")
if href_tag:
href_tag = urljoin(url, href_tag)
parsed_href = urlparse(href_tag)
href_tag = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if is_valid(href_tag):
#Comparing domain_name with netloc of href_tag and not the complete "href_tag" url
if domain_name not in urlparse(href_tag).netloc and href_tag not in external_links:
print(f"External link: {href_tag}")
external_links.add(href_tag)
continue
elif href_tag not in urls:
print(f"Internal link: {href_tag}")
urls.append(href_tag)
internal_links.add(href_tag)
def crawl(url, max_urls=50):
global total_links_visited, urls
total_links_visited += 1
get_all_website_links(url)
for link in urls:
#If any website had no internal link, the crawler would stop
#also, if a page doesnot load, there would be no internal links and the crawler would stop.
#Therefore I have created a global list to loop through while crawling.
if total_links_visited > max_urls:
break
crawl(link, max_urls=max_urls)
def main():
parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
parser.add_argument("url", help="The URL to extract links from.")
parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
args = parser.parse_args()
url = args.url
max_urls = args.max_urls
domain_name = urlparse(url).netloc
# This will save the state for any website. If the code is restarted, it can resume where it left off
if os.path.exists(f"{domain_name}_internal_links.txt"):
with open(f"{domain_name}_internal_links.txt", "r") as f:
for line in f:
internal_links.add(line.strip())
urls.append(line.strip())
with open(f"{domain_name}_external_links.txt", "r") as f:
for line in f:
external_links.add(line.strip())
crawl(url, max_urls=max_urls)
print("Total Internal Links:", len(internal_links))
print("Total External Links:", len(external_links))
print("Total URLs:", len(external_links) + len(internal_links))
with open(f"{domain_name}_internal_links.txt", "w") as f:
for internal_link in internal_links:
print(internal_link.strip(), file=f)
with open(f"{domain_name}_external_links.txt", "w") as f:
for external_link in external_links:
print(external_link.strip(), file=f)
if __name__ == "__main__":
main()