import sys import requests import argparse import re import tqdm from pathlib import Path class Spider: parser = argparse.ArgumentParser( description="Spider : An image extraction script", ) def __init__(self, target: str, recursive: bool, level: int, path: str): self.target = target self.recursive = recursive self.level = level if level else 5 self.path = path if path else "./data" self.href_reg = re.compile(r'href=["\'](.*?)["\']', re.IGNORECASE) self.img_reg = re.compile(r'src=["\'](.*?\.(?:jpg|jpeg|png|gif|bmp))["\']', re.IGNORECASE) self.visited = set() self.imgs = set() def run(self): self.get_page([self.target], self.level if self.recursive else 1) if len(self.visited) == 0: print("Error: couldn't reach target :", self.target) return for url in self.visited: self.get_img(url) for img in self.imgs: self.download_image(img) def is_local(self, href): if not href: return False href = href.strip() if (href in ("", "/", "#")): return False if (href.startswith(("http://", "https://", "//", "mailto:", "javascript:")) and not self.target in href): return False if (href.endswith(".css")): return False return True def get_page(self, targets, depth): if depth == 0 or not targets: return for target in targets: if target in self.visited: continue try: r = requests.get(target) except: continue if (r.status_code != 200): continue self.visited.add(target) links = self.href_reg.findall(r.text) next_links = [] for l in links: if (self.is_local(l) and self.target in l): next_links.append(l) elif (self.is_local(l)): next_links.append(self.target + l) self.get_page(next_links, depth - 1) def get_img(self, url): try: r = requests.get(url) except: return None imgs = self.img_reg.findall(r.text) for img in imgs: if (self.is_local(img) and self.target in img): self.imgs.add(img) elif (self.is_local(img)): self.imgs.add(self.target + img) def download_image(self, img): try: r = requests.get(img, stream = True) except: print("Failed to fetch : " + img) return folder = Path(self.path) folder.mkdir(parents=True, exist_ok=True) file_path = folder / img.replace("/", "_") if file_path.exists(): print("File :" + str(file_path) + " already exists, skipping...") return try: with file_path.open("wb") as handle: for data in tqdm.tqdm( r.iter_content(chunk_size=1024), unit="kB", total= int(r.headers.get("Content-Length")) // 1024, desc="Downloading : " + img, ncols=80 ): handle.write(data) except: print("Couldn't write to : " + str(file_path)) return def main(): parser = argparse.ArgumentParser( description="Spider : An image extraction script", ) parser.add_argument("-r", "--recursive", action="store_true", help="Recursively download the image in an URL received as a parameter") parser.add_argument("-l", "--level", type=int, help="Recursion level (default:5)") parser.add_argument("-p", "--path", type=str, help="Output folder") parser.add_argument("url", help="Target URL") args = parser.parse_args() if args.level and not args.recursive: parser.error("-l requires -r") spider = Spider(args.url, args.recursive, args.level, args.path) spider.run() if __name__ == "__main__": main()