initial commit
This commit is contained in:
@@ -0,0 +1,130 @@
|
||||
import sys
|
||||
import requests
|
||||
import argparse
|
||||
import re
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
|
||||
class Spider:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Spider : An image extraction script",
|
||||
)
|
||||
def __init__(self, target: str, recursive: bool, level: int, path: str):
|
||||
self.target = target
|
||||
self.recursive = recursive
|
||||
self.level = level if level else 5
|
||||
self.path = path if path else "./data"
|
||||
self.href_reg = re.compile(r'href=["\'](.*?)["\']', re.IGNORECASE)
|
||||
self.img_reg = re.compile(r'src=["\'](.*?\.(?:jpg|jpeg|png|gif|bmp))["\']', re.IGNORECASE)
|
||||
self.visited = set()
|
||||
self.imgs = set()
|
||||
|
||||
def run(self):
|
||||
self.get_page([self.target], self.level if self.recursive else 1)
|
||||
|
||||
if len(self.visited) == 0:
|
||||
print("Error: couldn't reach target :", self.target)
|
||||
return
|
||||
for url in self.visited:
|
||||
self.get_img(url)
|
||||
|
||||
for img in self.imgs:
|
||||
self.download_image(img)
|
||||
|
||||
def is_local(self, href):
|
||||
if not href:
|
||||
return False
|
||||
href = href.strip()
|
||||
if (href in ("", "/", "#")):
|
||||
return False
|
||||
if (href.startswith(("http://", "https://", "//", "mailto:", "javascript:")) and not self.target in href):
|
||||
return False
|
||||
if (href.endswith(".css")):
|
||||
return False
|
||||
return True
|
||||
|
||||
def get_page(self, targets, depth):
|
||||
if depth == 0 or not targets:
|
||||
return
|
||||
|
||||
for target in targets:
|
||||
if target in self.visited:
|
||||
continue
|
||||
try:
|
||||
r = requests.get(target)
|
||||
except:
|
||||
continue
|
||||
if (r.status_code != 200):
|
||||
continue
|
||||
self.visited.add(target)
|
||||
links = self.href_reg.findall(r.text)
|
||||
next_links = []
|
||||
|
||||
for l in links:
|
||||
if (self.is_local(l) and self.target in l):
|
||||
next_links.append(l)
|
||||
elif (self.is_local(l)):
|
||||
next_links.append(self.target + l)
|
||||
|
||||
self.get_page(next_links, depth - 1)
|
||||
|
||||
def get_img(self, url):
|
||||
try:
|
||||
r = requests.get(url)
|
||||
except:
|
||||
return None
|
||||
|
||||
imgs = self.img_reg.findall(r.text)
|
||||
|
||||
for img in imgs:
|
||||
if (self.is_local(img) and self.target in img):
|
||||
self.imgs.add(img)
|
||||
elif (self.is_local(img)):
|
||||
self.imgs.add(self.target + img)
|
||||
|
||||
def download_image(self, img):
|
||||
try:
|
||||
r = requests.get(img, stream = True)
|
||||
except:
|
||||
print("Failed to fetch : " + img)
|
||||
return
|
||||
|
||||
folder = Path(self.path)
|
||||
folder.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
file_path = folder / img.replace("/", "_")
|
||||
|
||||
if file_path.exists():
|
||||
print("File :" + str(file_path) + " already exists, skipping...")
|
||||
return
|
||||
try:
|
||||
with file_path.open("wb") as handle:
|
||||
for data in tqdm.tqdm(
|
||||
r.iter_content(chunk_size=1024),
|
||||
unit="kB",
|
||||
total= int(r.headers.get("Content-Length")) // 1024,
|
||||
desc="Downloading : " + img,
|
||||
ncols=80
|
||||
):
|
||||
handle.write(data)
|
||||
except:
|
||||
print("Couldn't write to : " + str(file_path))
|
||||
return
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Spider : An image extraction script",
|
||||
)
|
||||
parser.add_argument("-r", "--recursive", action="store_true", help="Recursively download the image in an URL received as a parameter")
|
||||
parser.add_argument("-l", "--level", type=int, help="Recursion level (default:5)")
|
||||
parser.add_argument("-p", "--path", type=str, help="Output folder")
|
||||
parser.add_argument("url", help="Target URL")
|
||||
args = parser.parse_args()
|
||||
if args.level and not args.recursive:
|
||||
parser.error("-l requires -r")
|
||||
spider = Spider(args.url, args.recursive, args.level, args.path)
|
||||
spider.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user