initial commit

This commit is contained in:
2026-06-09 17:46:46 +02:00
commit 28b42492e6
18 changed files with 195 additions and 0 deletions
Binary file not shown.

After

Width:  |  Height:  |  Size: 7.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 498 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 89 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 76 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.6 KiB

+2
View File
@@ -0,0 +1,2 @@
requests
argparse
+63
View File
@@ -0,0 +1,63 @@
import sys
import re
class Scorpion:
def __init__(self, args):
self.args = self.args_init(args[1:])
self.signatures = {
"jpg": [b"\xFF\xD8\xFF"],
"png": [b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"],
"gif": [b"GIF87a",b"GIF89a"],
"bmp": [b"BM"]
}
self.png_cluster = {
"IHDR": {"Width": 4, "Height": 4, "Bit Depth": 1, "Color Type": 1, "Compression": 1, "Filter": 1, "Interlace": 1}
}
def args_init(self, args: list[str]) -> set[str]:
argset = set()
pattern = re.compile(r'^.*\.(?:jpe?g|png|gif|bmp)$', re.IGNORECASE)
for arg in args:
print(arg)
if not pattern.match(arg):
print("Error : Invalid file")
return
argset.add(arg)
return argset
def hexdump(self, data: bytes, width: int = 16):
for i in range(0, len(data), width):
chunk = data[i:i+width]
hex_bytes = " ".join(f"{b:02X}" for b in chunk)
ascii_part = "".join(chr(b) if 32 <= b < 127 else "." for b in chunk)
print(f"{i:08X} {hex_bytes:<{width*3}} {ascii_part}")
def get_file_type(self, data: bytes):
for filetype, sigs in self.signatures.items():
if any(data.startswith(sig) for sig in sigs):
return filetype
return None
def run(self):
for arg in self.args:
with open(arg, "rb") as f:
data = f.read()
filetype = self.get_file_type(data)
self.hexdump(data)
match filetype:
case "jpg":
return None
case "png":
self.read_png(data)
case "gif":
return None
case "bmp":
return None
case _:
return None
def read_png(self, data):
for cluster, cluster_values in self.png_cluster.values():
if __name__ == "__main__":
scorpion = Scorpion(sys.argv)
scorpion.run()
+130
View File
@@ -0,0 +1,130 @@
import sys
import requests
import argparse
import re
import tqdm
from pathlib import Path
class Spider:
parser = argparse.ArgumentParser(
description="Spider : An image extraction script",
)
def __init__(self, target: str, recursive: bool, level: int, path: str):
self.target = target
self.recursive = recursive
self.level = level if level else 5
self.path = path if path else "./data"
self.href_reg = re.compile(r'href=["\'](.*?)["\']', re.IGNORECASE)
self.img_reg = re.compile(r'src=["\'](.*?\.(?:jpg|jpeg|png|gif|bmp))["\']', re.IGNORECASE)
self.visited = set()
self.imgs = set()
def run(self):
self.get_page([self.target], self.level if self.recursive else 1)
if len(self.visited) == 0:
print("Error: couldn't reach target :", self.target)
return
for url in self.visited:
self.get_img(url)
for img in self.imgs:
self.download_image(img)
def is_local(self, href):
if not href:
return False
href = href.strip()
if (href in ("", "/", "#")):
return False
if (href.startswith(("http://", "https://", "//", "mailto:", "javascript:")) and not self.target in href):
return False
if (href.endswith(".css")):
return False
return True
def get_page(self, targets, depth):
if depth == 0 or not targets:
return
for target in targets:
if target in self.visited:
continue
try:
r = requests.get(target)
except:
continue
if (r.status_code != 200):
continue
self.visited.add(target)
links = self.href_reg.findall(r.text)
next_links = []
for l in links:
if (self.is_local(l) and self.target in l):
next_links.append(l)
elif (self.is_local(l)):
next_links.append(self.target + l)
self.get_page(next_links, depth - 1)
def get_img(self, url):
try:
r = requests.get(url)
except:
return None
imgs = self.img_reg.findall(r.text)
for img in imgs:
if (self.is_local(img) and self.target in img):
self.imgs.add(img)
elif (self.is_local(img)):
self.imgs.add(self.target + img)
def download_image(self, img):
try:
r = requests.get(img, stream = True)
except:
print("Failed to fetch : " + img)
return
folder = Path(self.path)
folder.mkdir(parents=True, exist_ok=True)
file_path = folder / img.replace("/", "_")
if file_path.exists():
print("File :" + str(file_path) + " already exists, skipping...")
return
try:
with file_path.open("wb") as handle:
for data in tqdm.tqdm(
r.iter_content(chunk_size=1024),
unit="kB",
total= int(r.headers.get("Content-Length")) // 1024,
desc="Downloading : " + img,
ncols=80
):
handle.write(data)
except:
print("Couldn't write to : " + str(file_path))
return
def main():
parser = argparse.ArgumentParser(
description="Spider : An image extraction script",
)
parser.add_argument("-r", "--recursive", action="store_true", help="Recursively download the image in an URL received as a parameter")
parser.add_argument("-l", "--level", type=int, help="Recursion level (default:5)")
parser.add_argument("-p", "--path", type=str, help="Output folder")
parser.add_argument("url", help="Target URL")
args = parser.parse_args()
if args.level and not args.recursive:
parser.error("-l requires -r")
spider = Spider(args.url, args.recursive, args.level, args.path)
spider.run()
if __name__ == "__main__":
main()