mirror of
				https://github.com/simon987/scripts.git
				synced 2025-10-25 22:36:53 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			145 lines
		
	
	
		
			5.8 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			145 lines
		
	
	
		
			5.8 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env python3
 | |
| 
 | |
| import urllib.request, urllib.error, urllib.parse, argparse, logging
 | |
| import os, re, time
 | |
| import http.client 
 | |
| import fileinput
 | |
| from multiprocessing import Process
 | |
| 
 | |
| log = logging.getLogger('inb4404')
 | |
| workpath = os.path.dirname(os.path.realpath(__file__))
 | |
| args = None
 | |
| 
 | |
| def main():
 | |
|     global args
 | |
|     parser = argparse.ArgumentParser(description='inb4404')
 | |
|     parser.add_argument('thread', nargs=1, help='url of the thread (or filename; one url per line)')
 | |
|     parser.add_argument('-c', '--with-counter', action='store_true', help='show a counter next the the image that has been downloaded')
 | |
|     parser.add_argument('-d', '--date', action='store_true', help='show date as well')
 | |
|     parser.add_argument('-l', '--less', action='store_true', help='show less information (surpresses checking messages)')
 | |
|     parser.add_argument('-n', '--use-names', default=True, action='store_true', help='use thread names instead of the thread ids (...4chan.org/board/thread/thread-id/thread-name)')
 | |
|     parser.add_argument('-r', '--reload', action='store_true', help='reload the queue file every 5 minutes')
 | |
|     args = parser.parse_args()
 | |
| 
 | |
|     if args.date:
 | |
|         logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p')
 | |
|     else:
 | |
|         logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s', datefmt='%I:%M:%S %p')    
 | |
| 
 | |
|     thread = args.thread[0].strip()
 | |
|     if thread[:4].lower() == 'http':
 | |
|         download_thread(thread)
 | |
|     else:
 | |
|         download_from_file(thread)
 | |
| 
 | |
| def load(url):
 | |
|     req = urllib.request.Request(url, headers={'User-Agent': '4chan Browser'})
 | |
|     return urllib.request.urlopen(req).read()
 | |
| 
 | |
| def download_thread(thread_link):
 | |
|     board = thread_link.split('/')[3]
 | |
|     thread = thread_link.split('/')[5].split('#')[0]
 | |
|     if len(thread_link.split('/')) > 6:
 | |
|         thread_tmp = thread_link.split('/')[6].split('#')[0]
 | |
| 
 | |
|         if args.use_names or os.path.exists(os.path.join(workpath, 'downloads', board, thread_tmp)):                
 | |
|             thread = thread_tmp
 | |
| 
 | |
|     directory = os.path.join(workpath, 'downloads', board, thread)
 | |
|     if not os.path.exists(directory):
 | |
|         os.makedirs(directory)
 | |
| 
 | |
|     while True:
 | |
|         try:
 | |
|             regex = '(\/\/i(?:s|)\d*\.(?:4cdn|4chan)\.org\/\w+\/(\d+\.(?:jpg|png|gif|webm)))'
 | |
|             regex_result = list(set(re.findall(regex, load(thread_link).decode('utf-8'))))
 | |
|             regex_result = sorted(regex_result, key=lambda tup: tup[1])
 | |
|             regex_result_len = len(regex_result)            
 | |
|             regex_result_cnt = 1
 | |
| 
 | |
|             for link, img in regex_result:
 | |
|                 img_path = os.path.join(directory, img)
 | |
|                 if not os.path.exists(img_path):
 | |
|                     data = load('https:' + link)
 | |
| 
 | |
|                     output_text = board + '/' + thread + '/' + img
 | |
|                     if args.with_counter:
 | |
|                         output_text = '[' + str(regex_result_cnt).rjust(len(str(regex_result_len))) +  '/' + str(regex_result_len) + '] ' + output_text
 | |
| 
 | |
|                     log.info(output_text)
 | |
| 
 | |
|                     with open(img_path, 'wb') as f:
 | |
|                         f.write(data)
 | |
| 
 | |
|                     ##################################################################################
 | |
|                     # saves new images to a seperate directory
 | |
|                     # if you delete them there, they are not downloaded again
 | |
|                     # if you delete an image in the 'downloads' directory, it will be downloaded again
 | |
|                     copy_directory = os.path.join(workpath, 'new', board, thread)
 | |
|                     if not os.path.exists(copy_directory):
 | |
|                         os.makedirs(copy_directory)
 | |
|                     copy_path = os.path.join(copy_directory, img)
 | |
|                     with open(copy_path, 'wb') as f:
 | |
|                         f.write(data)
 | |
|                     ##################################################################################
 | |
|                 regex_result_cnt += 1
 | |
| 
 | |
|         except urllib.error.HTTPError as err:
 | |
|             time.sleep(10)
 | |
|             try:
 | |
|                 load(thread_link)    
 | |
|             except urllib.error.HTTPError as err:
 | |
|                 log.info('%s 404\'d', thread_link)
 | |
|                 break
 | |
|             continue
 | |
|         except (urllib.error.URLError, http.client.BadStatusLine, http.client.IncompleteRead):
 | |
|             if not args.less:
 | |
|                 log.warning('Something went wrong')
 | |
| 
 | |
|         if not args.less:
 | |
|             log.info('Checking ' + board + '/' + thread)
 | |
|         time.sleep(20)
 | |
| 
 | |
| def download_from_file(filename):
 | |
|     running_links = []
 | |
|     while True:
 | |
|         processes = []
 | |
|         for link in [_f for _f in [line.strip() for line in open(filename) if line[:4] == 'http'] if _f]:
 | |
|             if link not in running_links:
 | |
|                 running_links.append(link)
 | |
|                 log.info('Added ' + link)
 | |
| 
 | |
|             process = Process(target=download_thread, args=(link, ))
 | |
|             process.start()
 | |
|             processes.append([process, link])
 | |
| 
 | |
|         if len(processes) == 0:
 | |
|             log.warning(filename + ' empty')
 | |
|         
 | |
|         if args.reload:
 | |
|             time.sleep(60 * 5) # 5 minutes
 | |
|             links_to_remove = []
 | |
|             for process, link in processes:
 | |
|                 if not process.is_alive():
 | |
|                     links_to_remove.append(link)
 | |
|                 else:
 | |
|                     process.terminate()
 | |
| 
 | |
|             for link in links_to_remove:
 | |
|                 for line in fileinput.input(filename, inplace=True):
 | |
|                     print(line.replace(link, '-' + link), end='')
 | |
|                 running_links.remove(link)
 | |
|                 log.info('Removed ' + link)
 | |
|             if not args.less:
 | |
|                 log.info('Reloading ' + args.thread[0]) # thread = filename here; reloading on next loop
 | |
|         else:
 | |
|             break
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     try:
 | |
|         main()
 | |
|     except KeyboardInterrupt:
 | |
|         pass
 | |
| 
 |