mirror of
https://github.com/simon987/Misc-Download-Scripts.git
synced 2025-04-03 12:42:58 +00:00
Added charlierose.com
This commit is contained in:
parent
8212793dc5
commit
5566add768
19294
charlierose.com/links.csv
Normal file
19294
charlierose.com/links.csv
Normal file
File diff suppressed because it is too large
Load Diff
19292
charlierose.com/links_sorted.csv
Normal file
19292
charlierose.com/links_sorted.csv
Normal file
File diff suppressed because it is too large
Load Diff
49
charlierose.com/run.py
Normal file
49
charlierose.com/run.py
Normal file
@ -0,0 +1,49 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import csv
|
||||
import multiprocessing
|
||||
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Referrer": "https://charlierose.com/videos"
|
||||
}
|
||||
|
||||
|
||||
def request_timeout(url):
|
||||
while True:
|
||||
try:
|
||||
return requests.get(url, timeout=15)
|
||||
except Exception as e:
|
||||
print("!", end="", flush=True)
|
||||
continue
|
||||
|
||||
|
||||
def get_video_info(url):
|
||||
|
||||
r = request_timeout(url)
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
|
||||
try:
|
||||
desc = soup.find("div", attrs={"class", "description"}).find("p").text
|
||||
|
||||
with open('links.csv', 'a') as csv_file:
|
||||
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
||||
csv_writer.writerow([url, desc])
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print("Invalid " + url)
|
||||
|
||||
|
||||
pool = multiprocessing.Pool(processes=50)
|
||||
|
||||
urls = []
|
||||
for i in range(0, 35000):
|
||||
urls.append("https://charlierose.com/videos/" + str(i))
|
||||
|
||||
pool.map(get_video_info, urls)
|
22
charlierose.com/sort_remove_duplicates.py
Normal file
22
charlierose.com/sort_remove_duplicates.py
Normal file
@ -0,0 +1,22 @@
|
||||
import csv
|
||||
import os
|
||||
|
||||
rows = []
|
||||
|
||||
with open("links.csv", 'rb') as f:
|
||||
reader = csv.reader(f)
|
||||
for row in reader:
|
||||
if row not in rows:
|
||||
rows.append(row)
|
||||
|
||||
|
||||
def get_key(item):
|
||||
return int(os.path.split(item[0])[1])
|
||||
|
||||
sorted_list = sorted(rows, key=get_key)
|
||||
|
||||
with open('links_sorted.csv', 'a') as csv_file:
|
||||
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
||||
|
||||
for row in sorted_list:
|
||||
csv_writer.writerow(row)
|
Loading…
x
Reference in New Issue
Block a user