Added charlierose.com

This commit is contained in:
simon987 2018-04-23 12:44:08 -04:00
parent 8212793dc5
commit 5566add768
4 changed files with 38657 additions and 0 deletions

19294
charlierose.com/links.csv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

49
charlierose.com/run.py Normal file
View File

@ -0,0 +1,49 @@
import requests
from bs4 import BeautifulSoup
import csv
import multiprocessing
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Upgrade-Insecure-Requests": "1",
"Referrer": "https://charlierose.com/videos"
}
def request_timeout(url):
while True:
try:
return requests.get(url, timeout=15)
except Exception as e:
print("!", end="", flush=True)
continue
def get_video_info(url):
r = request_timeout(url)
soup = BeautifulSoup(r.text, "html.parser")
try:
desc = soup.find("div", attrs={"class", "description"}).find("p").text
with open('links.csv', 'a') as csv_file:
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow([url, desc])
except Exception as e:
print(e)
print("Invalid " + url)
pool = multiprocessing.Pool(processes=50)
urls = []
for i in range(0, 35000):
urls.append("https://charlierose.com/videos/" + str(i))
pool.map(get_video_info, urls)

View File

@ -0,0 +1,22 @@
import csv
import os
rows = []
with open("links.csv", 'rb') as f:
reader = csv.reader(f)
for row in reader:
if row not in rows:
rows.append(row)
def get_key(item):
return int(os.path.split(item[0])[1])
sorted_list = sorted(rows, key=get_key)
with open('links_sorted.csv', 'a') as csv_file:
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in sorted_list:
csv_writer.writerow(row)