2018-02-27 16:31:54 -05:00

112 lines
3.0 KiB
Python

import requests
from bs4 import BeautifulSoup
import os
import pdfkit
from urllib.parse import urljoin
import youtube_dl
articles = []
videos = []
kits = []
def get_articles():
for page in range(0, 10):
r = requests.get("https://software.intel.com/en-us/ai-academy/library?page=" + str(page))
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all("a"):
if link.get("href") is not None and link.get("href").startswith("/en-us/articles/"):
if link.string is not None:
articles.append((link.get("href"), link.string))
if link.get("href") is not None and link.get("href").startswith("/en-us/videos/"):
if link.string is not None:
videos.append((link.get("href"), link.string))
print(str(len(articles)) + " articles")
print(str(len(videos)) + " videos")
def get_kits():
r = requests.get("https://software.intel.com/en-us/ai-academy/students/kits")
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all("a"):
if link.string is not None and link.string == "Get Started":
kits.append(link.get("href"))
def download_article(article):
if not os.path.exists("articles"):
os.mkdir("articles")
if not os.path.isfile("articles/" + article[1] + ".pdf"):
pdfkit.from_url(urljoin("https://software.intel.com/", article[0]), "articles/" + article[1] + ".pdf")
def download_video(video):
if not os.path.exists("videos"):
os.mkdir("videos")
options = {"outtmpl": "videos/%(title)s.%(ext)s"}
ytd = youtube_dl.YoutubeDL(options)
ytd.download([urljoin("https://software.intel.com/", video[0])])
def download_file(url, destination):
while True:
try:
response = requests.get(url, stream=True, timeout=10)
if not os.path.exists(destination) and response.status_code == 200:
with open(destination, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
break
except:
print("!")
def download_kit(kit_url):
if not os.path.exists("kits"):
os.mkdir("kits")
kit_url = urljoin("https://software.intel.com/", kit_url)
r = requests.get(kit_url)
soup = BeautifulSoup(r.text, "html.parser")
kit_title = soup.find("title").string
if not os.path.exists("kits/" + kit_title):
os.mkdir("kits/" + kit_title)
pdfkit.from_url(kit_url, "kits/" + kit_title + "/kit.pdf")
for link in soup.find_all("a"):
target = link.get("href")
if target is not None and target.endswith(".zip"):
download_file(urljoin("https://software.intel.com/", target), "kits/" + kit_title + "/" + os.path.split(target)[1])
# get_articles()
get_kits()
for k in kits:
download_kit(k)
#
# for a in articles:
# download_article(a)
#
# for v in videos:
# download_video(v)