Misc-Download-Scripts/lynda.com/crawler-courses.py
2018-02-27 16:31:54 -05:00

73 lines
1.8 KiB
Python

import requests
import bs4
import json
URL = "https://lynda.com"
def request_timeout(url):
while True:
try:
return requests.get(url, timeout=30)
except:
print("!", end="", flush=True)
continue
def get_categories():
categories = []
r = requests.get(URL)
soup = bs4.BeautifulSoup(r.text, "html.parser")
for i in soup.find_all("i"):
if i.get("class") is not None and len(i.get("class")) > 1 and "cat-" in i.get("class")[1]:
category_id = i.get("class")[1][4:]
category_name = i.get("title")[:i.get("title").find("-") - 1]
categories.append((category_name, category_id))
return categories
def get_courses(category):
last_len = 0
courses = []
page = 0
while True:
page += 1
r = request_timeout("https://lynda.com/ajax/category/" + category[1] + "/courses?page=" + str(page))
soup = bs4.BeautifulSoup(json.loads(r.text)["html"], "html.parser")
for link in soup.find_all("a"):
if link.get("href") is not None:
if link.find("h3") is not None:
course_link = link.get("href")
course_name = link.find("h3").string
course_id = course_link.split("/")[-1:][0][:-7]
courses.append((course_name, course_id, course_link))
print("Page " + str(page) + " (" + str(len(courses)) + ")")
if last_len == len(courses):
break
last_len = len(courses)
return courses
file = open("courses.txt", "w")
for category in get_categories():
print(category)
for course in get_courses(category):
print(course[0])
file.write(category[1] + "\0" + course[0] + "\0" + course[1] + "\0" + course[2] + "\n")
file.flush()
file.close()