Discord-Channel-scraper/scrape-logs.py

209 lines
8.9 KiB
Python

#!/usr/bin/python
import json
import os
import pprint
import re
import discord
import getpass
import argparse
import logging
import requests
EMOJI_RE = re.compile("<:([^>]+):([0-9]{18})>")
logging.basicConfig(
level="WARNING",
style="{",
format="[{asctime}] [{process}] [{levelname}] {filename}:{lineno} {msg}"
)
log = logging.getLogger(__name__)
parser = argparse.ArgumentParser(description='Scrapes messages from a Discord channel.')
parser.add_argument('--username', '-u', action='store', help='Username to login under. If not specified, '
'username will be prompted for.')
# parser.add_argument('--password','-p', action='store', help='Password to login under. If not specified,
# password will be prompted for.')
parser.add_argument('--flag', '-f', action='store', default="!yank", help='An alternative to specifying the server and'
' channel, specify a piece of regex which'
' when matched against a message sent by the'
' target user, will trigger scraping of the'
' channel the message was posted in. Useful'
' for private messages and private chats.'
' Default value is "!yank", activates by'
' default if no server is specified.')
parser.add_argument('--quiet', '-q', action='store_true', help='Suppress messages in Discord')
parser.add_argument('--server', '--guild', '-s', action='store', help='Discord server name to scrape from '
'(user must be a member of the server and'
' have history privileges). This field is case'
' sensitive. If channel is not specified the '
'entire server will be scraped.')
parser.add_argument('--channel', '-c', action='store', help='Discord channel name to scrape from '
'(user must have history privileges for the particular'
' channel). This field is case sensitive.')
parser.add_argument('--limit', '-l', action='store', default=1000000, type=int, help='Number of messages to save.'
' Default is 1000000')
parser.add_argument('--output', '-o', action='store', help="Outputs all messages into a single file."
" If not specified, messages are saved under the format:"
" <channel name>.txt.")
parser.add_argument('--logging', action='store', choices=[10, 20, 30, 40, 50], default=20, help='Change the logging '
'level. Defaults to 20, info.')
parser.add_argument('--format', '-F', action='store', default="plain", type=str, help='Message format (plain|json)')
parser.add_argument('--dl_attachments', '-a', action='store_true', help='Download attachments')
parser.add_argument('--dl_emoji', '-e', action='store_true', help='Download emoji')
parser.add_argument('--skip_messages', '-S', action='store_true', help='Skip logging messages')
args = parser.parse_args()
log.setLevel(args.logging)
# prompt for username
if not args.username:
args.username = input("Username: ")
password = getpass.getpass("Password for user {0}: ".format(args.username))
client = discord.Client()
def download_emoji(emoji):
if not os.path.exists("./emoji"):
os.mkdir("./emoji")
url = "https://cdn.discordapp.com/emojis/{}.png?v=1".format(emoji[1])
filename = "./emoji/{}_{}.png".format(emoji[1], emoji[0])
if os.path.exists(filename):
return
r = requests.get(url, timeout=30)
if r.status_code == 200:
with open(filename, "wb") as out:
out.write(r.content)
def download_attachment(attachment, channel):
if not os.path.exists("./attachments"):
os.mkdir("./attachments")
if not os.path.exists("./attachments/" + channel):
os.mkdir("./attachments/" + channel)
r = requests.get(attachment["url"], timeout=30, stream=True)
if r.status_code == 200:
filename = "./attachments/{}/{}_{}".format(channel, attachment["id"], attachment["filename"])
with open(filename, "wb") as out:
for chunk in r.iter_content(4096):
out.write(chunk)
def save_line(out, message):
lines = []
if args.format == "plain":
for i in message.attachments:
lines.append('{0}::file:{1}'.format(message.author.name, i['url']))
lines.append('{0}: {1}'.format(message.author.name, message.content))
elif args.format == "json":
msg_obj = dict()
msg_obj["author"] = {
"name": message.author.name,
"id": message.author.id,
}
msg_obj["content"] = message.content
msg_obj["timestamp"] = message.timestamp.timestamp()
msg_obj["attachments"] = [{"url": a["url"], "id": a["id"], "filename": a["filename"]}
for a in message.attachments]
lines.append(json.dumps(msg_obj))
for line in lines:
out.write(line + "\n")
async def get_logs(channel):
try:
if not args.quiet:
await client.send_message(channel, "Getting the logs for channel {0}".format(channel.name))
log.info("Getting the logs for channel {0}".format(channel.name))
with open("{0}.txt".format(channel.name), 'w') as f:
async for line in client.logs_from(channel, limit=args.limit):
if not args.skip_messages:
save_line(f, line)
if args.dl_attachments:
for a in line.attachments:
download_attachment(a, line.channel.name)
if args.dl_emoji:
for e in EMOJI_RE.findall(line.content):
download_emoji(e)
for r in line.reactions:
if not isinstance(r.emoji, str):
download_emoji((r.emoji.name, r.emoji.id))
if not args.quiet:
await client.send_message(channel, 'The messages for this channel have been saved.')
log.info("Messages for channel {0} finished downloading".format(channel.name))
except Exception as e:
if not args.quiet:
await client.send_message(channel, 'Failed saving logs: {}'.format(str(e)))
log.error("Error while downloading channel {0}: {1}".format(channel.name, str(e)))
# Strangely, this will work once we are logged in
@client.async_event
async def on_message(message):
try:
log.debug(str(message.channel.server.name) + " -> " + str(message.channel.name) + ' - ' + str(message.author) +
': ' + str(message.content))
except:
log.debug("Private message - " + str(message.author) + ': ' + str(message.content))
if not args.server and not args.channel:
if args.flag == message.content[:len(args.flag)]:
await get_logs(message.channel)
# if (not args.server) and message.author.id == client.user.id and re.compile(args.flag).match(message.content):
# print("Matched {}".format(args.flag))
# await getLogs(message.channel)
@client.async_event
async def on_ready():
log.info("Logged in as user {0}".format(client.user.name))
if args.server and args.channel:
try:
channel = discord.utils.get(client.get_all_channels(), server__name=args.server, name=args.channel)
except:
channel = ""
if channel:
await get_logs(channel)
else:
log.error("Could not find channel {0} in server {1}".format(args.channel, args.server))
await client.logout()
elif args.server:
log.info("Downloading messages for all channels in server {0}".format(args.server))
channels = [c for c in client.get_all_channels() if c.server.name == args.server]
for channel in channels:
await get_logs(channel)
await client.logout()
else:
log.info('Entering flag mode with flag "{0}"'.format(args.flag))
try:
log.info("Logging in...")
client.run(args.username, password)
except KeyboardInterrupt:
log.info("Logging out...")
except Exception as e:
log.error(str(e))