python爬取Leetcode所有未ac题目

事情起源于寒假刚开始的时候
立下假期要好好学习的flag
决定继续刷leetcode 然而在不知道从什么时候开始刷leetcode到现在
居然从三百多题增长到了四百多题感觉自己刷题速度都要跟不上增长了
按以前的列表刷难免漏题决定写个”爬虫”

0.前言

再分析需求的时候查了很多博客。。完全无压力直接访问-解析html-提取所需信息一气呵成
然而leetcode改版了。。。我还没有查到改版后的爬取博客orz
随着前端的发展。。很多网页的主要信息已经变成js异步加载的了
大概leetcode改版之后题目列表就是动态加载的了：判断依据就是查看网页源代码的html里面没有table表格
而使用F12审查元素时可以看到table表格以及题目的各种信息

1.方案

为了解决这个问题可以采取以下几个途径：
0.5：爬取量不大时直接审查元素右键另存为嘛2333333
然后直接写个解析就好了嘛（作为一只咸鱼我首先想到的真的是这个QAQ)
1：模拟浏览器登录，等js加载完毕之后再解析。
2：先抓个包试试
以下将会详细分析

2.流程&方案执行

整个流程规划如下：
由于是自己的未完成题目：模拟登录 - 进入题目列表 - 访问题目列表知道没有下一页
由于题目一直在增加想隔一段时间执行一次执行脚本
抛弃方案0.5首先选择的是模拟浏览器代码比较简单

[title] [] [url] [link text]

browser = spynner.Browser()

browser.create_webview()

browser.set_html_parser(pyquery.PyQuery)

browser.load("https://leetcode.com/problemset/algorithms/", 50)

try:

    browser.wait_load(10)

except:

    pass

string = browser.html

browser.close()

然而在实际操作的过程中发现运行太慢
资源占用比太大
家里网络不好常常直接timeout
然后我个智障某天拿手机开了个热点调试。。。。成功耗尽一个月的流量包。。

最后选择抓包尝试
网络请求如下图：

前3个请求为：https://leetcode.com/accounts/login/
https://leetcode.com/problemset/
https://leetcode.com/problemset/algorithms/

查看倒数第六个请求的返回值：

调用的是一个api接口：https://leetcode.com/api/problems/algorithms/

返回的是json数据～还是全部的orz
此文终结

最后提一下模拟登录的坑
需要提交一个csrftoken值
所以网络请求有4次

成果图

主要代码：

[title] [] [url] [link text]

# coding=utf-8
# auther:ZhangMengRou
import json
import sys

import MySQLdb
import requests

db = MySQLdb.connect("localhost", "数据库用户名", "数据库密码", "leetcode_search")

cursor = db.cursor()

s = requests.session()
headers_base = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2',
    'Connection': 'keep-alive',
    'Host': 'leetcode.com',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36',
    'Referer': 'https://leetcode.com/accounts/login/',
}
headers1 = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2',
    'Connection': 'keep-alive',
    'Host': 'leetcode.com',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36',
    'Referer': 'https://leetcode.com/problemset/algorithms/',
}
headers2 = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate, sdch, br',
    'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2',
    'Connection': 'keep-alive',
    'Host': 'leetcode.com',
    'Content-Type': 'application/json',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36',
    'Referer': 'https://leetcode.com/problemset/algorithms/',
    'X-Requested-With': 'XMLHttpRequest',
}
login_data = {}
login_html = "404"


def search():
    # 获取csrfmiddlewaretoken
    url = "https://leetcode.com/accounts/login/"
    res = s.get(url=url, headers=headers_base)
    cookie = res.cookies['csrftoken']

    # 登录
    url = "https://leetcode.com/accounts/login/"
    res = s.get(url=url, headers=headers_base)
    cookie = res.cookies['csrftoken']
    # post数据
    login_data['csrfmiddlewaretoken'] = res.cookies['csrftoken']
    login_data['login'] = "用户名"
    login_data['password'] = "密码"
    res = s.post(url, headers=headers_base, data=login_data)

    headers2['csrfmiddlewaretoken'] = cookie

    # print res.text
    url = "https://leetcode.com/problemset/algorithms/"
    res2 = s.get(url=url, headers=headers1)

    # print res2.text
    url = "https://leetcode.com/api/problems/algorithms/"
    res3 = s.get(url=url, headers=headers2)

    # print res3.text
    hjson = json.loads(res3.text)

    count = 0
    a = hjson['stat_status_pairs']
    # 存入数据库
    for i in a:
        if i["status"] != 'ac':
            status = i["status"]

            total_acs = i["stat"]["total_acs"]
            question__title = i["stat"]["question__title"]
            question__article__slug = i["stat"]["question__article__slug"]
            total_submitted = i["stat"]["total_submitted"]
            question__title_slug = i["stat"]["question__title_slug"]
            question__article__live = i["stat"]["question__article__live"]
            question__hide = i["stat"]["question__hide"]
            question_id = i["stat"]["question_id"]

            count = count + 1
            print count
            is_favor = i["is_favor"]
            paid_only = i["paid_only"]
            difficulty_level = i["difficulty"]["level"]

            print status

            try:

                sql = """INSERT INTO UNFINISHED(
				id,
				status,
				total_acs,
				question__title,
				question__article__slug,
				total_submitted,
				question__title_slug ,
				question__article__live,
				question__hide,
				question_id,
				is_favor,
				paid_only,
				difficulty_level,
				url
				)
						VALUES (%s,%s,%s,%s,%s,
					   %s,%s,%s,%s,%s,
					   %s,%s,%s,%s
					   )
						"""
                cursor.execute(sql, (
                    count,
                    status,
                    total_acs,
                    question__title,
                    question__article__slug,
                    total_submitted,
                    question__title_slug,
                    question__article__live,
                    question__hide,
                    question_id,
                    is_favor,
                    paid_only,
                    difficulty_level,
                    "https://leetcode.com/problems/" + question__title_slug

                ))
                # hp_img_original_url, hp_author, ipad_url, hp_content,
                # hp_makettime, hide_flag, last_update_date, web_url, wb_img_url, image_authors, text_authors, image_from,
                #     text_from, content_bgcolor, template_category, praisenum, sharenum, commentnum
                print "yes"
                db.commit()
            except Exception, err:
                sys.stderr.write('Exception Error: %s' % str(err))
                # Rollback in case there is any error
                # print  "e"
                db.rollback()

    print


search()

# 数据库SQL：
/*
Navicat MySQL Data Transfer

Source Server         : mysql
Source Server Version : 100119
Source Host           : localhost:3306
Source Database       : leetcode_search

Target Server Type    : MYSQL
Target Server Version : 100119
File Encoding         : 65001

Date: 2017-01-30 15:20:49
*/

SET FOREIGN_KEY_CHECKS=0;

-- ----------------------------
-- Table structure for unfinished
-- ----------------------------
DROP TABLE IF EXISTS `unfinished`;
CREATE TABLE `unfinished` (
  `id` int(11) NOT NULL,
  `total_acs` varchar(255) DEFAULT NULL,
  `question__title` varchar(255) DEFAULT NULL,
  `question__article__slug` varchar(255) DEFAULT NULL,
  `total_submitted` varchar(255) DEFAULT NULL,
  `question__title_slug` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL,
  `question__article__live` varchar(255) DEFAULT NULL,
  `question__hide` varchar(255) DEFAULT NULL,
  `question_id` int(11) DEFAULT NULL,
  `is_favor` varchar(255) DEFAULT NULL,
  `paid_only` varchar(255) DEFAULT NULL,
  `difficulty_level` varchar(255) DEFAULT NULL,
  `url` varchar(255) DEFAULT NULL,
  `status` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
SET FOREIGN_KEY_CHECKS=1;

立个flag某狗狗说刷完leetcode带我去玩耍