1.利用casperjs 爬取新浪股市排行数据,生成数据文件
//获取新浪股票排行
var casper = require('casper').create({
waitTimeout: 10000,
verbose:true,
logLevel: "debug",
clientScripts: ["../jquery-3.1.1.min.js"], #Casperjs的运行需要jquery
pageSettings: {
loadImages: false,
loadPlugins: false,
userAgent: 'Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20100101 Firefox/17.0',
},
});
casper.start();
// 2. 打开新浪股票热度排行页面
casper.thenOpen('http://touzi.sina.com.cn/public/bhot');
// 3.输出全市场用户关注1日变化幅度排行
casper.then(function getrank() {
rank = casper.evaluate(function () {
var trlist = [];
$("#allday1").find('tr:gt(0)').each(function(){
var tdlist = [];
$(this).find("td").each(function() {
tdlist.push($(this).text().trim());
});
tdlist.push('\n');
trlist.push(tdlist);
});
return trlist;
});
var filename = 'data/allday1_sina.txt';
var fs = require('fs');
fs.write(filename,rank);
});
// 4.输出全市场用户关注5日变化幅度排行
casper.then(function getrank() {
rank = casper.evaluate(function () {
var trlist = [];
$("#allday5").find('tr:gt(0)').each(function(){
var tdlist = [];
$(this).find("td").each(function() {
tdlist.push($(this).text().trim());
});
tdlist.push('\n');
trlist.push(tdlist);
});
return trlist;
});
var filename = 'data/allday5_sina.txt';
var fs = require('fs');
fs.write(filename,rank);
});
casper.run();
2.使用 python入库
读取Casperjs生成的数据文件,写入mysql
#!/usr/bin/evn python
# -*- coding:utf-8 -*-
import MySQLdb
import os,sys,subprocess
class Spider():
def __init__(self, filename, js_filename, table):
self.url = url
self.filename = filename
self.table = table
def read_file_data(self):
value_sets = []
try:
with open(self.filename, 'rb') as data:
count = 0
for line in data:
value = line.decode("gb2312").encode("utf-8").split()
value_sets.append(value)
count += 1
return value_sets, count
except Exception,e:
print Exception,":",e
def get_insert_sql(self):
"""
获取table字段list,返回插入数据sql语句
"""
try:
cursor = db.cursor()
cursor.execute("select * from %s limit 1" % self.table)
field_name_list = [each[0] for each in cursor.description]
del field_name_list[0] #去除自增id字段名
column_list = "(" + ",".join([field for field in field_name_list]) + ")"
values_format = "values(" + ("%s,"*len(field_name_list)).rstrip(',') + ")"
insert_sql = "INSERT INTO %s" % self.table+column_list + values_format
return insert_sql
except Exception, e:
print("Error: %s" % e)
cursor.close()
db.close()
def save(self):
value_sets, count = self.get_file_data()
insert_sql = self.get_insert_sql()
if not (value_set and count):
print "get data fom file failed"
if not insert_sql:
print "get insert_sql failed"
try:
cursor = db.cursor()
cursor.executemany(insert_sql, value_sets)
db.commit()
print (u"成功插入数据%d条" % count)
except Exception, e:
db.rollback()
print Exception, ":", e
print (u"插入数据失败,数据回滚")
cursor.close()
db.close()