#!/user/bin/python
#encoding:utf-8
import MySQLdb
import urllib2
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
#查找一条数据
def findData(sql):
db = MySQLdb.connect(charset="utf8", host="localhost", user="root", passwd="", db="blog")
cusor = db.cursor()
cusor.execute(sql)
data = cusor.fetchone()
db.close()
return data
#插入/更新 一条数据
def insertData(sql):
db = MySQLdb.connect(charset="utf8", host="localhost", user="root", passwd="", db="blog")
cusor = db.cursor()
cusor.execute(sql)
db.commit()
db.close()
#抓取页面
def grabContent(url):
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36',}
req = urllib2.Request(url, headers = header)
con = urllib2.urlopen(req)
doc = con.read()
con.close()
return doc
#一切从零开始
def startFromZero():
data = grabContent('http://movie.douban.com/tag/')
soup = BeautifulSoup(data, 'html.parser')
for url in soup.find_all('a',{'class':'tag'}):
sqlStr = "INSERT INTO `blog_movie_tag` (tag) VALUES ('"+str(url.get_text())+"')"
insertData(sqlStr)
grabMovieInfo()
#开始抓取电影
def grabMovieInfo():
tag = findData("SELECT `tag` from blog_movie_tag WHERE status = '0' LIMIT 1 ")
for i in range(0,35):
if i != 34:
startNum = i * 15
else:
updateTagSql = "UPDATE `blog_movie_tag` SET status = '1' where tag = "+tag[0]
insertData(updateTagSql)
startNum = 500
url = "http://www.douban.com/tag/"+tag[0]+"/movie?start="+str(startNum)
i += 1
data = grabContent(url)
soup = BeautifulSoup(data, 'html.parser')
for url in soup.find_all('dl'):
movieName = str(url.find('a',{'class':'title'}).get_text()).replace("'", "")
movieYear = str(url.find('div',{'class':'desc'}).get_text())
oldDataSql = "SELECT `id` FROM `blog_movie` WHERE movie_name = '"+movieName+"'"
existId = findData(oldDataSql)
if(existId == None):
movieSql = "INSERT INTO `blog_movie` (movie_name, movie_year) VALUES ('"+movieName+"','"+movieYear+"')"
insertData(movieSql)
grabMovieTag(url.find('a', {'class':'title'}).get('href'), movieName)
#抓取电影详情页的标签
def grabMovieTag(url, movieName):
data = grabContent(url)
soup = BeautifulSoup(data, 'html.parser')
movieRate = soup.find('strong', {'class':'rating_num'}).get_text()
updateMovieSql = "UPDATE `blog_movie` SET rate = '"+movieRate+"' WHERE movie_name = '"+movieName+"'"
insertData(updateMovieSql)
i=0
for grabTag in soup.find('div', {'class':'tags-body'}).find_all_next('a'):
i = i+1
if(i < 9):
oldDataSql = "SELECT `id` FROM `blog_movie_tag` WHERE tag = '" + str(grabTag.get_text()) + "'"
existId = findData(oldDataSql)
if(existId == None):
dataSql = "INSERT INTO `blog_movie_tag` (tag) VALUES ('" + str(grabTag.get_text()) + "')"
insertData(dataSql)
else:
break
startFromZero()