IT recording...
[Database] 4. 파이썬을 이용한 Data 크롤링 (BeautifulSoup 사용) 본문
** 한글 인코딩 필요시 table, db 생성시 아래 옵션 추가하기
-- default charset=utf8 collate = utf8_bin
CREATE DATABASE bestproducts DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
CREATE TABLE tablename(field definitions) DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
1. BeautifulSoup 사용한 크롤링
- res = request.get('링크')
- soup = BeautifulSoup(res.connect, 'html.parser')
- 객체 = soup.select('태그')
import requests
from bs4 import BeautifulSoup
# 링크 연결, html.parser 붙여주기
res = requests.get('http://corners.gmarket.co.kr/Bestsellers')
soup = BeautifulSoup(res.connect, 'html.parser')
# categories에 해당 태그 값들 받아옴
categories = soup.select('div.gbest-cate ul.by-group li a')
* soup.select를 통해 접근 시 list로 받아와지고, 배열처럼 접근해야 하며
* soup.select_one을 통해 접근 시 .text()와 같은 함수 바로 사용이 가능하다
best_item = soup.select('div.best-list')
#enumerate를 통해 for문 index관리 가능
for index, item in enumerate(best_item[1].select('li')):
data_dict = dict() # data를 dict라는 구조체로 묶어서 관리
title = item.select_one('a.itemname').get_text()
ori_price = item.select_one('div o-price span span')
#...
if ori_price == None or ori_price.get_text() =='':
ori_price = dis_price
else:
ori_price = ori_price.get_Text().replace(',','').replace('원','')
# ...
data_dict['title'] = title
data_dict['ori_price'] = ori_price
# ...
save_data(data_dict)
2. 저장하기
def save_data(item_info):
#신규상품일때만 저장하게
sql = """SELECT COUNT(*) FROM items WHERE item_code = '""" +item_info['item_code'] + """';"""
cursor.execute(sql)
result = cursor.fetchone()
if result[0] == 0:
sql = """INSERT INTO items VALUES('""" + item_info['item_code'] + """',
'""" + item_info['title'] + """',
""" + str(item_info['ori_price']) + """,
""" + str(item_info['dis_price']) + """,
""" + str(item_info['discount_percent']) + """,
'""" + item_info['provider'] + """')"""
cursor.execute(sql)
sql = """INSERT INTO ranking (main_category, sub_category, item_ranking, item_code) VALUES('""" + item_info['category_name'] + """',
'""" + item_info['sub_category_name'] + """',
'""" + str(item_info['ranking']) + """',
'""" + item_info['item_code'] + """')"""
cursor.execute(sql)
** sql구문 작성시 따옴표가 헷갈린다면
sql = """
insert into items values(%s,%s,%s,%s,%s,%s)
"""
cursor.execute(sql, (item_info['item_code'],item_info['title'],str(item_info['ori_price']),
str(item_info['dis_price']),str(item_info['discount_percent']),item_info['provider']))
sql = """
insert into items values(%s,%s,%s,%s,%s,%s)
"""
#data 따로 준비
data = ((item_info['item_code'],item_info['title'],str(item_info['ori_price']),
str(item_info['dis_price']),str(item_info['discount_percent']),item_info['provider']))
cursor.execute(sql, data)
참고 )
pythonstudy.xyz/python/article/203-MySQL-DML
++ )
- enumerate : for문 index관리 가능
- dict( ) : 구조체로 관리 가능
- BeautifulSoup 사용시 유용한 태그 조건들 m.blog.naver.com/kiddwannabe/221177292446
>전체코드<
더보기
import requests
from bs4 import BeautifulSoup
import pymysql
host_name = 'localhost'
host_port = 3306
username = 'root'
password = 'rjsrnreo11'
database_name = 'bestproducts'
#DB 연결
db = pymysql.connect(host=host_name, port=host_port, user=username, passwd=password, db=database_name, charset='utf8')
cursor = db.cursor()
# 메인 카테고리 받아오기
res = requests.get('http://corners.gmarket.co.kr/Bestsellers')
soup = BeautifulSoup(res.content, 'html.parser')
mainCate = soup.select('div.gbest-cate ul.by-group li a')
for main in mainCate:
getSubCate('http://corners.gmarket.co.kr/'+main['href'], main.get_text())
db.commit()
db.close()
#==========================================================================================
def getSubCate(link, cateName):
res = requests.get(link)
soup = BeautifulSoup(res.content, 'html.parser')
getItem(soup, cateName, "ALL")
sub_categories = soup.select('div.navi ul li a')
for sub in sub_categories:
res = requests.get(sub['href'])
soup = BeautilfulSoup(res.content, 'html.parser')
getItem(soup, cateName, sub.get_text())
# for sub in sub_categories:
# print(cateName, sub.get_text(), 'http://corners.gmarket.co.kr/' + sub['href'])
#==========================================================================================
def getItem(soup, category_name, sub_category_name):
item_list = list()
best_item = soup.select('div.best-list')
for index, item in enumerate(best_item[1].select('li')):
data_dict = dict()
ranking = index + 1
title = item.select_one('a.itemname').get_text()
ori_price = item.select_one('div.o-price span span')
dis_price = item.select_one('div s-price strong span span')
discount_percent = item.select_one('div s-price em')
if ori_price == None or ori_price.get_text() == '':
ori_price = dis_price
if dis_price ==None or dis_price.get_text() =='':
ori_price, dis_price = 0, 0
else:
ori_price = ori_price.get_text().replace(',','').replace('원','')
dis_price = dis_price.get_text().replace(',','').replace('원','')
if discount_percent == None or discount_percent == '':
discount_percent = 0
else:
discount_percent = discount_percent.get_text().replace('%','')
product_link = item.select_one('a.itemname')
item_code = product_link.attrs['href'].split('=')[1]
res = requests.get(product_link.attrs['href'])
soup = BeautifulSoup(res.content, 'html.parser')
provider = soup.select_one('span.text__seller a')
if provider == None or provider =='':
provider = ''
else:
provider = provider.get_text()
data_dict['category_name'] = category_name
data_dict['sub_category_name'] = sub_category_name
data_dict['ranking'] = ranking
data_dict['title'] = title
data_dict['ori_price'] = ori_price
data_dict['dis_price'] = dis_price
data_dict['discount_percent'] = discount_percent
data_dict['item_code'] = item_code
data_dict['provider'] = provider
print(data_dict)
save_data(data_dict)
#==========================================================================================
def save_data(item_info):
#신규 상품일 때만 저장하게
sql = """SELECT COUNT(*) FROM items WHERE item_code = '""" + item_info['item_code'] + """';"""
cursor.execute(sql)
result = cursor.fetchone()
if result[0] == 0:
# sql = """INSERT INTO items VALUES('""" + item_info['item_code'] + """',
# '""" + item_info['title'] + """',
# """ + str(item_info['ori_price']) + """,
# """ + str(item_info['dis_price']) + """,
# """ + str(item_info['discount_percent']) + """,
# '""" + item_info['provider'] + """')"""
# cursor.execute(sql)
sql = """
insert into items values(%s,%s,%s,%s,%s,%s)
"""
cursor.execute(sql, (item_info['item_code'],item_info['title'],str(item_info['ori_price']),
str(item_info['dis_price']),str(item_info['discount_percent']),item_info['provider']))
sql = """INSERT INTO ranking (main_category, sub_category, item_ranking, item_code) VALUES('""" + item_info['category_name'] + """',
'""" + item_info['sub_category_name'] + """',
'""" + str(item_info['ranking']) + """',
'""" + item_info['item_code'] + """')"""
cursor.execute(sql)
'Database' 카테고리의 다른 글
[Python] 파이썬 기본 문법 정리 - List, Tuple, Dictionary, Set (0) | 2021.02.10 |
---|---|
[Database] 5. Mysql SELECT 추가 문법 (group by, count, join 등) (0) | 2021.02.08 |
[Database] 3. Foriegn Key 등 (0) | 2021.02.06 |
[Database] 2. pymysql , pandas 사용하기 (0) | 2021.02.06 |
[Database] 1. MYSQL 기본 문법 (0) | 2021.02.05 |
Comments