# -*- coding: utf-8 -*- """ Created on Thu Oct 14 12:59:05 2021 @author: thydzik """ import pickle from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import Select from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.common.exceptions import ElementClickInterceptedException from selenium.common.exceptions import StaleElementReferenceException import time from datetime import datetime import os import requests import shutil import warnings warnings.filterwarnings("ignore", message="Unverified HTTPS request") import re import csv import sqlite3 as sl import random driver = webdriver.Firefox() #https://www.amazon.com.au/b?ie=UTF8&node=8415198051 urls = ['https://www.amazon.com.au/s/ref=s9_acss_bw_cg_WAREHOUS_2a1_w?bbn=8415198051&rh=n%3A4851626051&pf_rd_m=AMMK0LS9EDNM8&pf_rd_s=merchandised-search-3&pf_rd_r=CAKMN8CB51WHQX79Y1SV&pf_rd_t=101&pf_rd_p=515f72da-4098-44a3-8e37-80b70a605fe0&pf_rd_i=8415198051', #books 'https://www.amazon.com.au/s/ref=s9_acss_bw_cg_WAREHOUS_2b1_w?ie=UTF8&node=4851799051&bbn=8415198051&pf_rd_m=AMMK0LS9EDNM8&pf_rd_s=merchandised-search-3&pf_rd_r=CAKMN8CB51WHQX79Y1SV&pf_rd_t=101&pf_rd_p=515f72da-4098-44a3-8e37-80b70a605fe0&pf_rd_i=8415198051', #tech 'https://www.amazon.com.au/s/ref=s9_acss_bw_cg_WAREHOUS_2c1_w?ie=UTF8&node=4852150051&bbn=8415198051&pf_rd_m=AMMK0LS9EDNM8&pf_rd_s=merchandised-search-3&pf_rd_r=CAKMN8CB51WHQX79Y1SV&pf_rd_t=101&pf_rd_p=515f72da-4098-44a3-8e37-80b70a605fe0&pf_rd_i=8415198051', #home #'https://www.amazon.com.au/s/ref=s9_acss_bw_cg_WAREHOUS_2d1_w?ie=UTF8&node=4852264051&bbn=8415198051&pf_rd_m=AMMK0LS9EDNM8&pf_rd_s=merchandised-search-3&pf_rd_r=CAKMN8CB51WHQX79Y1SV&pf_rd_t=101&pf_rd_p=515f72da-4098-44a3-8e37-80b70a605fe0&pf_rd_i=8415198051', #dvds 'https://www.amazon.com.au/s/ref=s9_acss_bw_cg_WAREHOUS_2e1_w?ie=UTF8&node=4851683051&bbn=8415198051&pf_rd_m=AMMK0LS9EDNM8&pf_rd_s=merchandised-search-3&pf_rd_r=CAKMN8CB51WHQX79Y1SV&pf_rd_t=101&pf_rd_p=515f72da-4098-44a3-8e37-80b70a605fe0&pf_rd_i=8415198051', #tech 'https://www.amazon.com.au/s/ref=s9_acss_bw_cg_WAREHOUS_3a1_w?ie=UTF8&node=4852445051&bbn=8415198051&pf_rd_m=AMMK0LS9EDNM8&pf_rd_s=merchandised-search-3&pf_rd_r=CAKMN8CB51WHQX79Y1SV&pf_rd_t=101&pf_rd_p=515f72da-4098-44a3-8e37-80b70a605fe0&pf_rd_i=8415198051', #toys 'https://www.amazon.com.au/s/ref=s9_acss_bw_cg_WAREHOUS_3b1_w?ie=UTF8&node=4851856051&bbn=8415198051&pf_rd_m=AMMK0LS9EDNM8&pf_rd_s=merchandised-search-3&pf_rd_r=CAKMN8CB51WHQX79Y1SV&pf_rd_t=101&pf_rd_p=515f72da-4098-44a3-8e37-80b70a605fe0&pf_rd_i=8415198051', #clothes 'https://www.amazon.com.au/s/ref=s9_acss_bw_cg_WAREHOUS_3c1_w?ie=UTF8&node=4852617051&bbn=8415198051&pf_rd_m=AMMK0LS9EDNM8&pf_rd_s=merchandised-search-3&pf_rd_r=CAKMN8CB51WHQX79Y1SV&pf_rd_t=101&pf_rd_p=515f72da-4098-44a3-8e37-80b70a605fe0&pf_rd_i=8415198051', #toys 'https://www.amazon.com.au/s/ref=s9_acss_bw_cg_WAREHOUS_3d1_w?ie=UTF8&node=4852675051&bbn=8415198051&pf_rd_m=AMMK0LS9EDNM8&pf_rd_s=merchandised-search-3&pf_rd_r=CAKMN8CB51WHQX79Y1SV&pf_rd_t=101&pf_rd_p=515f72da-4098-44a3-8e37-80b70a605fe0&pf_rd_i=8415198051' #toys ] random.shuffle(urls) prices = [] con = sl.connect('warehouse.db') maxpagetries = 20 for url in urls: urlstart = url+'&s=price-asc-rank' driver.get(urlstart) html = driver.page_source match = re.search(r's-pagination-item s-pagination-disabled" aria-disabled="true">(\d*)</span>', html) if match: maxpage = int(match.group(1)) else: print("didn't get a page max for: "+url) maxpage = maxpagetries #for page in range(1,2): for page in range(1,min(maxpage,maxpagetries)): driver.get(urlstart+"&page="+str(page)+"&ref=sr_pg_"+str(page)) html = driver.page_source results0 = re.findall(r'<div data-asin="(\w+)" (.*)</div>', html, re.IGNORECASE | re.MULTILINE) for result0 in results0: r_asin = result0[0] if r_asin: results = re.findall(r'a-text-normal" href="(.*?)"><span class="a-size-base.*?">(.*?)</span>\s</a>.*?\$([0-9\.]*)</span><span', result0[1], re.IGNORECASE | re.MULTILINE) for result in results: result = list(result) r_url = "https://www.amazon.com.au"+result[0] r_search = r_url.split('ref=')[0] r_title = result[1] r_title = r_title.replace(" ", " ") r_title = r_title.replace("|", "-") r_sale = result[2] #try and determine if price exists cur = con.cursor() cur.execute("SELECT * FROM items WHERE asin='"+r_asin+"'") items = cur.fetchall() if items: #found the item i_price = items[0][1] i_cat = items[0][2] i_cat= str(i_cat) #if item not found, or price is the same as the sale price if (not items) or (float(r_sale) == float(i_price)): #need to add it #navigate to the url driver.get(r_search) html0 = driver.page_source regex1 = r'(?:<a class="a-link-normal a-color-tertiary" href[\w\W]*?">\s+([\w ]+)\s+</a>[\w\W\s]*?)?"(?:price|base)"><span class="a-offscreen">\$([0-9\.]{2,})</span>[\w\W\s]*?(?:\[asin\]" value="|"currentAsin" : "|"asin": ?")(\w*)"[,>]*' match = re.search(regex1, html0) if match: i_cat = str(match.group(1)) i_price = match.group(2) i_asin = match.group(3) #if r_asin != i_asin: # print("asin different for "+r_asin+" != "+i_asin+": "+r_search) #else: #write the value to database if not items: cur.execute("INSERT INTO items VALUES ('"+r_asin+"',"+i_price+",'"+i_cat+"')") else: #need to updated instead cur.execute("UPDATE items SET price = "+i_price+", cat = '"+i_cat+"' WHERE asin = '"+r_asin+"';") con.commit() else: print("didn't get a match for: "+r_search) i_price = r_sale i_cat = '' r_discount = round(100*(1-(float(r_sale)/float(i_price))),1) price = [] #not prices price.append(r_title) price.append(r_url) price.append(i_cat) price.append(float(r_sale)) price.append(float(i_price)) price.append(r_discount) price.append("["+r_title+"]("+r_url+")|"+i_cat+"|$"+r_sale+"|$"+str(i_price)+"|"+str(r_discount)+"%") prices.append(price) #break #break prices = sorted(prices, key=lambda x:x[5], reverse=True) con.close() # open the file in the write mode f = open('amazonwarehouse.csv', 'w', newline='', encoding='utf-8') # create the csv writer writer = csv.writer(f) writer.writerow(["Product","Url","Cat", "Sale", "Original","Discount","Product|Cat|Sale|Original|Discount"]) writer.writerow(["Product","Url","Cat", "Sale", "Original","Discount","-|-|-|-|-"]) # write a row to the csv file for price in prices: writer.writerow(price) # close the file f.close() os.startfile('amazonwarehouse.csv') driver.close()