from selenium.webdriver.support.ui import Select from selenium.webdriver.common.by import By from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService from selenium.common.exceptions import NoSuchElementException import datetime as dt import time class GenericEvent(): def __init__(self, something:str): self.event = something class EventScraperPortable(): URL = "https://thefly.com/news.php" def __init__(self, pathToExe:str, headless=True): options = Options() options.add_argument('--no-sandbox') options.add_argument("--disable-extensions") options.add_argument("--disable-notifications") options.add_argument("--disable-gpu") options.add_argument("--disable-blink-features") options.add_argument('--disable-blink-features=AutomationControlled') options.add_argument('--disable-dev-shm-usage') if headless == True: options.add_argument('--headless') user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2' #user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36" # actual user-agent (via google) options.add_argument(f'user-agent={user_agent}') service = ChromeService(executable_path=pathToExe) self.driver = webdriver.Chrome(service=service, options=options) def run(self): self.scrapeContinuously(dt.datetime.combine(dt.datetime.now().date() - dt.timedelta(days=1), dt.time(16, 0)), eventTypeList=["initiate", "recUpgrade", "recDowngrade", "recomm", "hot_stocks"]) def scrapeHistory(self, eventTypeList=[str], untilTime=None): self.driver.get( self.URL ) time.sleep(5) idList = [] eventList = [] latestDate = None earliestDate = dt.datetime.now() reachedEnd = False dataTopicXPath = "" if len(eventTypeList) > 0: count = 0 middleXPath = "" for eventType in eventTypeList: if count > 0: middleXPath += "or @data-topic='" + eventType + "'" else: middleXPath += "@data-topic='" + eventType + "'" count += 1 dataTopicXPath = "[" + middleXPath + "]" alwaysTrue = True while alwaysTrue == True: if earliestDate > untilTime: hoursBetweenEarliestAndNow = (earliestDate - untilTime).total_seconds() / (60*60) hoursBetweenEarliestAndNow = int(hoursBetweenEarliestAndNow) +1 for i in range(hoursBetweenEarliestAndNow): self.scrollDown() time.sleep(1) time.sleep(5) rows = self.getMultByXpath("//*[@id='search_news']/div[@class='newsFeedWidget feedCerrado']/table/tbody/tr" + dataTopicXPath) for row in rows: dataID = row.get_attribute("data-id") newsDate = row.get_attribute("data-datenews") newsDate = dt.datetime.strptime(newsDate, '%Y-%m-%d %H:%M:%S') if newsDate < earliestDate: earliestDate = newsDate if (earliestDate and untilTime and earliestDate <= untilTime): reachedEnd = True break elif dataID not in idList: idList.append(dataID) eventType = row.get_attribute("data-topic") headline = row.find_element(by=By.XPATH, value=".//td/div/a/span").text.strip() try: ticker = None tickers = row.find_elements(by=By.XPATH, value=".//td/div/div[@class='simbolos_wrapper']/span[@class='ticker fpo_overlay']") if len(tickers) < 3 and len(tickers) > 0: ticker = tickers[0].get_attribute("data-ticker") finally: if ticker is None or len(ticker) == 0: ticker = "blah blah" event = GenericEvent(ticker + headline) if event is not None: eventList.append(event) # if they're not set, then set them for the first time if not latestDate: latestDate = newsDate if reachedEnd == True: break return eventList, latestDate def scrollDown(self): #keep going # scrolling back chronologically triggerElement = self.getOneByXpath("//*[@id='search_news']/div[@class='moreNewsTriggers']") self.driver.execute_script("arguments[0].scrollIntoView();", triggerElement) def scrapeContinuously(self, initiallySearchUntil:dt.datetime, eventTypeList=[str]): print("Starting to scrape continuously, looking back to:", initiallySearchUntil.strftime("%Y%m%d-%H-%M-%S")) alwaysTrue = True latestDate = initiallySearchUntil while alwaysTrue == True: priorLatestDate = latestDate newEvents, latestDate = self.scrapeHistory(eventTypeList, latestDate) # we made it to the end if latestDate is None - nothing more to find if latestDate is None: latestDate = priorLatestDate # do stuff with the events #self.notifyListeners(newEvents) time.sleep(60) def getMultByXpath(self, path:str): return self.driver.find_elements(by=By.XPATH, value=path) def getOneByXpath(self, path:str): try: result = self.driver.find_element(by=By.XPATH, value=path) #result = self.driver.find_element_by_xpath(path) -- deprecated except NoSuchElementException: result = None return result def finished(self): if self.driver: self.driver.quit() def main(): pathToExe=r"C:\\Program Files (x86)\\Google\\ChromeDiver\\chromedriver101.0.4951.41.exe" scraper = EventScraperPortable(pathToExe, headless=True) scraper.run() scraper.finished() if __name__ == '__main__': main()