import streamlit as st import requests import numpy as np from streamlit_lottie import st_lottie from PIL import Image import warnings warnings.filterwarnings("ignore") import requests import pandas as pd import numpy as np from bs4 import BeautifulSoup import bs4 from urllib.request import urlopen import time import re import time import matplotlib.pyplot as plt import seaborn as sns import matplotlib as mpl import plotly import plotly.express as px import plotly.graph_objs as go import plotly.offline as py from plotly.offline import iplot from plotly.subplots import make_subplots import plotly.figure_factory as ff from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC #Settings for using the driver without a UI options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') options.add_argument("start-maximized") options.add_argument("disable-infobars") options.add_argument("--disable-extensions") driver = webdriver.Chrome('chromedriver',options=options) # wuzzuf function def Wuzzuf_scrapping(job_type , job_num): job1 = job_type.split(" ")[0] job2 = job_type.split(" ")[1] link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1 title = [] location = [] country = [] job_description = [] Job_Requirements =[] company_name = [] links = [] Jop_type = [] Career_Level = [] company_logo = [] Job_Categories = [] Skills_And_Tools = [] Experience_Needed =[] post_time = [] Title = [] pages_num = np.ceil(job_num/15) for i in range(int(pages_num) ): link_new = link1 +'&start='+str(i) data = requests.get(link_new) soup = BeautifulSoup(data.content) Title = soup.find_all('h2' , {'class': 'css-m604qf'}) # to get the info about jobs for x in range(0,len(Title)): t = re.split('\(|\-',Title[x].find('a').text) title.append(t[0].strip()) loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text) r = "" for i in range(len(loc[:-1])): r= r+ ', ' +loc[:-1][i].strip() location.append(r.replace(',', '', 1).strip()) country.append(loc[-1].strip()) links.append('https://wuzzuf.net' + Title[x].find('a').attrs['href']) m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text))) company_name.append(m) c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span') if len(c) ==1: Jop_type.append(c[0].text) else: n =[] for i in range(len(c)): n.append(c[i].text) Jop_type.append(n) n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span']) Career_Level.append(n[0].text) n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span']) yy = n[1].text.replace('·',' ').strip() yy = re.findall('[0-9-+]*',yy) y1 ="" for i in range(len(yy)): if any(yy[i]): y1 = y1+yy[i] if y1 != "": Experience_Needed.append(y1) else: Experience_Needed.append("Not Specified") time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div') post_time.append(time.text) # to get the logo of the company data1 = requests.get(links[x]) soup1 = BeautifulSoup(data1.content) company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content']) #time.sleep(4) # get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls driver = webdriver.Chrome('chromedriver',options=options) #driver.implicitly_wait(10) driver.get(links[x]) Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:]) Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:]) job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:]) all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div') dict_other = {} new = all[0].text.split("\n\n") if len(new)!=1 : for i in range(len(new)): result =[] for k in (new[i].split('\n')[1:]): result.append(k.replace("\u202f"," ")) dict_other[new[i].split('\n')[0]] = result #result = re.sub('[\W_]+', '', ini_string) Job_Requirements.append(dict_other) else: nn = new[0].replace("\u202f"," ") Job_Requirements.append(nn.split('\n')) # create data frame to combine all together df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements}) df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8') return df[:job_num] # linkedin function def LINKEDIN_Scrapping(job_search , num_jobs): job1 = job_search.split(" ")[0] job2 = job_search.split(" ")[1] link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0' # FIRST get main informations about jobs title = [] location = [] country = [] company_name = [] post_time = [] links =[] # get the specific numbers of jobs l1 = "" ll ="" driver = webdriver.Chrome('chromedriver',options=options) driver.get(link1) SCROLL_PAUSE_TIME = 0.5 while True : l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div') ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a') if len(l1) >= num_jobs: break time.sleep(3) # Get scroll height last_height = driver.execute_script("return document.body.scrollHeight") while True: # Scroll down to bottom driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Wait to load page time.sleep(SCROLL_PAUSE_TIME) # Calculate new scroll height and compare with last scroll height new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height options.add_argument("window-size=1200x600") WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click() print(len(l1)) time.sleep(2) l2 = l1[:num_jobs] for info in l2: info_tot = info.text.split("\n") if len(info_tot)==5: title.append(info_tot[1]) location.append(info_tot[3]) company_name.append(info_tot[2]) post_time.append(info_tot[4]) else: title.append(info_tot[1]) location.append(info_tot[3]) company_name.append(info_tot[2]) post_time.append(info_tot[5]) # get links for jobs l3 = ll[:num_jobs] for i in l3: links.append(i.get_attribute('href')) df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time}) # GET DESCRIPTION AND LOGO def all_description_LOGO(urls): description =[] LOGO =[] for link in urls: driver = webdriver.Chrome('chromedriver',options=options) driver.get(link) options.add_argument("window-size=1200x600") WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click() qqq= 4+444*58/7+65 K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img') LOGO.append(K.get_attribute('src')) time.sleep(3) t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div') t_reverse=t.text[::-1] if t_reverse[:9] =="erom wohs": l = len(t.text) strings=t.text[:l-9].split("\n") strings[:] = [x for x in strings if x] description.append(strings) else: strings=t.text.split("\n") strings[:] = [x for x in strings if x] description.append(strings) df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO}) return df_ml # apply desc. and logo function E = all_description_LOGO(links) # other info function def other(urls): frames =[] for url in urls: data1 = requests.get(url) soup1 = BeautifulSoup(data1.content) j = soup1.find('ul' , {'class': 'description__job-criteria-list'}) time.sleep(4) jj=j.find_all('h3') dic ={} for i in range(len(jj)): dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip() output = pd.DataFrame() output = output.append(dic, ignore_index=True) frames.append(output) result = pd.concat(frames) return result # apply Other function df = other(links) df.fillna('Not_Found',inplace= True) df.reset_index(inplace=True, drop=True) # combine all together result = pd.concat([df_ml,E, df ], axis=1) return result ##################### map_bubble ##################### #### function to show map for loaction of the job def map_bubble(df): import requests import urllib.parse g =[] for i in range(len(df.Location)): if df.Location.loc[i].split(","): g.append(df.Location.loc[i].split(",")[0]) else: g.append(df.Location.loc[i]) df['new_loc']=g if 'country' in df.columns: df["full_location"] = df["new_loc"] + ", " +df["country"] dict_cities = dict(df.full_location.value_counts()) else : dict_cities = dict(df.new_loc.value_counts()) lat = [] lon = [] bubble_df = pd.DataFrame() add=[] val=[] try: for address in dict_cities.keys(): url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(address) +'?format=json' response = requests.get(url).json() lat.append(response[0]["lat"]) lon.append(response[0]["lon"]) add.append(address) val.append(dict_cities[address]) except: pass bubble_df['address'] =add bubble_df['lat'] = lat bubble_df['lon'] = lon bubble_df['value'] = val # import the library import folium # Make an empty map m = folium.Map(location=[20,0], tiles="OpenStreetMap", zoom_start=2) # add marker one by one on the map for i in range(0,len(bubble_df)): folium.Circle( location=[bubble_df.iloc[i]['lat'], bubble_df.iloc[i]['lon']], popup=bubble_df.iloc[i][['address','value']].values, radius=float(bubble_df.iloc[i]['value'])*500, color='#69b3a2', fill=True, fill_color='#69b3a2' ).add_to(m) m # Show the map again return m ########################## ######################### #### wuzzuf analysis def wuzzuf_exp(df1): top10_job_title = df1['Title'].value_counts()[:10] fig1 = px.bar(y=top10_job_title.values, x=top10_job_title.index, color = top10_job_title.index, color_discrete_sequence=px.colors.sequential.deep, text=top10_job_title.values, title= 'Top 10 Job Titles', template= 'plotly_dark') fig1.update_layout(height=500,width=500, xaxis_title="Job Titles", yaxis_title="count", font = dict(size=17,family="Franklin Gothic")) st.plotly_chart(fig1) type_grouped = df1['Career_Level'].value_counts() #e_type = ['Full-Time','Part-Time','Contract','Freelance'] e_type =dict(df1['Career_Level'].value_counts()).keys() fig2 = px.bar(x = e_type, y = type_grouped.values, color = type_grouped.index, color_discrete_sequence=px.colors.sequential.dense, template = 'plotly_dark', text = type_grouped.values, title = 'Career Level Distribution') fig2.update_layout( height=500, width=500, xaxis_title="Career Level", yaxis_title="count", font = dict(size=17,family="Franklin Gothic")) fig2.update_traces(width=0.5) st.plotly_chart(fig2) residence = df1['Location'].value_counts() top10_employee_location = residence[:10] fig3 = px.bar(y=top10_employee_location.values, x=top10_employee_location.index, color = top10_employee_location.index, color_discrete_sequence=px.colors.sequential.deep, text=top10_employee_location.values, title= 'Top 10 Location of job', template= 'plotly_dark') fig3.update_layout(height=500,width=500, xaxis_title="Location of job", yaxis_title="count", font = dict(size=17,family="Franklin Gothic")) st.plotly_chart(fig3) type_grouped = df1['Experience_Needed'].value_counts() #e_type = ['Full-Time','Part-Time','Contract','Freelance'] e_type =dict(df1['Experience_Needed'].value_counts()).keys() fig4 = px.bar(x = e_type, y = type_grouped.values, color = type_grouped.index, color_discrete_sequence=px.colors.sequential.dense, template = 'plotly_dark', text = type_grouped.values, title = ' Experience Level Distribution') fig4.update_layout(height=500,width=500, xaxis_title=" Experience Level (years)", yaxis_title="count", font = dict(size=17,family="Franklin Gothic")) fig4.update_traces(width=0.5) st.plotly_chart(fig4) return ######################### ### linkedin analysis def linkedin_exp(df1): top10_job_title = df1['Title'].value_counts()[:10] fig1 = px.bar(y=top10_job_title.values, x=top10_job_title.index, color = top10_job_title.index, color_discrete_sequence=px.colors.sequential.deep, text=top10_job_title.values, title= 'Top 10 Job Titles', template= 'plotly_dark') fig1.update_layout(height=500,width=500, xaxis_title="Job Titles", yaxis_title="count", font = dict(size=17,family="Franklin Gothic")) st.plotly_chart(fig1) type_grouped = df1['Employment type'].value_counts() #e_type = ['Full-Time','Part-Time','Contract','Freelance'] e_type =dict(df1['Employment type'].value_counts()).keys() fig2 = px.bar(x = e_type, y = type_grouped.values, color = type_grouped.index, color_discrete_sequence=px.colors.sequential.dense, template = 'plotly_dark', text = type_grouped.values, title = 'Employment type Distribution') fig2.update_layout( height=500, width=500, xaxis_title="Employment type", yaxis_title="count", font = dict(size=17,family="Franklin Gothic")) fig2.update_traces(width=0.5) st.plotly_chart(fig2) residence = df1['Location'].value_counts() top10_employee_location = residence[:10] fig3 = px.bar(y=top10_employee_location.values, x=top10_employee_location.index, color = top10_employee_location.index, color_discrete_sequence=px.colors.sequential.deep, text=top10_employee_location.values, title= 'Top 10 Location of job', template= 'plotly_dark') fig3.update_layout(height=500,width=500, xaxis_title="Location of job", yaxis_title="count", font = dict(size=17,family="Franklin Gothic")) st.plotly_chart(fig3) type_grouped = df1['Seniority level'].value_counts() #e_type = ['Full-Time','Part-Time','Contract','Freelance'] e_type =dict(df1['Seniority level'].value_counts()).keys() fig4 = px.bar(x = e_type, y = type_grouped.values, color = type_grouped.index, color_discrete_sequence=px.colors.sequential.dense, template = 'plotly_dark', text = type_grouped.values, title = 'Seniority level Distribution') fig4.update_layout(height=500,width=500, xaxis_title="Seniority level", yaxis_title="count", font = dict(size=17,family="Franklin Gothic")) fig4.update_traces(width=0.5) st.plotly_chart(fig4) return ######################## ####################### stream lit app ################################ #site = "" #job ="" #num_jobs = 0 st.set_page_config(page_title="My Web_Scrap Page", page_icon=":tada:", layout="wide") # ---- HEADER SECTION ---- with st.container(): left_column, right_column = st.columns(2) with left_column: st.subheader("Hi! I am Yassmen :wave:") st.title("An Electronics and Communcation Engineer") st.write( "In this app we will scrap jobs from LinkedIn and Wuzzuf websites, let's get it started :boom:" ) st.write("[Reach me >](https://www.linkedin.com/in/yassmen-youssef-48439a166/)") with right_column: pass # st_lottie(lottie_coding, height=300, key="coding") import streamlit as st from streamlit_option_menu import option_menu #with st.sidebar: # selected = option_menu("Main Menu", ["select website", 'search job','numbers of jobs'], icons=['linkedin', 'search','123'], menu_icon="cast", default_index=1) webs =["Wuzzuf","Linkedin"] jobs =["Machine Learning","AI Engineer","Data Analysis","Software Testing"] nums = np.arange(1,1000) #with st.sidebar: #if selected == "select website": site = st.sidebar.selectbox("select one website", webs) #elif selected == "search job": job = st.sidebar.selectbox("select one job", jobs) #elif selected == "numbers of jobs": num_jobs = st.sidebar.selectbox("select num of jobs you want to scrap", nums) import streamlit.components.v1 as components import hydralit_components as hc n2 = pd.DataFrame() if st.sidebar.button('Start Scrapping'): if site =="Wuzzuf": with st.container(): st.write("---") tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"]) with tab1 : with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]): time.sleep(5) n1 = Wuzzuf_scrapping(job ,num_jobs ) try: tab1.dataframe(n1) except: try: tab1.write(n1.astype(str).set_index(n1.index.astype(str))) # Success except: tab1.table(n1) with tab2: map_bubble(n1) with tab3: #tab3.plotly_chart(wuzzuf_exp(n1)) wuzzuf_exp(n1) if site =="Linkedin": with st.container(): st.write("---") tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"]) with tab1 : with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]): time.sleep(5) n1 = LINKEDIN_Scrapping(job ,num_jobs ) try: tab1.dataframe(n1) except: try: tab1.write(n1.astype(str).set_index(n1.index.astype(str))) # Success except: tab1.table(n1) with tab2: map_bubble(n1) with tab3: linkedin_exp(n1) # WILL CHANGE