import streamlit as st
import requests
import numpy as np
from streamlit_lottie import st_lottie
from PIL import Image
import warnings
warnings.filterwarnings("ignore")
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import bs4
from urllib.request import urlopen
import time
import re
import time
import matplotlib.pyplot as plt 
import seaborn as sns 
import matplotlib as mpl
import plotly 
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#Settings for using the driver without a UI
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome('chromedriver',options=options)


# wuzzuf function
def Wuzzuf_scrapping(job_type , job_num):
    job1 = job_type.split(" ")[0]
    job2 = job_type.split(" ")[1]
    link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1
    title = []
    location = []
    country = []
    job_description = []
    Job_Requirements =[]
    company_name = []
    links = []
    Jop_type = []
    Career_Level = []
    company_logo = []
    Job_Categories = []
    Skills_And_Tools = []
    Experience_Needed =[]
    post_time = []
    Title = []
    pages_num = np.ceil(job_num/15)


    for i in range(int(pages_num) ):
      link_new = link1 +'&start='+str(i)
      data  = requests.get(link_new)
      soup  = BeautifulSoup(data.content)
      Title = soup.find_all('h2' , {'class': 'css-m604qf'})

  # to get the info about jobs

      for x in range(0,len(Title)):
        t = re.split('\(|\-',Title[x].find('a').text)
        title.append(t[0].strip())
        loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text)
        r = ""
        for i in range(len(loc[:-1])):
          r= r+ ', ' +loc[:-1][i].strip()
        location.append(r.replace(',', '', 1).strip())
        country.append(loc[-1].strip())
        links.append('https://wuzzuf.net' + Title[x].find('a').attrs['href'])
        m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text)))
        company_name.append(m)
        c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span')
        if len(c) ==1:
          Jop_type.append(c[0].text)
        else:
          n =[]
          for i in range(len(c)):
            n.append(c[i].text)
          Jop_type.append(n)
        n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
        Career_Level.append(n[0].text)
        n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])

        yy = n[1].text.replace('·',' ').strip()
        yy = re.findall('[0-9-+]*',yy)
        y1 =""
        for i in range(len(yy)):
        
          if any(yy[i]):
            y1 = y1+yy[i]
        if y1 != "":
          Experience_Needed.append(y1)
        else:
          Experience_Needed.append("Not Specified")
        time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div')
        post_time.append(time.text)
        
  # to get the logo of the company
      
        data1  = requests.get(links[x])
        soup1 = BeautifulSoup(data1.content)
        company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content'])
        #time.sleep(4)


  # get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls
        driver = webdriver.Chrome('chromedriver',options=options)
        #driver.implicitly_wait(10)
        driver.get(links[x])
        Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:])
        Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:])
        job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:])
        all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div')
        dict_other = {}
    
        new = all[0].text.split("\n\n")

        if len(new)!=1 :
          for i in range(len(new)):
            result =[]
            for k in (new[i].split('\n')[1:]):  
              result.append(k.replace("\u202f"," "))
              dict_other[new[i].split('\n')[0]] = result

            #result = re.sub('[\W_]+', '', ini_string)

          Job_Requirements.append(dict_other)

        else:
          nn = new[0].replace("\u202f"," ")
          Job_Requirements.append(nn.split('\n'))


  #  create data frame to combine all together

    df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements})
    
    df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8')
    return df[:job_num]


# linkedin function


def LINKEDIN_Scrapping(job_search , num_jobs):
  job1 = job_search.split(" ")[0]
  job2 = job_search.split(" ")[1]

  link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
  
  # FIRST get main informations about jobs

  title = []
  location = []
  country = []
  company_name = []
  post_time = []
  links =[]
  # get the specific numbers of jobs
  l1 = ""
  ll =""
  driver = webdriver.Chrome('chromedriver',options=options)
  driver.get(link1)
  SCROLL_PAUSE_TIME = 0.5
  while True :
    l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div')
    ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a') 

    if len(l1) >= num_jobs:
      break
    time.sleep(3)
    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        
    options.add_argument("window-size=1200x600")
    WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click()
    print(len(l1))
    time.sleep(2)
    


  l2 = l1[:num_jobs]

  for info in l2:   
    info_tot = info.text.split("\n")
    if len(info_tot)==5:
      title.append(info_tot[1])
      location.append(info_tot[3])
      company_name.append(info_tot[2])
      post_time.append(info_tot[4])
    else:
      title.append(info_tot[1])
      location.append(info_tot[3])
      company_name.append(info_tot[2])
      post_time.append(info_tot[5])

  # get links for jobs
  l3 = ll[:num_jobs]
  for i in l3:
    links.append(i.get_attribute('href'))
  
  df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time})




    # GET DESCRIPTION AND LOGO 
  def all_description_LOGO(urls):
    description =[]
    LOGO =[]
    for link in urls:         
      driver = webdriver.Chrome('chromedriver',options=options)
      driver.get(link)
      options.add_argument("window-size=1200x600")
      WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click()
      qqq= 4+444*58/7+65
      K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img')
      LOGO.append(K.get_attribute('src'))
      time.sleep(3)
      t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div')
      t_reverse=t.text[::-1]

      if t_reverse[:9] =="erom wohs":
        l = len(t.text)
        strings=t.text[:l-9].split("\n")
        strings[:] = [x for x in strings if x]
        description.append(strings)
      else:
        strings=t.text.split("\n")
        strings[:] = [x for x in strings if x]
        description.append(strings)
    df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO})

    return df_ml

  # apply desc. and logo function
  E = all_description_LOGO(links)

  # other info function
  def other(urls):
    frames =[]
    for url in urls:
      data1 = requests.get(url)
      soup1 = BeautifulSoup(data1.content)
      j =  soup1.find('ul' , {'class': 'description__job-criteria-list'})
      time.sleep(4)
      jj=j.find_all('h3')
      dic ={}
      for i in range(len(jj)):
        dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip()
      output = pd.DataFrame()
      output = output.append(dic, ignore_index=True) 
      frames.append(output)
    result = pd.concat(frames)
    return result

  # apply Other function
  df = other(links)
  df.fillna('Not_Found',inplace= True)
  df.reset_index(inplace=True, drop=True)
 
 # combine all together
  result = pd.concat([df_ml,E, df ], axis=1)

  return result


##################### map_bubble #####################

#### function to show map for loaction of the job



def map_bubble(df):
  
  import requests
  import urllib.parse
  g =[]
  for i  in range(len(df.Location)):

    if  df.Location.loc[i].split(","):
      g.append(df.Location.loc[i].split(",")[0])
    else:
      g.append(df.Location.loc[i])
  df['new_loc']=g 
  if 'country' in df.columns:
    df["full_location"] = df["new_loc"] + ", " +df["country"]
    dict_cities = dict(df.full_location.value_counts())
  else :
    dict_cities = dict(df.new_loc.value_counts())
  lat = []
  lon = []
  bubble_df = pd.DataFrame()
  add=[]
  val=[]
  try:
    for address in dict_cities.keys():
      url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(address) +'?format=json'

      response = requests.get(url).json()
      lat.append(response[0]["lat"])
      lon.append(response[0]["lon"])
      add.append(address)
      val.append(dict_cities[address])
  except:
    pass

  bubble_df['address'] =add
  bubble_df['lat'] = lat
  bubble_df['lon'] = lon
  bubble_df['value'] = val


  # import the library
  import folium

  # Make an empty map
  m = folium.Map(location=[20,0], tiles="OpenStreetMap", zoom_start=2)
  # add marker one by one on the map
  for i in range(0,len(bubble_df)):
    folium.Circle(
        location=[bubble_df.iloc[i]['lat'], bubble_df.iloc[i]['lon']],

        popup=bubble_df.iloc[i][['address','value']].values,
        radius=float(bubble_df.iloc[i]['value'])*500,
        color='#69b3a2',
        fill=True,
        fill_color='#69b3a2'
    ).add_to(m)
  m
  # Show the map again
  return m


##########################





#########################
#### wuzzuf analysis
def wuzzuf_exp(df1):
  top10_job_title = df1['Title'].value_counts()[:10]
  fig1 = px.bar(y=top10_job_title.values, 
              x=top10_job_title.index, 
              color = top10_job_title.index,
              color_discrete_sequence=px.colors.sequential.deep,
              text=top10_job_title.values,
              title= 'Top 10 Job Titles',
              template= 'plotly_dark')
  fig1.update_layout(height=500,width=500, 
      xaxis_title="Job Titles",
      yaxis_title="count",
      font = dict(size=17,family="Franklin Gothic"))
  st.plotly_chart(fig1)

  type_grouped = df1['Career_Level'].value_counts()
  #e_type = ['Full-Time','Part-Time','Contract','Freelance']
  e_type =dict(df1['Career_Level'].value_counts()).keys()
  fig2 = px.bar(x = e_type, y = type_grouped.values, 
        color = type_grouped.index, 
        color_discrete_sequence=px.colors.sequential.dense,
        template = 'plotly_dark',
        text = type_grouped.values, title = 'Career Level Distribution')
  fig2.update_layout( height=500, width=500,
      xaxis_title="Career Level",
      yaxis_title="count",
      font = dict(size=17,family="Franklin Gothic"))
  fig2.update_traces(width=0.5)
  st.plotly_chart(fig2)
  residence = df1['Location'].value_counts()
  top10_employee_location = residence[:10]
  fig3 = px.bar(y=top10_employee_location.values, 
              x=top10_employee_location.index, 
              color = top10_employee_location.index,
              color_discrete_sequence=px.colors.sequential.deep,
              text=top10_employee_location.values,
              title= 'Top 10 Location of job',
              template= 'plotly_dark')
  fig3.update_layout(height=500,width=500,
      xaxis_title="Location of job",
      yaxis_title="count",
      font = dict(size=17,family="Franklin Gothic"))
  st.plotly_chart(fig3)
    
  type_grouped = df1['Experience_Needed'].value_counts()
  #e_type = ['Full-Time','Part-Time','Contract','Freelance']
  e_type =dict(df1['Experience_Needed'].value_counts()).keys()
  fig4 = px.bar(x = e_type, y = type_grouped.values, 
        color = type_grouped.index, 
        color_discrete_sequence=px.colors.sequential.dense,
        template = 'plotly_dark',
        text = type_grouped.values, title = ' Experience Level Distribution')
  fig4.update_layout(height=500,width=500,
      xaxis_title=" Experience Level (years)",
      yaxis_title="count",
      font = dict(size=17,family="Franklin Gothic"))
  fig4.update_traces(width=0.5)
  st.plotly_chart(fig4)
  return 



#########################
### linkedin analysis

def linkedin_exp(df1):
  top10_job_title = df1['Title'].value_counts()[:10]
  fig1 = px.bar(y=top10_job_title.values, 
              x=top10_job_title.index, 
              color = top10_job_title.index,
              color_discrete_sequence=px.colors.sequential.deep,
              text=top10_job_title.values,
              title= 'Top 10 Job Titles',
              template= 'plotly_dark')
  fig1.update_layout(height=500,width=500, 
      xaxis_title="Job Titles",
      yaxis_title="count",
      font = dict(size=17,family="Franklin Gothic"))
  st.plotly_chart(fig1)

  type_grouped = df1['Employment type'].value_counts()
  #e_type = ['Full-Time','Part-Time','Contract','Freelance']
  e_type =dict(df1['Employment type'].value_counts()).keys()
  fig2 = px.bar(x = e_type, y = type_grouped.values, 
        color = type_grouped.index, 
        color_discrete_sequence=px.colors.sequential.dense,
        template = 'plotly_dark',
        text = type_grouped.values, title = 'Employment type Distribution')
  fig2.update_layout( height=500, width=500,
      xaxis_title="Employment type",
      yaxis_title="count",
      font = dict(size=17,family="Franklin Gothic"))
  fig2.update_traces(width=0.5)
  st.plotly_chart(fig2)
  residence = df1['Location'].value_counts()
  top10_employee_location = residence[:10]
  fig3 = px.bar(y=top10_employee_location.values, 
              x=top10_employee_location.index, 
              color = top10_employee_location.index,
              color_discrete_sequence=px.colors.sequential.deep,
              text=top10_employee_location.values,
              title= 'Top 10 Location of job',
              template= 'plotly_dark')
  fig3.update_layout(height=500,width=500,
      xaxis_title="Location of job",
      yaxis_title="count",
      font = dict(size=17,family="Franklin Gothic"))
  st.plotly_chart(fig3)
    
  type_grouped = df1['Seniority level'].value_counts()
  #e_type = ['Full-Time','Part-Time','Contract','Freelance']
  e_type =dict(df1['Seniority level'].value_counts()).keys()
  fig4 = px.bar(x = e_type, y = type_grouped.values, 
        color = type_grouped.index, 
        color_discrete_sequence=px.colors.sequential.dense,
        template = 'plotly_dark',
        text = type_grouped.values, title = 'Seniority level Distribution')
  fig4.update_layout(height=500,width=500,
      xaxis_title="Seniority level",
      yaxis_title="count",
      font = dict(size=17,family="Franklin Gothic"))
  fig4.update_traces(width=0.5)
  st.plotly_chart(fig4)
  return 


########################

####################### stream lit app ################################

#site = ""
#job =""
#num_jobs = 0

st.set_page_config(page_title="My Web_Scrap Page", page_icon=":tada:", layout="wide")


# ---- HEADER SECTION ----
with st.container():
    left_column, right_column = st.columns(2)
    with left_column:
        st.subheader("Hi! I am Yassmen :wave:")
        st.title("An Electronics and Communcation Engineer")
        st.write(
            "In this app we will scrap jobs from LinkedIn and Wuzzuf websites, let's get it started :boom:"
        )
        st.write("[Reach me >](https://www.linkedin.com/in/yassmen-youssef-48439a166/)")
    with right_column:
        pass
       # st_lottie(lottie_coding, height=300, key="coding")



import streamlit as st
from streamlit_option_menu import option_menu

#with st.sidebar:
   # selected = option_menu("Main Menu", ["select website", 'search job','numbers of jobs'], icons=['linkedin', 'search','123'], menu_icon="cast", default_index=1)
    
webs =["Wuzzuf","Linkedin"]
jobs =["Machine Learning","AI Engineer","Data Analysis","Software Testing"]
nums = np.arange(1,1000)

#with st.sidebar:
  #if selected == "select website":
site = st.sidebar.selectbox("select one website", webs)
  #elif selected == "search job":
job = st.sidebar.selectbox("select one job", jobs)
  #elif selected == "numbers of jobs":
num_jobs = st.sidebar.selectbox("select num of jobs you want to scrap", nums)



import streamlit.components.v1 as components

import hydralit_components as hc
n2 = pd.DataFrame()

if st.sidebar.button('Start Scrapping'):
  if site =="Wuzzuf":

    with st.container():
        st.write("---")
        tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
        with tab1 :
          with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]):
            time.sleep(5)
            n1 = Wuzzuf_scrapping(job ,num_jobs )
            try:
              tab1.dataframe(n1)
            except:
              try:
                tab1.write(n1.astype(str).set_index(n1.index.astype(str)))  # Success
              except:
                tab1.table(n1)
        with tab2:
          map_bubble(n1)
        with tab3:
          #tab3.plotly_chart(wuzzuf_exp(n1))
          wuzzuf_exp(n1)


  if site =="Linkedin":
    with st.container():
        st.write("---")
        tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
        with tab1 :
          with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]):
            time.sleep(5)
            n1 = LINKEDIN_Scrapping(job ,num_jobs )
            try:
              tab1.dataframe(n1)
            except:
              try:
                tab1.write(n1.astype(str).set_index(n1.index.astype(str)))  # Success
              except:
                tab1.table(n1)
        with tab2:
          map_bubble(n1)
        with tab3:
          linkedin_exp(n1)  # WILL CHANGE