
    ݫGi>                        d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZ ddlZddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZmZmZmZ ddlmZmZm Z m!Z!m"Z" ddlm#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+m,Z, ddl-m.Z.  G d d      Z/y)z(
Core LinkChecker class for the package
    N)BeautifulSoup)randint   )AparserLinkExtractor)ForeBackStyle)urlparse)
get_config)
get_connectionget_website_settingsget_content_type_settingsget_global_deletion_settingsget_projectget_user_agentget_cookiesget_deleted_keywordsis_cloudflare_websiteget_project_title_score)	get_proxyget_contenttest_contentmerge_settings	parse_url)selenium_contentplaywright_contentprocess_instagramprocess_facebookprocess_tiktokprocess_redditcurl_cffi_content)LinkCheckerErrorConfigurationError)fuzzc                   t    e Zd ZdZddZd Zd Zd Zd Zd Z	dd	Z
dd
Zd Zd Z	 ddZd Zd Zd Zd Zy)LinkCheckerzN
    Main LinkChecker class that orchestrates link checking functionality
    Nc                 B    t        ||      | _        d| _        d| _        y)z
        Initialize LinkChecker
        
        Args:
            server_type (str): Type of server (autocheck, linkverification, etc.)
            config_file (str): Path to custom config file
        N)r   configconcur)selfserver_typeconfig_files      4/var/www/html/utilities/link_checker_package/core.py__init__zLinkChecker.__init__"   s      !k:    c                 &    | j                          | S )zContext manager entry)connectr+   s    r.   	__enter__zLinkChecker.__enter__.   s    r0   c                 $    | j                          y)zContext manager exitN)
disconnect)r+   exc_typeexc_valexc_tbs       r.   __exit__zLinkChecker.__exit__3   s    r0   c                 N    t        | j                  d         \  | _        | _        y)zConnect to databasedatabaseN)r   r(   r)   r*   r3   s    r.   r2   zLinkChecker.connect7   s    +DKK
,CD$(r0   c                     | j                   r| j                   j                          | j                  r| j                  j                          yy)zDisconnect from databaseN)r*   closer)   r3   s    r.   r6   zLinkChecker.disconnect;   s1    88HHNN88HHNN r0   c                    	 | j                   st        d       | j                          y| j                   j                  d       y# t        $ r}t        d|        	 t        | d      r/| j                  r#	 | j                  j                          n#  Y nxY w| j                          t        d       Y d}~y# t        $ r}t        d|        Y d}~Y d}~y	d}~ww xY wd}~ww xY w)
z
        Ensure database connection is active, reconnect if needed
        
        Returns:
            bool: True if connected, False if connection failed
        z3Database cursor not found, attempting to connect...TzSELECT 1z3Database connection lost, attempting to reconnect: connz!Database reconnected successfullyNz!Failed to reconnect to database: F)r*   printr2   execute	Exceptionhasattrr@   r>   )r+   ereconnect_errors      r.   _ensure_database_connectionz'LinkChecker._ensure_database_connectionB   s    	88KL HHZ( 	GsKL4(TYY		) 9: 9/9JKLu	sX   'A A 	C&C!B97BB9B B99	CCC!CC!!C&c                 2	   	 t        |      }| j                         st        d        | j                  ||fi |S | j                         st        d        | j                  ||fi |S t	        | j
                  |      }|d   }|d   }t        |      }i }	|rE| j                         st        d        | j                  ||fi |S t        | j
                  |      }	| j                         st        d        | j                  ||fi |S t        | j
                        }
|j                  d      }|j                  d      }|j                  d	      }|j                  d
      }|j                  d      }|j                  d      }|j                  d      }|j                  d      }t        |dv|dg i fv|dv|dg i fv|dvg      }t        d| d| d|        |r t        d       |d}|d}|d}|d}|d}n|t        d       |
j                  dd      rdj                  |
d         }|
j                  dd      rdj                  |
d         }|
j                  dd      r|
j                  d      du}|d}|d}|d}t        d| d| d| d |        g }g }|rFt        |      j                  d      D cg c]#  }|j                         s|j                         % }}|rFt        |      j                  d      D cg c]#  }|j                         s|j                         % }}	 t        | j                         }	 | j                         st        d#       d$}nt%        | j
                        }|sd$}t        d&| d'| d(| d)| d*| d|        | j'                  |||||||      \  }}t        d+| d,|        | j)                  ||||||||||
|||      }t        d-| d|        |||d.   |||||||||||d/||d0	S c c}w c c}w # t"        $ r}t        d!|        d"}Y d}~d}~ww xY w# t"        $ r}t        d%|        d$}Y d}~d}~ww xY w# t"        $ r'}ddl}|j-                          t/        d1|       d}~ww xY w)2a$  
        Main method to check a link
        
        Args:
            url (str): URL to check
            project_id (int): Project ID (optional)
            **kwargs: Additional parameters
        
        Returns:
            dict: Check result with status, content, and analysis
        z/Database connection failed, using fallback modezKDatabase connection lost during website settings fetch, using fallback modewebsite_settingshosting_emailzPDatabase connection lost during content type settings fetch, using fallback modezJDatabase connection lost during global settings fetch, using fallback modecheck_http_code
http_codescheck_stop_words	stopwordshomepage_redirectcheck_stop_words_on_pageuse_seleniumuse_playwright)Nr   NzWebsite settings for : z, has_settings=<Website has settings configured, using ONLY website settingsF0No website settings found, using global settings	http_code ,stopwordredirectr   Settings: selenium=, playwright=, http_codes=, homepage_redirect=Failed to get proxy: 127.0.0.1:8080zKDatabase connection lost during user agent fetch, using fallback user agentsMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36zFailed to get user agent: Fetching content for  using user agent: z	 : proxy=z, cloudflare=z : selenium=Fetched content for  with status: Analysis result for statusrK   rL   rM   
stop_wordsrO   rP   rQ   rR   	urldisplay_linkrg   contenttest_resultanalysissettingsproxy	useragentzLink check failed: )r   rG   rA   _check_link_without_databaser   r*   r   r   r   getanyjoinstrsplitstripr   r(   rC   r   _fetch_content_analyze_content	traceback	print_excr"   )r+   rk   
project_idkwargsrl   website_datarI   rJ   
cloudflarecontent_type_settingsglobal_settingsrK   rL   rM   ri   rO   rP   rQ   rR   website_has_settingscodes_to_checkstop_words_to_checkcwrq   rE   rr   rm   rn   analysis_resultr|   s                                  r.   
check_linkzLinkChecker.check_linkg   s   [	>$S>L 335GH8t88jSFSS 335cd8t88jSFSS/,GL+,>?(9M.}=J %'!779lm<4<<S*WPVWW(A$((J(W% 335bc8t88jSFSS:488DO /223DEO)--l;J/334FG)--k:J 0 4 45H I'7';';<V'W$+//?L-112BCN $'y04R.0 	14R.0!2( $  ),r:J9K?[oZpqr $TV"*&+O%!%J#+',$%!%J$,(-%HJ"&&{26!$/+*F!GJ"&&z26!$/**E!FJ"&&z!4(7(;(;J(Gt(S% (/+0(#$%!&'~]>BRR_`j_kk  AR  @S  T  U  N"$58_5J5J35O!]STSZSZS\!'')!]!]:=j/:O:OPS:T&bQXYX_X_Xaqwwy&b#&b)!$++.

R779gh !VI .txx 8I$ %Z	 ),7J9+U^_d^eers}r~  K  LX  KY  Yf  gu  fv  w  x#'#6#6\9eZn$ G[
 (n[MRS #33o~ "57H(,UXZnO
 (b8IJK  ,)(3"*+'6",(8",):0H$0&4	 &% Q "^&b
  )-aS12()  R21#67 R	RT  	>!"%8#<==	>s   9Q& .Q& +AQ& 	AQ& F4Q& PP* Q& 
P P2Q& 5P 7Q  BQ& 
Q& 	P=#P83Q& 8P==Q&  	Q#	QQ& Q##Q& &	R/"RRc                    	 t        d|        t        |      }i }i }g dg g g d}|j                  d      }|j                  d      }	|j                  d      }
|j                  d      }|j                  d      }|j                  d	      }|j                  d
      }|j                  d      }t        |du|	du|
du|du|dug      }|r t        d       |d}|	d}	|
d}
|d}|d}nt        d       ||j                  d      du}|	%|j                  d      rdj	                  |d         }	|
|j                  d      du}
|%|j                  d      rdj	                  |d         }||j                  d      du}|d}|d}|d}t        d| d| d|	 d|        g }g }|	rFt        |	      j                  d      D cg c]#  }|j                         s|j                         % }}|rFt        |      j                  d      D cg c]#  }|j                         s|j                         % }}	 t        | j                        }d}t        d| d|        | j                  ||||      \  }}t        d| d|        | j                  |||||
|||||d|d      }t        d| d |        |||d!   |||||	|
|||||d"||d#	S c c}w c c}w # t        $ r}t        d|        d}Y d}~d}~ww xY w# t        $ rG}t        d$|        d%dl}|j                          |d&d'd(d)d(ggd&d*| dd(d)d(gd+i dd,d#	cY d}~S d}~ww xY w)-a&  
        Check a link without database connection (fallback method)
        
        Args:
            url (str): URL to check
            project_id (int): Project ID (optional)
            **kwargs: Additional parameters
        
        Returns:
            dict: Basic check result
        z Checking link without database: )404410451)rV   rY   rZ   	skip_coderK   rL   rM   rN   rO   rP   rQ   rR   NrT   FrU   rV   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rS   rg   rh   rj   z'Error in _check_link_without_database: r   errorrW   500nullzDatabase connection failed: rg   reason
is_deletedrV   redirect_url	all_codesz<Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36)rA   r   rt   ru   rv   rw   rx   ry   r   r(   rC   _fetch_content_simpler{   r|   r}   )r+   rk   r~   r   rl   rI   r   r   rK   rL   rM   ri   rO   rP   rQ   rR   r   r   r   r   r   rq   rE   rr   rm   rn   r   r|   s                               r.   rs   z(LinkChecker._check_link_without_database  sz   T	4SE:; %S>L  "$&!2	O /223DEO)--l;J/334FG)--k:J 0 4 45H I'7';';<V'W$+//?L-112BCN $'t+$& ,$&!-( $  $TV"*&+O%!%J#+',$%!%J$,(-%HJ"*&5&9&9+&Fd&RO%/*=*=k*J!$/+*F!GJ#+'6':'::'Fd'R$%/*=*=j*I!$/**E!FJ$,(7(;(;J(Gt(S% (/+0(#$%!&'~]>BRR_`j_kk  AR  @S  T  U  N"$58_5J5J35O!]STSZSZS\!'')!]!]:=j/:O:OPS:T&bQXYX_X_Xaqwwy&b#&b)!$++.
 NI),7J9+VW $(#=#=c<QZ\a#b G[(n[MRS #33o~ "57H(,PSUZO
 (b8IJK  ,)(3"*+'6",(8",):0H$0&4	 &% 9 "^&b
  )-aS12()T  	;A3?@!  ,! %vw7% <QC@"'!&$*"' )[! 	ss   GL 	KK1 L K'K9L <K B L 
L 	K?%K:5L :K??L 	M<MMMc           	         	 t        d       d|z   }t        t        t        j                         dz              dz   }| j                  d   d    d| d| d| d| 	}t        j                         }t        ||d	
      }	t        j                         |z
  }
|	dk(  rPt        d       	 t        |||      }t        d|
dd       |d   r|d   d|d   dggfS |d   |d   |d   |d   ggfS t        |	      }t        d|
dd       |d   dk(  rt        d       	 t        |||      }|d   dv r2t        d       |d   r|d   d|d   dggfS |d   |d   |d   |d   ggfS t        d        t        |||d!| j                        }|d   |d   dd"ggfS |d   d'v rot        d(|d    d)       	 t        |||      }|d   dv r2t        d       |d   r|d   d|d   dggfS |d   |d   |d   |d   ggfS t        d*|d    d+       |	|fS |	|fS # t        $ r }t        d|        ddddggfcY d}~S d}~ww xY w# t        $ ro}t        d#| d$       	 t        |||d!| j                        }|d   |d   dd"ggfcY d}~S # t        $ r"}t        d%| d&       |	|fcY d}~cY d}~S d}~ww xY wd}~ww xY w# t        $ r}t        d,| d+       |	|fcY d}~S d}~ww xY w# t        $ r}t        d-|        	 t        d.       t        |||      }|d   r|d   d|d   dggfcY d}~S |d   |d   |d   |d   ggfcY d}~S # t        $ r%}t        d/|        ddddggfcY d}~cY d}~S d}~ww xY wd}~ww xY w)0aH  
        Simple content fetching without database dependencies
        
        Args:
            url (str): URL to fetch
            display_link (str): Display link
            useragent (str): User agent string
            proxy (str): Proxy string
        
        Returns:
            tuple: (content, test_result)
        zAUsing simple scraper with curl_cffi fallback for content fetchinghttp://  .txtpathsscraper_path ""    timeoutTIMEOUT4Scraper timed out after 30 seconds, trying curl_cfficurl_cffi completed in .2f seconds after scraper timeout   r   300   r   -curl_cffi also failed after scraper timeout: rW   r   r   NScraper completed in  seconds206z1Scraper returned code 206, trying curl_cffi first200r   (curl_cffi succeeded where scraper failedz&curl_cffi also failed, trying SeleniumFr   curl_cffi failed: z, trying SeleniumzSelenium retry also failed: #, returning original scraper result)r   403429503Scraper returned , trying curl_cffizcurl_cffi also returned , using original scraper resultcurl_cffi fallback failed: z"Error in simple content fetching: zTrying curl_cffi as last resortz#curl_cffi last resort also failed: )
rA   rw   inttimer(   r   r!   rC   r   r   )r+   rk   rl   rr   rq   proxy_to_uselink_idcommand
start_timerm   elapsed_timecurl_resultrE   rn   content_partse2s                   r.   r   z!LinkChecker._fetch_content_simple  s   W	4UV$u,L#diikD012V;GW-n=>bR~UWXaWbbdeldmnGJ!'7B?G99;3L )#LN
8"3CE"JK3L3EEcde"1~*1~{1~w/OOO*1~AAQ\]^Q_P`/aaa
 'w/K),s);8DE 1~&IK4"3CE"JK"1~7 HJ&q>#.q>E;q>E73S#SS#.q>KNKPQNU`abUcTd3e#ee FH(8CPUW[WbWb(c,Q/-2BFUG1TTT 1~!==)+a.)99KLM0"3CE"JK"1~7 HJ&q>#.q>E;q>E73S#SS#.q>KNKPQNU`abUcTd3e#ee 8Q8HHghi&33
 K''g ! 8I!MNvw7778. ! 4.qc1BCD4(8CPUW[WbWb(c,Q/-2BFUG1TTT$ 4 <RD@cde&333440 ! 07s:YZ["K//0
  	46qc:;
478/YFq>&q>E;q>E7+KKK&q>KNKN[YZ^L\+]]] 4;B4@AE6E733334	4s>  B%K (/H H ,.K 1H9 H9 !1H9 K -1J4 J4 3J4 	K 	H6H1+H6,K 1H66K 9	J1J,&I>8J19K >	J)J$J)J,J1K $J))J,,J11K 4	K=KKK KK 	M4&M/5*L>M4%L>8M4>	M,M'M,M/!M4'M,,M//M4c           
      8   t        d| d| d|        d|v rmt        d       t        | j                  d      }|r7t        |d   d         d	k(  r#t	        |||d   | j
                        }	|	d
dd
ggfS t        d       dd
dd
ggfS d|v rxt        d       t        | j                  d      }
|
rBt        |
d   d         d	k(  r.t        || j                  ||
d   | j
                        }	|	d
dd
ggfS t        d       dd
dd
ggfS d|v rmt        d       t        | j                  d      }|r7t        |d   d         d	k(  r#t        |||d   | j
                        }	|	d
dd
ggfS t        d       dd
dd
ggfS d|v rmt        d       t        | j                  d      }|r7t        |d   d         d	k(  r#t        |||d   | j
                        }	|	d
dd
ggfS t        d       dd
dd
ggfS d|v sd|v rKt        d       t        ||| j                  | j                  ||| j
                        }|d	   |d   |d   d
ggfS d|v r/t        d       t        ||||| j
                        }|d	   d
dd
ggfS |rKt        d       t        ||| j                  | j                  ||| j
                        }|d	   |d   |d   d
ggfS |r2t        d        t        ||||| j
                        }|d	   |d   dd
ggfS t        d!       d"|z   }t        t        t        j                         d#z              d$z   }| j
                  d%   d&    d'| d(| d'| d(| 	}t        j                         }t        ||d)*      }	t        j                         |z
  }|	d+k(  rPt        d,       	 t        |||      }t        d-|d.d/       |d0   r|d	   d1|d   d1ggfS |d	   |d   |d   |d   ggfS t#        |	      }t        d6|d.d7       |d   d8k(  r3t        d9       	 t        ||||| j
                        }|d	   |d   dd
ggfS |d   d<v rt        d=|d    d>       	 t        |||      }|d   d<v rht%        |d	         t%        |	      kD  rNt        d?t%        |d	          d@t%        |	       dA       |d0   r|d	   d1|d   d1ggfS |d	   |d   |d   |d   ggfS t        dB       |	|fS dE|d   v sd4|d   v sd1|d   v s|d   d4k(  rt        dF|d    d;       	 t        |||      }|d   d<v r2t        dG       |d0   r|d	   d1|d   d1ggfS |d	   |d   |d   |d   ggfS t        dH       t        ||| j                  | j                  ||| j
                        }|d	   |d   |d   d
ggfS |d   dMv rm	 t        d=|d    dN       t        |||      }|d   d<v ret%        |d	         dOkD  rTt        t&        j(                  dPz   t*        j,                  z          |d0   r|d	   d1|d   d1ggfS |d	   |d   |d   |d   ggfS |rt        t&        j.                  dQz   t*        j,                  z          t1        |      }|rCt%        |      dOkD  r5t        t&        j(                  dRz   t*        j,                  z          |d
dd
ggfS t        t&        j2                  dSz   t*        j,                  z          |	|fS t        t&        j2                  dTz   t*        j,                  z          |	|fS |	|fS # t         $ r }t        d2|        d3d4dd4ggfcY d5}~S d5}~ww xY w# t         $ rR}t        d:| d;       t        |||      }|d0   r|d	   d1|d   d1ggfcY d5}~S |d	   |d   |d   |d   ggfcY d5}~S d5}~ww xY w# t         $ r}t        dC| dD       |	|fcY d5}~S d5}~ww xY w# t         $ r}t        dI| dJ       	 t        ||| j                  | j                  ||| j
                        }|d	   |d   |d   d
ggfcY d5}~S # t         $ r"}t        dK| dL       |	|fcY d5}~cY d5}~S d5}~ww xY wd5}~ww xY w# t         $ r?}t        t&        j2                  dC| dLz   t*        j,                  z          |	|fcY d5}~S d5}~ww xY w)Ua  
        Fetch content from URL using appropriate method
        
        Args:
            url (str): URL to fetch
            display_link (str): Display link
            useragent (str): User agent string
            proxy (str): Proxy string
            cloudflare (bool): Whether to handle Cloudflare
            use_selenium (bool): Whether to use Selenium
            use_playwright (bool): Whether to use Playwright
        
        Returns:
            tuple: (content, test_result)
        rb   z with settings: selenium=r\   	instagramzUsing Instagram handler
insta_userr   rg   r   r   r   zIInstagram user not available or disabled, falling through to next handlerfacebookzUsing Facebook handlerfacebook_userzHFacebook user not available or disabled, falling through to next handlertiktokzUsing TikTok handlertiktok_userzFTikTok user not available or disabled, falling through to next handlerredditzUsing Reddit handlerreddit_userzFReddit user not available or disabled, falling through to next handlerzkinogoby.zonefilmsz(Using Playwright for kinogoby.zone/filmsr   zturbobit.netzUsing Selenium for turbobit.netzUsing Playwright (configured)zUsing Selenium (configured)z-Using default scraper with curl_cffi fallbackr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rW   r   Nr   r   r   z1Scraper returned code 206, retrying with SeleniumzSelenium retry failed: r   r   r   z&, also trying curl_cffi for comparisonz#curl_cffi returned better content (z vs z chars)zUsing original scraper resultr   r   302z/Detected redirect/500 error in scraper result: r   z(curl_cffi also failed, trying Playwrightr   z, trying PlaywrightzPlaywright also failed: r   )r   r   r   z, trying curl_cffi first2   z*curl_cffi successfully bypassed protectionz7curl_cffi failed for Cloudflare website, trying Aparserz<Aparser successfully extracted content from scraper fallbackz>Aparser failed for scraper fallback, returning original resultzFcurl_cffi failed for non-Cloudflare website, returning original result)rA   r   r*   r   r   r(   r   r   r    r   r)   r   rw   r   r   r!   rC   r   lenr   GREENr	   	RESET_ALLYELLOWr   RED)r+   rk   rl   rr   rq   r   rQ   rR   r   rm   fb_userr   r   r   r   r   r   r   r   r   rE   rn   r   aparser_contents                           r.   rz   zLinkChecker._fetch_content  s
     	%l^3L\NZghvgwxy ,&+,$TXX|<Jc*Q-"9:a?+IsJqM4;;W 888abbejlruzt{d|||<'*+!$((O<G3wqz(349*9dhhWQZQUQ\Q\] 888`aadikqtyszc{{{%()%dhh>Ks;q>(#;<A(CQU 888^__bgiorwqxayyy_()%dhh>Ks;q>(#;<A(CQU 888^__bgiorwqxayyy #w#~<=.y#txxSXZdfjfqfqrM #mA&6a8H5'%RRRs"34,YUJPTP[P[\M #eVeW%=== 12.y#txxSXZdfjfqfqrM #mA&6a8H5'%RRR/0,YUJPTP[P[\M #mA&6%HHH AB$u,L#diikD012V;GW-n=>bR~UWXaWbbdeldmnGJ!'7B?G99;3L )#LN
8"3CE"JK3L3EEcde"1~*1~{1~w/OOO*1~AAQ\]^Q_P`/aaa
 'w/K),s);8DE 1~&IKb$4YUJX\XcXc$dM(+mA.>-PPP 1~/)+a.)99_`a0"3CE"JK #1~7CA<ORUV]R^<^ CCTUDWCXX\]`ah]i\jjqrs&q>#.q>E;q>E73S#SS#.q>KNKPQNU`abUcTd3e#ee =?&33 A&%;q>*AUkZ[nE\`klm`nrw`wGTUGWWijk4"3CE"JK"1~7 HJ&q>#.q>E;q>E73S#SS#.q>KNKPQNU`abUcTd3e#ee HJ(:9c488UYU]U]_dfprvr}r}(~,Q/-2BMRSDTW\V]1^^^ 1~!660-k!n-==UVW"3CE"JK"1~7CA<ORT<Tdjj+WWZ_ZiZiij&q>#.q>E;q>E73S#SS#.q>KNKPQNU`abUcTd3e#ee#dkk.efinixixxy*>s*C*s?/Cb/H!$**/m"mpupp"  A#2UFUG4L#LL!$((-m"mpupp"  A#*K#77dhh)qqty  uD  uD  D  E&33
 K''I ! 8I!MNvw7778 ! b3A36HIJ"3CE"JK"1~*1~{1~w/OOO*1~AAQ\]^Q_P`/aaab8 ! 07s:YZ["K//0* ! 4.qc1DEF4(:9c488UYU]U]_dfprvr}r}(~,Q/-2BMRSDTW\V]1^^^$ 4 8<_`a&33344D ! 0$(('B1#Eh%iilql{l{{|"K//0s  /\
 \
 &\6 A'^ -^ ^  1^= 2^= A
^= A6a a %A>a $0a 0a 
	\3\.(\3.\36	^?.^-^3^^^	^:^5/^:5^:=	aa	?`a	a$a6a7a	;aaa		a	b4bbbc                 R	   |d   }|d   }|d   }ddd|||d}|r"|r |dk(  rd	}||v rHd
|d<   d| d|d<   d|d<   |S |s1|
r/|
j                  d      r|
d   }||v rd
|d<   d| d|d<   d|d<   |S |
r]|
j                  d      rL|
d   }|D ]B  }t        |t              s|d   }|d   dk(  }||k(  s'|s*|s-d|d<   d| d|d<   d|d<   |c S  |rd}|dk(  rd}n|r|r|dk7  r	 t        |      }t        t	        |            }t        |j                        dkD  }|j                  dk(  xs t        |j                        dk  }|r9|r7d}|j                  |j                  k7  rdnd}t        d| d| d| d | d!	       |rd
|d<   d$|d<   d|d<   |S |d%v rd&|d<   |d'k(  rd(nd)|d<   |S t        |      d*k  rd+|d<   d,|d<   |S 	 t        |d-      }|r|j                  }n|}|r{|ry|D ]s  } t        j                  | j                               }!t        j                  |!|t        j                   t        j"                  z  /      s^d
|d<   d0|  d1|d<   d|d<   |c S  n|s|
r|
j                  d2      r}|
d2   }"|"D ]s  } t        j                  | j                               }!t        j                  |!|t        j                   t        j"                  z  /      s^d
|d<   d0|  d3|d<   d|d<   |c S  	 | j%                         st        d4       g }#nt'        | j(                        }#|#D ]r  }$t        j                  |$j                               }!t        j                  |!|t        j                   t        j"                  z  /      s^d
|d<   d6|$ |d<   d|d<   |c S  	 | j%                         st        d7       nBd8}%| j(                  j+                  |%|	g       | j(                  j,                  r|dxx   d9z  cc<   d;|v sd<|v rd=|d<   d>|d<   |S |dxx   d?z  cc<   |S # t        $ r}t        d"|        Y d#}~d#}~ww xY w# t        $ r d+|d<   d.|d<   |cY S w xY w# t        $ r}t        d5|        g }#Y d#}~Xd#}~ww xY w# t        $ r}t        d:|        Y d#}~d#}~ww xY w)@a  
        Analyze fetched content
        
        Args:
            content (str): Fetched content
            test_result (list): Test result from content fetching
            check_http_code (bool): Whether to check HTTP codes
            codes_to_check (list): HTTP codes to check
            check_stop_words (bool): Whether to check stop words
            stop_words_to_check (list): Stop words to check
            homepage_redirect (bool): Whether to check homepage redirect
            check_stop_words_on_page (bool): Whether to check stop words on page
            display_link (str): Display link
            global_settings (dict): Global settings
            is_cloudflare (bool): Whether website is cloudflare
        
        Returns:
            dict: Analysis result
        r   r   r   activerW   Fr   401r   deletedrg   z
HTTP code z found (website configured)r   Tr   rV   z found (global)r   coder   skippedz
Skip code r   r   /zcross-domainzsame-domainzHomepage redirect detected (z): z -> z (code: )z'Error parsing URLs for redirect check: Nz*Redirect to homepage (enabled in settings))205r   maintenancer   zSite maintenancezOnion redirect
   r   zVery little contentzhtml.parserzBS4 parsing error)flagszStop word found: z (website configured)rY   z	 (global)z@Database connection lost during deleted keywords check, skippingz Failed to get deleted keywords: zDeleted keyword found: z<Database connection lost during upload sites check, skippingz0SELECT * FROM upload_sites WHERE hostname = (%s)zUpload hosts link
zFailed to check upload sites: zToo many requests from your IPzToo many requestsrate_limitedzToo many requests from IPzNo stop words found
)rt   
isinstancedictr
   rw   r   pathnetlocrA   rC   r   textreescapery   search
IGNORECASEDOTALLrG   r   r*   rB   rowcount)&r+   rm   rn   rK   r   rM   r   rO   rP   rl   r   is_cloudflareoriginal_urlr   r   res_urlcodesro   global_http_codes
skip_codes	skip_itemr   cloudflare_onlyis_homepage_redirectoriginal_parsedfinal_parsedoriginal_has_pathfinal_is_homepagedomain_inforE   soupcontent_for_stop_words	stop_wordescaped_patternglobal_stop_wordsdeleted_keywordskeywordsqls&                                         r.   r{   zLinkChecker._analyze_content  s   , 1~a.A #
 ~u}~%%."'1$7R%S")-&%/o>Q>QR]>^ / <((%."'1$%G")-& 22;?(5J' ,	i. )& 1I&/&=&BO y(*}1:HX.3=dV?1SHX.5:H\2#+O,F #( u}'+$'g.?I&.|&<O#+CL#9L ),O,@,@(AA(E%)5):):c)A)_SIZIZE[^_E_% )->/3,8G8N8NR^ReRe8enkx <[M\NZ^_f^ggoptouuvwx $%."%Q")-& >!!.HX7;u}!3JZHXO
 w<"!(HX!6HXO	 -8D $%)YY"%,"  30 $	"$))IOO,=">99_.DBMM\^\e\eLef)2HX&+<YKG\)]HX&-1H\*#O$ &/o>Q>QR\>] /
 ;. $	"$))IOO,=">99_.DBMM\^\e\eLef)2HX&+<YKy)QHX&-1H\*#O$	"335XY#% #7#A 
 ( 	 G ii8Oyy*@XZXaXaHab%."'>wi%H")-&	 		8335TUH  |n588$$X&*??&
 ,w6:MQX:X!/HX!<HXO55M ! ICA3GHHI2  	!(HX!4HXO	X  	"4QC89!	".  	821#677	8s\   %BP Q 23Q AR 	Q 'P;;Q QQ	R'Q==R	R&R!!R&c                     	 | j                         st        d       yt        | j                  |      S # t        $ r}t        d|        Y d}~yd}~ww xY w)z
        Get project information
        
        Args:
            project_id (int): Project ID
        
        Returns:
            dict: Project information
        z2Database connection lost during project info fetchNzFailed to get project info: )rG   rA   r   r*   rC   )r+   r~   rE   s      r.   get_project_infozLinkChecker.get_project_info  sS    	335JKtxx44 	045	s   4 4 	AAAc                    d}d}d}d|j                         v rd}d}| j                  j                  ||g       | j                  j                         }|r|d   	 t	        |d         S |dk(  r|S |S # t
        $ r Y w xY w)Nr   project_title_scoreauthorproject_author_scorezASELECT value FROM tse_configs WHERE component='FWS' AND config=%svalue)lowerr*   rB   fetchonefloatrC   )r+   search_phrasedefault_titledefault_authorr(   r  rows          r.   get_score_thresholdzLinkChecker.get_score_threshold  s    &}**,,+FQvh'hh!3w<+S\** "(+A!A~T}T  s   A3 3	A?>A?c                 
   d}d}t        t        j                  dz   |z   dz   |z          t        t        j                         	 |dd}t        j                  d|      }t        d|j                         |j                  d	k7  r6t        t        j                  d
z          t        t        j                         y|j                         }	t        |	       d|	v rT|	d   }t        t        j                         t        dt        |      z   dz   |z          t        t        j                         | j                  |      }
t        |      t        |
      kD  ryy # t        $ r!}t        dt        |      z          Y d }~yd }~ww xY w)Nr   z	Checking z ..z$aecaa0fe-4673-436d-a798-1601cf593f64)indexapi_keyzhttp://127.0.0.1:5000/search)jsonzSearch:    zBad status code TscorezScore:  zError: )rA   r   r   r	   r   requestspoststatus_coder   r  r   rw   r  r  BaseException)r+   linkr  search_termsearch_index	max_score
total_hits	json_datardata	thresholdrE   s               r.   r   zLinkChecker.search  s;   	
djj;&6>EFeoo	%AI
 <9MA*amm,}}#dhh!334eoo&668D$K$ M	dkk"i#i.036=>eoo&00?IY%	"22 3 	)c!f$%	s    A6E ;BE 	F!E==Fc                 d   	 | j                         st        d       yt        | j                  |      }|d   }t	        |d         }t        | j                        }	 t        j                  ||      }t        |      |kD  }	|	r?t        t        j                  dz   |z   d	z   |z          t        t        j                         y|r}t        t	        |            d
kD  rf	 t        j                  ||      }t        |      |kD  }|r?t        t        j                  dz   |z   d	z   |z          t        t        j                         yy# t        $ r}
t        d|
        d}	Y d}
~
d}
~
ww xY w# t        $ r}
t        d|
        d}Y d}
~
d}
~
ww xY w# t        $ r}
t        d|
        Y d}
~
yd}
~
ww xY w)a  
        Check if project title is found on page
        
        Args:
            url (str): URL to check
            project_id (int): Project ID
            content (str): Page content
        
        Returns:
            bool: True if title found, False otherwise
        z+Database connection lost during title checkFproject_titleproject_title_engzError calculating similarity: TNz******* Found title z on    z******* Found translate zFailed to check title on page: )rG   rA   r   r*   rw   r   r$   partial_ratior   rC   r   r   r	   r   r   )r+   rk   r~   rm   projecttitle	translater
  r  resultsrE   ress               r.   check_title_on_pagezLinkChecker.check_title_on_page  s   +	335CD!$((J7GO,EG$789I"9$(("C**5':e*'::
 dkk$::UBVKcQReoo&SY014 ..y'BEe*'::C $++(BBYNQWWZ]]^%//*-  6qc:; ! :1#>?C  	3A378	sx   F >F $E A F F $E(  A F 	E%E F  E%%F (	F1FF FF 	F/F**F/)NN)N)NF)__name__
__module____qualname____doc__r/   r4   r:   r2   r6   rG   r   rs   r   rz   r{   r  r  r   r4   r0   r.   r&   r&      sg    

E#Jg>R`Dd4LM(b INXt&U @7r0   r&   )0r8  r   r   bs4r   randomr   content_fetchersr   coloramar   r   r	   r  urllib.parser
   r(   r   r<   r   r   r   r   r   r   r   r   r   r   utilsr   r   r   r   r   r   r   r   r   r   r    r!   
exceptionsr"   r#   	rapidfuzzr$   r&   r9  r0   r.   <module>rB     sf     	   2 & &  !   
    = ^ ^r0   