Skip to content

Commit

Permalink
Revert "Better handling of redirects (#1493)"
Browse files Browse the repository at this point in the history
This reverts commit 5fee29d.
  • Loading branch information
hydrusnetwork committed Dec 18, 2023
1 parent 5fee29d commit a825fce
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 48 deletions.
29 changes: 13 additions & 16 deletions hydrus/client/importing/ClientImportFileSeeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
from hydrus.client.media import ClientMediaManagers
from hydrus.client.metadata import ClientTags
from hydrus.client.networking import ClientNetworkingFunctions
from hydrus.client.networking import ClientNetworkingJobs

FILE_SEED_TYPE_HDD = 0
FILE_SEED_TYPE_URL = 1
Expand Down Expand Up @@ -642,7 +641,7 @@ def DownloadAndImportRawFile( self, file_url: str, file_import_options, loud_or_

url_to_fetch = HG.client_controller.network_engine.domain_manager.GetURLToFetch( file_url )

network_job: ClientNetworkingJobs.NetworkJob = network_job_factory( 'GET', url_to_fetch, temp_path = temp_path, referral_url = referral_url )
network_job = network_job_factory( 'GET', url_to_fetch, temp_path = temp_path, referral_url = referral_url )

for ( key, value ) in self._request_headers.items():

Expand All @@ -668,34 +667,32 @@ def DownloadAndImportRawFile( self, file_url: str, file_import_options, loud_or_
self._AddPrimaryURLs( ( url_to_fetch, ) )


#actual_fetched_url = network_job.GetActualFetchedURL()
actual_fetched_url = network_job.GetActualFetchedURL()

redirected_url = network_job.GetRedirectedUrl()

if redirected_url is not None:
if actual_fetched_url not in ( file_url, url_to_fetch ):

self._AddPrimaryURLs( ( redirected_url, ) )
self._AddPrimaryURLs( ( actual_fetched_url, ) )

( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( redirected_url )
( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )

if actual_url_type == HC.URL_TYPE_POST and actual_can_parse:

# we just had a 3XX redirect to a Post URL!

if file_seed_cache is None:

raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}", but then there was no file log in which to queue that download!'.format( file_url, redirected_url ) )
raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}", but then there was no file log in which to queue that download!'.format( file_url, actual_fetched_url ) )

else:

( original_url_type, original_match_name, original_can_parse, original_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.file_seed_data )

if original_url_type == actual_url_type and original_match_name == actual_match_name:

raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}". As that URL has the same class as this import job\'s original URL, we are stopping here in case this is a looping redirect!'.format( file_url, redirected_url ) )
raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}". As that URL has the same class as this import job\'s original URL, we are stopping here in case this is a looping redirect!'.format( file_url, actual_fetched_url ) )


file_seed = FileSeed( FILE_SEED_TYPE_URL, redirected_url )
file_seed = FileSeed( FILE_SEED_TYPE_URL, actual_fetched_url )

file_seed.SetReferralURL( file_url )

Expand Down Expand Up @@ -1383,19 +1380,19 @@ def WorkOnURL( self, file_seed_cache: "FileSeedCache", status_hook, network_job_

parsing_text = network_job.GetContentText()

redirected_url = network_job.GetRedirectedUrl()
actual_fetched_url = network_job.GetActualFetchedURL()

if redirected_url is not None:
if actual_fetched_url != url_to_check:

# we have redirected, a 3XX response

( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( redirected_url )
( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )

if actual_url_type == HC.URL_TYPE_POST and actual_can_parse:

self._AddPrimaryURLs( ( redirected_url, ) )
self._AddPrimaryURLs( ( actual_fetched_url, ) )

post_url = redirected_url
post_url = actual_fetched_url

url_for_child_referral = post_url

Expand Down
10 changes: 5 additions & 5 deletions hydrus/client/importing/ClientImportGallerySeeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,19 +447,19 @@ def WorkOnURL( self, gallery_token_name, gallery_seed_log: "GallerySeedLog", fil

parsing_text = network_job.GetContentText()

redirected_url = network_job.GetRedirectedUrl()
actual_fetched_url = network_job.GetActualFetchedURL()

do_parse = True

if redirected_url is not None:
if actual_fetched_url != url_to_check:

( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( redirected_url )
( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )

if url_type == HC.URL_TYPE_GALLERY:

if can_parse:

gallery_url = redirected_url
gallery_url = actual_fetched_url

url_for_child_referral = gallery_url

Expand All @@ -480,7 +480,7 @@ def WorkOnURL( self, gallery_token_name, gallery_seed_log: "GallerySeedLog", fil

from hydrus.client.importing import ClientImportFileSeeds

file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, redirected_url )
file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, actual_fetched_url )

file_seed.SetReferralURL( url_for_child_referral )

Expand Down
41 changes: 14 additions & 27 deletions hydrus/client/networking/ClientNetworkingJobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,6 @@ def __init__( self, method: str, url: str, body = None, referral_url = None, tem
self._actual_fetched_url = self._url
self._temp_path = temp_path

self._redirected_url = None

self._response_server_header = None
self._response_last_modified = None

Expand Down Expand Up @@ -787,11 +785,11 @@ def _SendRequestAndGetResponse( self ) -> requests.Response:
snc = self._session_network_context


session: requests.Session = self.engine.session_manager.GetSession( snc )
session = self.engine.session_manager.GetSession( snc )

( connect_timeout, read_timeout ) = self._GetTimeouts()

response = session.request( method, url, data = data, files = files, headers = headers, stream = True, timeout = ( connect_timeout, read_timeout ), allow_redirects = False )
response = session.request( method, url, data = data, files = files, headers = headers, stream = True, timeout = ( connect_timeout, read_timeout ) )

with self._lock:

Expand Down Expand Up @@ -1223,14 +1221,6 @@ def GetActualFetchedURL( self ):



def GetRedirectedUrl( self ):

with self._lock:

return self._redirected_url



def GetContentBytes( self ):

with self._lock:
Expand Down Expand Up @@ -1322,7 +1312,7 @@ def GetSecondLevelDomain( self ):



def GetSession( self ) -> requests.Session:
def GetSession( self ):

with self._lock:

Expand Down Expand Up @@ -1564,21 +1554,20 @@ def Start( self ):

response = self._SendRequestAndGetResponse()

if response.is_redirect:

session = self.GetSession()

self._redirected_url = session.get_redirect_target( response )
# I think tbh I would rather tell requests not to do 3XX, which is possible with allow_redirects = False on request, and then just raise various 3XX exceptions with url info, so I can requeue easier and keep a record
# figuring out correct new url seems a laugh, requests has slight helpers, but lots of exceptions
# SessionRedirectMixin here https://requests.readthedocs.io/en/latest/_modules/requests/sessions/
# but this will do as a patch for now
self._actual_fetched_url = response.url

if self._actual_fetched_url != self._url and HG.network_report_mode:

if HG.network_report_mode:

HydrusData.ShowText( 'Network Jobs Redirect: {} -> {}'.format( self._url, self._redirected_url ) )

HydrusData.ShowText( 'Network Jobs Redirect: {} -> {}'.format( self._url, self._actual_fetched_url ) )


elif response.ok:
self._ParseFirstResponseHeaders( response )
self._ParseFirstResponseHeaders( response )

if response.ok:

with self._lock:

Expand Down Expand Up @@ -1643,8 +1632,6 @@ def Start( self ):

else:

self._ParseFirstResponseHeaders( response )

with self._lock:

self._status_text = str( response.status_code ) + ' - ' + str( response.reason )
Expand Down

0 comments on commit a825fce

Please sign in to comment.