|
Python3 multi-threaded download the code, this looks like the original from Axel multi-threaded downloading tool.
'' '
'' '
import sys
import os
import time
import getopt
import urllib.request
import urllib.parse
from threading import Thread
# ================================================= ==============================
# Def download (url, output = os.getcwd (), blocks = 6, proxies = local_proxies)
# Output: the output file path, the default is the current path
# Blocks: the number of threads
# Proxies: a proxy address
# ================================================= ==============================
local_proxies = {} # proxy address
class Maple (Thread):
version = "Mozilla / 5.0"
def __init __ (self, threadname, url, filename, ranges = 0, proxies = {}):
Thread .__ init __ (self, name = threadname)
self.name = threadname
self.url = url
self.proxies = proxies
self.filename = filename
self.ranges = ranges
self.downloaded = 0
def run (self):
try:
self.downloaded = os.path.getsize (self.filename) # get downloaded file byte tranches to support HTTP
except OSError:
#print 'never downloaded'
self.downloaded = 0
opener = GetUrlOpener (self.proxies) # generates the corresponding url opener according to the agent parameters
if self.ranges: #ranges thread for the file to be downloaded block byte range
# Rebuild start poind
self.startpoint = self.ranges [0] + self.downloaded # start downloading from the location of the downloaded byte block after
# This part is completed
if self.startpoint> = self.ranges [1]:
self.downloaded = self.ranges [1] - self.ranges [0]
print ( 'Part% s has been downloaded over.'% self.filename)
return
opener.addheaders = [( 'Range', 'bytes = {} - {}'. format (self.startpoint, self.ranges [1])), ( 'User-agent', 'Mozilla / 5.0')] # add request headers content, download only the specified range of bytes, disguised as the browser requests
print ( 'task% s will download from% d to% d'% (self.name, self.startpoint + 1, self.ranges [1] +1))
else: #ranges not specify (the file size is unknown, can not be cut), start downloading all the remaining bytes from the location of the downloaded byte block after
self.startpoint = self.downloaded
opener.addheaders = [( 'Range', 'bytes = {} -'. format (self.startpoint)), ( 'User-agent', 'Mozilla / 5.0')]
self.fetchsize = 16384 # The number of bytes per read
self.urlhandle = opener.open (self.url) # open the file Address
data = self.urlhandle.read (self.fetchsize)
while data: # cycle to read data written to a temporary file, and update the number of bytes downloaded
filehandle = open (self.filename, 'ab +')
filehandle.write (data)
filehandle.close ()
self.downloaded + = len (data)
data = self.urlhandle.read (self.fetchsize)
def Sec2Time (second): # convert the seconds to standard time format. That have an existing function, Leng Shimo results found
day = second // (3600 * 24)
second- = day * 3600 * 24
hour = second // 3600
second - = hour * 3600
minute = second // 60
second- = minute * 60
if day == 0:
if hour == 0:
if minute == 0:
return '{: 0.2f} S.' format (second).
else:
return '{: 02} M: {: 0.2f} S'.format (minute, second)
else:
return '{: 02} H: {: 02} M: {: 0.2f} S'.format (hour, minute, second)
else:
return '{: 03} D: {: 02} H: {: 02} M: {: 0.2f} S'.format (day, hour, minute, second)
def GetUrlOpener (proxies = {}): # Analysis agent parameters, return url opener. Full agent format: user / passwd @ http: //127.0.0.1: 8087. Such as a different format, you need to modify this analysis function
if proxies:
try:
ap = proxies.split ( '@')
if len (ap)> 1:
auth = ap [0]
addr = ap [1]
else:
addr = ap [0]
auth = ''
if ': //' in addr:
ptype = addr [: addr.find ( ': //')]
phost = addr [addr.find ( ': //') +3:]
else:
ptype = 'http'
phost = addr
proxy = {ptype: ptype + ': //' + phost}
proxy_handler = urllib.request.ProxyHandler (proxy)
except Exception as ex:
print (ex)
return urllib.request.build_opener ()
try:
authlist = auth.split ( '/')
if len (authlist)> 1:
user = authlist [0]
passwd = authlist [1]
proxy_auth_handler = urllib.request.ProxyBasicAuthHandler ()
proxy_auth_handler.add_password ( 'realm', phost, user, passwd)
opener = urllib.request.build_opener (proxy_handler, proxy_auth_handler)
else:
opener = urllib.request.build_opener (proxy_handler)
return opener
except Exception as ex:
print (ex)
return urllib.request.build_opener (proxy_handler)
else:
# UrlHandler = urllib.request.urlopen (url)
return urllib.request.build_opener ()
def GetUrlFileInfo (url, proxies = {}): # Get the file to download information, including file name, file type and file size
scheme, netloc, path, query, fragment = urllib.parse.urlsplit (url) # url Analysis
filename = urllib.parse.unquote (path) # url if the file name part of Chinese presence, which was correctly decoded
filename = filename.split ( '/') [- 1]
opener = GetUrlOpener (proxies) # read request over the network in response to the head, according to the head get file information. File name to file name information returned by the server prevail
urlHandler = opener.open (url)
headers = urlHandler.info ()
if 'Content-Disposition' in headers: # Content-Disposition field is likely to get the file name, but may be garbled, could not find a solution span>
disposition = headers.get ( 'Content-Disposition')
if 'filename =' in disposition:
filename = disposition.split ( 'filename =') [1]
if filename [0] == ' "' or filename [0] ==" ' ":
filename = filename [1: -1]
filename = urllib.parse.unquote (filename)
if filename:
(Name, ext) = os.path.splitext (filename)
else:
(Name, ext) = ( 'Unknown', '')
if 'Content-Length' in headers: # Get file length, if the acquisition failure, you can only use single-threaded download
length = int (headers.get ( 'Content-Length'))
else:
length = -1
(Type, kind) = headers.get ( 'Content-Type'). Split ( '/') # get file types, backup
infos = [(name, ext), (type, kind), length]
return infos
def SpliteBlocks (totalsize, blocknumber): # divide each download thread range according to the specified parameters and the number of threads to get the file length
blocksize = totalsize // blocknumber
ranges = []
for i in range (0, blocknumber-1):
ranges.append ((i * blocksize, i * blocksize + blocksize - 1))
ranges.append ((blocksize * (blocknumber-1), totalsize -1))
return ranges
def islive (tasks): # check whether the entire download is complete thread
for task in tasks:
if task.isAlive ():
return True
return False
def download (url, target = os.getcwd (), blocks = 6, proxies = local_proxies):
flag = True
print ( 'Retrieving resource information ...')
url = urllib.parse.quote (url, safe = '/% & @ = + $;,:?') # url encoding would provide non-English characters will be encoded to a standard format
try:
infos = GetUrlFileInfo (url, proxies) # Getting file information
except Exception as ex:
print (ex)
flag = False
if flag: & nbsp;
if not os.path.exists (target):
os.makedirs (target)
size = infos [2] # Get the file size
output = os.path.join (target, ''. join (infos [0])) # full path according to the acquired file name and save directory specified
type = infos [1] [0]
starttime = time.time () # start time
print ( 'Infomation:')
print ( 'FileName: {0} FileType: {1} FileLength: {2}'. format ( '/' join (infos [1]), infos [2] if 'join (infos [0]),.'. int (infos [2])> 0 else 'Unknown')) # print the acquired file information
if size> 0: #size greater than 0 indicates success Get file length, can be multi-threaded download
print ( 'Starting multithread download ...')
ranges = SpliteBlocks (size, blocks)
else: # only single-threaded download, set the number of threads 1, ranges blanking,
print ( 'Starting single thread download ...')
ranges = ()
blocks = 1
threadname = [infos [0] [0] + "_ thread_% d"% i for i in range (0, blocks)] # thread name generation
filename = [infos [0] [0] + "_tmpfile_% d"% for i in range (0, blocks)] # generate temporary file name for each thread
tasks = []
for i in range (0, blocks): # generate download threads set as a background thread is started after the thread is added to the list of threads
task = Maple (threadname [i], url, filename [i], ranges [i] if ranges else ranges, proxies)
task.setDaemon (True)
task.start ()
tasks.append (task)
time.sleep (1)
downloaded = 0
while islive (tasks): state statistics # list of threads each thread, output download progress
downloaded = sum ([task.downloaded for task in tasks])
if size> 0:
process = downloaded / float (size) * 100
show = '\ rFilesize:% d Downloaded:% d Completed:.% 2f %%'% (size, downloaded, process)
else:
show = '\ rDownloaded:% d'% downloaded
sys.stdout.write (show)
sys.stdout.flush ()
time.sleep (0.2)
endtime = time.time () # after the download is complete, stop the clock
consuming = Sec2Time (endtime-starttime)
if size> 0: # subsequent processing of multi-threaded download
downloadsize = 0
for i in filename:
downloadsize + = os.path.getsize (i)
if downloadsize == size:
show = '\ rFilesize:% d Downloaded:% d Completed:.% 2f %% \ n'% (size, downloadsize, 100)
else:
show = '\ nSize is not mathed! \ n'
flag = False
else: # follow-up treatment of single-threaded download
show = '\ nTotal Size:% d \ n'% downloaded
sys.stdout.write (show)
sys.stdout.flush ()
if flag: # temporary files confirm each file downloaded will be no problem for the final integration of the target file
print ( 'Integrating files ...')
num = 1
while os.path.exists (output): # prevent local file the same name already exists
fname, fext = os.path.splitext (output)
if '(' + str (num-1) + ')' + fext in output:
output = output.replace ( '(' + str (num-1) + ')' + fext, '(' + str (num) + ')' + fext)
else:
fname + = '(' + str (num) + ')'
output = fname + fext
num + = 1
if len (filename) == 1: # single-threaded download, download the file directly to rename the file to the target
os.rename (filename [0], output)
else: # multithreading temporary file integration
filehandle = open (output, 'wb +')
for i in filename:
try:
f = open (i, 'rb')
filehandle.write (f.read ())
f.close ()
os.remove (i)
except Exception as ex:
print (ex)
filehandle.close ()
if os.path.exists (output):
print ( 'Download Complete!')
else:
print ( 'Failed to generate target file!')
try:
# Os.remove (output)
pass
except:
pass
else:
for i in filename:
try:
os.remove (i)
pass
except:
pass
print ( 'Download Failed!')
pass
print ( 'Consuming: {} \ n'.format (consuming)) # Output Processed
& Nbsp; & nbsp; else:
& Nbsp; & nbsp; & nbsp; & nbsp; print ( 'Failed to retrieve resource information!')
& Nbsp; & nbsp; & nbsp; & nbsp; sys.exit ()
def main (argv): # process the incoming parameters, use the getopt module, another module optparse a more powerful processing arguments passed
try:
options, args = getopt.getopt (argv, 'hu: f: n: p:', [ 'help', 'url =', 'target =', 'num =', 'proxy ='])
except Exception as ex:
print (ex)
sys.exit ()
num = 2
url, target, proxies = '', '', ''
url = 'http://www.pygtk.org/dist/pygtk2-tut.pdf'
target = '/ home / maple / Desktop'
#proxies = 'http://127.0.0.1:8087'
# Proxies = {}
for name, value in options:
if name in ( '-h', '- help'):
print ( 'No Help ^^')
sys.exit ()
if name in ( '-u', '- url'):
url = value
if name in ( '-t', '- target'):
target = value
if name in ( '-n', '- num'):
num = int (value)
if name in ( '-p', '- proxy'):
proxies = value
#check args
download (url, target, num, proxies)
if __name__ == '__main__':
main (sys.argv [1:])
This code is written in exception handling some chaos, did not care how exception handling, and then change it if necessary
Further multi-threaded download, if you use a proxy, will lead to discrepancies downloaded the file server and file size. Thus download failed. I am using GoAgent, the proxy server will automatically download the target file multithreading, ignoring download byte range specified by the program. The first thread is downloaded to the complete file, other threads can download redundant content. We did not find the standard approach. Alternative approach has two kinds:
1 will be used as a proxy for the multi-threaded download enforce the single-threaded download
2, not to check the file size, multithreaded 0 temporary files downloaded renamed to the target file, and other temporary files deleted.
Two methods are simple, but the damage to the overall logic of the code. No added code. |
|
|
|