f = float negs = sc.textFile('s3n://prognoos-pyspark/NEG/*.gz') \ .filter(lambda x: x.count(';') > 14) \ .map(lambda x: [s.strip() for s in x.split(';')]) \ .map(lambda x: (strpt(x[5]), 'NEG', x[1], f(x[3]), f(x[16]), x[17])) buys = sc.textFile('s3n://prognoos-pyspark/CPA/*.gz') \ .filter(lambda x: x.count(';') > 14) \ .map(lambda x: [s.strip() for s in x.split(';')]) \ .map(lambda x: (strpt(x[6]), 'CPA', x[1], f(x[8]), x[15], None)) sell = sc.textFile('s3n://prognoos-pyspark/VDA/*.gz') \ .filter(lambda x: x.count(';') > 14) \ .map(lambda x: [s.strip() for s in x.split(';')]) \ .map(lambda x: (strpt(x[6]), 'VDA', x[1], f(x[8]), None, x[15])) all_operations = negs.union(buys).union(sell) total = all_operations.count() # total = 52980676