import os
import pandas as pd
import argparse
import sys

parser = argparse.ArgumentParser(

            description="This scrips creates a manifest file for use with QIIME2. " 
                        "It can take single-end or paired-end reads, and should detect "
                        "whether you have single or paired end reads in your files "
                        "(it bases this on the presence of _1 / _R1 and _2 / R2 in "
                        "your file names, unless you have given it a metadata file). "
                        "The script is expecting that your file names follow the "
                        "format samplename_R1.fastq.gz or samplename_1.fq (if these are not "
                        "in a metadata file). It is also assumed that all files have the same "
                        "extension.",
            epilog='''
            Usage example:
            make_manifest.py -f raw_data -o Illumina_manifest.txt
            make_manifest.py -f raw_data -o PacBio_manifext.txt -m metadata_file.txt
                   ''', formatter_class=argparse.RawDescriptionHelpFormatter)
                         
parser.add_argument('-f', '--folder', metavar='PATH', type=str, required=True,
                    help='The folder that contains the raw reads.')
parser.add_argument('-o', '--outfile', metavar='PATH', type=str, required=True,
                    help='Output file to save the new manifest file to.')
parser.add_argument('--pacbio', default=False, action="store_true",
                    help='Whether samples are PacBio or not. Setting this option will help when the file naming format does not follow that used by Illumina. If you set this, only --folder and --outfile are needed.')
parser.add_argument('-m', '--metadata', metavar='PATH', type=str, default=None,
                    help='Metadata file containing sample names in the first column, expectation is tab-delimited.')
parser.add_argument('--continue_without_all_files', default=False, action="store_true",
                    help='Continue with making the file even if not all samples from the metadata file can be found.')
parser.add_argument('--readtype', type=str, default=None, choices=['single', 'paired'],
                    help='Whether samples are single- or paired-end. Only necessary if --continue_without_all_files is set.')


args = parser.parse_args()

if args.continue_without_all_files:
    if args.readtype == None:
        sys.exit("You need to set --readtype to run with the --continue_without_all_files option.")

current_directory = str(os.getcwd())
folder = str(args.folder)
try:
    this_folder = os.listdir(folder)
    if '/home/' not in folder:
        folder = current_directory+'/'+folder
except:
    folder = folder

files = os.listdir(args.folder)

if args.pacbio:
    print("You've set pacbio so this means that only the folder and outfile arguments are needed (and these are the only arguments that will be used).")
    ext = '.'+files[0].split('.', 1)[1]
    files = list(set([f.split('.')[0] for f in files]))
    all_files = []
    columns = ['sample-id', 'absolute-filepath']
    for f in files:
        all_files.append([f, folder+'/'+f+ext])
    manifest = pd.DataFrame(all_files, columns=columns).set_index('sample-id')
    manifest.to_csv(args.outfile, sep='\t')
    sys.exit('Finished making pacbio manifest file with '+str(len(all_files))+' samples')

ext = '.'+files[0].split('.', 1)[1]
files = list(set([f.split('.')[0] for f in files]))
files_no_underscore = list(set([f.split('_')[0] for f in files]))

sample_rename = None
if args.metadata != None:
    md = pd.read_csv(args.metadata, sep='\t', index_col=0, header=0)
    samples = list(md.index.values)
    if args.continue_without_all_files:
        keeping_samples = []
    for sample in samples:
        if sample not in files and sample not in files_no_underscore:
            print("Couldn't find a file with this sample name: "+sample)
        else:
            if args.continue_without_all_files:
                keeping_samples.append(sample)
            else:
                continue
    if 'sample_rename' in md.columns:
        sample_rename = {}
        for row in md.index.values:
            sample_rename[row] = md.loc[row, 'sample_rename']
    if args.continue_without_all_files:
        samples = keeping_samples
        if args.readtype == 'single': single = True
        else: single = False
    else:
        single = False
        if len(files) == len(samples) and len(files_no_underscore) == len(samples):
            single = True
else:
    samples = [f.split('_')[0] for f in files]
    samples = [f.split('.')[0] if '.' in f else f for f in samples]
    samples = set(samples)
    samples = sorted(list(samples))
    single = False
    if len(files) == len(samples):
        single = True
        fwd_rev = False
        for f in files:
            if '_R1' in f or '_1' in f or '_R2' in f or '_2' in f:
                print("All of the file names are unique but it looks like there are some that have _R1 or _1 in them.\
                    Continuing with running, but please look carefully at the output that you get to check that it makes sense.")

all_files = []
if single:
    columns = ['sample-id', 'absolute-filepath']
    for f in samples:
        if sample_rename == None:
            all_files.append([f, folder+'/'+f+ext])
        else:
            all_files.append([sample_rename[f], folder+'/'+f+ext])
else:
    r = ''
    if files[0].split('_')[1][0] in ['R', 'r']:
        r += files[0].split('_')[1][0]
    columns = ['sample-id', 'forward-absolute-filepath', 'reverse-absolute-filepath']
    for f in samples:
        if sample_rename == None:
            all_files.append([f, folder+'/'+f+'_'+r+'1'+ext, folder+'/'+f+'_'+r+'2'+ext])
        else:
            all_files.append([sample_rename[f], folder+'/'+f+'_'+r+'1'+ext, folder+'/'+f+'_'+r+'2'+ext])

manifest = pd.DataFrame(all_files, columns=columns).set_index('sample-id')
manifest.to_csv(args.outfile, sep='\t')
