path = "/content/drive/My Drive/Colab Notebooks/pillbox/Pillbox.csv" images_path = "/content/drive/My Drive/Colab Notebooks/pillbox/pillbox_production_images_full_201812" data = pd.read_csv(path, usecols=['has_image','splshape_text','medicine_name', 'splimprint', 'splimage', 'splcolor_text','source'], dtype={'has_image': 'str','splshape_text': 'str', 'medicine_name' : 'str', 'splimprint' : 'str' , 'splimage' : 'str', 'splcolor_text' : 'str', 'source' : 'str'}) data = data.replace(np.nan, '', regex=True) data_has_image = data.loc[data['has_image'] == 'True'] unique_shapes = data_has_image.splshape_text.unique() print(unique_shapes) unique_shape_count = len(data_has_image.splshape_text.unique()) shape_to_index = {} for i in range(unique_shape_count): shape_to_index[unique_shapes[i]] = i shape_to_index print("unique shape: ", len(data_has_image.splshape_text.unique())) data_has_image = data_has_image [:1300] #120 print("has image len: ", len(data_has_image)) images = [] labels = [] for row_i in range(data_has_image.shape[0]): row = data_has_image.iloc[row_i] if row.loc['splimage'] == "": continue image_path = images_path+"/" + row.loc['splimage'] + ".jpg" #image = cv2.imread(image_path, 0) try: image = Image.open(image_path).convert('LA') except: continue desired_size = 32 im_pth = image_path im = Image.open(im_pth) old_size = im.size # old_size[0] is in (width, height) format ratio = float(desired_size) / max(old_size) new_size = tuple([int(x * ratio) for x in old_size]) # use thumbnail() or resize() method to resize the input image # thumbnail is a in-place operation # im.thumbnail(new_size, Image.ANTIALIAS) im = im.resize(new_size, Image.ANTIALIAS) # create a new image and paste the resized on it new_im = Image.new("RGB", (desired_size, desired_size)) new_im.paste(im, ((desired_size - new_size[0]) // 2, (desired_size - new_size[1]) // 2)) #new_im.show() image = new_im #splimprint = row.loc['splimprint'] #splimprint_vector = splimprint_to_vector(splimprint) splshape = row.loc['splshape_text'] splshape_vector = np.zeros(unique_shape_count) splshape_vector[shape_to_index[splshape]] = 1 #print("shape and vec: ",splshape, splshape_vector) images.append(np.array(image)) labels.append(splshape_vector) if row_i % 200 == 0: print("loading images: ", row_i, "/", data_has_image.shape[0])