Split into train, test, validation on colab @01/02/2024

388 in train

120 in test

91 in Val

For Colab

import os
import random
import shutil
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

def split_data(source_dir, dest_dir, split_ratio=(0.8, 0.1, 0.1), seed=42):
    # Set a seed for reproducibility
    random.seed(seed)

    # Create destination directories in Google Drive
    train_dir = os.path.join(dest_dir, 'train')
    test_dir = os.path.join(dest_dir, 'test')
    validation_dir = os.path.join(dest_dir, 'validation')

    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    os.makedirs(validation_dir, exist_ok=True)

    # List all files in the source directory
    all_files = os.listdir(source_dir)
    total_files = len(all_files)

    # Calculate the number of files for each split
    num_train = int(split_ratio[0] * total_files)
    num_test = int(split_ratio[1] * total_files)
    num_validation = total_files - num_train - num_test

    # Randomly shuffle the list of files
    random.shuffle(all_files)

    # Copy files to the destination directories based on the split ratio
    for i, filename in enumerate(all_files):
        src_path = os.path.join(source_dir, filename)

        if i < num_train:
            dest_path = os.path.join(train_dir, filename)
        elif i < num_train + num_test:
            dest_path = os.path.join(test_dir, filename)
        else:
            dest_path = os.path.join(validation_dir, filename)

        shutil.copy(src_path, dest_path)

    print(f"Splitting complete. {num_train} files in train, {num_test} files in test, and {num_validation} files in validation.")

# Example usage:
source_directory = '/content/drive/MyDrive/path/to/source_directory'  # Replace with the path to your source directory in Google Drive
destination_directory = '/content/drive/MyDrive/path/to/destination_directory'  # Replace with the desired destination directory in Google Drive

split_data(source_directory, destination_directory)

For local system

import os
import random
import shutil

def split_data(source_dir, dest_dir, split_ratio=(0.8, 0.1, 0.1), seed=42):
    # Set a seed for reproducibility
    random.seed(seed)

    # Create destination directories
    train_dir = os.path.join(dest_dir, 'train')
    test_dir = os.path.join(dest_dir, 'test')
    validation_dir = os.path.join(dest_dir, 'validation')

    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    os.makedirs(validation_dir, exist_ok=True)

    # List all files in the source directory
    all_files = os.listdir(source_dir)
    total_files = len(all_files)

    # Calculate the number of files for each split
    num_train = int(split_ratio[0] * total_files)
    num_test = int(split_ratio[1] * total_files)
    num_validation = total_files - num_train - num_test

    # Randomly shuffle the list of files
    random.shuffle(all_files)

    # Copy files to the destination directories based on the split ratio
    for i, filename in enumerate(all_files):
        src_path = os.path.join(source_dir, filename)
        src_annotation_path = os.path.join(source_dir, f"{os.path.splitext(filename)[0]}.txt")

        if i < num_train:
            dest_path = os.path.join(train_dir, filename)
            dest_annotation_path = os.path.join(train_dir, f"{os.path.splitext(filename)[0]}.txt")
        elif i < num_train + num_test:
            dest_path = os.path.join(test_dir, filename)
            dest_annotation_path = os.path.join(test_dir, f"{os.path.splitext(filename)[0]}.txt")
        else:
            dest_path = os.path.join(validation_dir, filename)
            dest_annotation_path = os.path.join(validation_dir, f"{os.path.splitext(filename)[0]}.txt")

        shutil.copy(src_path, dest_path)

        # Check if annotation file exists and move it
        if os.path.exists(src_annotation_path):
            shutil.copy(src_annotation_path, dest_annotation_path)

    print(f"Splitting complete. {num_train} files in train, {num_test} files in test, and {num_validation} files in validation.")

# Example usage:
source_directory = 'E:\\DataScienceWithPython-main\\OpenCV\\model\\yolo\\Data\\img\\img_annotated'  # Replace with the path to your source directory
destination_directory = 'E:\\DataScienceWithPython-main\\OpenCV\\model\\yolo\\Data'  # Replace with the desired destination directory

split_data(source_directory, destination_directory)

For txt file moving

import os
import shutil

def move_files(source_folder, txt_folder, npz_folder):
    # Create destination folders if they don't exist
    os.makedirs(txt_folder, exist_ok=True)
    os.makedirs(npz_folder, exist_ok=True)

    # Iterate through files in the source folder
    for filename in os.listdir(source_folder):
        source_path = os.path.join(source_folder, filename)

        # Check if it's a .txt file
        if filename.endswith('.txt'):
            destination_path = os.path.join(txt_folder, filename)
            shutil.move(source_path, destination_path)
            print(f"Moved {filename} to {txt_folder}")

        # Check if it's a .npz file
        elif filename.endswith('.npz'):
            destination_path = os.path.join(npz_folder, filename)
            shutil.move(source_path, destination_path)
            print(f"Moved {filename} to {npz_folder}")

# Example usage
source_folder = r'C:\\Users\\lenovo\\Downloads\\Data88\\tags\\validation_tag_new'
txt_folder = r'C:\\Users\\lenovo\\Downloads\\New folder\\val_labels'
npz_folder = r'C:\\Users\\lenovo\\Downloads\\New folder\\val_labels'

move_files(source_folder, txt_folder, npz_folder)