Commit d00fb92c authored by Wohlgemuth, Jason's avatar Wohlgemuth, Jason
Browse files

feat: Add large file read support to read_file

parent 0587b647
Loading
Loading
Loading
Loading
Loading
+46 −5
Original line number Diff line number Diff line
@@ -23,7 +23,7 @@ use crate::fail;
use crate::prelude::{canonicalize, create_dir_all, io, var, BufReader, Cursor, Error, File, PathBuf, Read, Write};
#[cfg(any(unix, target_os = "wasi", target_os = "redox"))]
use crate::prelude::{set_permissions, Permissions, PermissionsExt};
use crate::util::constants::{APPLICATION, ORGANIZATION, QUALIFIER};
use crate::util::constants::{APPLICATION, LARGE_FILE_THRESHOLD_BYTES, ORGANIZATION, QUALIFIER};
#[cfg(windows)]
use crate::util::file_extension;
use crate::util::{generate_guid, suffix, Label, MimeType, SemanticVersion, ToAbsoluteString, ToStrings};
@@ -876,6 +876,43 @@ where
///     .collect();
/// ```
pub fn read_file<P>(path: P) -> Result<String, io::Error>
where
    P: Into<PathBuf> + Clone + Send,
{
    let path_buf = path.into();
    let filename = path_buf.file_name().unwrap_or_default().to_string_lossy().to_string();
    let is_large_file = match path_buf.metadata() {
        | Ok(metadata) => metadata.len() >= LARGE_FILE_THRESHOLD_BYTES,
        | Err(_) => false,
    };
    if is_large_file {
        trace!(filename, "=> {} Read file with large-file strategy", Label::using());
        read_large_file(path_buf)
    } else {
        match File::open(&path_buf) {
            | Ok(file) => {
                let mut reader = BufReader::new(file);
                let mut content = String::new();
                match reader.read_to_string(&mut content) {
                    | Ok(_) => Ok(content),
                    | Err(why) => {
                        error!(filename, "=> {} Read file content", Label::fail());
                        Err(why)
                    }
                }
            }
            | Err(why) => {
                error!(filename, "=> {} Read file", Label::fail());
                Err(why)
            }
        }
    }
}
/// Reads large files and returns the contents as a string.
///
/// This function uses a larger buffered reader and pre-allocates the output string
/// using file metadata when available.
pub fn read_large_file<P>(path: P) -> Result<String, io::Error>
where
    P: Into<PathBuf> + Clone + Send,
{
@@ -884,16 +921,20 @@ where
    let file = match File::open(&path_buf) {
        | Ok(file) => file,
        | Err(why) => {
            error!(filename, "=> {} Read file", Label::fail());
            error!(filename, "=> {} Read large file", Label::fail());
            return Err(why);
        }
    };
    let mut reader = BufReader::new(file);
    let mut content = String::new();
    let capacity = match file.metadata() {
        | Ok(metadata) => usize::try_from(metadata.len()).unwrap_or(0),
        | Err(_) => 0,
    };
    let mut reader = BufReader::with_capacity(1024 * 1024, file);
    let mut content = if capacity > 0 { String::with_capacity(capacity) } else { String::new() };
    match reader.read_to_string(&mut content) {
        | Ok(_) => Ok(content),
        | Err(why) => {
            error!(filename, "=> {} Read file content", Label::fail());
            error!(filename, "=> {} Read large file content", Label::fail());
            Err(why)
        }
    }
+22 −2
Original line number Diff line number Diff line
@@ -4,10 +4,10 @@ use crate::io::bagit::{Bag, BagInfo, Save};
use crate::io::config::{ApplicationConfiguration, Bucket};
use crate::io::{
    archive, file_checksum, files_all, files_from_git_branch, files_from_git_commit, files_from_gitlab_merge_request, filter_git_command_result,
    filter_ignored, image_paths, InputOutput,
    filter_ignored, image_paths, read_file, read_large_file, InputOutput,
};
use crate::{Location, Repository, Scheme};
use std::fs::{create_dir_all, read_to_string, remove_dir_all};
use std::fs::{create_dir_all, read_to_string, remove_dir_all, remove_file, write};
use std::path::PathBuf;
use std::time::{SystemTime, UNIX_EPOCH};

@@ -423,3 +423,23 @@ fn test_image_paths() {
    let files = image_paths(path);
    assert_eq!(files.len(), 0);
}
#[test]
fn test_read_large_file() {
    let stamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos();
    let path = std::env::temp_dir().join(format!("acorn-read-large-file-{stamp}.txt"));
    let expected = "ACORN ".repeat(500_000);
    write(path.clone(), expected.clone()).unwrap();
    let content = read_large_file(path.clone()).unwrap();
    assert_eq!(content, expected);
    remove_file(path).unwrap();
}
#[test]
fn test_read_file_large_content() {
    let stamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos();
    let path = std::env::temp_dir().join(format!("acorn-read-file-large-{stamp}.txt"));
    let expected = "ACORN-LARGE ".repeat(800_000);
    write(path.clone(), expected.clone()).unwrap();
    let content = read_file(path.clone()).unwrap();
    assert_eq!(content, expected);
    remove_file(path).unwrap();
}
+2 −0
Original line number Diff line number Diff line
@@ -62,6 +62,8 @@ pub const APPLICATION: &str = "acorn";
pub const ORGANIZATION: &str = "ornl";
/// Organization qualifier
pub const QUALIFIER: &str = "org";
/// File size threshold in bytes for switching to large-file read strategy (100 MB).
pub const LARGE_FILE_THRESHOLD_BYTES: u64 = 100 * 1024 * 1024;
// Schema defaults
/// Default affiliation
pub const DEFAULT_AFFILIATION: &str = "Oak Ridge National Laboratory";
+2527 −2557

File changed.

Preview size limit exceeded, changes collapsed.