Commit b623e51e authored by Wohlgemuth, Jason's avatar Wohlgemuth, Jason
Browse files

feat: Initial check capability for DOCX files

parent 61a07828
Loading
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -113,6 +113,9 @@ pub enum Standard {
    /// Dublin Core Metadata Initiative (DCMI)
    #[display("dcmi")]
    Dcmi,
    /// DOCX-derived text
    #[display("docx")]
    Docx,
    /// InvenioRDM
    #[display("invenio")]
    Invenio,
+5 −1
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ use acorn::io::unique_file_extensions;
use acorn::prelude::{exit, PathBuf};
use acorn::schema::research_activity::ResearchActivity;
use acorn::schema::standard::cff::Cff;
use acorn::schema::standard::text::Text;
use acorn::schema::standard::text::{Docx, Text};
use acorn::util::constants::ENV_READABILITY_METRIC;
use acorn::util::{print_values_as_table, regex_join, Label};
use acorn::{fail, skip};
@@ -92,6 +92,7 @@ fn apply_early_exit_policy(results: Vec<Check>, category: &CheckCategory, option
async fn collect(paths: &[PathBuf], check_options: &CheckOptions) -> Vec<Check> {
    match infer_standard(paths) {
        | Standard::Cff => collect_checks_for::<Cff>(paths, check_options).await,
        | Standard::Docx => collect_checks_for::<Docx>(paths, check_options).await,
        | Standard::Rads => collect_checks_for::<ResearchActivity>(paths, check_options).await,
        | Standard::Text => collect_checks_for::<Text>(paths, check_options).await,
        | _ => unimplemented!(),
@@ -158,6 +159,9 @@ fn infer_standard(paths: &[PathBuf]) -> Standard {
    if extensions.len() == 1 && extensions.contains(&"cff".to_string()) {
        warn!("=> {} Inferred standard (CFF)", Label::using());
        Standard::Cff
    } else if extensions.len() == 1 && extensions.contains(&"docx".to_string()) {
        warn!("=> {} Inferred standard (DOCX)", Label::using());
        Standard::Docx
    } else if extensions.len() == 1 && extensions.contains(&"txt".to_string()) {
        warn!("=> {} Inferred standard (Text)", Label::using());
        Standard::Text
+3 −0
Original line number Diff line number Diff line
@@ -94,6 +94,9 @@ pub enum Standard {
    /// InvenioRDM
    #[display("invenio")]
    Invenio,
    /// DOCX-derived text
    #[display("docx")]
    Docx,
    /// Plain text
    #[display("text")]
    Text,
+36 −1
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ use crate::prelude::{self, create_dir_all, remove_file, write, Command, CommandO
use crate::schema::pid::{PersistentIdentifier, PersistentIdentifierParse, DOI};
use crate::schema::research_activity::ResearchActivity;
use crate::schema::standard::cff::{Cff, Identifier, IdentifierType, Reference};
use crate::schema::standard::text::Text;
use crate::schema::standard::text::{Docx, Text};
use crate::schema::{Organization, ProgrammingLanguage, Website};
use crate::util::constants::{APPLICATION, CUSTOM_VALE_PACKAGE_NAME, DEFAULT_VALE_PACKAGE_URL, DEFAULT_VALE_ROOT, VALE_RELEASES_URL, VALE_VERSION};
use crate::util::{is_uri_or_path, Constant, Label, SemanticVersion, StringConversion};
@@ -233,6 +233,41 @@ impl Analysis for Cff {
    }
}
#[async_trait]
impl Analysis for Docx {
    fn standard() -> Standard {
        Standard::Docx
    }
    async fn check_prose(paths: &[PathBuf], options: Option<&CheckOptions>) -> Vec<Check> {
        check_prose_for::<Self>(paths, options).await
    }
    async fn check_quality(paths: &[PathBuf], _options: Option<&CheckOptions>) -> Vec<Check> {
        paths
            .par_iter()
            .map(|path| match Self::read(path) {
                | Ok(_) => check_ok!(CheckCategory::Quality),
                | Err(why) => check_err!(
                    CheckCategory::Quality,
                    message: "Cannot read DOCX data",
                    context: why.to_string()
                ),
            })
            .collect()
    }
    async fn check_readability(paths: &[PathBuf], options: Option<&CheckOptions>) -> Vec<Check> {
        check_readability_for::<Self>(paths, options)
    }
    async fn check_schema(_paths: &[PathBuf], _options: Option<&CheckOptions>) -> Vec<Check> {
        vec![]
    }
    #[cfg(feature = "std")]
    async fn check_websites(_paths: &[PathBuf], _options: Option<&CheckOptions>) -> Vec<Check> {
        vec![]
    }
    fn output_path(path: &Path, _data: &Self) -> PathBuf {
        standard_project_folder("check", None).join(path.to_path_buf().file_name_with_parent())
    }
}
#[async_trait]
impl Analysis for ResearchActivity {
    fn standard() -> Standard {
        Standard::ResearchActivityData
+42 −3
Original line number Diff line number Diff line
#[cfg(feature = "std")]
use crate::io::InputOutput;
use crate::schema::standard::text::Text;
use crate::schema::standard::text::{Docx, Text};
#[cfg(feature = "std")]
use crate::test_utils::unique_path;
use crate::util::{ToMarkdown, ToProse};
use crate::test_utils::{fixture_path, unique_path};
use crate::util::{Unstructured, ToMarkdown, ToProse};
use validator::Validate;

#[test]
@@ -13,6 +13,7 @@ fn test_text_to_prose_and_markdown_passthrough() {
    };
    assert_eq!(data.to_prose(), "Line one\n\nLine two");
    assert_eq!(data.to_markdown(), "Line one\n\nLine two");
    assert_eq!(data.content(), "Line one\n\nLine two");
}
#[test]
fn test_text_validate_is_noop() {
@@ -43,3 +44,41 @@ fn test_text_input_output_rejects_unsupported_extension() {
    let result = Text::read(output);
    assert!(result.is_err());
}

#[test]
fn test_docx_to_prose_and_markdown_passthrough() {
    let data = Docx {
        content: "Line one\n\nLine two".to_string(),
    };
    assert_eq!(data.to_prose(), "Line one\n\nLine two");
    assert_eq!(data.to_markdown(), "Line one\n\nLine two");
    assert_eq!(data.content(), "Line one\n\nLine two");
}
#[test]
fn test_docx_validate_is_noop() {
    let data = Docx {
        content: "any content should validate".to_string(),
    };
    assert!(data.validate().is_ok());
}
#[cfg(feature = "std")]
#[test]
fn test_docx_input_output_read_docx_fixture() {
    let source = fixture_path("acorn.docx");
    let result = Docx::read(source).expect("failed to read docx file");
    assert!(result.content.contains("ACORN"));
}
#[cfg(feature = "std")]
#[test]
fn test_docx_input_output_write_rejects_docx_extension() {
    let output = unique_path("docx-io", "docx");
    if let Some(parent) = output.parent() {
        std::fs::create_dir_all(parent).expect("failed to create test_artifacts directory");
    }
    let source = Docx {
        content: "plain text content".to_string(),
    };
    let result = source.write(output.clone());
    assert!(result.is_err());
    let _cleanup = std::fs::remove_file(output);
}
Loading