diff --git a/testing/adios2/engine/bp/CMakeLists.txt b/testing/adios2/engine/bp/CMakeLists.txt
index 006db6b06ef36f3aba5f5a4353eaf85315bdd5b0..da8fa30dacddbca80dd97de12852ceed1ad2b1bd 100644
--- a/testing/adios2/engine/bp/CMakeLists.txt
+++ b/testing/adios2/engine/bp/CMakeLists.txt
@@ -4,31 +4,48 @@
 #------------------------------------------------------------------------------#
 
 # MPI versions of the test are not properly implemented at the moment
-if(NOT ADIOS2_HAVE_MPI)
-  
-  if(ADIOS2_HAVE_ADIOS1)
-    add_executable(TestBPWriteRead TestBPWriteRead.cpp)
-    target_link_libraries(TestBPWriteRead adios2 gtest adios1::adios)
-
-    add_executable(TestBPWriteReadAttributes TestBPWriteReadAttributes.cpp)
-    target_link_libraries(TestBPWriteReadAttributes adios2 gtest adios1::adios)
-  
-    add_executable(TestBPWriteReadstdio TestBPWriteReadstdio.cpp)
-    target_link_libraries(TestBPWriteReadstdio adios2 gtest adios1::adios)
-
-    add_executable(TestBPWriteReadfstream TestBPWriteReadfstream.cpp)
-    target_link_libraries(TestBPWriteReadfstream adios2 gtest adios1::adios)
-    
-    gtest_add_tests(TARGET TestBPWriteRead)
-    gtest_add_tests(TARGET TestBPWriteReadAttributes)
-    gtest_add_tests(TARGET TestBPWriteReadstdio)
-    gtest_add_tests(TARGET TestBPWriteReadfstream)
-  endif()
-    
-  add_executable(TestBPWriteProfilingJSON TestBPWriteProfilingJSON.cpp)
-  target_link_libraries(TestBPWriteProfilingJSON
-    adios2 gtest gtest_main NLohmannJson
+
+add_executable(TestBPWriteRead TestBPWriteRead.cpp)
+target_link_libraries(TestBPWriteRead adios2 gtest adios1::adios)
+
+add_executable(TestBPWriteReadAttributes TestBPWriteReadAttributes.cpp)
+target_link_libraries(TestBPWriteReadAttributes adios2 gtest adios1::adios)
+
+add_executable(TestBPWriteReadstdio TestBPWriteReadstdio.cpp)
+target_link_libraries(TestBPWriteReadstdio adios2 gtest adios1::adios)
+
+add_executable(TestBPWriteReadfstream TestBPWriteReadfstream.cpp)
+target_link_libraries(TestBPWriteReadfstream adios2 gtest adios1::adios)
+
+add_executable(TestBPWriteProfilingJSON TestBPWriteProfilingJSON.cpp)
+target_link_libraries(TestBPWriteProfilingJSON
+  adios2 gtest NLohmannJson
+)
+
+if(ADIOS2_HAVE_MPI)
+  target_include_directories(TestBPWriteRead PRIVATE ${MPI_C_INCLUDE_PATH})
+  target_link_libraries(TestBPWriteRead ${MPI_C_LIBRARIES})
+
+  target_include_directories(TestBPWriteReadAttributes PRIVATE ${MPI_C_INCLUDE_PATH})
+  target_link_libraries(TestBPWriteReadAttributes ${MPI_C_LIBRARIES})
+
+  target_include_directories(TestBPWriteReadstdio PRIVATE ${MPI_C_INCLUDE_PATH})
+  target_link_libraries(TestBPWriteReadstdio ${MPI_C_LIBRARIES})
+
+  target_include_directories(TestBPWriteReadfstream PRIVATE ${MPI_C_INCLUDE_PATH})
+  target_link_libraries(TestBPWriteReadfstream ${MPI_C_LIBRARIES})
+
+  target_include_directories(TestBPWriteProfilingJSON PRIVATE ${MPI_C_INCLUDE_PATH})
+  target_link_libraries(TestBPWriteProfilingJSON ${MPI_C_LIBRARIES})
+
+  set(extra_test_args
+    EXEC_WRAPPER
+      ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS}
   )
-  
-  gtest_add_tests(TARGET TestBPWriteProfilingJSON)
 endif()
+
+gtest_add_tests(TARGET TestBPWriteRead ${extra_test_args})
+gtest_add_tests(TARGET TestBPWriteReadAttributes ${extra_test_args})
+gtest_add_tests(TARGET TestBPWriteReadstdio ${extra_test_args})
+gtest_add_tests(TARGET TestBPWriteReadfstream ${extra_test_args})
+gtest_add_tests(TARGET TestBPWriteProfilingJSON ${extra_test_args})
diff --git a/testing/adios2/engine/bp/TestBPWriteProfilingJSON.cpp b/testing/adios2/engine/bp/TestBPWriteProfilingJSON.cpp
index c032ab98d40d8b1b24cac35e07ab6e9a9ab55329..5fbf1723c1d83603dd9b365ef7b97afb6133d782 100644
--- a/testing/adios2/engine/bp/TestBPWriteProfilingJSON.cpp
+++ b/testing/adios2/engine/bp/TestBPWriteProfilingJSON.cpp
@@ -38,49 +38,75 @@ public:
 //******************************************************************************
 
 // ADIOS2 write, native ADIOS1 read
-TEST_F(BPWriteProfilingJSONTest, ADIOS2BPWriteProfilingJSON)
+TEST_F(BPWriteProfilingJSONTest, DISABLED_ADIOS2BPWriteProfilingJSON)
 {
-    std::string fname = "ADIOS2BPWriteProfilingJSON.bp";
+    // Use a relative path + file name to test path in file name capability
+    std::string fname;
+    fname = "foo/ADIOS2BPWriteProfilingJSON.bp";
+
+    int mpiRank = 0, mpiSize = 1;
+    // Number of rows
+    const std::size_t Nx = 8;
+
+    // Number of steps
+    const std::size_t NSteps = 3;
+
+#ifdef ADIOS2_HAVE_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
+#endif
 
     // Write test data and profiling.json using ADIOS2
     {
+#ifdef ADIOS2_HAVE_MPI
+        adios2::ADIOS adios(MPI_COMM_WORLD, adios2::DebugON);
+#else
         adios2::ADIOS adios(true);
+#endif
         adios2::IO &io = adios.DeclareIO("TestIO");
 
-        // Declare 1D variables
+        // Declare 1D variables (NumOfProcesses * Nx)
+        // The local process' part (start, count) can be defined now or later
+        // before Write().
         {
-            auto &var_i8 =
-                io.DefineVariable<int8_t>("i8", {}, {}, adios2::Dims{8});
+            adios2::Dims shape{static_cast<unsigned int>(Nx * mpiSize)};
+            adios2::Dims start{static_cast<unsigned int>(Nx * mpiRank)};
+            adios2::Dims count{static_cast<unsigned int>(Nx)};
+            auto &var_i8 = io.DefineVariable<int8_t>("i8", shape, start, count);
             auto &var_i16 =
-                io.DefineVariable<int16_t>("i16", {}, {}, adios2::Dims{8});
+                io.DefineVariable<int16_t>("i16", shape, start, count);
             auto &var_i32 =
-                io.DefineVariable<int32_t>("i32", {}, {}, adios2::Dims{8});
+                io.DefineVariable<int32_t>("i32", shape, start, count);
             auto &var_i64 =
-                io.DefineVariable<int64_t>("i64", {}, {}, adios2::Dims{8});
+                io.DefineVariable<int64_t>("i64", shape, start, count);
             auto &var_u8 =
-                io.DefineVariable<uint8_t>("u8", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint8_t>("u8", shape, start, count);
             auto &var_u16 =
-                io.DefineVariable<uint16_t>("u16", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint16_t>("u16", shape, start, count);
             auto &var_u32 =
-                io.DefineVariable<uint32_t>("u32", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint32_t>("u32", shape, start, count);
             auto &var_u64 =
-                io.DefineVariable<uint64_t>("u64", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint64_t>("u64", shape, start, count);
             auto &var_r32 =
-                io.DefineVariable<float>("r32", {}, {}, adios2::Dims{8});
+                io.DefineVariable<float>("r32", shape, start, count);
             auto &var_r64 =
-                io.DefineVariable<double>("r64", {}, {}, adios2::Dims{8});
+                io.DefineVariable<double>("r64", shape, start, count);
         }
 
         // Create the BP Engine
         io.SetEngine("BPFileWriter");
         io.SetParameters({{"Threads", "2"}});
-        io.AddTransport("File", {{"Library", "POSIX"}});
+        io.AddTransport("file", {{"Library", "POSIX"}});
 
         auto engine = io.Open(fname, adios2::OpenMode::Write);
         ASSERT_NE(engine.get(), nullptr);
 
-        for (size_t step = 0; step < 3; ++step)
+        for (size_t step = 0; step < NSteps; ++step)
         {
+            // Generate test data for each process uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, step, mpiRank, mpiSize);
+
             // Retrieve the variables that previously went out of scope
             auto &var_i8 = io.GetVariable<int8_t>("i8");
             auto &var_i16 = io.GetVariable<int16_t>("i16");
@@ -93,22 +119,37 @@ TEST_F(BPWriteProfilingJSONTest, ADIOS2BPWriteProfilingJSON)
             auto &var_r32 = io.GetVariable<float>("r32");
             auto &var_r64 = io.GetVariable<double>("r64");
 
+            // Make a 1D selection to describe the local dimensions of the
+            // variable we write and its offsets in the global spaces
+            adios2::SelectionBoundingBox sel({mpiRank * Nx}, {Nx});
+            var_i8.SetSelection(sel);
+            var_i16.SetSelection(sel);
+            var_i32.SetSelection(sel);
+            var_i64.SetSelection(sel);
+            var_u8.SetSelection(sel);
+            var_u16.SetSelection(sel);
+            var_u32.SetSelection(sel);
+            var_u64.SetSelection(sel);
+            var_r32.SetSelection(sel);
+            var_r64.SetSelection(sel);
+
             // Write each one
-            engine->Write(var_i8, m_TestData.I8.data() + step);
-            engine->Write(var_i16, m_TestData.I16.data() + step);
-            engine->Write(var_i32, m_TestData.I32.data() + step);
-            engine->Write(var_i64, m_TestData.I64.data() + step);
-            engine->Write(var_u8, m_TestData.U8.data() + step);
-            engine->Write(var_u16, m_TestData.U16.data() + step);
-            engine->Write(var_u32, m_TestData.U32.data() + step);
-            engine->Write(var_u64, m_TestData.U64.data() + step);
-            engine->Write(var_r32, m_TestData.R32.data() + step);
-            engine->Write(var_r64, m_TestData.R64.data() + step);
+            // fill in the variable with values from starting index to
+            // starting index + count
+            engine->Write(var_i8, currentTestData.I8.data());
+            engine->Write(var_i16, currentTestData.I16.data());
+            engine->Write(var_i32, currentTestData.I32.data());
+            engine->Write(var_i64, currentTestData.I64.data());
+            engine->Write(var_u8, currentTestData.U8.data());
+            engine->Write(var_u16, currentTestData.U16.data());
+            engine->Write(var_u32, currentTestData.U32.data());
+            engine->Write(var_u64, currentTestData.U64.data());
+            engine->Write(var_r32, currentTestData.R32.data());
+            engine->Write(var_r64, currentTestData.R64.data());
 
             // Advance to the next time step
             engine->Advance();
         }
-
         // Close the file
         engine->Close();
     }
@@ -116,25 +157,43 @@ TEST_F(BPWriteProfilingJSONTest, ADIOS2BPWriteProfilingJSON)
     // open json file, parse it to a json structure, and verify a few things
     {
         std::ifstream profilingJSONFile(fname + ".dir/profiling.json");
-        std::stringstream buffer;
-        buffer << profilingJSONFile.rdbuf();
-
-        const json profilingJSON = json::parse(buffer);
+        const json profilingJSON = json::parse(profilingJSONFile);
 
         // check rank is zero
-        const int rank = profilingJSON[0].value("rank", -1);
-        ASSERT_EQ(rank, 0);
+        const int rank = profilingJSON[mpiRank].value("rank", -1);
+        ASSERT_EQ(rank, mpiRank);
 
         // check threads
-        const int threads = profilingJSON[0].value("threads", 0);
+        const int threads = profilingJSON[mpiRank].value("threads", 0);
         ASSERT_EQ(threads, 2);
 
         // check bytes
-        const unsigned long int bytes = profilingJSON[0].value("bytes", 0UL);
+        const unsigned long int bytes =
+            profilingJSON[mpiRank].value("bytes", 0UL);
         ASSERT_EQ(bytes, 6536);
 
         const auto transportType =
-            profilingJSON[0]["transport_0"].value("type", "0");
+            profilingJSON[mpiRank]["transport_0"].value("type", "0");
         ASSERT_EQ(transportType, "File_POSIX");
     }
 }
+
+//******************************************************************************
+// main
+//******************************************************************************
+
+int main(int argc, char **argv)
+{
+#ifdef ADIOS2_HAVE_MPI
+    MPI_Init(nullptr, nullptr);
+#endif
+
+    ::testing::InitGoogleTest(&argc, argv);
+    int result = RUN_ALL_TESTS();
+
+#ifdef ADIOS2_HAVE_MPI
+    MPI_Finalize();
+#endif
+
+    return result;
+}
diff --git a/testing/adios2/engine/bp/TestBPWriteRead.cpp b/testing/adios2/engine/bp/TestBPWriteRead.cpp
index 57eeafa21fcab4ad38a1264e25b3266ffc7c3822..ed5ad92aaf1cea03057ccf7bc8ddd2691e69b0df 100644
--- a/testing/adios2/engine/bp/TestBPWriteRead.cpp
+++ b/testing/adios2/engine/bp/TestBPWriteRead.cpp
@@ -27,49 +27,82 @@ public:
 // 1D 1x8 test data
 //******************************************************************************
 
-// ADIOS2 write, native ADIOS1 read
+// ADIOS2 BP write, native ADIOS1 read
 TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8)
 {
+    // Each process would write a 1x8 array and all processes would
+    // form a mpiSize * Nx 1D array
     std::string fname = "ADIOS2BPWriteADIOS1Read1D8.bp";
 
-    // Write test data using ADIOS2
+    int mpiRank = 0, mpiSize = 1;
+    // Number of rows
+    const std::size_t Nx = 8;
+
+    // Number of steps
+    const std::size_t NSteps = 3;
+
+#ifdef ADIOS2_HAVE_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
+#endif
+
+    // Write test data using BP
     {
+#ifdef ADIOS2_HAVE_MPI
+        adios2::ADIOS adios(MPI_COMM_WORLD, adios2::DebugON);
+#else
         adios2::ADIOS adios(true);
+#endif
         adios2::IO &io = adios.DeclareIO("TestIO");
 
-        // Declare 1D variables
+        // Declare 1D variables (NumOfProcesses * Nx)
+        // The local process' part (start, count) can be defined now or later
+        // before Write().
         {
-            auto &var_i8 =
-                io.DefineVariable<int8_t>("i8", {}, {}, adios2::Dims{8});
+            adios2::Dims shape{static_cast<unsigned int>(Nx * mpiSize)};
+            adios2::Dims start{static_cast<unsigned int>(Nx * mpiRank)};
+            adios2::Dims count{static_cast<unsigned int>(Nx)};
+            auto &var_i8 = io.DefineVariable<int8_t>("i8", shape, start, count);
             auto &var_i16 =
-                io.DefineVariable<int16_t>("i16", {}, {}, adios2::Dims{8});
+                io.DefineVariable<int16_t>("i16", shape, start, count);
             auto &var_i32 =
-                io.DefineVariable<int32_t>("i32", {}, {}, adios2::Dims{8});
+                io.DefineVariable<int32_t>("i32", shape, start, count);
             auto &var_i64 =
-                io.DefineVariable<int64_t>("i64", {}, {}, adios2::Dims{8});
+                io.DefineVariable<int64_t>("i64", shape, start, count);
             auto &var_u8 =
-                io.DefineVariable<uint8_t>("u8", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint8_t>("u8", shape, start, count);
             auto &var_u16 =
-                io.DefineVariable<uint16_t>("u16", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint16_t>("u16", shape, start, count);
             auto &var_u32 =
-                io.DefineVariable<uint32_t>("u32", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint32_t>("u32", shape, start, count);
             auto &var_u64 =
-                io.DefineVariable<uint64_t>("u64", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint64_t>("u64", shape, start, count);
             auto &var_r32 =
-                io.DefineVariable<float>("r32", {}, {}, adios2::Dims{8});
+                io.DefineVariable<float>("r32", shape, start, count);
             auto &var_r64 =
-                io.DefineVariable<double>("r64", {}, {}, adios2::Dims{8});
+                io.DefineVariable<double>("r64", shape, start, count);
         }
 
         // Create the BP Engine
         io.SetEngine("BPFileWriter");
-        io.AddTransport("File");
 
+        io.AddTransport("file");
+
+        // QUESTION: It seems that BPFilterWriter cannot overwrite existing
+        // files
+        // Ex. if you tune Nx and NSteps, the test would fail. But if you clear
+        // the cache in
+        // ${adios2Build}/testing/adios2/engine/bp/ADIOS2BPWriteADIOS1Read1D8.bp.dir,
+        // then it works
         auto engine = io.Open(fname, adios2::OpenMode::Write);
         ASSERT_NE(engine.get(), nullptr);
 
-        for (size_t step = 0; step < 3; ++step)
+        for (size_t step = 0; step < NSteps; ++step)
         {
+            // Generate test data for each process uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, step, mpiRank, mpiSize);
+
             // Retrieve the variables that previously went out of scope
             auto &var_i8 = io.GetVariable<int8_t>("i8");
             auto &var_i16 = io.GetVariable<int16_t>("i16");
@@ -82,17 +115,33 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8)
             auto &var_r32 = io.GetVariable<float>("r32");
             auto &var_r64 = io.GetVariable<double>("r64");
 
+            // Make a 1D selection to describe the local dimensions of the
+            // variable we write and its offsets in the global spaces
+            adios2::SelectionBoundingBox sel({mpiRank * Nx}, {Nx});
+            var_i8.SetSelection(sel);
+            var_i16.SetSelection(sel);
+            var_i32.SetSelection(sel);
+            var_i64.SetSelection(sel);
+            var_u8.SetSelection(sel);
+            var_u16.SetSelection(sel);
+            var_u32.SetSelection(sel);
+            var_u64.SetSelection(sel);
+            var_r32.SetSelection(sel);
+            var_r64.SetSelection(sel);
+
             // Write each one
-            engine->Write(var_i8, m_TestData.I8.data() + step);
-            engine->Write(var_i16, m_TestData.I16.data() + step);
-            engine->Write(var_i32, m_TestData.I32.data() + step);
-            engine->Write(var_i64, m_TestData.I64.data() + step);
-            engine->Write(var_u8, m_TestData.U8.data() + step);
-            engine->Write(var_u16, m_TestData.U16.data() + step);
-            engine->Write(var_u32, m_TestData.U32.data() + step);
-            engine->Write(var_u64, m_TestData.U64.data() + step);
-            engine->Write(var_r32, m_TestData.R32.data() + step);
-            engine->Write(var_r64, m_TestData.R64.data() + step);
+            // fill in the variable with values from starting index to
+            // starting index + count
+            engine->Write(var_i8, currentTestData.I8.data());
+            engine->Write(var_i16, currentTestData.I16.data());
+            engine->Write(var_i32, currentTestData.I32.data());
+            engine->Write(var_i64, currentTestData.I64.data());
+            engine->Write(var_u8, currentTestData.U8.data());
+            engine->Write(var_u16, currentTestData.U16.data());
+            engine->Write(var_u32, currentTestData.U32.data());
+            engine->Write(var_u64, currentTestData.U64.data());
+            engine->Write(var_r32, currentTestData.R32.data());
+            engine->Write(var_r64, currentTestData.R64.data());
 
             // Advance to the next time step
             engine->Advance();
@@ -102,83 +151,105 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8)
         engine->Close();
     }
 
-// Read test data using ADIOS1
-#ifdef ADIOS2_HAVE_MPI
-    // Read everything from rank 0
-    int rank;
-    MPI_Comm_rank();
-    if (rank == 0)
-#endif
     {
-        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_WORLD,
+        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_SELF,
                                "verbose=3");
 
         // Open the file for reading
-        ADIOS_FILE *f =
-            adios_read_open_file((fname + ".dir/" + fname + ".0").c_str(),
-                                 ADIOS_READ_METHOD_BP, MPI_COMM_WORLD);
+        // Note: Since collective metadata generation is not implemented yet,
+        // SO for now we read each subfile instead of a single bp file with all
+        // metadata.
+        // Meanwhile if we open file with MPI_COMM_WORLD, then the selection
+        // bounding box should be [0, Nx]
+        std::string index = std::to_string(mpiRank);
+        ADIOS_FILE *f = adios_read_open_file(
+            (fname + ".dir/" + fname + "." + index).c_str(),
+            ADIOS_READ_METHOD_BP, MPI_COMM_SELF);
         ASSERT_NE(f, nullptr);
 
         // Check the variables exist
         ADIOS_VARINFO *var_i8 = adios_inq_var(f, "i8");
         ASSERT_NE(var_i8, nullptr);
         ASSERT_EQ(var_i8->ndim, 1);
-        ASSERT_EQ(var_i8->dims[0], 8);
+        ASSERT_EQ(var_i8->global, 1);
+        ASSERT_EQ(var_i8->nsteps, NSteps);
+        ASSERT_EQ(var_i8->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_i16 = adios_inq_var(f, "i16");
         ASSERT_NE(var_i16, nullptr);
         ASSERT_EQ(var_i16->ndim, 1);
-        ASSERT_EQ(var_i16->dims[0], 8);
+        ASSERT_EQ(var_i16->global, 1);
+        ASSERT_EQ(var_i16->nsteps, NSteps);
+        ASSERT_EQ(var_i16->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_i32 = adios_inq_var(f, "i32");
         ASSERT_NE(var_i32, nullptr);
         ASSERT_EQ(var_i32->ndim, 1);
-        ASSERT_EQ(var_i32->dims[0], 8);
+        ASSERT_EQ(var_i32->global, 1);
+        ASSERT_EQ(var_i32->nsteps, NSteps);
+        ASSERT_EQ(var_i32->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_i64 = adios_inq_var(f, "i64");
         ASSERT_NE(var_i64, nullptr);
         ASSERT_EQ(var_i64->ndim, 1);
-        ASSERT_EQ(var_i64->dims[0], 8);
+        ASSERT_EQ(var_i64->global, 1);
+        ASSERT_EQ(var_i64->nsteps, NSteps);
+        ASSERT_EQ(var_i64->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_u8 = adios_inq_var(f, "u8");
         ASSERT_NE(var_u8, nullptr);
         ASSERT_EQ(var_u8->ndim, 1);
-        ASSERT_EQ(var_u8->dims[0], 8);
+        ASSERT_EQ(var_u8->global, 1);
+        ASSERT_EQ(var_u8->nsteps, NSteps);
+        ASSERT_EQ(var_u8->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_u16 = adios_inq_var(f, "u16");
         ASSERT_NE(var_u16, nullptr);
         ASSERT_EQ(var_u16->ndim, 1);
-        ASSERT_EQ(var_u16->dims[0], 8);
+        ASSERT_EQ(var_u16->global, 1);
+        ASSERT_EQ(var_u16->nsteps, NSteps);
+        ASSERT_EQ(var_u16->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_u32 = adios_inq_var(f, "u32");
         ASSERT_NE(var_u32, nullptr);
         ASSERT_EQ(var_u32->ndim, 1);
-        ASSERT_EQ(var_u32->dims[0], 8);
+        ASSERT_EQ(var_u32->global, 1);
+        ASSERT_EQ(var_u32->nsteps, NSteps);
+        ASSERT_EQ(var_u32->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_u64 = adios_inq_var(f, "u64");
         ASSERT_NE(var_u64, nullptr);
         ASSERT_EQ(var_u64->ndim, 1);
-        ASSERT_EQ(var_u64->dims[0], 8);
+        ASSERT_EQ(var_u64->global, 1);
+        ASSERT_EQ(var_u64->nsteps, NSteps);
+        ASSERT_EQ(var_u64->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_r32 = adios_inq_var(f, "r32");
         ASSERT_NE(var_r32, nullptr);
         ASSERT_EQ(var_r32->ndim, 1);
-        ASSERT_EQ(var_r32->dims[0], 8);
+        ASSERT_EQ(var_r32->global, 1);
+        ASSERT_EQ(var_r32->nsteps, NSteps);
+        ASSERT_EQ(var_r32->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_r64 = adios_inq_var(f, "r64");
         ASSERT_NE(var_r64, nullptr);
         ASSERT_EQ(var_r64->ndim, 1);
-        ASSERT_EQ(var_r64->dims[0], 8);
-
-        std::array<int8_t, 8> I8;
-        std::array<int16_t, 8> I16;
-        std::array<int32_t, 8> I32;
-        std::array<int64_t, 8> I64;
-        std::array<uint8_t, 8> U8;
-        std::array<uint16_t, 8> U16;
-        std::array<uint32_t, 8> U32;
-        std::array<uint64_t, 8> U64;
-        std::array<float, 8> R32;
-        std::array<double, 8> R64;
-
-        uint64_t start[1] = {0};
-        uint64_t count[1] = {8};
+        ASSERT_EQ(var_r64->global, 1);
+        ASSERT_EQ(var_r64->nsteps, NSteps);
+        ASSERT_EQ(var_r64->dims[0], mpiSize * Nx);
+
+        std::array<int8_t, Nx> I8;
+        std::array<int16_t, Nx> I16;
+        std::array<int32_t, Nx> I32;
+        std::array<int64_t, Nx> I64;
+        std::array<uint8_t, Nx> U8;
+        std::array<uint16_t, Nx> U16;
+        std::array<uint32_t, Nx> U32;
+        std::array<uint64_t, Nx> U64;
+        std::array<float, Nx> R32;
+        std::array<double, Nx> R64;
+
+        uint64_t start[1] = {mpiRank * Nx};
+        uint64_t count[1] = {Nx};
         ADIOS_SELECTION *sel = adios_selection_boundingbox(1, start, count);
 
         // Read stuff
-        for (size_t t = 0; t < 3; ++t)
+        for (size_t t = 0; t < NSteps; ++t)
         {
+            // Generate test data for each rank uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, t, mpiRank, mpiSize);
             // Read the current step
             adios_schedule_read_byid(f, sel, var_i8->varid, t, 1, I8.data());
             adios_schedule_read_byid(f, sel, var_i16->varid, t, 1, I16.data());
@@ -193,22 +264,22 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8)
             adios_perform_reads(f, 1);
 
             // Check if it's correct
-            for (size_t i = 0; i < 8; ++i)
+            for (size_t i = 0; i < Nx; ++i)
             {
                 std::stringstream ss;
-                ss << "t=" << t << " i=" << i;
+                ss << "t=" << t << " i=" << i << " rank=" << mpiRank;
                 std::string msg = ss.str();
 
-                EXPECT_EQ(I8[i], m_TestData.I8[i + t]) << msg;
-                EXPECT_EQ(I16[i], m_TestData.I16[i + t]) << msg;
-                EXPECT_EQ(I32[i], m_TestData.I32[i + t]) << msg;
-                EXPECT_EQ(I64[i], m_TestData.I64[i + t]) << msg;
-                EXPECT_EQ(U8[i], m_TestData.U8[i + t]) << msg;
-                EXPECT_EQ(U16[i], m_TestData.U16[i + t]) << msg;
-                EXPECT_EQ(U32[i], m_TestData.U32[i + t]) << msg;
-                EXPECT_EQ(U64[i], m_TestData.U64[i + t]) << msg;
-                EXPECT_EQ(R32[i], m_TestData.R32[i + t]) << msg;
-                EXPECT_EQ(R64[i], m_TestData.R64[i + t]) << msg;
+                EXPECT_EQ(I8[i], currentTestData.I8[i]) << msg;
+                EXPECT_EQ(I16[i], currentTestData.I16[i]) << msg;
+                EXPECT_EQ(I32[i], currentTestData.I32[i]) << msg;
+                EXPECT_EQ(I64[i], currentTestData.I64[i]) << msg;
+                EXPECT_EQ(U8[i], currentTestData.U8[i]) << msg;
+                EXPECT_EQ(U16[i], currentTestData.U16[i]) << msg;
+                EXPECT_EQ(U32[i], currentTestData.U32[i]) << msg;
+                EXPECT_EQ(U64[i], currentTestData.U64[i]) << msg;
+                EXPECT_EQ(R32[i], currentTestData.R32[i]) << msg;
+                EXPECT_EQ(R64[i], currentTestData.R64[i]) << msg;
             }
         }
 
@@ -228,6 +299,8 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8)
 
         // Cleanup file
         adios_read_close(f);
+
+        adios_read_finalize_method(ADIOS_READ_METHOD_BP);
     }
 }
 
@@ -243,38 +316,66 @@ TEST_F(BPWriteReadTest, DISABLED_ADIOS2BPWriteADIOS2BPRead1D8)
 // 2D 2x4 test data
 //******************************************************************************
 
-// ADIOS2 write, native ADIOS1 read
+// ADIOS2 BP write, native ADIOS1 read
 TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4)
 {
+    // Each process would write a 2x4 array and all processes would
+    // form a 2D 2 * (numberOfProcess*Nx) matrix where Nx is 4 here
     std::string fname = "ADIOS2BPWriteADIOS1Read2D2x4Test.bp";
 
+    int mpiRank = 0, mpiSize = 1;
+    // Number of rows
+    const std::size_t Nx = 4;
+
+    // Number of rows
+    const std::size_t Ny = 2;
+
+    // Number of steps
+    const std::size_t NSteps = 3;
+
+#ifdef ADIOS2_HAVE_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
+#endif
+
     // Write test data using ADIOS2
     {
+#ifdef ADIOS2_HAVE_MPI
+        adios2::ADIOS adios(MPI_COMM_WORLD, adios2::DebugON);
+#else
         adios2::ADIOS adios(true);
+#endif
         adios2::IO &io = adios.DeclareIO("TestIO");
 
-        // Declare 1D variables
+        // Declare 2D variables (Ny * (NumOfProcesses * Nx))
+        // The local process' part (start, count) can be defined now or later
+        // before Write().
         {
-            auto &var_i8 =
-                io.DefineVariable<int8_t>("i8", {}, {}, adios2::Dims{2, 4});
+            adios2::Dims shape{static_cast<unsigned int>(Ny),
+                               static_cast<unsigned int>(Nx * mpiSize)};
+            adios2::Dims start{static_cast<unsigned int>(0),
+                               static_cast<unsigned int>(mpiRank * Nx)};
+            adios2::Dims count{static_cast<unsigned int>(Ny),
+                               static_cast<unsigned int>(Nx)};
+            auto &var_i8 = io.DefineVariable<int8_t>("i8", shape, start, count);
             auto &var_i16 =
-                io.DefineVariable<int16_t>("i16", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<int16_t>("i16", shape, start, count);
             auto &var_i32 =
-                io.DefineVariable<int32_t>("i32", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<int32_t>("i32", shape, start, count);
             auto &var_i64 =
-                io.DefineVariable<int64_t>("i64", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<int64_t>("i64", shape, start, count);
             auto &var_u8 =
-                io.DefineVariable<uint8_t>("u8", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<uint8_t>("u8", shape, start, count);
             auto &var_u16 =
-                io.DefineVariable<uint16_t>("u16", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<uint16_t>("u16", shape, start, count);
             auto &var_u32 =
-                io.DefineVariable<uint32_t>("u32", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<uint32_t>("u32", shape, start, count);
             auto &var_u64 =
-                io.DefineVariable<uint64_t>("u64", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<uint64_t>("u64", shape, start, count);
             auto &var_r32 =
-                io.DefineVariable<float>("r32", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<float>("r32", shape, start, count);
             auto &var_r64 =
-                io.DefineVariable<double>("r64", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<double>("r64", shape, start, count);
         }
 
         // Create the BP Engine
@@ -284,8 +385,12 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4)
         auto engine = io.Open(fname, adios2::OpenMode::Write);
         ASSERT_NE(engine.get(), nullptr);
 
-        for (size_t step = 0; step < 3; ++step)
+        for (size_t step = 0; step < NSteps; ++step)
         {
+            // Generate test data for each process uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, step, mpiRank, mpiSize);
+
             // Retrieve the variables that previously went out of scope
             auto &var_i8 = io.GetVariable<int8_t>("i8");
             auto &var_i16 = io.GetVariable<int16_t>("i16");
@@ -298,17 +403,34 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4)
             auto &var_r32 = io.GetVariable<float>("r32");
             auto &var_r64 = io.GetVariable<double>("r64");
 
+            // Make a 2D selection to describe the local dimensions of the
+            // variable we write and its offsets in the global spaces
+            adios2::SelectionBoundingBox sel(
+                {0, static_cast<unsigned int>(mpiRank * Nx)}, {Ny, Nx});
+            var_i8.SetSelection(sel);
+            var_i16.SetSelection(sel);
+            var_i32.SetSelection(sel);
+            var_i64.SetSelection(sel);
+            var_u8.SetSelection(sel);
+            var_u16.SetSelection(sel);
+            var_u32.SetSelection(sel);
+            var_u64.SetSelection(sel);
+            var_r32.SetSelection(sel);
+            var_r64.SetSelection(sel);
+
             // Write each one
-            engine->Write(var_i8, m_TestData.I8.data() + step);
-            engine->Write(var_i16, m_TestData.I16.data() + step);
-            engine->Write(var_i32, m_TestData.I32.data() + step);
-            engine->Write(var_i64, m_TestData.I64.data() + step);
-            engine->Write(var_u8, m_TestData.U8.data() + step);
-            engine->Write(var_u16, m_TestData.U16.data() + step);
-            engine->Write(var_u32, m_TestData.U32.data() + step);
-            engine->Write(var_u64, m_TestData.U64.data() + step);
-            engine->Write(var_r32, m_TestData.R32.data() + step);
-            engine->Write(var_r64, m_TestData.R64.data() + step);
+            // fill in the variable with values from starting index to
+            // starting index + count
+            engine->Write(var_i8, currentTestData.I8.data());
+            engine->Write(var_i16, currentTestData.I16.data());
+            engine->Write(var_i32, currentTestData.I32.data());
+            engine->Write(var_i64, currentTestData.I64.data());
+            engine->Write(var_u8, currentTestData.U8.data());
+            engine->Write(var_u16, currentTestData.U16.data());
+            engine->Write(var_u32, currentTestData.U32.data());
+            engine->Write(var_u64, currentTestData.U64.data());
+            engine->Write(var_r32, currentTestData.R32.data());
+            engine->Write(var_r64, currentTestData.R64.data());
 
             // Advance to the next time step
             engine->Advance();
@@ -318,93 +440,118 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4)
         engine->Close();
     }
 
-// Read test data using ADIOS1
-#ifdef ADIOS2_HAVE_MPI
-    // Read everything from rank 0
-    int rank;
-    MPI_Comm_rank();
-    if (rank == 0)
-#endif
     {
-        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_WORLD,
+        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_SELF,
                                "verbose=3");
 
         // Open the file for reading
-        ADIOS_FILE *f =
-            adios_read_open_file((fname + ".dir/" + fname + ".0").c_str(),
-                                 ADIOS_READ_METHOD_BP, MPI_COMM_WORLD);
+        // Note: Since collective metadata generation is not implemented yet,
+        // SO for now we read each subfile instead of a single bp file with all
+        // metadata.
+        // Meanwhile if we open file with MPI_COMM_WORLD, then the selection
+        // bounding box should be [0, Nx]
+        std::string index = std::to_string(mpiRank);
+        ADIOS_FILE *f = adios_read_open_file(
+            (fname + ".dir/" + fname + "." + index).c_str(),
+            ADIOS_READ_METHOD_BP, MPI_COMM_SELF);
         ASSERT_NE(f, nullptr);
 
         // Check the variables exist
         ADIOS_VARINFO *var_i8 = adios_inq_var(f, "i8");
         ASSERT_NE(var_i8, nullptr);
         ASSERT_EQ(var_i8->ndim, 2);
-        ASSERT_EQ(var_i8->dims[0], 2);
-        ASSERT_EQ(var_i8->dims[1], 4);
+        ASSERT_EQ(var_i8->global, 1);
+        ASSERT_EQ(var_i8->nsteps, NSteps);
+        ASSERT_EQ(var_i8->dims[0], Ny);
+        ASSERT_EQ(var_i8->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i16 = adios_inq_var(f, "i16");
         ASSERT_NE(var_i16, nullptr);
         ASSERT_EQ(var_i16->ndim, 2);
-        ASSERT_EQ(var_i16->dims[0], 2);
-        ASSERT_EQ(var_i16->dims[1], 4);
+        ASSERT_EQ(var_i16->global, 1);
+        ASSERT_EQ(var_i16->nsteps, NSteps);
+        ASSERT_EQ(var_i16->dims[0], Ny);
+        ASSERT_EQ(var_i16->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i32 = adios_inq_var(f, "i32");
         ASSERT_NE(var_i32, nullptr);
         ASSERT_EQ(var_i32->ndim, 2);
-        ASSERT_EQ(var_i32->dims[0], 2);
-        ASSERT_EQ(var_i32->dims[1], 4);
+        ASSERT_EQ(var_i32->global, 1);
+        ASSERT_EQ(var_i32->nsteps, NSteps);
+        ASSERT_EQ(var_i32->dims[0], Ny);
+        ASSERT_EQ(var_i32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i64 = adios_inq_var(f, "i64");
         ASSERT_NE(var_i64, nullptr);
         ASSERT_EQ(var_i64->ndim, 2);
-        ASSERT_EQ(var_i64->dims[0], 2);
-        ASSERT_EQ(var_i64->dims[1], 4);
+        ASSERT_EQ(var_i64->global, 1);
+        ASSERT_EQ(var_i64->nsteps, NSteps);
+        ASSERT_EQ(var_i64->dims[0], Ny);
+        ASSERT_EQ(var_i64->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u8 = adios_inq_var(f, "u8");
         ASSERT_NE(var_u8, nullptr);
         ASSERT_EQ(var_u8->ndim, 2);
-        ASSERT_EQ(var_u8->dims[0], 2);
-        ASSERT_EQ(var_u8->dims[1], 4);
+        ASSERT_EQ(var_u8->global, 1);
+        ASSERT_EQ(var_u8->nsteps, NSteps);
+        ASSERT_EQ(var_u8->dims[0], Ny);
+        ASSERT_EQ(var_u8->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u16 = adios_inq_var(f, "u16");
         ASSERT_NE(var_u16, nullptr);
         ASSERT_EQ(var_u16->ndim, 2);
-        ASSERT_EQ(var_u16->dims[0], 2);
-        ASSERT_EQ(var_u16->dims[1], 4);
+        ASSERT_EQ(var_u16->global, 1);
+        ASSERT_EQ(var_u16->nsteps, NSteps);
+        ASSERT_EQ(var_u16->dims[0], Ny);
+        ASSERT_EQ(var_u16->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u32 = adios_inq_var(f, "u32");
         ASSERT_NE(var_u32, nullptr);
         ASSERT_EQ(var_u32->ndim, 2);
-        ASSERT_EQ(var_u32->dims[0], 2);
-        ASSERT_EQ(var_u32->dims[1], 4);
+        ASSERT_EQ(var_u32->global, 1);
+        ASSERT_EQ(var_u32->nsteps, NSteps);
+        ASSERT_EQ(var_u32->dims[0], Ny);
+        ASSERT_EQ(var_u32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u64 = adios_inq_var(f, "u64");
         ASSERT_NE(var_u64, nullptr);
         ASSERT_EQ(var_u64->ndim, 2);
-        ASSERT_EQ(var_u64->dims[0], 2);
-        ASSERT_EQ(var_u64->dims[1], 4);
+        ASSERT_EQ(var_u64->global, 1);
+        ASSERT_EQ(var_u64->nsteps, NSteps);
+        ASSERT_EQ(var_u64->dims[0], Ny);
+        ASSERT_EQ(var_u64->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_r32 = adios_inq_var(f, "r32");
         ASSERT_NE(var_r32, nullptr);
         ASSERT_EQ(var_r32->ndim, 2);
-        ASSERT_EQ(var_r32->dims[0], 2);
-        ASSERT_EQ(var_r32->dims[1], 4);
+        ASSERT_EQ(var_r32->global, 1);
+        ASSERT_EQ(var_r32->nsteps, NSteps);
+        ASSERT_EQ(var_r32->dims[0], Ny);
+        ASSERT_EQ(var_r32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_r64 = adios_inq_var(f, "r64");
         ASSERT_NE(var_r64, nullptr);
         ASSERT_EQ(var_r64->ndim, 2);
-        ASSERT_EQ(var_r64->dims[0], 2);
-        ASSERT_EQ(var_r64->dims[1], 4);
-
-        std::array<int8_t, 8> I8;
-        std::array<int16_t, 8> I16;
-        std::array<int32_t, 8> I32;
-        std::array<int64_t, 8> I64;
-        std::array<uint8_t, 8> U8;
-        std::array<uint16_t, 8> U16;
-        std::array<uint32_t, 8> U32;
-        std::array<uint64_t, 8> U64;
-        std::array<float, 8> R32;
-        std::array<double, 8> R64;
-
-        uint64_t start[2] = {0, 0};
-        uint64_t count[2] = {2, 4};
+        ASSERT_EQ(var_r64->global, 1);
+        ASSERT_EQ(var_r64->nsteps, NSteps);
+        ASSERT_EQ(var_r64->dims[0], Ny);
+        ASSERT_EQ(var_r64->dims[1], mpiSize * Nx);
+
+        // If the size of the array is smaller than the data
+        // the result is weird... double and uint64_t would get completely
+        // garbage data
+        std::array<int8_t, Nx * Ny> I8;
+        std::array<int16_t, Nx * Ny> I16;
+        std::array<int32_t, Nx * Ny> I32;
+        std::array<int64_t, Nx * Ny> I64;
+        std::array<uint8_t, Nx * Ny> U8;
+        std::array<uint16_t, Nx * Ny> U16;
+        std::array<uint32_t, Nx * Ny> U32;
+        std::array<uint64_t, Nx * Ny> U64;
+        std::array<float, Nx * Ny> R32;
+        std::array<double, Nx * Ny> R64;
+
+        uint64_t start[2] = {0, mpiRank * Nx};
+        uint64_t count[2] = {Ny, Nx};
         ADIOS_SELECTION *sel = adios_selection_boundingbox(2, start, count);
 
         // Read stuff
-        for (size_t t = 0; t < 3; ++t)
+        for (size_t t = 0; t < NSteps; ++t)
         {
+            // Generate test data for each rank uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, t, mpiRank, mpiSize);
             // Read the current step
             adios_schedule_read_byid(f, sel, var_i8->varid, t, 1, I8.data());
             adios_schedule_read_byid(f, sel, var_i16->varid, t, 1, I16.data());
@@ -419,22 +566,22 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4)
             adios_perform_reads(f, 1);
 
             // Check if it's correct
-            for (size_t i = 0; i < 8; ++i)
+            for (size_t i = 0; i < Nx; ++i)
             {
                 std::stringstream ss;
-                ss << "t=" << t << " i=" << i;
+                ss << "t=" << t << " i=" << i << " rank=" << mpiRank;
                 std::string msg = ss.str();
 
-                EXPECT_EQ(I8[i], m_TestData.I8[i + t]) << msg;
-                EXPECT_EQ(I16[i], m_TestData.I16[i + t]) << msg;
-                EXPECT_EQ(I32[i], m_TestData.I32[i + t]) << msg;
-                EXPECT_EQ(I64[i], m_TestData.I64[i + t]) << msg;
-                EXPECT_EQ(U8[i], m_TestData.U8[i + t]) << msg;
-                EXPECT_EQ(U16[i], m_TestData.U16[i + t]) << msg;
-                EXPECT_EQ(U32[i], m_TestData.U32[i + t]) << msg;
-                EXPECT_EQ(U64[i], m_TestData.U64[i + t]) << msg;
-                EXPECT_EQ(R32[i], m_TestData.R32[i + t]) << msg;
-                EXPECT_EQ(R64[i], m_TestData.R64[i + t]) << msg;
+                EXPECT_EQ(I8[i], currentTestData.I8[i]) << msg;
+                EXPECT_EQ(I16[i], currentTestData.I16[i]) << msg;
+                EXPECT_EQ(I32[i], currentTestData.I32[i]) << msg;
+                EXPECT_EQ(I64[i], currentTestData.I64[i]) << msg;
+                EXPECT_EQ(U8[i], currentTestData.U8[i]) << msg;
+                EXPECT_EQ(U16[i], currentTestData.U16[i]) << msg;
+                EXPECT_EQ(U32[i], currentTestData.U32[i]) << msg;
+                EXPECT_EQ(U64[i], currentTestData.U64[i]) << msg;
+                EXPECT_EQ(R32[i], currentTestData.R32[i]) << msg;
+                EXPECT_EQ(R64[i], currentTestData.R64[i]) << msg;
             }
         }
 
@@ -454,6 +601,8 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4)
 
         // Cleanup file
         adios_read_close(f);
+
+        adios_read_finalize_method(ADIOS_READ_METHOD_BP);
     }
 }
 
@@ -472,46 +621,78 @@ TEST_F(BPWriteReadTest, DISABLED_ADIOS2BPWriteADIOS2BPRead2D2x4)
 // ADIOS2 write, native ADIOS1 read
 TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2)
 {
+    // Each process would write a 4x2 array and all processes would
+    // form a 2D 4 * (NumberOfProcess * Nx) matrix where Nx is 2 here
     std::string fname = "ADIOS2BPWriteADIOS1Read2D4x2Test.bp";
 
+    int mpiRank = 0, mpiSize = 1;
+    // Number of rows
+    const std::size_t Nx = 2;
+    // Number of cols
+    const std::size_t Ny = 4;
+
+    // Number of steps
+    const std::size_t NSteps = 3;
+
+#ifdef ADIOS2_HAVE_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
+#endif
+
     // Write test data using ADIOS2
     {
+#ifdef ADIOS2_HAVE_MPI
+        adios2::ADIOS adios(MPI_COMM_WORLD, adios2::DebugON);
+#else
         adios2::ADIOS adios(true);
+#endif
         adios2::IO &io = adios.DeclareIO("TestIO");
 
-        // Declare 1D variables
+        // Declare 2D variables (4 * (NumberOfProcess * Nx))
+        // The local process' part (start, count) can be defined now or later
+        // before Write().
         {
-            auto &var_i8 =
-                io.DefineVariable<int8_t>("i8", {}, {}, adios2::Dims{4, 2});
+            adios2::Dims shape{static_cast<unsigned int>(Ny),
+                               static_cast<unsigned int>(mpiSize * Nx)};
+            adios2::Dims start{static_cast<unsigned int>(0),
+                               static_cast<unsigned int>(mpiRank * Nx)};
+            adios2::Dims count{static_cast<unsigned int>(Ny),
+                               static_cast<unsigned int>(Nx)};
+            auto &var_i8 = io.DefineVariable<int8_t>("i8", shape, start, count);
             auto &var_i16 =
-                io.DefineVariable<int16_t>("i16", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<int16_t>("i16", shape, start, count);
             auto &var_i32 =
-                io.DefineVariable<int32_t>("i32", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<int32_t>("i32", shape, start, count);
             auto &var_i64 =
-                io.DefineVariable<int64_t>("i64", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<int64_t>("i64", shape, start, count);
             auto &var_u8 =
-                io.DefineVariable<uint8_t>("u8", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<uint8_t>("u8", shape, start, count);
             auto &var_u16 =
-                io.DefineVariable<uint16_t>("u16", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<uint16_t>("u16", shape, start, count);
             auto &var_u32 =
-                io.DefineVariable<uint32_t>("u32", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<uint32_t>("u32", shape, start, count);
             auto &var_u64 =
-                io.DefineVariable<uint64_t>("u64", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<uint64_t>("u64", shape, start, count);
             auto &var_r32 =
-                io.DefineVariable<float>("r32", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<float>("r32", shape, start, count);
             auto &var_r64 =
-                io.DefineVariable<double>("r64", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<double>("r64", shape, start, count);
         }
 
         // Create the BP Engine
         io.SetEngine("BPFileWriter");
+
         io.AddTransport("file");
 
         auto engine = io.Open(fname, adios2::OpenMode::Write);
         ASSERT_NE(engine.get(), nullptr);
 
-        for (size_t step = 0; step < 3; ++step)
+        for (size_t step = 0; step < NSteps; ++step)
         {
+            // Generate test data for each process uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, step, mpiRank, mpiSize);
+
             // Retrieve the variables that previously went out of scope
             auto &var_i8 = io.GetVariable<int8_t>("i8");
             auto &var_i16 = io.GetVariable<int16_t>("i16");
@@ -524,17 +705,34 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2)
             auto &var_r32 = io.GetVariable<float>("r32");
             auto &var_r64 = io.GetVariable<double>("r64");
 
+            // Make a 2D selection to describe the local dimensions of the
+            // variable we write and its offsets in the global spaces
+            adios2::SelectionBoundingBox sel(
+                {0, static_cast<unsigned int>(mpiRank * Nx)}, {Ny, Nx});
+            var_i8.SetSelection(sel);
+            var_i16.SetSelection(sel);
+            var_i32.SetSelection(sel);
+            var_i64.SetSelection(sel);
+            var_u8.SetSelection(sel);
+            var_u16.SetSelection(sel);
+            var_u32.SetSelection(sel);
+            var_u64.SetSelection(sel);
+            var_r32.SetSelection(sel);
+            var_r64.SetSelection(sel);
+
             // Write each one
-            engine->Write(var_i8, m_TestData.I8.data() + step);
-            engine->Write(var_i16, m_TestData.I16.data() + step);
-            engine->Write(var_i32, m_TestData.I32.data() + step);
-            engine->Write(var_i64, m_TestData.I64.data() + step);
-            engine->Write(var_u8, m_TestData.U8.data() + step);
-            engine->Write(var_u16, m_TestData.U16.data() + step);
-            engine->Write(var_u32, m_TestData.U32.data() + step);
-            engine->Write(var_u64, m_TestData.U64.data() + step);
-            engine->Write(var_r32, m_TestData.R32.data() + step);
-            engine->Write(var_r64, m_TestData.R64.data() + step);
+            // fill in the variable with values from starting index to
+            // starting index + count
+            engine->Write(var_i8, currentTestData.I8.data());
+            engine->Write(var_i16, currentTestData.I16.data());
+            engine->Write(var_i32, currentTestData.I32.data());
+            engine->Write(var_i64, currentTestData.I64.data());
+            engine->Write(var_u8, currentTestData.U8.data());
+            engine->Write(var_u16, currentTestData.U16.data());
+            engine->Write(var_u32, currentTestData.U32.data());
+            engine->Write(var_u64, currentTestData.U64.data());
+            engine->Write(var_r32, currentTestData.R32.data());
+            engine->Write(var_r64, currentTestData.R64.data());
 
             // Advance to the next time step
             engine->Advance();
@@ -544,93 +742,118 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2)
         engine->Close();
     }
 
-// Read test data using ADIOS1
-#ifdef ADIOS2_HAVE_MPI
-    // Read everything from rank 0
-    int rank;
-    MPI_Comm_rank();
-    if (rank == 0)
-#endif
     {
-        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_WORLD,
+        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_SELF,
                                "verbose=3");
 
         // Open the file for reading
-        ADIOS_FILE *f =
-            adios_read_open_file((fname + ".dir/" + fname + ".0").c_str(),
-                                 ADIOS_READ_METHOD_BP, MPI_COMM_WORLD);
+        // Note: Since collective metadata generation is not implemented yet,
+        // SO for now we read each subfile instead of a single bp file with all
+        // metadata.
+        // Meanwhile if we open file with MPI_COMM_WORLD, then the selection
+        // bounding box should be [0, Nx]
+        std::string index = std::to_string(mpiRank);
+        ADIOS_FILE *f = adios_read_open_file(
+            (fname + ".dir/" + fname + "." + index).c_str(),
+            ADIOS_READ_METHOD_BP, MPI_COMM_SELF);
         ASSERT_NE(f, nullptr);
 
         // Check the variables exist
         ADIOS_VARINFO *var_i8 = adios_inq_var(f, "i8");
         ASSERT_NE(var_i8, nullptr);
         ASSERT_EQ(var_i8->ndim, 2);
-        ASSERT_EQ(var_i8->dims[0], 4);
-        ASSERT_EQ(var_i8->dims[1], 2);
+        ASSERT_EQ(var_i8->global, 1);
+        ASSERT_EQ(var_i8->nsteps, NSteps);
+        ASSERT_EQ(var_i8->dims[0], Ny);
+        ASSERT_EQ(var_i8->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i16 = adios_inq_var(f, "i16");
         ASSERT_NE(var_i16, nullptr);
         ASSERT_EQ(var_i16->ndim, 2);
-        ASSERT_EQ(var_i16->dims[0], 4);
-        ASSERT_EQ(var_i16->dims[1], 2);
+        ASSERT_EQ(var_i16->global, 1);
+        ASSERT_EQ(var_i16->nsteps, NSteps);
+        ASSERT_EQ(var_i16->dims[0], Ny);
+        ASSERT_EQ(var_i16->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i32 = adios_inq_var(f, "i32");
         ASSERT_NE(var_i32, nullptr);
         ASSERT_EQ(var_i32->ndim, 2);
-        ASSERT_EQ(var_i32->dims[0], 4);
-        ASSERT_EQ(var_i32->dims[1], 2);
+        ASSERT_EQ(var_i32->global, 1);
+        ASSERT_EQ(var_i32->nsteps, NSteps);
+        ASSERT_EQ(var_i32->dims[0], Ny);
+        ASSERT_EQ(var_i32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i64 = adios_inq_var(f, "i64");
         ASSERT_NE(var_i64, nullptr);
         ASSERT_EQ(var_i64->ndim, 2);
-        ASSERT_EQ(var_i64->dims[0], 4);
-        ASSERT_EQ(var_i64->dims[1], 2);
+        ASSERT_EQ(var_i64->global, 1);
+        ASSERT_EQ(var_i64->nsteps, NSteps);
+        ASSERT_EQ(var_i64->dims[0], Ny);
+        ASSERT_EQ(var_i64->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u8 = adios_inq_var(f, "u8");
         ASSERT_NE(var_u8, nullptr);
         ASSERT_EQ(var_u8->ndim, 2);
-        ASSERT_EQ(var_u8->dims[0], 4);
-        ASSERT_EQ(var_u8->dims[1], 2);
+        ASSERT_EQ(var_u8->global, 1);
+        ASSERT_EQ(var_u8->nsteps, NSteps);
+        ASSERT_EQ(var_u8->dims[0], Ny);
+        ASSERT_EQ(var_u8->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u16 = adios_inq_var(f, "u16");
         ASSERT_NE(var_u16, nullptr);
         ASSERT_EQ(var_u16->ndim, 2);
-        ASSERT_EQ(var_u16->dims[0], 4);
-        ASSERT_EQ(var_u16->dims[1], 2);
+        ASSERT_EQ(var_u16->global, 1);
+        ASSERT_EQ(var_u16->nsteps, NSteps);
+        ASSERT_EQ(var_u16->dims[0], Ny);
+        ASSERT_EQ(var_u16->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u32 = adios_inq_var(f, "u32");
         ASSERT_NE(var_u32, nullptr);
         ASSERT_EQ(var_u32->ndim, 2);
-        ASSERT_EQ(var_u32->dims[0], 4);
-        ASSERT_EQ(var_u32->dims[1], 2);
+        ASSERT_EQ(var_u32->global, 1);
+        ASSERT_EQ(var_u32->nsteps, NSteps);
+        ASSERT_EQ(var_u32->dims[0], Ny);
+        ASSERT_EQ(var_u32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u64 = adios_inq_var(f, "u64");
         ASSERT_NE(var_u64, nullptr);
         ASSERT_EQ(var_u64->ndim, 2);
-        ASSERT_EQ(var_u64->dims[0], 4);
-        ASSERT_EQ(var_u64->dims[1], 2);
+        ASSERT_EQ(var_u64->global, 1);
+        ASSERT_EQ(var_u64->nsteps, NSteps);
+        ASSERT_EQ(var_u64->dims[0], Ny);
+        ASSERT_EQ(var_u64->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_r32 = adios_inq_var(f, "r32");
         ASSERT_NE(var_r32, nullptr);
         ASSERT_EQ(var_r32->ndim, 2);
-        ASSERT_EQ(var_r32->dims[0], 4);
-        ASSERT_EQ(var_r32->dims[1], 2);
+        ASSERT_EQ(var_r32->global, 1);
+        ASSERT_EQ(var_r32->nsteps, NSteps);
+        ASSERT_EQ(var_r32->dims[0], Ny);
+        ASSERT_EQ(var_r32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_r64 = adios_inq_var(f, "r64");
         ASSERT_NE(var_r64, nullptr);
         ASSERT_EQ(var_r64->ndim, 2);
-        ASSERT_EQ(var_r64->dims[0], 4);
-        ASSERT_EQ(var_r64->dims[1], 2);
-
-        std::array<int8_t, 8> I8;
-        std::array<int16_t, 8> I16;
-        std::array<int32_t, 8> I32;
-        std::array<int64_t, 8> I64;
-        std::array<uint8_t, 8> U8;
-        std::array<uint16_t, 8> U16;
-        std::array<uint32_t, 8> U32;
-        std::array<uint64_t, 8> U64;
-        std::array<float, 8> R32;
-        std::array<double, 8> R64;
-
-        uint64_t start[2] = {0, 0};
-        uint64_t count[2] = {4, 2};
+        ASSERT_EQ(var_r64->global, 1);
+        ASSERT_EQ(var_r64->nsteps, NSteps);
+        ASSERT_EQ(var_r64->dims[0], Ny);
+        ASSERT_EQ(var_r64->dims[1], mpiSize * Nx);
+
+        // If the size of the array is smaller than the data
+        // the result is weird... double and uint64_t would get completely
+        // garbage data
+        std::array<int8_t, Nx * Ny> I8;
+        std::array<int16_t, Nx * Ny> I16;
+        std::array<int32_t, Nx * Ny> I32;
+        std::array<int64_t, Nx * Ny> I64;
+        std::array<uint8_t, Nx * Ny> U8;
+        std::array<uint16_t, Nx * Ny> U16;
+        std::array<uint32_t, Nx * Ny> U32;
+        std::array<uint64_t, Nx * Ny> U64;
+        std::array<float, Nx * Ny> R32;
+        std::array<double, Nx * Ny> R64;
+
+        uint64_t start[2] = {0, mpiRank * Nx};
+        uint64_t count[2] = {Ny, Nx};
         ADIOS_SELECTION *sel = adios_selection_boundingbox(2, start, count);
 
         // Read stuff
-        for (size_t t = 0; t < 3; ++t)
+        for (size_t t = 0; t < NSteps; ++t)
         {
+            // Generate test data for each rank uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, t, mpiRank, mpiSize);
             // Read the current step
             adios_schedule_read_byid(f, sel, var_i8->varid, t, 1, I8.data());
             adios_schedule_read_byid(f, sel, var_i16->varid, t, 1, I16.data());
@@ -645,22 +868,22 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2)
             adios_perform_reads(f, 1);
 
             // Check if it's correct
-            for (size_t i = 0; i < 8; ++i)
+            for (size_t i = 0; i < Nx; ++i)
             {
                 std::stringstream ss;
-                ss << "t=" << t << " i=" << i;
+                ss << "t=" << t << " i=" << i << " rank=" << mpiRank;
                 std::string msg = ss.str();
 
-                EXPECT_EQ(I8[i], m_TestData.I8[i + t]) << msg;
-                EXPECT_EQ(I16[i], m_TestData.I16[i + t]) << msg;
-                EXPECT_EQ(I32[i], m_TestData.I32[i + t]) << msg;
-                EXPECT_EQ(I64[i], m_TestData.I64[i + t]) << msg;
-                EXPECT_EQ(U8[i], m_TestData.U8[i + t]) << msg;
-                EXPECT_EQ(U16[i], m_TestData.U16[i + t]) << msg;
-                EXPECT_EQ(U32[i], m_TestData.U32[i + t]) << msg;
-                EXPECT_EQ(U64[i], m_TestData.U64[i + t]) << msg;
-                EXPECT_EQ(R32[i], m_TestData.R32[i + t]) << msg;
-                EXPECT_EQ(R64[i], m_TestData.R64[i + t]) << msg;
+                EXPECT_EQ(I8[i], currentTestData.I8[i]) << msg;
+                EXPECT_EQ(I16[i], currentTestData.I16[i]) << msg;
+                EXPECT_EQ(I32[i], currentTestData.I32[i]) << msg;
+                EXPECT_EQ(I64[i], currentTestData.I64[i]) << msg;
+                EXPECT_EQ(U8[i], currentTestData.U8[i]) << msg;
+                EXPECT_EQ(U16[i], currentTestData.U16[i]) << msg;
+                EXPECT_EQ(U32[i], currentTestData.U32[i]) << msg;
+                EXPECT_EQ(U64[i], currentTestData.U64[i]) << msg;
+                EXPECT_EQ(R32[i], currentTestData.R32[i]) << msg;
+                EXPECT_EQ(R64[i], currentTestData.R64[i]) << msg;
             }
         }
 
@@ -680,6 +903,8 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2)
 
         // Cleanup file
         adios_read_close(f);
+
+        adios_read_finalize_method(ADIOS_READ_METHOD_BP);
     }
 }
 
diff --git a/testing/adios2/engine/bp/TestBPWriteReadAttributes.cpp b/testing/adios2/engine/bp/TestBPWriteReadAttributes.cpp
index 8e8ce0704bdad98bdfdb40c11e966cc1ea292ccf..b963ad7a3d0d67ec871419a08f1b361a8573c8f2 100644
--- a/testing/adios2/engine/bp/TestBPWriteReadAttributes.cpp
+++ b/testing/adios2/engine/bp/TestBPWriteReadAttributes.cpp
@@ -30,33 +30,76 @@ public:
 // ADIOS2 write, native ADIOS1 read for single value attributes
 TEST_F(BPWriteReadAttributeTest, ADIOS2BPWriteADIOS1ReadSingleTypes)
 {
-    std::string fname = "ADIOS2BPWriteAttributeADIOS1ReadSingleTypes.bp";
+    std::string fname = "foo/ADIOS2BPWriteAttributeADIOS1ReadSingleTypes.bp";
+    std::string fRootName = "ADIOS2BPWriteAttributeADIOS1ReadSingleTypes.bp";
 
-    // Write test data using ADIOS2
+    int mpiRank = 0, mpiSize = 1;
+#ifdef ADIOS2_HAVE_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
+#endif
+
+    // FIXME: Since collective meta generation has not landed yet, so there is
+    // no way for us to gather different attribute data per process. Ideally we
+    // should use unique attribute data per process. Ex:
+    // std::to_string(mpiRank);
+    std::string mpiRankString = std::to_string(0);
+    std::string s1_Single = std::string("s1_Single_") + mpiRankString;
+    std::string i8_Single = std::string("i8_Single_") + mpiRankString;
+    std::string i16_Single = std::string("i16_Single_") + mpiRankString;
+    std::string i32_Single = std::string("i32_Single_") + mpiRankString;
+    std::string i64_Single = std::string("i64_Single_") + mpiRankString;
+    std::string u8_Single = std::string("u8_Single_") + mpiRankString;
+    std::string u16_Single = std::string("u16_Single_") + mpiRankString;
+    std::string u32_Single = std::string("u32_Single_") + mpiRankString;
+    std::string u64_Single = std::string("u64_Single_") + mpiRankString;
+    std::string float_Single = std::string("float_Single_") + mpiRankString;
+    std::string double_Single = std::string("double_Single_") + mpiRankString;
+
+    // When collective meta generation has landed, use
+    // generateNewSmallTestData(m_TestData, 0, 0, mpiSize);
+    // Generate current testing data
+    SmallTestData currentTestData =
+        generateNewSmallTestData(m_TestData, 0, 0, 0);
+
+    // Write test data using BP
     {
+#ifdef ADIOS2_HAVE_MPI
+        adios2::ADIOS adios(MPI_COMM_WORLD, adios2::DebugON);
+#else
         adios2::ADIOS adios(true);
+#endif
         adios2::IO &io = adios.DeclareIO("TestIO");
 
         // Declare Single Value Attributes
         {
-            io.DefineAttribute<std::string>("s1_Single", m_TestData.S1);
-            io.DefineAttribute<int8_t>("i8_Single", m_TestData.I8.front());
-            io.DefineAttribute<int16_t>("i16_Single", m_TestData.I16.front());
-            io.DefineAttribute<int32_t>("i32_Single", m_TestData.I32.front());
-            io.DefineAttribute<int64_t>("i64_Single", m_TestData.I64.front());
-
-            io.DefineAttribute<uint8_t>("u8_Single", m_TestData.U8.front());
-            io.DefineAttribute<uint16_t>("u16_Single", m_TestData.U16.front());
-            io.DefineAttribute<uint32_t>("u32_Single", m_TestData.U32.front());
-            io.DefineAttribute<uint64_t>("u64_Single", m_TestData.U64.front());
-
-            io.DefineAttribute<float>("float_Single", m_TestData.R32.front());
-            io.DefineAttribute<double>("double_Single", m_TestData.R64.front());
+            io.DefineAttribute<std::string>(s1_Single, currentTestData.S1);
+            io.DefineAttribute<int8_t>(i8_Single, currentTestData.I8.front());
+            io.DefineAttribute<int16_t>(i16_Single,
+                                        currentTestData.I16.front());
+            io.DefineAttribute<int32_t>(i32_Single,
+                                        currentTestData.I32.front());
+            io.DefineAttribute<int64_t>(i64_Single,
+                                        currentTestData.I64.front());
+
+            io.DefineAttribute<uint8_t>(u8_Single, currentTestData.U8.front());
+            io.DefineAttribute<uint16_t>(u16_Single,
+                                         currentTestData.U16.front());
+            io.DefineAttribute<uint32_t>(u32_Single,
+                                         currentTestData.U32.front());
+            io.DefineAttribute<uint64_t>(u64_Single,
+                                         currentTestData.U64.front());
+
+            io.DefineAttribute<float>(float_Single,
+                                      currentTestData.R32.front());
+            io.DefineAttribute<double>(double_Single,
+                                       currentTestData.R64.front());
         }
 
         // Create the BP Engine
         io.SetEngine("BPFileWriter");
-        io.AddTransport("File");
+
+        io.AddTransport("file");
 
         auto engine = io.Open(fname, adios2::OpenMode::Write);
         ASSERT_NE(engine.get(), nullptr);
@@ -65,93 +108,97 @@ TEST_F(BPWriteReadAttributeTest, ADIOS2BPWriteADIOS1ReadSingleTypes)
         engine->Close();
     }
 
-// Read test data using ADIOS1
-#ifdef ADIOS2_HAVE_MPI
-    // Read everything from rank 0
-    int rank;
-    MPI_Comm_rank();
-    if (rank == 0)
-#endif
     {
-        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_WORLD,
+        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_SELF,
                                "verbose=3");
 
         // Open the file for reading
-        ADIOS_FILE *f =
-            adios_read_open_file((fname + ".dir/" + fname + ".0").c_str(),
-                                 ADIOS_READ_METHOD_BP, MPI_COMM_WORLD);
+        ADIOS_FILE *f = adios_read_open_file(
+            (fname + ".dir/" + fRootName + "." + mpiRankString).c_str(),
+            ADIOS_READ_METHOD_BP, MPI_COMM_SELF);
         ASSERT_NE(f, nullptr);
 
         int size, status;
         enum ADIOS_DATATYPES type;
         void *data = nullptr;
 
-        status = adios_get_attr(f, "s1_Single", &type, &size, &data);
+        // status = adios_get_attr(f, "s1_Single_0", &type, &size, &data);
+        status = adios_get_attr(f, s1_Single.c_str(), &type, &size, &data);
         ASSERT_EQ(status, 0);
         ASSERT_NE(data, nullptr);
         ASSERT_EQ(type, adios_string);
         std::string singleStringAttribute(reinterpret_cast<char *>(data), size);
-        ASSERT_STREQ(singleStringAttribute.c_str(), m_TestData.S1.c_str());
+        ASSERT_STREQ(singleStringAttribute.c_str(), currentTestData.S1.c_str());
 
-        status = adios_get_attr(f, "i8_Single", &type, &size, &data);
+        status = adios_get_attr(f, i8_Single.c_str(), &type, &size, &data);
         ASSERT_EQ(status, 0);
         ASSERT_NE(data, nullptr);
         ASSERT_EQ(type, adios_byte);
-        ASSERT_EQ(*reinterpret_cast<int8_t *>(data), m_TestData.I8.front());
+        ASSERT_EQ(*reinterpret_cast<int8_t *>(data),
+                  currentTestData.I8.front());
 
-        status = adios_get_attr(f, "i16_Single", &type, &size, &data);
+        status = adios_get_attr(f, i16_Single.c_str(), &type, &size, &data);
         ASSERT_EQ(status, 0);
         ASSERT_NE(data, nullptr);
         ASSERT_EQ(type, adios_short);
-        ASSERT_EQ(*reinterpret_cast<int16_t *>(data), m_TestData.I16.front());
+        ASSERT_EQ(*reinterpret_cast<int16_t *>(data),
+                  currentTestData.I16.front());
 
-        status = adios_get_attr(f, "i32_Single", &type, &size, &data);
+        status = adios_get_attr(f, i32_Single.c_str(), &type, &size, &data);
         ASSERT_EQ(status, 0);
         ASSERT_NE(data, nullptr);
         ASSERT_EQ(type, adios_integer);
-        ASSERT_EQ(*reinterpret_cast<int32_t *>(data), m_TestData.I32.front());
+        ASSERT_EQ(*reinterpret_cast<int32_t *>(data),
+                  currentTestData.I32.front());
 
-        status = adios_get_attr(f, "i64_Single", &type, &size, &data);
+        status = adios_get_attr(f, i64_Single.c_str(), &type, &size, &data);
         ASSERT_EQ(status, 0);
         ASSERT_NE(data, nullptr);
         ASSERT_EQ(type, adios_long);
-        ASSERT_EQ(*reinterpret_cast<int64_t *>(data), m_TestData.I64.front());
+        ASSERT_EQ(*reinterpret_cast<int64_t *>(data),
+                  currentTestData.I64.front());
 
-        status = adios_get_attr(f, "u8_Single", &type, &size, &data);
+        status = adios_get_attr(f, u8_Single.c_str(), &type, &size, &data);
         ASSERT_EQ(status, 0);
         ASSERT_NE(data, nullptr);
         ASSERT_EQ(type, adios_unsigned_byte);
-        ASSERT_EQ(*reinterpret_cast<uint8_t *>(data), m_TestData.U8.front());
+        ASSERT_EQ(*reinterpret_cast<uint8_t *>(data),
+                  currentTestData.U8.front());
 
-        status = adios_get_attr(f, "u16_Single", &type, &size, &data);
+        status = adios_get_attr(f, u16_Single.c_str(), &type, &size, &data);
         ASSERT_EQ(status, 0);
         ASSERT_NE(data, nullptr);
         ASSERT_EQ(type, adios_unsigned_short);
-        ASSERT_EQ(*reinterpret_cast<uint16_t *>(data), m_TestData.U16.front());
+        ASSERT_EQ(*reinterpret_cast<uint16_t *>(data),
+                  currentTestData.U16.front());
 
-        status = adios_get_attr(f, "u32_Single", &type, &size, &data);
+        status = adios_get_attr(f, u32_Single.c_str(), &type, &size, &data);
         ASSERT_EQ(status, 0);
         ASSERT_NE(data, nullptr);
         ASSERT_EQ(type, adios_unsigned_integer);
-        ASSERT_EQ(*reinterpret_cast<uint32_t *>(data), m_TestData.U32.front());
+        ASSERT_EQ(*reinterpret_cast<uint32_t *>(data),
+                  currentTestData.U32.front());
 
-        status = adios_get_attr(f, "u64_Single", &type, &size, &data);
+        status = adios_get_attr(f, u64_Single.c_str(), &type, &size, &data);
         ASSERT_EQ(status, 0);
         ASSERT_NE(data, nullptr);
         ASSERT_EQ(type, adios_unsigned_long);
-        ASSERT_EQ(*reinterpret_cast<uint64_t *>(data), m_TestData.U64.front());
+        ASSERT_EQ(*reinterpret_cast<uint64_t *>(data),
+                  currentTestData.U64.front());
 
-        status = adios_get_attr(f, "float_Single", &type, &size, &data);
+        status = adios_get_attr(f, float_Single.c_str(), &type, &size, &data);
         ASSERT_EQ(status, 0);
         ASSERT_NE(data, nullptr);
         ASSERT_EQ(type, adios_real);
-        ASSERT_EQ(*reinterpret_cast<float *>(data), m_TestData.R32.front());
+        ASSERT_EQ(*reinterpret_cast<float *>(data),
+                  currentTestData.R32.front());
 
-        status = adios_get_attr(f, "double_Single", &type, &size, &data);
+        status = adios_get_attr(f, double_Single.c_str(), &type, &size, &data);
         ASSERT_EQ(status, 0);
         ASSERT_NE(data, nullptr);
         ASSERT_EQ(type, adios_double);
-        ASSERT_EQ(*reinterpret_cast<double *>(data), m_TestData.R64.front());
+        ASSERT_EQ(*reinterpret_cast<double *>(data),
+                  currentTestData.R64.front());
 
         // Cleanup file
         adios_read_close(f);
@@ -161,7 +208,8 @@ TEST_F(BPWriteReadAttributeTest, ADIOS2BPWriteADIOS1ReadSingleTypes)
 // ADIOS2 write, native ADIOS1 read for array attributes
 TEST_F(BPWriteReadAttributeTest, ADIOS2BPWriteADIOS1ReadArrayTypes)
 {
-    std::string fname = "ADIOS2BPWriteAttributeADIOS1ReadArrayTypes.bp";
+    std::string fname = "foo/bar/ADIOS2BPWriteAttributeADIOS1ReadArrayTypes.bp";
+    std::string fRootName = "ADIOS2BPWriteAttributeADIOS1ReadArrayTypes.bp";
 
     // Write test data using ADIOS2
     {
@@ -198,7 +246,8 @@ TEST_F(BPWriteReadAttributeTest, ADIOS2BPWriteADIOS1ReadArrayTypes)
 
         // Create the BP Engine
         io.SetEngine("BPFileWriter");
-        io.AddTransport("File");
+
+        io.AddTransport("file");
 
         auto engine = io.Open(fname, adios2::OpenMode::Write);
         ASSERT_NE(engine.get(), nullptr);
@@ -207,20 +256,13 @@ TEST_F(BPWriteReadAttributeTest, ADIOS2BPWriteADIOS1ReadArrayTypes)
         engine->Close();
     }
 
-// Read test data using ADIOS1
-#ifdef ADIOS2_HAVE_MPI
-    // Read everything from rank 0
-    int rank;
-    MPI_Comm_rank();
-    if (rank == 0)
-#endif
     {
         adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_WORLD,
                                "verbose=3");
 
         // Open the file for reading
         ADIOS_FILE *f =
-            adios_read_open_file((fname + ".dir/" + fname + ".0").c_str(),
+            adios_read_open_file((fname + ".dir/" + fRootName + ".0").c_str(),
                                  ADIOS_READ_METHOD_BP, MPI_COMM_WORLD);
         ASSERT_NE(f, nullptr);
 
diff --git a/testing/adios2/engine/bp/TestBPWriteReadfstream.cpp b/testing/adios2/engine/bp/TestBPWriteReadfstream.cpp
index 33164aae4921b79d6afc68e92e20a9a7fa1c6e74..faaaf1fd9f0bd9555e02ab41831e4ca29b6b3cb9 100644
--- a/testing/adios2/engine/bp/TestBPWriteReadfstream.cpp
+++ b/testing/adios2/engine/bp/TestBPWriteReadfstream.cpp
@@ -27,49 +27,81 @@ public:
 // 1D 1x8 test data
 //******************************************************************************
 
-// ADIOS2 write, native ADIOS1 read
+// ADIOS2 BP write, native ADIOS1 read
 TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8fstream)
 {
+    // Each process would write a 1x8 array and all processes would
+    // form a mpiSize * Nx 1D array
     std::string fname = "ADIOS2BPWriteADIOS1Read1D8fstream.bp";
 
-    // Write test data using ADIOS2
+    int mpiRank = 0, mpiSize = 1;
+    // Number of rows
+    const std::size_t Nx = 8;
+
+    // Number of steps
+    const std::size_t NSteps = 3;
+
+#ifdef ADIOS2_HAVE_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
+#endif
+
+    // Write test data using BP
     {
+#ifdef ADIOS2_HAVE_MPI
+        adios2::ADIOS adios(MPI_COMM_WORLD, adios2::DebugON);
+#else
         adios2::ADIOS adios(true);
+#endif
         adios2::IO &io = adios.DeclareIO("TestIO");
 
-        // Declare 1D variables
+        // Declare 1D variables (NumOfProcesses * Nx)
+        // The local process' part (start, count) can be defined now or later
+        // before Write().
         {
-            auto &var_i8 =
-                io.DefineVariable<int8_t>("i8", {}, {}, adios2::Dims{8});
+            adios2::Dims shape{static_cast<unsigned int>(Nx * mpiSize)};
+            adios2::Dims start{static_cast<unsigned int>(Nx * mpiRank)};
+            adios2::Dims count{static_cast<unsigned int>(Nx)};
+            auto &var_i8 = io.DefineVariable<int8_t>("i8", shape, start, count);
             auto &var_i16 =
-                io.DefineVariable<int16_t>("i16", {}, {}, adios2::Dims{8});
+                io.DefineVariable<int16_t>("i16", shape, start, count);
             auto &var_i32 =
-                io.DefineVariable<int32_t>("i32", {}, {}, adios2::Dims{8});
+                io.DefineVariable<int32_t>("i32", shape, start, count);
             auto &var_i64 =
-                io.DefineVariable<int64_t>("i64", {}, {}, adios2::Dims{8});
+                io.DefineVariable<int64_t>("i64", shape, start, count);
             auto &var_u8 =
-                io.DefineVariable<uint8_t>("u8", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint8_t>("u8", shape, start, count);
             auto &var_u16 =
-                io.DefineVariable<uint16_t>("u16", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint16_t>("u16", shape, start, count);
             auto &var_u32 =
-                io.DefineVariable<uint32_t>("u32", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint32_t>("u32", shape, start, count);
             auto &var_u64 =
-                io.DefineVariable<uint64_t>("u64", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint64_t>("u64", shape, start, count);
             auto &var_r32 =
-                io.DefineVariable<float>("r32", {}, {}, adios2::Dims{8});
+                io.DefineVariable<float>("r32", shape, start, count);
             auto &var_r64 =
-                io.DefineVariable<double>("r64", {}, {}, adios2::Dims{8});
+                io.DefineVariable<double>("r64", shape, start, count);
         }
 
         // Create the BP Engine
         io.SetEngine("BPFileWriter");
-        io.AddTransport("File", {{"Library", "fstream"}});
+        io.AddTransport("file", {{"Library", "fstream"}});
 
+        // QUESTION: It seems that BPFilterWriter cannot overwrite existing
+        // files
+        // Ex. if you tune Nx and NSteps, the test would fail. But if you clear
+        // the cache in
+        // ${adios2Build}/testing/adios2/engine/bp/ADIOS2BPWriteADIOS1Read1D8.bp.dir,
+        // then it works
         auto engine = io.Open(fname, adios2::OpenMode::Write);
         ASSERT_NE(engine.get(), nullptr);
 
-        for (size_t step = 0; step < 3; ++step)
+        for (size_t step = 0; step < NSteps; ++step)
         {
+            // Generate test data for each process uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, step, mpiRank, mpiSize);
+
             // Retrieve the variables that previously went out of scope
             auto &var_i8 = io.GetVariable<int8_t>("i8");
             auto &var_i16 = io.GetVariable<int16_t>("i16");
@@ -82,17 +114,33 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8fstream)
             auto &var_r32 = io.GetVariable<float>("r32");
             auto &var_r64 = io.GetVariable<double>("r64");
 
+            // Make a 1D selection to describe the local dimensions of the
+            // variable we write and its offsets in the global spaces
+            adios2::SelectionBoundingBox sel({mpiRank * Nx}, {Nx});
+            var_i8.SetSelection(sel);
+            var_i16.SetSelection(sel);
+            var_i32.SetSelection(sel);
+            var_i64.SetSelection(sel);
+            var_u8.SetSelection(sel);
+            var_u16.SetSelection(sel);
+            var_u32.SetSelection(sel);
+            var_u64.SetSelection(sel);
+            var_r32.SetSelection(sel);
+            var_r64.SetSelection(sel);
+
             // Write each one
-            engine->Write(var_i8, m_TestData.I8.data() + step);
-            engine->Write(var_i16, m_TestData.I16.data() + step);
-            engine->Write(var_i32, m_TestData.I32.data() + step);
-            engine->Write(var_i64, m_TestData.I64.data() + step);
-            engine->Write(var_u8, m_TestData.U8.data() + step);
-            engine->Write(var_u16, m_TestData.U16.data() + step);
-            engine->Write(var_u32, m_TestData.U32.data() + step);
-            engine->Write(var_u64, m_TestData.U64.data() + step);
-            engine->Write(var_r32, m_TestData.R32.data() + step);
-            engine->Write(var_r64, m_TestData.R64.data() + step);
+            // fill in the variable with values from starting index to
+            // starting index + count
+            engine->Write(var_i8, currentTestData.I8.data());
+            engine->Write(var_i16, currentTestData.I16.data());
+            engine->Write(var_i32, currentTestData.I32.data());
+            engine->Write(var_i64, currentTestData.I64.data());
+            engine->Write(var_u8, currentTestData.U8.data());
+            engine->Write(var_u16, currentTestData.U16.data());
+            engine->Write(var_u32, currentTestData.U32.data());
+            engine->Write(var_u64, currentTestData.U64.data());
+            engine->Write(var_r32, currentTestData.R32.data());
+            engine->Write(var_r64, currentTestData.R64.data());
 
             // Advance to the next time step
             engine->Advance();
@@ -102,83 +150,105 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8fstream)
         engine->Close();
     }
 
-// Read test data using ADIOS1
-#ifdef ADIOS2_HAVE_MPI
-    // Read everything from rank 0
-    int rank;
-    MPI_Comm_rank();
-    if (rank == 0)
-#endif
     {
-        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_WORLD,
+        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_SELF,
                                "verbose=3");
 
         // Open the file for reading
-        ADIOS_FILE *f =
-            adios_read_open_file((fname + ".dir/" + fname + ".0").c_str(),
-                                 ADIOS_READ_METHOD_BP, MPI_COMM_WORLD);
+        // Note: Since collective metadata generation is not implemented yet,
+        // SO for now we read each subfile instead of a single bp file with all
+        // metadata.
+        // Meanwhile if we open file with MPI_COMM_WORLD, then the selection
+        // bounding box should be [0, Nx]
+        std::string index = std::to_string(mpiRank);
+        ADIOS_FILE *f = adios_read_open_file(
+            (fname + ".dir/" + fname + "." + index).c_str(),
+            ADIOS_READ_METHOD_BP, MPI_COMM_SELF);
         ASSERT_NE(f, nullptr);
 
         // Check the variables exist
         ADIOS_VARINFO *var_i8 = adios_inq_var(f, "i8");
         ASSERT_NE(var_i8, nullptr);
         ASSERT_EQ(var_i8->ndim, 1);
-        ASSERT_EQ(var_i8->dims[0], 8);
+        ASSERT_EQ(var_i8->global, 1);
+        ASSERT_EQ(var_i8->nsteps, NSteps);
+        ASSERT_EQ(var_i8->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_i16 = adios_inq_var(f, "i16");
         ASSERT_NE(var_i16, nullptr);
         ASSERT_EQ(var_i16->ndim, 1);
-        ASSERT_EQ(var_i16->dims[0], 8);
+        ASSERT_EQ(var_i16->global, 1);
+        ASSERT_EQ(var_i16->nsteps, NSteps);
+        ASSERT_EQ(var_i16->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_i32 = adios_inq_var(f, "i32");
         ASSERT_NE(var_i32, nullptr);
         ASSERT_EQ(var_i32->ndim, 1);
-        ASSERT_EQ(var_i32->dims[0], 8);
+        ASSERT_EQ(var_i32->global, 1);
+        ASSERT_EQ(var_i32->nsteps, NSteps);
+        ASSERT_EQ(var_i32->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_i64 = adios_inq_var(f, "i64");
         ASSERT_NE(var_i64, nullptr);
         ASSERT_EQ(var_i64->ndim, 1);
-        ASSERT_EQ(var_i64->dims[0], 8);
+        ASSERT_EQ(var_i64->global, 1);
+        ASSERT_EQ(var_i64->nsteps, NSteps);
+        ASSERT_EQ(var_i64->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_u8 = adios_inq_var(f, "u8");
         ASSERT_NE(var_u8, nullptr);
         ASSERT_EQ(var_u8->ndim, 1);
-        ASSERT_EQ(var_u8->dims[0], 8);
+        ASSERT_EQ(var_u8->global, 1);
+        ASSERT_EQ(var_u8->nsteps, NSteps);
+        ASSERT_EQ(var_u8->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_u16 = adios_inq_var(f, "u16");
         ASSERT_NE(var_u16, nullptr);
         ASSERT_EQ(var_u16->ndim, 1);
-        ASSERT_EQ(var_u16->dims[0], 8);
+        ASSERT_EQ(var_u16->global, 1);
+        ASSERT_EQ(var_u16->nsteps, NSteps);
+        ASSERT_EQ(var_u16->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_u32 = adios_inq_var(f, "u32");
         ASSERT_NE(var_u32, nullptr);
         ASSERT_EQ(var_u32->ndim, 1);
-        ASSERT_EQ(var_u32->dims[0], 8);
+        ASSERT_EQ(var_u32->global, 1);
+        ASSERT_EQ(var_u32->nsteps, NSteps);
+        ASSERT_EQ(var_u32->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_u64 = adios_inq_var(f, "u64");
         ASSERT_NE(var_u64, nullptr);
         ASSERT_EQ(var_u64->ndim, 1);
-        ASSERT_EQ(var_u64->dims[0], 8);
+        ASSERT_EQ(var_u64->global, 1);
+        ASSERT_EQ(var_u64->nsteps, NSteps);
+        ASSERT_EQ(var_u64->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_r32 = adios_inq_var(f, "r32");
         ASSERT_NE(var_r32, nullptr);
         ASSERT_EQ(var_r32->ndim, 1);
-        ASSERT_EQ(var_r32->dims[0], 8);
+        ASSERT_EQ(var_r32->global, 1);
+        ASSERT_EQ(var_r32->nsteps, NSteps);
+        ASSERT_EQ(var_r32->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_r64 = adios_inq_var(f, "r64");
         ASSERT_NE(var_r64, nullptr);
         ASSERT_EQ(var_r64->ndim, 1);
-        ASSERT_EQ(var_r64->dims[0], 8);
-
-        std::array<int8_t, 8> I8;
-        std::array<int16_t, 8> I16;
-        std::array<int32_t, 8> I32;
-        std::array<int64_t, 8> I64;
-        std::array<uint8_t, 8> U8;
-        std::array<uint16_t, 8> U16;
-        std::array<uint32_t, 8> U32;
-        std::array<uint64_t, 8> U64;
-        std::array<float, 8> R32;
-        std::array<double, 8> R64;
-
-        uint64_t start[1] = {0};
-        uint64_t count[1] = {8};
+        ASSERT_EQ(var_r64->global, 1);
+        ASSERT_EQ(var_r64->nsteps, NSteps);
+        ASSERT_EQ(var_r64->dims[0], mpiSize * Nx);
+
+        std::array<int8_t, Nx> I8;
+        std::array<int16_t, Nx> I16;
+        std::array<int32_t, Nx> I32;
+        std::array<int64_t, Nx> I64;
+        std::array<uint8_t, Nx> U8;
+        std::array<uint16_t, Nx> U16;
+        std::array<uint32_t, Nx> U32;
+        std::array<uint64_t, Nx> U64;
+        std::array<float, Nx> R32;
+        std::array<double, Nx> R64;
+
+        uint64_t start[1] = {mpiRank * Nx};
+        uint64_t count[1] = {Nx};
         ADIOS_SELECTION *sel = adios_selection_boundingbox(1, start, count);
 
         // Read stuff
-        for (size_t t = 0; t < 3; ++t)
+        for (size_t t = 0; t < NSteps; ++t)
         {
+            // Generate test data for each rank uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, t, mpiRank, mpiSize);
             // Read the current step
             adios_schedule_read_byid(f, sel, var_i8->varid, t, 1, I8.data());
             adios_schedule_read_byid(f, sel, var_i16->varid, t, 1, I16.data());
@@ -193,22 +263,22 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8fstream)
             adios_perform_reads(f, 1);
 
             // Check if it's correct
-            for (size_t i = 0; i < 8; ++i)
+            for (size_t i = 0; i < Nx; ++i)
             {
                 std::stringstream ss;
-                ss << "t=" << t << " i=" << i;
+                ss << "t=" << t << " i=" << i << " rank=" << mpiRank;
                 std::string msg = ss.str();
 
-                EXPECT_EQ(I8[i], m_TestData.I8[i + t]) << msg;
-                EXPECT_EQ(I16[i], m_TestData.I16[i + t]) << msg;
-                EXPECT_EQ(I32[i], m_TestData.I32[i + t]) << msg;
-                EXPECT_EQ(I64[i], m_TestData.I64[i + t]) << msg;
-                EXPECT_EQ(U8[i], m_TestData.U8[i + t]) << msg;
-                EXPECT_EQ(U16[i], m_TestData.U16[i + t]) << msg;
-                EXPECT_EQ(U32[i], m_TestData.U32[i + t]) << msg;
-                EXPECT_EQ(U64[i], m_TestData.U64[i + t]) << msg;
-                EXPECT_EQ(R32[i], m_TestData.R32[i + t]) << msg;
-                EXPECT_EQ(R64[i], m_TestData.R64[i + t]) << msg;
+                EXPECT_EQ(I8[i], currentTestData.I8[i]) << msg;
+                EXPECT_EQ(I16[i], currentTestData.I16[i]) << msg;
+                EXPECT_EQ(I32[i], currentTestData.I32[i]) << msg;
+                EXPECT_EQ(I64[i], currentTestData.I64[i]) << msg;
+                EXPECT_EQ(U8[i], currentTestData.U8[i]) << msg;
+                EXPECT_EQ(U16[i], currentTestData.U16[i]) << msg;
+                EXPECT_EQ(U32[i], currentTestData.U32[i]) << msg;
+                EXPECT_EQ(U64[i], currentTestData.U64[i]) << msg;
+                EXPECT_EQ(R32[i], currentTestData.R32[i]) << msg;
+                EXPECT_EQ(R64[i], currentTestData.R64[i]) << msg;
             }
         }
 
@@ -228,6 +298,8 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8fstream)
 
         // Cleanup file
         adios_read_close(f);
+
+        adios_read_finalize_method(ADIOS_READ_METHOD_BP);
     }
 }
 
@@ -243,38 +315,66 @@ TEST_F(BPWriteReadTest, DISABLED_ADIOS2BPWriteADIOS2BPRead1D8fstream)
 // 2D 2x4 test data
 //******************************************************************************
 
-// ADIOS2 write, native ADIOS1 read
+// ADIOS2 BP write, native ADIOS1 read
 TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4fstream)
 {
+    // Each process would write a 2x4 array and all processes would
+    // form a 2D 2 * (numberOfProcess*Nx) matrix where Nx is 4 here
     std::string fname = "ADIOS2BPWriteADIOS1Read2D2x4Testfstream.bp";
 
+    int mpiRank = 0, mpiSize = 1;
+    // Number of rows
+    const std::size_t Nx = 4;
+
+    // Number of rows
+    const std::size_t Ny = 2;
+
+    // Number of steps
+    const std::size_t NSteps = 3;
+
+#ifdef ADIOS2_HAVE_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
+#endif
+
     // Write test data using ADIOS2
     {
+#ifdef ADIOS2_HAVE_MPI
+        adios2::ADIOS adios(MPI_COMM_WORLD, adios2::DebugON);
+#else
         adios2::ADIOS adios(true);
+#endif
         adios2::IO &io = adios.DeclareIO("TestIO");
 
-        // Declare 1D variables
+        // Declare 2D variables (Ny * (NumOfProcesses * Nx))
+        // The local process' part (start, count) can be defined now or later
+        // before Write().
         {
-            auto &var_i8 =
-                io.DefineVariable<int8_t>("i8", {}, {}, adios2::Dims{2, 4});
+            adios2::Dims shape{static_cast<unsigned int>(Ny),
+                               static_cast<unsigned int>(Nx * mpiSize)};
+            adios2::Dims start{static_cast<unsigned int>(0),
+                               static_cast<unsigned int>(mpiRank * Nx)};
+            adios2::Dims count{static_cast<unsigned int>(Ny),
+                               static_cast<unsigned int>(Nx)};
+            auto &var_i8 = io.DefineVariable<int8_t>("i8", shape, start, count);
             auto &var_i16 =
-                io.DefineVariable<int16_t>("i16", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<int16_t>("i16", shape, start, count);
             auto &var_i32 =
-                io.DefineVariable<int32_t>("i32", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<int32_t>("i32", shape, start, count);
             auto &var_i64 =
-                io.DefineVariable<int64_t>("i64", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<int64_t>("i64", shape, start, count);
             auto &var_u8 =
-                io.DefineVariable<uint8_t>("u8", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<uint8_t>("u8", shape, start, count);
             auto &var_u16 =
-                io.DefineVariable<uint16_t>("u16", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<uint16_t>("u16", shape, start, count);
             auto &var_u32 =
-                io.DefineVariable<uint32_t>("u32", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<uint32_t>("u32", shape, start, count);
             auto &var_u64 =
-                io.DefineVariable<uint64_t>("u64", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<uint64_t>("u64", shape, start, count);
             auto &var_r32 =
-                io.DefineVariable<float>("r32", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<float>("r32", shape, start, count);
             auto &var_r64 =
-                io.DefineVariable<double>("r64", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<double>("r64", shape, start, count);
         }
 
         // Create the BP Engine
@@ -284,8 +384,12 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4fstream)
         auto engine = io.Open(fname, adios2::OpenMode::Write);
         ASSERT_NE(engine.get(), nullptr);
 
-        for (size_t step = 0; step < 3; ++step)
+        for (size_t step = 0; step < NSteps; ++step)
         {
+            // Generate test data for each process uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, step, mpiRank, mpiSize);
+
             // Retrieve the variables that previously went out of scope
             auto &var_i8 = io.GetVariable<int8_t>("i8");
             auto &var_i16 = io.GetVariable<int16_t>("i16");
@@ -298,17 +402,34 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4fstream)
             auto &var_r32 = io.GetVariable<float>("r32");
             auto &var_r64 = io.GetVariable<double>("r64");
 
+            // Make a 2D selection to describe the local dimensions of the
+            // variable we write and its offsets in the global spaces
+            adios2::SelectionBoundingBox sel(
+                {0, static_cast<unsigned int>(mpiRank * Nx)}, {Ny, Nx});
+            var_i8.SetSelection(sel);
+            var_i16.SetSelection(sel);
+            var_i32.SetSelection(sel);
+            var_i64.SetSelection(sel);
+            var_u8.SetSelection(sel);
+            var_u16.SetSelection(sel);
+            var_u32.SetSelection(sel);
+            var_u64.SetSelection(sel);
+            var_r32.SetSelection(sel);
+            var_r64.SetSelection(sel);
+
             // Write each one
-            engine->Write(var_i8, m_TestData.I8.data() + step);
-            engine->Write(var_i16, m_TestData.I16.data() + step);
-            engine->Write(var_i32, m_TestData.I32.data() + step);
-            engine->Write(var_i64, m_TestData.I64.data() + step);
-            engine->Write(var_u8, m_TestData.U8.data() + step);
-            engine->Write(var_u16, m_TestData.U16.data() + step);
-            engine->Write(var_u32, m_TestData.U32.data() + step);
-            engine->Write(var_u64, m_TestData.U64.data() + step);
-            engine->Write(var_r32, m_TestData.R32.data() + step);
-            engine->Write(var_r64, m_TestData.R64.data() + step);
+            // fill in the variable with values from starting index to
+            // starting index + count
+            engine->Write(var_i8, currentTestData.I8.data());
+            engine->Write(var_i16, currentTestData.I16.data());
+            engine->Write(var_i32, currentTestData.I32.data());
+            engine->Write(var_i64, currentTestData.I64.data());
+            engine->Write(var_u8, currentTestData.U8.data());
+            engine->Write(var_u16, currentTestData.U16.data());
+            engine->Write(var_u32, currentTestData.U32.data());
+            engine->Write(var_u64, currentTestData.U64.data());
+            engine->Write(var_r32, currentTestData.R32.data());
+            engine->Write(var_r64, currentTestData.R64.data());
 
             // Advance to the next time step
             engine->Advance();
@@ -318,93 +439,118 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4fstream)
         engine->Close();
     }
 
-// Read test data using ADIOS1
-#ifdef ADIOS2_HAVE_MPI
-    // Read everything from rank 0
-    int rank;
-    MPI_Comm_rank();
-    if (rank == 0)
-#endif
     {
-        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_WORLD,
+        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_SELF,
                                "verbose=3");
 
         // Open the file for reading
-        ADIOS_FILE *f =
-            adios_read_open_file((fname + ".dir/" + fname + ".0").c_str(),
-                                 ADIOS_READ_METHOD_BP, MPI_COMM_WORLD);
+        // Note: Since collective metadata generation is not implemented yet,
+        // SO for now we read each subfile instead of a single bp file with all
+        // metadata.
+        // Meanwhile if we open file with MPI_COMM_WORLD, then the selection
+        // bounding box should be [0, Nx]
+        std::string index = std::to_string(mpiRank);
+        ADIOS_FILE *f = adios_read_open_file(
+            (fname + ".dir/" + fname + "." + index).c_str(),
+            ADIOS_READ_METHOD_BP, MPI_COMM_SELF);
         ASSERT_NE(f, nullptr);
 
         // Check the variables exist
         ADIOS_VARINFO *var_i8 = adios_inq_var(f, "i8");
         ASSERT_NE(var_i8, nullptr);
         ASSERT_EQ(var_i8->ndim, 2);
-        ASSERT_EQ(var_i8->dims[0], 2);
-        ASSERT_EQ(var_i8->dims[1], 4);
+        ASSERT_EQ(var_i8->global, 1);
+        ASSERT_EQ(var_i8->nsteps, NSteps);
+        ASSERT_EQ(var_i8->dims[0], Ny);
+        ASSERT_EQ(var_i8->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i16 = adios_inq_var(f, "i16");
         ASSERT_NE(var_i16, nullptr);
         ASSERT_EQ(var_i16->ndim, 2);
-        ASSERT_EQ(var_i16->dims[0], 2);
-        ASSERT_EQ(var_i16->dims[1], 4);
+        ASSERT_EQ(var_i16->global, 1);
+        ASSERT_EQ(var_i16->nsteps, NSteps);
+        ASSERT_EQ(var_i16->dims[0], Ny);
+        ASSERT_EQ(var_i16->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i32 = adios_inq_var(f, "i32");
         ASSERT_NE(var_i32, nullptr);
         ASSERT_EQ(var_i32->ndim, 2);
-        ASSERT_EQ(var_i32->dims[0], 2);
-        ASSERT_EQ(var_i32->dims[1], 4);
+        ASSERT_EQ(var_i32->global, 1);
+        ASSERT_EQ(var_i32->nsteps, NSteps);
+        ASSERT_EQ(var_i32->dims[0], Ny);
+        ASSERT_EQ(var_i32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i64 = adios_inq_var(f, "i64");
         ASSERT_NE(var_i64, nullptr);
         ASSERT_EQ(var_i64->ndim, 2);
-        ASSERT_EQ(var_i64->dims[0], 2);
-        ASSERT_EQ(var_i64->dims[1], 4);
+        ASSERT_EQ(var_i64->global, 1);
+        ASSERT_EQ(var_i64->nsteps, NSteps);
+        ASSERT_EQ(var_i64->dims[0], Ny);
+        ASSERT_EQ(var_i64->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u8 = adios_inq_var(f, "u8");
         ASSERT_NE(var_u8, nullptr);
         ASSERT_EQ(var_u8->ndim, 2);
-        ASSERT_EQ(var_u8->dims[0], 2);
-        ASSERT_EQ(var_u8->dims[1], 4);
+        ASSERT_EQ(var_u8->global, 1);
+        ASSERT_EQ(var_u8->nsteps, NSteps);
+        ASSERT_EQ(var_u8->dims[0], Ny);
+        ASSERT_EQ(var_u8->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u16 = adios_inq_var(f, "u16");
         ASSERT_NE(var_u16, nullptr);
         ASSERT_EQ(var_u16->ndim, 2);
-        ASSERT_EQ(var_u16->dims[0], 2);
-        ASSERT_EQ(var_u16->dims[1], 4);
+        ASSERT_EQ(var_u16->global, 1);
+        ASSERT_EQ(var_u16->nsteps, NSteps);
+        ASSERT_EQ(var_u16->dims[0], Ny);
+        ASSERT_EQ(var_u16->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u32 = adios_inq_var(f, "u32");
         ASSERT_NE(var_u32, nullptr);
         ASSERT_EQ(var_u32->ndim, 2);
-        ASSERT_EQ(var_u32->dims[0], 2);
-        ASSERT_EQ(var_u32->dims[1], 4);
+        ASSERT_EQ(var_u32->global, 1);
+        ASSERT_EQ(var_u32->nsteps, NSteps);
+        ASSERT_EQ(var_u32->dims[0], Ny);
+        ASSERT_EQ(var_u32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u64 = adios_inq_var(f, "u64");
         ASSERT_NE(var_u64, nullptr);
         ASSERT_EQ(var_u64->ndim, 2);
-        ASSERT_EQ(var_u64->dims[0], 2);
-        ASSERT_EQ(var_u64->dims[1], 4);
+        ASSERT_EQ(var_u64->global, 1);
+        ASSERT_EQ(var_u64->nsteps, NSteps);
+        ASSERT_EQ(var_u64->dims[0], Ny);
+        ASSERT_EQ(var_u64->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_r32 = adios_inq_var(f, "r32");
         ASSERT_NE(var_r32, nullptr);
         ASSERT_EQ(var_r32->ndim, 2);
-        ASSERT_EQ(var_r32->dims[0], 2);
-        ASSERT_EQ(var_r32->dims[1], 4);
+        ASSERT_EQ(var_r32->global, 1);
+        ASSERT_EQ(var_r32->nsteps, NSteps);
+        ASSERT_EQ(var_r32->dims[0], Ny);
+        ASSERT_EQ(var_r32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_r64 = adios_inq_var(f, "r64");
         ASSERT_NE(var_r64, nullptr);
         ASSERT_EQ(var_r64->ndim, 2);
-        ASSERT_EQ(var_r64->dims[0], 2);
-        ASSERT_EQ(var_r64->dims[1], 4);
-
-        std::array<int8_t, 8> I8;
-        std::array<int16_t, 8> I16;
-        std::array<int32_t, 8> I32;
-        std::array<int64_t, 8> I64;
-        std::array<uint8_t, 8> U8;
-        std::array<uint16_t, 8> U16;
-        std::array<uint32_t, 8> U32;
-        std::array<uint64_t, 8> U64;
-        std::array<float, 8> R32;
-        std::array<double, 8> R64;
-
-        uint64_t start[2] = {0, 0};
-        uint64_t count[2] = {2, 4};
+        ASSERT_EQ(var_r64->global, 1);
+        ASSERT_EQ(var_r64->nsteps, NSteps);
+        ASSERT_EQ(var_r64->dims[0], Ny);
+        ASSERT_EQ(var_r64->dims[1], mpiSize * Nx);
+
+        // If the size of the array is smaller than the data
+        // the result is weird... double and uint64_t would get completely
+        // garbage data
+        std::array<int8_t, Nx * Ny> I8;
+        std::array<int16_t, Nx * Ny> I16;
+        std::array<int32_t, Nx * Ny> I32;
+        std::array<int64_t, Nx * Ny> I64;
+        std::array<uint8_t, Nx * Ny> U8;
+        std::array<uint16_t, Nx * Ny> U16;
+        std::array<uint32_t, Nx * Ny> U32;
+        std::array<uint64_t, Nx * Ny> U64;
+        std::array<float, Nx * Ny> R32;
+        std::array<double, Nx * Ny> R64;
+
+        uint64_t start[2] = {0, mpiRank * Nx};
+        uint64_t count[2] = {Ny, Nx};
         ADIOS_SELECTION *sel = adios_selection_boundingbox(2, start, count);
 
         // Read stuff
-        for (size_t t = 0; t < 3; ++t)
+        for (size_t t = 0; t < NSteps; ++t)
         {
+            // Generate test data for each rank uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, t, mpiRank, mpiSize);
             // Read the current step
             adios_schedule_read_byid(f, sel, var_i8->varid, t, 1, I8.data());
             adios_schedule_read_byid(f, sel, var_i16->varid, t, 1, I16.data());
@@ -419,22 +565,22 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4fstream)
             adios_perform_reads(f, 1);
 
             // Check if it's correct
-            for (size_t i = 0; i < 8; ++i)
+            for (size_t i = 0; i < Nx; ++i)
             {
                 std::stringstream ss;
-                ss << "t=" << t << " i=" << i;
+                ss << "t=" << t << " i=" << i << " rank=" << mpiRank;
                 std::string msg = ss.str();
 
-                EXPECT_EQ(I8[i], m_TestData.I8[i + t]) << msg;
-                EXPECT_EQ(I16[i], m_TestData.I16[i + t]) << msg;
-                EXPECT_EQ(I32[i], m_TestData.I32[i + t]) << msg;
-                EXPECT_EQ(I64[i], m_TestData.I64[i + t]) << msg;
-                EXPECT_EQ(U8[i], m_TestData.U8[i + t]) << msg;
-                EXPECT_EQ(U16[i], m_TestData.U16[i + t]) << msg;
-                EXPECT_EQ(U32[i], m_TestData.U32[i + t]) << msg;
-                EXPECT_EQ(U64[i], m_TestData.U64[i + t]) << msg;
-                EXPECT_EQ(R32[i], m_TestData.R32[i + t]) << msg;
-                EXPECT_EQ(R64[i], m_TestData.R64[i + t]) << msg;
+                EXPECT_EQ(I8[i], currentTestData.I8[i]) << msg;
+                EXPECT_EQ(I16[i], currentTestData.I16[i]) << msg;
+                EXPECT_EQ(I32[i], currentTestData.I32[i]) << msg;
+                EXPECT_EQ(I64[i], currentTestData.I64[i]) << msg;
+                EXPECT_EQ(U8[i], currentTestData.U8[i]) << msg;
+                EXPECT_EQ(U16[i], currentTestData.U16[i]) << msg;
+                EXPECT_EQ(U32[i], currentTestData.U32[i]) << msg;
+                EXPECT_EQ(U64[i], currentTestData.U64[i]) << msg;
+                EXPECT_EQ(R32[i], currentTestData.R32[i]) << msg;
+                EXPECT_EQ(R64[i], currentTestData.R64[i]) << msg;
             }
         }
 
@@ -454,6 +600,8 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4fstream)
 
         // Cleanup file
         adios_read_close(f);
+
+        adios_read_finalize_method(ADIOS_READ_METHOD_BP);
     }
 }
 
@@ -472,35 +620,62 @@ TEST_F(BPWriteReadTest, DISABLED_ADIOS2BPWriteADIOS2BPRead2D2x4fstream)
 // ADIOS2 write, native ADIOS1 read
 TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2fstream)
 {
+    // Each process would write a 4x2 array and all processes would
+    // form a 2D 4 * (NumberOfProcess * Nx) matrix where Nx is 2 here
     std::string fname = "ADIOS2BPWriteADIOS1Read2D4x2Testfstream.bp";
 
+    int mpiRank = 0, mpiSize = 1;
+    // Number of rows
+    const std::size_t Nx = 2;
+    // Number of cols
+    const std::size_t Ny = 4;
+
+    // Number of steps
+    const std::size_t NSteps = 3;
+
+#ifdef ADIOS2_HAVE_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
+#endif
+
     // Write test data using ADIOS2
     {
+#ifdef ADIOS2_HAVE_MPI
+        adios2::ADIOS adios(MPI_COMM_WORLD, adios2::DebugON);
+#else
         adios2::ADIOS adios(true);
+#endif
         adios2::IO &io = adios.DeclareIO("TestIO");
 
-        // Declare 1D variables
+        // Declare 2D variables (4 * (NumberOfProcess * Nx))
+        // The local process' part (start, count) can be defined now or later
+        // before Write().
         {
-            auto &var_i8 =
-                io.DefineVariable<int8_t>("i8", {}, {}, adios2::Dims{4, 2});
+            adios2::Dims shape{static_cast<unsigned int>(Ny),
+                               static_cast<unsigned int>(mpiSize * Nx)};
+            adios2::Dims start{static_cast<unsigned int>(0),
+                               static_cast<unsigned int>(mpiRank * Nx)};
+            adios2::Dims count{static_cast<unsigned int>(Ny),
+                               static_cast<unsigned int>(Nx)};
+            auto &var_i8 = io.DefineVariable<int8_t>("i8", shape, start, count);
             auto &var_i16 =
-                io.DefineVariable<int16_t>("i16", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<int16_t>("i16", shape, start, count);
             auto &var_i32 =
-                io.DefineVariable<int32_t>("i32", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<int32_t>("i32", shape, start, count);
             auto &var_i64 =
-                io.DefineVariable<int64_t>("i64", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<int64_t>("i64", shape, start, count);
             auto &var_u8 =
-                io.DefineVariable<uint8_t>("u8", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<uint8_t>("u8", shape, start, count);
             auto &var_u16 =
-                io.DefineVariable<uint16_t>("u16", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<uint16_t>("u16", shape, start, count);
             auto &var_u32 =
-                io.DefineVariable<uint32_t>("u32", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<uint32_t>("u32", shape, start, count);
             auto &var_u64 =
-                io.DefineVariable<uint64_t>("u64", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<uint64_t>("u64", shape, start, count);
             auto &var_r32 =
-                io.DefineVariable<float>("r32", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<float>("r32", shape, start, count);
             auto &var_r64 =
-                io.DefineVariable<double>("r64", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<double>("r64", shape, start, count);
         }
 
         // Create the BP Engine
@@ -510,8 +685,12 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2fstream)
         auto engine = io.Open(fname, adios2::OpenMode::Write);
         ASSERT_NE(engine.get(), nullptr);
 
-        for (size_t step = 0; step < 3; ++step)
+        for (size_t step = 0; step < NSteps; ++step)
         {
+            // Generate test data for each process uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, step, mpiRank, mpiSize);
+
             // Retrieve the variables that previously went out of scope
             auto &var_i8 = io.GetVariable<int8_t>("i8");
             auto &var_i16 = io.GetVariable<int16_t>("i16");
@@ -524,17 +703,34 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2fstream)
             auto &var_r32 = io.GetVariable<float>("r32");
             auto &var_r64 = io.GetVariable<double>("r64");
 
+            // Make a 2D selection to describe the local dimensions of the
+            // variable we write and its offsets in the global spaces
+            adios2::SelectionBoundingBox sel(
+                {0, static_cast<unsigned int>(mpiRank * Nx)}, {Ny, Nx});
+            var_i8.SetSelection(sel);
+            var_i16.SetSelection(sel);
+            var_i32.SetSelection(sel);
+            var_i64.SetSelection(sel);
+            var_u8.SetSelection(sel);
+            var_u16.SetSelection(sel);
+            var_u32.SetSelection(sel);
+            var_u64.SetSelection(sel);
+            var_r32.SetSelection(sel);
+            var_r64.SetSelection(sel);
+
             // Write each one
-            engine->Write(var_i8, m_TestData.I8.data() + step);
-            engine->Write(var_i16, m_TestData.I16.data() + step);
-            engine->Write(var_i32, m_TestData.I32.data() + step);
-            engine->Write(var_i64, m_TestData.I64.data() + step);
-            engine->Write(var_u8, m_TestData.U8.data() + step);
-            engine->Write(var_u16, m_TestData.U16.data() + step);
-            engine->Write(var_u32, m_TestData.U32.data() + step);
-            engine->Write(var_u64, m_TestData.U64.data() + step);
-            engine->Write(var_r32, m_TestData.R32.data() + step);
-            engine->Write(var_r64, m_TestData.R64.data() + step);
+            // fill in the variable with values from starting index to
+            // starting index + count
+            engine->Write(var_i8, currentTestData.I8.data());
+            engine->Write(var_i16, currentTestData.I16.data());
+            engine->Write(var_i32, currentTestData.I32.data());
+            engine->Write(var_i64, currentTestData.I64.data());
+            engine->Write(var_u8, currentTestData.U8.data());
+            engine->Write(var_u16, currentTestData.U16.data());
+            engine->Write(var_u32, currentTestData.U32.data());
+            engine->Write(var_u64, currentTestData.U64.data());
+            engine->Write(var_r32, currentTestData.R32.data());
+            engine->Write(var_r64, currentTestData.R64.data());
 
             // Advance to the next time step
             engine->Advance();
@@ -544,93 +740,118 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2fstream)
         engine->Close();
     }
 
-// Read test data using ADIOS1
-#ifdef ADIOS2_HAVE_MPI
-    // Read everything from rank 0
-    int rank;
-    MPI_Comm_rank();
-    if (rank == 0)
-#endif
     {
-        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_WORLD,
+        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_SELF,
                                "verbose=3");
 
         // Open the file for reading
-        ADIOS_FILE *f =
-            adios_read_open_file((fname + ".dir/" + fname + ".0").c_str(),
-                                 ADIOS_READ_METHOD_BP, MPI_COMM_WORLD);
+        // Note: Since collective metadata generation is not implemented yet,
+        // SO for now we read each subfile instead of a single bp file with all
+        // metadata.
+        // Meanwhile if we open file with MPI_COMM_WORLD, then the selection
+        // bounding box should be [0, Nx]
+        std::string index = std::to_string(mpiRank);
+        ADIOS_FILE *f = adios_read_open_file(
+            (fname + ".dir/" + fname + "." + index).c_str(),
+            ADIOS_READ_METHOD_BP, MPI_COMM_SELF);
         ASSERT_NE(f, nullptr);
 
         // Check the variables exist
         ADIOS_VARINFO *var_i8 = adios_inq_var(f, "i8");
         ASSERT_NE(var_i8, nullptr);
         ASSERT_EQ(var_i8->ndim, 2);
-        ASSERT_EQ(var_i8->dims[0], 4);
-        ASSERT_EQ(var_i8->dims[1], 2);
+        ASSERT_EQ(var_i8->global, 1);
+        ASSERT_EQ(var_i8->nsteps, NSteps);
+        ASSERT_EQ(var_i8->dims[0], Ny);
+        ASSERT_EQ(var_i8->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i16 = adios_inq_var(f, "i16");
         ASSERT_NE(var_i16, nullptr);
         ASSERT_EQ(var_i16->ndim, 2);
-        ASSERT_EQ(var_i16->dims[0], 4);
-        ASSERT_EQ(var_i16->dims[1], 2);
+        ASSERT_EQ(var_i16->global, 1);
+        ASSERT_EQ(var_i16->nsteps, NSteps);
+        ASSERT_EQ(var_i16->dims[0], Ny);
+        ASSERT_EQ(var_i16->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i32 = adios_inq_var(f, "i32");
         ASSERT_NE(var_i32, nullptr);
         ASSERT_EQ(var_i32->ndim, 2);
-        ASSERT_EQ(var_i32->dims[0], 4);
-        ASSERT_EQ(var_i32->dims[1], 2);
+        ASSERT_EQ(var_i32->global, 1);
+        ASSERT_EQ(var_i32->nsteps, NSteps);
+        ASSERT_EQ(var_i32->dims[0], Ny);
+        ASSERT_EQ(var_i32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i64 = adios_inq_var(f, "i64");
         ASSERT_NE(var_i64, nullptr);
         ASSERT_EQ(var_i64->ndim, 2);
-        ASSERT_EQ(var_i64->dims[0], 4);
-        ASSERT_EQ(var_i64->dims[1], 2);
+        ASSERT_EQ(var_i64->global, 1);
+        ASSERT_EQ(var_i64->nsteps, NSteps);
+        ASSERT_EQ(var_i64->dims[0], Ny);
+        ASSERT_EQ(var_i64->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u8 = adios_inq_var(f, "u8");
         ASSERT_NE(var_u8, nullptr);
         ASSERT_EQ(var_u8->ndim, 2);
-        ASSERT_EQ(var_u8->dims[0], 4);
-        ASSERT_EQ(var_u8->dims[1], 2);
+        ASSERT_EQ(var_u8->global, 1);
+        ASSERT_EQ(var_u8->nsteps, NSteps);
+        ASSERT_EQ(var_u8->dims[0], Ny);
+        ASSERT_EQ(var_u8->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u16 = adios_inq_var(f, "u16");
         ASSERT_NE(var_u16, nullptr);
         ASSERT_EQ(var_u16->ndim, 2);
-        ASSERT_EQ(var_u16->dims[0], 4);
-        ASSERT_EQ(var_u16->dims[1], 2);
+        ASSERT_EQ(var_u16->global, 1);
+        ASSERT_EQ(var_u16->nsteps, NSteps);
+        ASSERT_EQ(var_u16->dims[0], Ny);
+        ASSERT_EQ(var_u16->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u32 = adios_inq_var(f, "u32");
         ASSERT_NE(var_u32, nullptr);
         ASSERT_EQ(var_u32->ndim, 2);
-        ASSERT_EQ(var_u32->dims[0], 4);
-        ASSERT_EQ(var_u32->dims[1], 2);
+        ASSERT_EQ(var_u32->global, 1);
+        ASSERT_EQ(var_u32->nsteps, NSteps);
+        ASSERT_EQ(var_u32->dims[0], Ny);
+        ASSERT_EQ(var_u32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u64 = adios_inq_var(f, "u64");
         ASSERT_NE(var_u64, nullptr);
         ASSERT_EQ(var_u64->ndim, 2);
-        ASSERT_EQ(var_u64->dims[0], 4);
-        ASSERT_EQ(var_u64->dims[1], 2);
+        ASSERT_EQ(var_u64->global, 1);
+        ASSERT_EQ(var_u64->nsteps, NSteps);
+        ASSERT_EQ(var_u64->dims[0], Ny);
+        ASSERT_EQ(var_u64->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_r32 = adios_inq_var(f, "r32");
         ASSERT_NE(var_r32, nullptr);
         ASSERT_EQ(var_r32->ndim, 2);
-        ASSERT_EQ(var_r32->dims[0], 4);
-        ASSERT_EQ(var_r32->dims[1], 2);
+        ASSERT_EQ(var_r32->global, 1);
+        ASSERT_EQ(var_r32->nsteps, NSteps);
+        ASSERT_EQ(var_r32->dims[0], Ny);
+        ASSERT_EQ(var_r32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_r64 = adios_inq_var(f, "r64");
         ASSERT_NE(var_r64, nullptr);
         ASSERT_EQ(var_r64->ndim, 2);
-        ASSERT_EQ(var_r64->dims[0], 4);
-        ASSERT_EQ(var_r64->dims[1], 2);
-
-        std::array<int8_t, 8> I8;
-        std::array<int16_t, 8> I16;
-        std::array<int32_t, 8> I32;
-        std::array<int64_t, 8> I64;
-        std::array<uint8_t, 8> U8;
-        std::array<uint16_t, 8> U16;
-        std::array<uint32_t, 8> U32;
-        std::array<uint64_t, 8> U64;
-        std::array<float, 8> R32;
-        std::array<double, 8> R64;
-
-        uint64_t start[2] = {0, 0};
-        uint64_t count[2] = {4, 2};
+        ASSERT_EQ(var_r64->global, 1);
+        ASSERT_EQ(var_r64->nsteps, NSteps);
+        ASSERT_EQ(var_r64->dims[0], Ny);
+        ASSERT_EQ(var_r64->dims[1], mpiSize * Nx);
+
+        // If the size of the array is smaller than the data
+        // the result is weird... double and uint64_t would get completely
+        // garbage data
+        std::array<int8_t, Nx * Ny> I8;
+        std::array<int16_t, Nx * Ny> I16;
+        std::array<int32_t, Nx * Ny> I32;
+        std::array<int64_t, Nx * Ny> I64;
+        std::array<uint8_t, Nx * Ny> U8;
+        std::array<uint16_t, Nx * Ny> U16;
+        std::array<uint32_t, Nx * Ny> U32;
+        std::array<uint64_t, Nx * Ny> U64;
+        std::array<float, Nx * Ny> R32;
+        std::array<double, Nx * Ny> R64;
+
+        uint64_t start[2] = {0, mpiRank * Nx};
+        uint64_t count[2] = {Ny, Nx};
         ADIOS_SELECTION *sel = adios_selection_boundingbox(2, start, count);
 
         // Read stuff
-        for (size_t t = 0; t < 3; ++t)
+        for (size_t t = 0; t < NSteps; ++t)
         {
+            // Generate test data for each rank uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, t, mpiRank, mpiSize);
             // Read the current step
             adios_schedule_read_byid(f, sel, var_i8->varid, t, 1, I8.data());
             adios_schedule_read_byid(f, sel, var_i16->varid, t, 1, I16.data());
@@ -645,22 +866,22 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2fstream)
             adios_perform_reads(f, 1);
 
             // Check if it's correct
-            for (size_t i = 0; i < 8; ++i)
+            for (size_t i = 0; i < Nx; ++i)
             {
                 std::stringstream ss;
-                ss << "t=" << t << " i=" << i;
+                ss << "t=" << t << " i=" << i << " rank=" << mpiRank;
                 std::string msg = ss.str();
 
-                EXPECT_EQ(I8[i], m_TestData.I8[i + t]) << msg;
-                EXPECT_EQ(I16[i], m_TestData.I16[i + t]) << msg;
-                EXPECT_EQ(I32[i], m_TestData.I32[i + t]) << msg;
-                EXPECT_EQ(I64[i], m_TestData.I64[i + t]) << msg;
-                EXPECT_EQ(U8[i], m_TestData.U8[i + t]) << msg;
-                EXPECT_EQ(U16[i], m_TestData.U16[i + t]) << msg;
-                EXPECT_EQ(U32[i], m_TestData.U32[i + t]) << msg;
-                EXPECT_EQ(U64[i], m_TestData.U64[i + t]) << msg;
-                EXPECT_EQ(R32[i], m_TestData.R32[i + t]) << msg;
-                EXPECT_EQ(R64[i], m_TestData.R64[i + t]) << msg;
+                EXPECT_EQ(I8[i], currentTestData.I8[i]) << msg;
+                EXPECT_EQ(I16[i], currentTestData.I16[i]) << msg;
+                EXPECT_EQ(I32[i], currentTestData.I32[i]) << msg;
+                EXPECT_EQ(I64[i], currentTestData.I64[i]) << msg;
+                EXPECT_EQ(U8[i], currentTestData.U8[i]) << msg;
+                EXPECT_EQ(U16[i], currentTestData.U16[i]) << msg;
+                EXPECT_EQ(U32[i], currentTestData.U32[i]) << msg;
+                EXPECT_EQ(U64[i], currentTestData.U64[i]) << msg;
+                EXPECT_EQ(R32[i], currentTestData.R32[i]) << msg;
+                EXPECT_EQ(R64[i], currentTestData.R64[i]) << msg;
             }
         }
 
@@ -680,6 +901,8 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2fstream)
 
         // Cleanup file
         adios_read_close(f);
+
+        adios_read_finalize_method(ADIOS_READ_METHOD_BP);
     }
 }
 
diff --git a/testing/adios2/engine/bp/TestBPWriteReadstdio.cpp b/testing/adios2/engine/bp/TestBPWriteReadstdio.cpp
index b55d6f4290ad7d96580947d5bbe49a5378cf3e61..a47653c7f1931804ce49bc9bfdac4557d9678b7c 100644
--- a/testing/adios2/engine/bp/TestBPWriteReadstdio.cpp
+++ b/testing/adios2/engine/bp/TestBPWriteReadstdio.cpp
@@ -27,49 +27,85 @@ public:
 // 1D 1x8 test data
 //******************************************************************************
 
-// ADIOS2 write, native ADIOS1 read
+// ADIOS2 BP write, native ADIOS1 read
 TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8stdio)
 {
+    // Each process would write a 1x8 array and all processes would
+    // form a mpiSize * Nx 1D array
     std::string fname = "ADIOS2BPWriteADIOS1Read1D8stdio.bp";
 
-    // Write test data using ADIOS2
+    int mpiRank = 0, mpiSize = 1;
+    // Number of rows
+    const std::size_t Nx = 8;
+
+    // Number of steps
+    const std::size_t NSteps = 3;
+
+#ifdef ADIOS2_HAVE_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
+#endif
+
+    // Write test data using BP
     {
+#ifdef ADIOS2_HAVE_MPI
+        adios2::ADIOS adios(MPI_COMM_WORLD, adios2::DebugON);
+#else
         adios2::ADIOS adios(true);
+#endif
         adios2::IO &io = adios.DeclareIO("TestIO");
 
-        // Declare 1D variables
+        // Declare 1D variables (NumOfProcesses * Nx)
+        // The local process' part (start, count) can be defined now or later
+        // before Write().
         {
-            auto &var_i8 =
-                io.DefineVariable<int8_t>("i8", {}, {}, adios2::Dims{8});
+            adios2::Dims shape{static_cast<unsigned int>(Nx * mpiSize)};
+            adios2::Dims start{static_cast<unsigned int>(Nx * mpiRank)};
+            adios2::Dims count{static_cast<unsigned int>(Nx)};
+            auto &var_i8 = io.DefineVariable<int8_t>("i8", shape, start, count);
             auto &var_i16 =
-                io.DefineVariable<int16_t>("i16", {}, {}, adios2::Dims{8});
+                io.DefineVariable<int16_t>("i16", shape, start, count);
             auto &var_i32 =
-                io.DefineVariable<int32_t>("i32", {}, {}, adios2::Dims{8});
+                io.DefineVariable<int32_t>("i32", shape, start, count);
             auto &var_i64 =
-                io.DefineVariable<int64_t>("i64", {}, {}, adios2::Dims{8});
+                io.DefineVariable<int64_t>("i64", shape, start, count);
             auto &var_u8 =
-                io.DefineVariable<uint8_t>("u8", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint8_t>("u8", shape, start, count);
             auto &var_u16 =
-                io.DefineVariable<uint16_t>("u16", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint16_t>("u16", shape, start, count);
             auto &var_u32 =
-                io.DefineVariable<uint32_t>("u32", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint32_t>("u32", shape, start, count);
             auto &var_u64 =
-                io.DefineVariable<uint64_t>("u64", {}, {}, adios2::Dims{8});
+                io.DefineVariable<uint64_t>("u64", shape, start, count);
             auto &var_r32 =
-                io.DefineVariable<float>("r32", {}, {}, adios2::Dims{8});
+                io.DefineVariable<float>("r32", shape, start, count);
             auto &var_r64 =
-                io.DefineVariable<double>("r64", {}, {}, adios2::Dims{8});
+                io.DefineVariable<double>("r64", shape, start, count);
         }
 
         // Create the BP Engine
         io.SetEngine("BPFileWriter");
-        io.AddTransport("File", {{"Library", "stdio"}});
 
+#ifdef ADIOS2_HAVE_MPI
+        io.AddTransport("file", {{"Library", "stdio"}});
+#else
+        io.AddTransport("file");
+#endif
+        // QUESTION: It seems that BPFilterWriter cannot overwrite existing
+        // files
+        // Ex. if you tune Nx and NSteps, the test would fail. But if you clear
+        // the cache in
+        // ${adios2Build}/testing/adios2/engine/bp/ADIOS2BPWriteADIOS1Read1D8.bp.dir,
+        // then it works
         auto engine = io.Open(fname, adios2::OpenMode::Write);
         ASSERT_NE(engine.get(), nullptr);
 
-        for (size_t step = 0; step < 3; ++step)
+        for (size_t step = 0; step < NSteps; ++step)
         {
+            // Generate test data for each process uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, step, mpiRank, mpiSize);
+
             // Retrieve the variables that previously went out of scope
             auto &var_i8 = io.GetVariable<int8_t>("i8");
             auto &var_i16 = io.GetVariable<int16_t>("i16");
@@ -82,17 +118,33 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8stdio)
             auto &var_r32 = io.GetVariable<float>("r32");
             auto &var_r64 = io.GetVariable<double>("r64");
 
+            // Make a 1D selection to describe the local dimensions of the
+            // variable we write and its offsets in the global spaces
+            adios2::SelectionBoundingBox sel({mpiRank * Nx}, {Nx});
+            var_i8.SetSelection(sel);
+            var_i16.SetSelection(sel);
+            var_i32.SetSelection(sel);
+            var_i64.SetSelection(sel);
+            var_u8.SetSelection(sel);
+            var_u16.SetSelection(sel);
+            var_u32.SetSelection(sel);
+            var_u64.SetSelection(sel);
+            var_r32.SetSelection(sel);
+            var_r64.SetSelection(sel);
+
             // Write each one
-            engine->Write(var_i8, m_TestData.I8.data() + step);
-            engine->Write(var_i16, m_TestData.I16.data() + step);
-            engine->Write(var_i32, m_TestData.I32.data() + step);
-            engine->Write(var_i64, m_TestData.I64.data() + step);
-            engine->Write(var_u8, m_TestData.U8.data() + step);
-            engine->Write(var_u16, m_TestData.U16.data() + step);
-            engine->Write(var_u32, m_TestData.U32.data() + step);
-            engine->Write(var_u64, m_TestData.U64.data() + step);
-            engine->Write(var_r32, m_TestData.R32.data() + step);
-            engine->Write(var_r64, m_TestData.R64.data() + step);
+            // fill in the variable with values from starting index to
+            // starting index + count
+            engine->Write(var_i8, currentTestData.I8.data());
+            engine->Write(var_i16, currentTestData.I16.data());
+            engine->Write(var_i32, currentTestData.I32.data());
+            engine->Write(var_i64, currentTestData.I64.data());
+            engine->Write(var_u8, currentTestData.U8.data());
+            engine->Write(var_u16, currentTestData.U16.data());
+            engine->Write(var_u32, currentTestData.U32.data());
+            engine->Write(var_u64, currentTestData.U64.data());
+            engine->Write(var_r32, currentTestData.R32.data());
+            engine->Write(var_r64, currentTestData.R64.data());
 
             // Advance to the next time step
             engine->Advance();
@@ -102,83 +154,105 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8stdio)
         engine->Close();
     }
 
-// Read test data using ADIOS1
-#ifdef ADIOS2_HAVE_MPI
-    // Read everything from rank 0
-    int rank;
-    MPI_Comm_rank();
-    if (rank == 0)
-#endif
     {
-        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_WORLD,
+        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_SELF,
                                "verbose=3");
 
         // Open the file for reading
-        ADIOS_FILE *f =
-            adios_read_open_file((fname + ".dir/" + fname + ".0").c_str(),
-                                 ADIOS_READ_METHOD_BP, MPI_COMM_WORLD);
+        // Note: Since collective metadata generation is not implemented yet,
+        // SO for now we read each subfile instead of a single bp file with all
+        // metadata.
+        // Meanwhile if we open file with MPI_COMM_WORLD, then the selection
+        // bounding box should be [0, Nx]
+        std::string index = std::to_string(mpiRank);
+        ADIOS_FILE *f = adios_read_open_file(
+            (fname + ".dir/" + fname + "." + index).c_str(),
+            ADIOS_READ_METHOD_BP, MPI_COMM_SELF);
         ASSERT_NE(f, nullptr);
 
         // Check the variables exist
         ADIOS_VARINFO *var_i8 = adios_inq_var(f, "i8");
         ASSERT_NE(var_i8, nullptr);
         ASSERT_EQ(var_i8->ndim, 1);
-        ASSERT_EQ(var_i8->dims[0], 8);
+        ASSERT_EQ(var_i8->global, 1);
+        ASSERT_EQ(var_i8->nsteps, NSteps);
+        ASSERT_EQ(var_i8->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_i16 = adios_inq_var(f, "i16");
         ASSERT_NE(var_i16, nullptr);
         ASSERT_EQ(var_i16->ndim, 1);
-        ASSERT_EQ(var_i16->dims[0], 8);
+        ASSERT_EQ(var_i16->global, 1);
+        ASSERT_EQ(var_i16->nsteps, NSteps);
+        ASSERT_EQ(var_i16->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_i32 = adios_inq_var(f, "i32");
         ASSERT_NE(var_i32, nullptr);
         ASSERT_EQ(var_i32->ndim, 1);
-        ASSERT_EQ(var_i32->dims[0], 8);
+        ASSERT_EQ(var_i32->global, 1);
+        ASSERT_EQ(var_i32->nsteps, NSteps);
+        ASSERT_EQ(var_i32->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_i64 = adios_inq_var(f, "i64");
         ASSERT_NE(var_i64, nullptr);
         ASSERT_EQ(var_i64->ndim, 1);
-        ASSERT_EQ(var_i64->dims[0], 8);
+        ASSERT_EQ(var_i64->global, 1);
+        ASSERT_EQ(var_i64->nsteps, NSteps);
+        ASSERT_EQ(var_i64->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_u8 = adios_inq_var(f, "u8");
         ASSERT_NE(var_u8, nullptr);
         ASSERT_EQ(var_u8->ndim, 1);
-        ASSERT_EQ(var_u8->dims[0], 8);
+        ASSERT_EQ(var_u8->global, 1);
+        ASSERT_EQ(var_u8->nsteps, NSteps);
+        ASSERT_EQ(var_u8->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_u16 = adios_inq_var(f, "u16");
         ASSERT_NE(var_u16, nullptr);
         ASSERT_EQ(var_u16->ndim, 1);
-        ASSERT_EQ(var_u16->dims[0], 8);
+        ASSERT_EQ(var_u16->global, 1);
+        ASSERT_EQ(var_u16->nsteps, NSteps);
+        ASSERT_EQ(var_u16->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_u32 = adios_inq_var(f, "u32");
         ASSERT_NE(var_u32, nullptr);
         ASSERT_EQ(var_u32->ndim, 1);
-        ASSERT_EQ(var_u32->dims[0], 8);
+        ASSERT_EQ(var_u32->global, 1);
+        ASSERT_EQ(var_u32->nsteps, NSteps);
+        ASSERT_EQ(var_u32->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_u64 = adios_inq_var(f, "u64");
         ASSERT_NE(var_u64, nullptr);
         ASSERT_EQ(var_u64->ndim, 1);
-        ASSERT_EQ(var_u64->dims[0], 8);
+        ASSERT_EQ(var_u64->global, 1);
+        ASSERT_EQ(var_u64->nsteps, NSteps);
+        ASSERT_EQ(var_u64->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_r32 = adios_inq_var(f, "r32");
         ASSERT_NE(var_r32, nullptr);
         ASSERT_EQ(var_r32->ndim, 1);
-        ASSERT_EQ(var_r32->dims[0], 8);
+        ASSERT_EQ(var_r32->global, 1);
+        ASSERT_EQ(var_r32->nsteps, NSteps);
+        ASSERT_EQ(var_r32->dims[0], mpiSize * Nx);
         ADIOS_VARINFO *var_r64 = adios_inq_var(f, "r64");
         ASSERT_NE(var_r64, nullptr);
         ASSERT_EQ(var_r64->ndim, 1);
-        ASSERT_EQ(var_r64->dims[0], 8);
-
-        std::array<int8_t, 8> I8;
-        std::array<int16_t, 8> I16;
-        std::array<int32_t, 8> I32;
-        std::array<int64_t, 8> I64;
-        std::array<uint8_t, 8> U8;
-        std::array<uint16_t, 8> U16;
-        std::array<uint32_t, 8> U32;
-        std::array<uint64_t, 8> U64;
-        std::array<float, 8> R32;
-        std::array<double, 8> R64;
-
-        uint64_t start[1] = {0};
-        uint64_t count[1] = {8};
+        ASSERT_EQ(var_r64->global, 1);
+        ASSERT_EQ(var_r64->nsteps, NSteps);
+        ASSERT_EQ(var_r64->dims[0], mpiSize * Nx);
+
+        std::array<int8_t, Nx> I8;
+        std::array<int16_t, Nx> I16;
+        std::array<int32_t, Nx> I32;
+        std::array<int64_t, Nx> I64;
+        std::array<uint8_t, Nx> U8;
+        std::array<uint16_t, Nx> U16;
+        std::array<uint32_t, Nx> U32;
+        std::array<uint64_t, Nx> U64;
+        std::array<float, Nx> R32;
+        std::array<double, Nx> R64;
+
+        uint64_t start[1] = {mpiRank * Nx};
+        uint64_t count[1] = {Nx};
         ADIOS_SELECTION *sel = adios_selection_boundingbox(1, start, count);
 
         // Read stuff
-        for (size_t t = 0; t < 3; ++t)
+        for (size_t t = 0; t < NSteps; ++t)
         {
+            // Generate test data for each rank uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, t, mpiRank, mpiSize);
             // Read the current step
             adios_schedule_read_byid(f, sel, var_i8->varid, t, 1, I8.data());
             adios_schedule_read_byid(f, sel, var_i16->varid, t, 1, I16.data());
@@ -193,22 +267,22 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8stdio)
             adios_perform_reads(f, 1);
 
             // Check if it's correct
-            for (size_t i = 0; i < 8; ++i)
+            for (size_t i = 0; i < Nx; ++i)
             {
                 std::stringstream ss;
-                ss << "t=" << t << " i=" << i;
+                ss << "t=" << t << " i=" << i << " rank=" << mpiRank;
                 std::string msg = ss.str();
 
-                EXPECT_EQ(I8[i], m_TestData.I8[i + t]) << msg;
-                EXPECT_EQ(I16[i], m_TestData.I16[i + t]) << msg;
-                EXPECT_EQ(I32[i], m_TestData.I32[i + t]) << msg;
-                EXPECT_EQ(I64[i], m_TestData.I64[i + t]) << msg;
-                EXPECT_EQ(U8[i], m_TestData.U8[i + t]) << msg;
-                EXPECT_EQ(U16[i], m_TestData.U16[i + t]) << msg;
-                EXPECT_EQ(U32[i], m_TestData.U32[i + t]) << msg;
-                EXPECT_EQ(U64[i], m_TestData.U64[i + t]) << msg;
-                EXPECT_EQ(R32[i], m_TestData.R32[i + t]) << msg;
-                EXPECT_EQ(R64[i], m_TestData.R64[i + t]) << msg;
+                EXPECT_EQ(I8[i], currentTestData.I8[i]) << msg;
+                EXPECT_EQ(I16[i], currentTestData.I16[i]) << msg;
+                EXPECT_EQ(I32[i], currentTestData.I32[i]) << msg;
+                EXPECT_EQ(I64[i], currentTestData.I64[i]) << msg;
+                EXPECT_EQ(U8[i], currentTestData.U8[i]) << msg;
+                EXPECT_EQ(U16[i], currentTestData.U16[i]) << msg;
+                EXPECT_EQ(U32[i], currentTestData.U32[i]) << msg;
+                EXPECT_EQ(U64[i], currentTestData.U64[i]) << msg;
+                EXPECT_EQ(R32[i], currentTestData.R32[i]) << msg;
+                EXPECT_EQ(R64[i], currentTestData.R64[i]) << msg;
             }
         }
 
@@ -228,6 +302,8 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read1D8stdio)
 
         // Cleanup file
         adios_read_close(f);
+
+        adios_read_finalize_method(ADIOS_READ_METHOD_BP);
     }
 }
 
@@ -243,49 +319,85 @@ TEST_F(BPWriteReadTest, DISABLED_ADIOS2BPWriteADIOS2BPRead1D8stdio)
 // 2D 2x4 test data
 //******************************************************************************
 
-// ADIOS2 write, native ADIOS1 read
+// ADIOS2 BP write, native ADIOS1 read
 TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4stdio)
 {
+    // Each process would write a 2x4 array and all processes would
+    // form a 2D 2 * (numberOfProcess*Nx) matrix where Nx is 4 here
     std::string fname = "ADIOS2BPWriteADIOS1Read2D2x4Teststdio.bp";
 
+    int mpiRank = 0, mpiSize = 1;
+    // Number of rows
+    const std::size_t Nx = 4;
+
+    // Number of rows
+    const std::size_t Ny = 2;
+
+    // Number of steps
+    const std::size_t NSteps = 3;
+
+#ifdef ADIOS2_HAVE_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
+#endif
+
     // Write test data using ADIOS2
     {
+#ifdef ADIOS2_HAVE_MPI
+        adios2::ADIOS adios(MPI_COMM_WORLD, adios2::DebugON);
+#else
         adios2::ADIOS adios(true);
+#endif
         adios2::IO &io = adios.DeclareIO("TestIO");
 
-        // Declare 1D variables
+        // Declare 2D variables (Ny * (NumOfProcesses * Nx))
+        // The local process' part (start, count) can be defined now or later
+        // before Write().
         {
-            auto &var_i8 =
-                io.DefineVariable<int8_t>("i8", {}, {}, adios2::Dims{2, 4});
+            adios2::Dims shape{static_cast<unsigned int>(Ny),
+                               static_cast<unsigned int>(Nx * mpiSize)};
+            adios2::Dims start{static_cast<unsigned int>(0),
+                               static_cast<unsigned int>(mpiRank * Nx)};
+            adios2::Dims count{static_cast<unsigned int>(Ny),
+                               static_cast<unsigned int>(Nx)};
+            auto &var_i8 = io.DefineVariable<int8_t>("i8", shape, start, count);
             auto &var_i16 =
-                io.DefineVariable<int16_t>("i16", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<int16_t>("i16", shape, start, count);
             auto &var_i32 =
-                io.DefineVariable<int32_t>("i32", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<int32_t>("i32", shape, start, count);
             auto &var_i64 =
-                io.DefineVariable<int64_t>("i64", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<int64_t>("i64", shape, start, count);
             auto &var_u8 =
-                io.DefineVariable<uint8_t>("u8", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<uint8_t>("u8", shape, start, count);
             auto &var_u16 =
-                io.DefineVariable<uint16_t>("u16", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<uint16_t>("u16", shape, start, count);
             auto &var_u32 =
-                io.DefineVariable<uint32_t>("u32", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<uint32_t>("u32", shape, start, count);
             auto &var_u64 =
-                io.DefineVariable<uint64_t>("u64", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<uint64_t>("u64", shape, start, count);
             auto &var_r32 =
-                io.DefineVariable<float>("r32", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<float>("r32", shape, start, count);
             auto &var_r64 =
-                io.DefineVariable<double>("r64", {}, {}, adios2::Dims{2, 4});
+                io.DefineVariable<double>("r64", shape, start, count);
         }
 
         // Create the BP Engine
         io.SetEngine("BPFileWriter");
+#ifdef ADIOS2_HAVE_MPI
         io.AddTransport("file", {{"Library", "stdio"}});
+#else
+        io.AddTransport("file");
+#endif
 
         auto engine = io.Open(fname, adios2::OpenMode::Write);
         ASSERT_NE(engine.get(), nullptr);
 
-        for (size_t step = 0; step < 3; ++step)
+        for (size_t step = 0; step < NSteps; ++step)
         {
+            // Generate test data for each process uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, step, mpiRank, mpiSize);
+
             // Retrieve the variables that previously went out of scope
             auto &var_i8 = io.GetVariable<int8_t>("i8");
             auto &var_i16 = io.GetVariable<int16_t>("i16");
@@ -298,17 +410,34 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4stdio)
             auto &var_r32 = io.GetVariable<float>("r32");
             auto &var_r64 = io.GetVariable<double>("r64");
 
+            // Make a 2D selection to describe the local dimensions of the
+            // variable we write and its offsets in the global spaces
+            adios2::SelectionBoundingBox sel(
+                {0, static_cast<unsigned int>(mpiRank * Nx)}, {Ny, Nx});
+            var_i8.SetSelection(sel);
+            var_i16.SetSelection(sel);
+            var_i32.SetSelection(sel);
+            var_i64.SetSelection(sel);
+            var_u8.SetSelection(sel);
+            var_u16.SetSelection(sel);
+            var_u32.SetSelection(sel);
+            var_u64.SetSelection(sel);
+            var_r32.SetSelection(sel);
+            var_r64.SetSelection(sel);
+
             // Write each one
-            engine->Write(var_i8, m_TestData.I8.data() + step);
-            engine->Write(var_i16, m_TestData.I16.data() + step);
-            engine->Write(var_i32, m_TestData.I32.data() + step);
-            engine->Write(var_i64, m_TestData.I64.data() + step);
-            engine->Write(var_u8, m_TestData.U8.data() + step);
-            engine->Write(var_u16, m_TestData.U16.data() + step);
-            engine->Write(var_u32, m_TestData.U32.data() + step);
-            engine->Write(var_u64, m_TestData.U64.data() + step);
-            engine->Write(var_r32, m_TestData.R32.data() + step);
-            engine->Write(var_r64, m_TestData.R64.data() + step);
+            // fill in the variable with values from starting index to
+            // starting index + count
+            engine->Write(var_i8, currentTestData.I8.data());
+            engine->Write(var_i16, currentTestData.I16.data());
+            engine->Write(var_i32, currentTestData.I32.data());
+            engine->Write(var_i64, currentTestData.I64.data());
+            engine->Write(var_u8, currentTestData.U8.data());
+            engine->Write(var_u16, currentTestData.U16.data());
+            engine->Write(var_u32, currentTestData.U32.data());
+            engine->Write(var_u64, currentTestData.U64.data());
+            engine->Write(var_r32, currentTestData.R32.data());
+            engine->Write(var_r64, currentTestData.R64.data());
 
             // Advance to the next time step
             engine->Advance();
@@ -318,93 +447,118 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4stdio)
         engine->Close();
     }
 
-// Read test data using ADIOS1
-#ifdef ADIOS2_HAVE_MPI
-    // Read everything from rank 0
-    int rank;
-    MPI_Comm_rank();
-    if (rank == 0)
-#endif
     {
-        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_WORLD,
+        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_SELF,
                                "verbose=3");
 
         // Open the file for reading
-        ADIOS_FILE *f =
-            adios_read_open_file((fname + ".dir/" + fname + ".0").c_str(),
-                                 ADIOS_READ_METHOD_BP, MPI_COMM_WORLD);
+        // Note: Since collective metadata generation is not implemented yet,
+        // SO for now we read each subfile instead of a single bp file with all
+        // metadata.
+        // Meanwhile if we open file with MPI_COMM_WORLD, then the selection
+        // bounding box should be [0, Nx]
+        std::string index = std::to_string(mpiRank);
+        ADIOS_FILE *f = adios_read_open_file(
+            (fname + ".dir/" + fname + "." + index).c_str(),
+            ADIOS_READ_METHOD_BP, MPI_COMM_SELF);
         ASSERT_NE(f, nullptr);
 
         // Check the variables exist
         ADIOS_VARINFO *var_i8 = adios_inq_var(f, "i8");
         ASSERT_NE(var_i8, nullptr);
         ASSERT_EQ(var_i8->ndim, 2);
-        ASSERT_EQ(var_i8->dims[0], 2);
-        ASSERT_EQ(var_i8->dims[1], 4);
+        ASSERT_EQ(var_i8->global, 1);
+        ASSERT_EQ(var_i8->nsteps, NSteps);
+        ASSERT_EQ(var_i8->dims[0], Ny);
+        ASSERT_EQ(var_i8->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i16 = adios_inq_var(f, "i16");
         ASSERT_NE(var_i16, nullptr);
         ASSERT_EQ(var_i16->ndim, 2);
-        ASSERT_EQ(var_i16->dims[0], 2);
-        ASSERT_EQ(var_i16->dims[1], 4);
+        ASSERT_EQ(var_i16->global, 1);
+        ASSERT_EQ(var_i16->nsteps, NSteps);
+        ASSERT_EQ(var_i16->dims[0], Ny);
+        ASSERT_EQ(var_i16->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i32 = adios_inq_var(f, "i32");
         ASSERT_NE(var_i32, nullptr);
         ASSERT_EQ(var_i32->ndim, 2);
-        ASSERT_EQ(var_i32->dims[0], 2);
-        ASSERT_EQ(var_i32->dims[1], 4);
+        ASSERT_EQ(var_i32->global, 1);
+        ASSERT_EQ(var_i32->nsteps, NSteps);
+        ASSERT_EQ(var_i32->dims[0], Ny);
+        ASSERT_EQ(var_i32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i64 = adios_inq_var(f, "i64");
         ASSERT_NE(var_i64, nullptr);
         ASSERT_EQ(var_i64->ndim, 2);
-        ASSERT_EQ(var_i64->dims[0], 2);
-        ASSERT_EQ(var_i64->dims[1], 4);
+        ASSERT_EQ(var_i64->global, 1);
+        ASSERT_EQ(var_i64->nsteps, NSteps);
+        ASSERT_EQ(var_i64->dims[0], Ny);
+        ASSERT_EQ(var_i64->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u8 = adios_inq_var(f, "u8");
         ASSERT_NE(var_u8, nullptr);
         ASSERT_EQ(var_u8->ndim, 2);
-        ASSERT_EQ(var_u8->dims[0], 2);
-        ASSERT_EQ(var_u8->dims[1], 4);
+        ASSERT_EQ(var_u8->global, 1);
+        ASSERT_EQ(var_u8->nsteps, NSteps);
+        ASSERT_EQ(var_u8->dims[0], Ny);
+        ASSERT_EQ(var_u8->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u16 = adios_inq_var(f, "u16");
         ASSERT_NE(var_u16, nullptr);
         ASSERT_EQ(var_u16->ndim, 2);
-        ASSERT_EQ(var_u16->dims[0], 2);
-        ASSERT_EQ(var_u16->dims[1], 4);
+        ASSERT_EQ(var_u16->global, 1);
+        ASSERT_EQ(var_u16->nsteps, NSteps);
+        ASSERT_EQ(var_u16->dims[0], Ny);
+        ASSERT_EQ(var_u16->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u32 = adios_inq_var(f, "u32");
         ASSERT_NE(var_u32, nullptr);
         ASSERT_EQ(var_u32->ndim, 2);
-        ASSERT_EQ(var_u32->dims[0], 2);
-        ASSERT_EQ(var_u32->dims[1], 4);
+        ASSERT_EQ(var_u32->global, 1);
+        ASSERT_EQ(var_u32->nsteps, NSteps);
+        ASSERT_EQ(var_u32->dims[0], Ny);
+        ASSERT_EQ(var_u32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u64 = adios_inq_var(f, "u64");
         ASSERT_NE(var_u64, nullptr);
         ASSERT_EQ(var_u64->ndim, 2);
-        ASSERT_EQ(var_u64->dims[0], 2);
-        ASSERT_EQ(var_u64->dims[1], 4);
+        ASSERT_EQ(var_u64->global, 1);
+        ASSERT_EQ(var_u64->nsteps, NSteps);
+        ASSERT_EQ(var_u64->dims[0], Ny);
+        ASSERT_EQ(var_u64->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_r32 = adios_inq_var(f, "r32");
         ASSERT_NE(var_r32, nullptr);
         ASSERT_EQ(var_r32->ndim, 2);
-        ASSERT_EQ(var_r32->dims[0], 2);
-        ASSERT_EQ(var_r32->dims[1], 4);
+        ASSERT_EQ(var_r32->global, 1);
+        ASSERT_EQ(var_r32->nsteps, NSteps);
+        ASSERT_EQ(var_r32->dims[0], Ny);
+        ASSERT_EQ(var_r32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_r64 = adios_inq_var(f, "r64");
         ASSERT_NE(var_r64, nullptr);
         ASSERT_EQ(var_r64->ndim, 2);
-        ASSERT_EQ(var_r64->dims[0], 2);
-        ASSERT_EQ(var_r64->dims[1], 4);
-
-        std::array<int8_t, 8> I8;
-        std::array<int16_t, 8> I16;
-        std::array<int32_t, 8> I32;
-        std::array<int64_t, 8> I64;
-        std::array<uint8_t, 8> U8;
-        std::array<uint16_t, 8> U16;
-        std::array<uint32_t, 8> U32;
-        std::array<uint64_t, 8> U64;
-        std::array<float, 8> R32;
-        std::array<double, 8> R64;
-
-        uint64_t start[2] = {0, 0};
-        uint64_t count[2] = {2, 4};
+        ASSERT_EQ(var_r64->global, 1);
+        ASSERT_EQ(var_r64->nsteps, NSteps);
+        ASSERT_EQ(var_r64->dims[0], Ny);
+        ASSERT_EQ(var_r64->dims[1], mpiSize * Nx);
+
+        // If the size of the array is smaller than the data
+        // the result is weird... double and uint64_t would get completely
+        // garbage data
+        std::array<int8_t, Nx * Ny> I8;
+        std::array<int16_t, Nx * Ny> I16;
+        std::array<int32_t, Nx * Ny> I32;
+        std::array<int64_t, Nx * Ny> I64;
+        std::array<uint8_t, Nx * Ny> U8;
+        std::array<uint16_t, Nx * Ny> U16;
+        std::array<uint32_t, Nx * Ny> U32;
+        std::array<uint64_t, Nx * Ny> U64;
+        std::array<float, Nx * Ny> R32;
+        std::array<double, Nx * Ny> R64;
+
+        uint64_t start[2] = {0, mpiRank * Nx};
+        uint64_t count[2] = {Ny, Nx};
         ADIOS_SELECTION *sel = adios_selection_boundingbox(2, start, count);
 
         // Read stuff
-        for (size_t t = 0; t < 3; ++t)
+        for (size_t t = 0; t < NSteps; ++t)
         {
+            // Generate test data for each rank uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, t, mpiRank, mpiSize);
             // Read the current step
             adios_schedule_read_byid(f, sel, var_i8->varid, t, 1, I8.data());
             adios_schedule_read_byid(f, sel, var_i16->varid, t, 1, I16.data());
@@ -419,22 +573,22 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4stdio)
             adios_perform_reads(f, 1);
 
             // Check if it's correct
-            for (size_t i = 0; i < 8; ++i)
+            for (size_t i = 0; i < Nx; ++i)
             {
                 std::stringstream ss;
-                ss << "t=" << t << " i=" << i;
+                ss << "t=" << t << " i=" << i << " rank=" << mpiRank;
                 std::string msg = ss.str();
 
-                EXPECT_EQ(I8[i], m_TestData.I8[i + t]) << msg;
-                EXPECT_EQ(I16[i], m_TestData.I16[i + t]) << msg;
-                EXPECT_EQ(I32[i], m_TestData.I32[i + t]) << msg;
-                EXPECT_EQ(I64[i], m_TestData.I64[i + t]) << msg;
-                EXPECT_EQ(U8[i], m_TestData.U8[i + t]) << msg;
-                EXPECT_EQ(U16[i], m_TestData.U16[i + t]) << msg;
-                EXPECT_EQ(U32[i], m_TestData.U32[i + t]) << msg;
-                EXPECT_EQ(U64[i], m_TestData.U64[i + t]) << msg;
-                EXPECT_EQ(R32[i], m_TestData.R32[i + t]) << msg;
-                EXPECT_EQ(R64[i], m_TestData.R64[i + t]) << msg;
+                EXPECT_EQ(I8[i], currentTestData.I8[i]) << msg;
+                EXPECT_EQ(I16[i], currentTestData.I16[i]) << msg;
+                EXPECT_EQ(I32[i], currentTestData.I32[i]) << msg;
+                EXPECT_EQ(I64[i], currentTestData.I64[i]) << msg;
+                EXPECT_EQ(U8[i], currentTestData.U8[i]) << msg;
+                EXPECT_EQ(U16[i], currentTestData.U16[i]) << msg;
+                EXPECT_EQ(U32[i], currentTestData.U32[i]) << msg;
+                EXPECT_EQ(U64[i], currentTestData.U64[i]) << msg;
+                EXPECT_EQ(R32[i], currentTestData.R32[i]) << msg;
+                EXPECT_EQ(R64[i], currentTestData.R64[i]) << msg;
             }
         }
 
@@ -454,6 +608,8 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D2x4stdio)
 
         // Cleanup file
         adios_read_close(f);
+
+        adios_read_finalize_method(ADIOS_READ_METHOD_BP);
     }
 }
 
@@ -472,46 +628,82 @@ TEST_F(BPWriteReadTest, DISABLED_ADIOS2BPWriteADIOS2BPRead2D2x4stdio)
 // ADIOS2 write, native ADIOS1 read
 TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2stdio)
 {
+    // Each process would write a 4x2 array and all processes would
+    // form a 2D 4 * (NumberOfProcess * Nx) matrix where Nx is 2 here
     std::string fname = "ADIOS2BPWriteADIOS1Read2D4x2Teststdio.bp";
 
+    int mpiRank = 0, mpiSize = 1;
+    // Number of rows
+    const std::size_t Nx = 2;
+    // Number of cols
+    const std::size_t Ny = 4;
+
+    // Number of steps
+    const std::size_t NSteps = 3;
+
+#ifdef ADIOS2_HAVE_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
+#endif
+
     // Write test data using ADIOS2
     {
+#ifdef ADIOS2_HAVE_MPI
+        adios2::ADIOS adios(MPI_COMM_WORLD, adios2::DebugON);
+#else
         adios2::ADIOS adios(true);
+#endif
         adios2::IO &io = adios.DeclareIO("TestIO");
 
-        // Declare 1D variables
+        // Declare 2D variables (4 * (NumberOfProcess * Nx))
+        // The local process' part (start, count) can be defined now or later
+        // before Write().
         {
-            auto &var_i8 =
-                io.DefineVariable<int8_t>("i8", {}, {}, adios2::Dims{4, 2});
+            adios2::Dims shape{static_cast<unsigned int>(Ny),
+                               static_cast<unsigned int>(mpiSize * Nx)};
+            adios2::Dims start{static_cast<unsigned int>(0),
+                               static_cast<unsigned int>(mpiRank * Nx)};
+            adios2::Dims count{static_cast<unsigned int>(Ny),
+                               static_cast<unsigned int>(Nx)};
+            auto &var_i8 = io.DefineVariable<int8_t>("i8", shape, start, count);
             auto &var_i16 =
-                io.DefineVariable<int16_t>("i16", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<int16_t>("i16", shape, start, count);
             auto &var_i32 =
-                io.DefineVariable<int32_t>("i32", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<int32_t>("i32", shape, start, count);
             auto &var_i64 =
-                io.DefineVariable<int64_t>("i64", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<int64_t>("i64", shape, start, count);
             auto &var_u8 =
-                io.DefineVariable<uint8_t>("u8", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<uint8_t>("u8", shape, start, count);
             auto &var_u16 =
-                io.DefineVariable<uint16_t>("u16", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<uint16_t>("u16", shape, start, count);
             auto &var_u32 =
-                io.DefineVariable<uint32_t>("u32", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<uint32_t>("u32", shape, start, count);
             auto &var_u64 =
-                io.DefineVariable<uint64_t>("u64", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<uint64_t>("u64", shape, start, count);
             auto &var_r32 =
-                io.DefineVariable<float>("r32", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<float>("r32", shape, start, count);
             auto &var_r64 =
-                io.DefineVariable<double>("r64", {}, {}, adios2::Dims{4, 2});
+                io.DefineVariable<double>("r64", shape, start, count);
         }
 
         // Create the BP Engine
         io.SetEngine("BPFileWriter");
+
+#ifdef ADIOS2_HAVE_MPI
         io.AddTransport("file", {{"Library", "stdio"}});
+#else
+        io.AddTransport("file");
+#endif
 
         auto engine = io.Open(fname, adios2::OpenMode::Write);
         ASSERT_NE(engine.get(), nullptr);
 
-        for (size_t step = 0; step < 3; ++step)
+        for (size_t step = 0; step < NSteps; ++step)
         {
+            // Generate test data for each process uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, step, mpiRank, mpiSize);
+
             // Retrieve the variables that previously went out of scope
             auto &var_i8 = io.GetVariable<int8_t>("i8");
             auto &var_i16 = io.GetVariable<int16_t>("i16");
@@ -524,17 +716,34 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2stdio)
             auto &var_r32 = io.GetVariable<float>("r32");
             auto &var_r64 = io.GetVariable<double>("r64");
 
+            // Make a 2D selection to describe the local dimensions of the
+            // variable we write and its offsets in the global spaces
+            adios2::SelectionBoundingBox sel(
+                {0, static_cast<unsigned int>(mpiRank * Nx)}, {Ny, Nx});
+            var_i8.SetSelection(sel);
+            var_i16.SetSelection(sel);
+            var_i32.SetSelection(sel);
+            var_i64.SetSelection(sel);
+            var_u8.SetSelection(sel);
+            var_u16.SetSelection(sel);
+            var_u32.SetSelection(sel);
+            var_u64.SetSelection(sel);
+            var_r32.SetSelection(sel);
+            var_r64.SetSelection(sel);
+
             // Write each one
-            engine->Write(var_i8, m_TestData.I8.data() + step);
-            engine->Write(var_i16, m_TestData.I16.data() + step);
-            engine->Write(var_i32, m_TestData.I32.data() + step);
-            engine->Write(var_i64, m_TestData.I64.data() + step);
-            engine->Write(var_u8, m_TestData.U8.data() + step);
-            engine->Write(var_u16, m_TestData.U16.data() + step);
-            engine->Write(var_u32, m_TestData.U32.data() + step);
-            engine->Write(var_u64, m_TestData.U64.data() + step);
-            engine->Write(var_r32, m_TestData.R32.data() + step);
-            engine->Write(var_r64, m_TestData.R64.data() + step);
+            // fill in the variable with values from starting index to
+            // starting index + count
+            engine->Write(var_i8, currentTestData.I8.data());
+            engine->Write(var_i16, currentTestData.I16.data());
+            engine->Write(var_i32, currentTestData.I32.data());
+            engine->Write(var_i64, currentTestData.I64.data());
+            engine->Write(var_u8, currentTestData.U8.data());
+            engine->Write(var_u16, currentTestData.U16.data());
+            engine->Write(var_u32, currentTestData.U32.data());
+            engine->Write(var_u64, currentTestData.U64.data());
+            engine->Write(var_r32, currentTestData.R32.data());
+            engine->Write(var_r64, currentTestData.R64.data());
 
             // Advance to the next time step
             engine->Advance();
@@ -544,93 +753,118 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2stdio)
         engine->Close();
     }
 
-// Read test data using ADIOS1
-#ifdef ADIOS2_HAVE_MPI
-    // Read everything from rank 0
-    int rank;
-    MPI_Comm_rank();
-    if (rank == 0)
-#endif
     {
-        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_WORLD,
+        adios_read_init_method(ADIOS_READ_METHOD_BP, MPI_COMM_SELF,
                                "verbose=3");
 
         // Open the file for reading
-        ADIOS_FILE *f =
-            adios_read_open_file((fname + ".dir/" + fname + ".0").c_str(),
-                                 ADIOS_READ_METHOD_BP, MPI_COMM_WORLD);
+        // Note: Since collective metadata generation is not implemented yet,
+        // SO for now we read each subfile instead of a single bp file with all
+        // metadata.
+        // Meanwhile if we open file with MPI_COMM_WORLD, then the selection
+        // bounding box should be [0, Nx]
+        std::string index = std::to_string(mpiRank);
+        ADIOS_FILE *f = adios_read_open_file(
+            (fname + ".dir/" + fname + "." + index).c_str(),
+            ADIOS_READ_METHOD_BP, MPI_COMM_SELF);
         ASSERT_NE(f, nullptr);
 
         // Check the variables exist
         ADIOS_VARINFO *var_i8 = adios_inq_var(f, "i8");
         ASSERT_NE(var_i8, nullptr);
         ASSERT_EQ(var_i8->ndim, 2);
-        ASSERT_EQ(var_i8->dims[0], 4);
-        ASSERT_EQ(var_i8->dims[1], 2);
+        ASSERT_EQ(var_i8->global, 1);
+        ASSERT_EQ(var_i8->nsteps, NSteps);
+        ASSERT_EQ(var_i8->dims[0], Ny);
+        ASSERT_EQ(var_i8->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i16 = adios_inq_var(f, "i16");
         ASSERT_NE(var_i16, nullptr);
         ASSERT_EQ(var_i16->ndim, 2);
-        ASSERT_EQ(var_i16->dims[0], 4);
-        ASSERT_EQ(var_i16->dims[1], 2);
+        ASSERT_EQ(var_i16->global, 1);
+        ASSERT_EQ(var_i16->nsteps, NSteps);
+        ASSERT_EQ(var_i16->dims[0], Ny);
+        ASSERT_EQ(var_i16->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i32 = adios_inq_var(f, "i32");
         ASSERT_NE(var_i32, nullptr);
         ASSERT_EQ(var_i32->ndim, 2);
-        ASSERT_EQ(var_i32->dims[0], 4);
-        ASSERT_EQ(var_i32->dims[1], 2);
+        ASSERT_EQ(var_i32->global, 1);
+        ASSERT_EQ(var_i32->nsteps, NSteps);
+        ASSERT_EQ(var_i32->dims[0], Ny);
+        ASSERT_EQ(var_i32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_i64 = adios_inq_var(f, "i64");
         ASSERT_NE(var_i64, nullptr);
         ASSERT_EQ(var_i64->ndim, 2);
-        ASSERT_EQ(var_i64->dims[0], 4);
-        ASSERT_EQ(var_i64->dims[1], 2);
+        ASSERT_EQ(var_i64->global, 1);
+        ASSERT_EQ(var_i64->nsteps, NSteps);
+        ASSERT_EQ(var_i64->dims[0], Ny);
+        ASSERT_EQ(var_i64->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u8 = adios_inq_var(f, "u8");
         ASSERT_NE(var_u8, nullptr);
         ASSERT_EQ(var_u8->ndim, 2);
-        ASSERT_EQ(var_u8->dims[0], 4);
-        ASSERT_EQ(var_u8->dims[1], 2);
+        ASSERT_EQ(var_u8->global, 1);
+        ASSERT_EQ(var_u8->nsteps, NSteps);
+        ASSERT_EQ(var_u8->dims[0], Ny);
+        ASSERT_EQ(var_u8->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u16 = adios_inq_var(f, "u16");
         ASSERT_NE(var_u16, nullptr);
         ASSERT_EQ(var_u16->ndim, 2);
-        ASSERT_EQ(var_u16->dims[0], 4);
-        ASSERT_EQ(var_u16->dims[1], 2);
+        ASSERT_EQ(var_u16->global, 1);
+        ASSERT_EQ(var_u16->nsteps, NSteps);
+        ASSERT_EQ(var_u16->dims[0], Ny);
+        ASSERT_EQ(var_u16->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u32 = adios_inq_var(f, "u32");
         ASSERT_NE(var_u32, nullptr);
         ASSERT_EQ(var_u32->ndim, 2);
-        ASSERT_EQ(var_u32->dims[0], 4);
-        ASSERT_EQ(var_u32->dims[1], 2);
+        ASSERT_EQ(var_u32->global, 1);
+        ASSERT_EQ(var_u32->nsteps, NSteps);
+        ASSERT_EQ(var_u32->dims[0], Ny);
+        ASSERT_EQ(var_u32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_u64 = adios_inq_var(f, "u64");
         ASSERT_NE(var_u64, nullptr);
         ASSERT_EQ(var_u64->ndim, 2);
-        ASSERT_EQ(var_u64->dims[0], 4);
-        ASSERT_EQ(var_u64->dims[1], 2);
+        ASSERT_EQ(var_u64->global, 1);
+        ASSERT_EQ(var_u64->nsteps, NSteps);
+        ASSERT_EQ(var_u64->dims[0], Ny);
+        ASSERT_EQ(var_u64->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_r32 = adios_inq_var(f, "r32");
         ASSERT_NE(var_r32, nullptr);
         ASSERT_EQ(var_r32->ndim, 2);
-        ASSERT_EQ(var_r32->dims[0], 4);
-        ASSERT_EQ(var_r32->dims[1], 2);
+        ASSERT_EQ(var_r32->global, 1);
+        ASSERT_EQ(var_r32->nsteps, NSteps);
+        ASSERT_EQ(var_r32->dims[0], Ny);
+        ASSERT_EQ(var_r32->dims[1], mpiSize * Nx);
         ADIOS_VARINFO *var_r64 = adios_inq_var(f, "r64");
         ASSERT_NE(var_r64, nullptr);
         ASSERT_EQ(var_r64->ndim, 2);
-        ASSERT_EQ(var_r64->dims[0], 4);
-        ASSERT_EQ(var_r64->dims[1], 2);
-
-        std::array<int8_t, 8> I8;
-        std::array<int16_t, 8> I16;
-        std::array<int32_t, 8> I32;
-        std::array<int64_t, 8> I64;
-        std::array<uint8_t, 8> U8;
-        std::array<uint16_t, 8> U16;
-        std::array<uint32_t, 8> U32;
-        std::array<uint64_t, 8> U64;
-        std::array<float, 8> R32;
-        std::array<double, 8> R64;
-
-        uint64_t start[2] = {0, 0};
-        uint64_t count[2] = {4, 2};
+        ASSERT_EQ(var_r64->global, 1);
+        ASSERT_EQ(var_r64->nsteps, NSteps);
+        ASSERT_EQ(var_r64->dims[0], Ny);
+        ASSERT_EQ(var_r64->dims[1], mpiSize * Nx);
+
+        // If the size of the array is smaller than the data
+        // the result is weird... double and uint64_t would get completely
+        // garbage data
+        std::array<int8_t, Nx * Ny> I8;
+        std::array<int16_t, Nx * Ny> I16;
+        std::array<int32_t, Nx * Ny> I32;
+        std::array<int64_t, Nx * Ny> I64;
+        std::array<uint8_t, Nx * Ny> U8;
+        std::array<uint16_t, Nx * Ny> U16;
+        std::array<uint32_t, Nx * Ny> U32;
+        std::array<uint64_t, Nx * Ny> U64;
+        std::array<float, Nx * Ny> R32;
+        std::array<double, Nx * Ny> R64;
+
+        uint64_t start[2] = {0, mpiRank * Nx};
+        uint64_t count[2] = {Ny, Nx};
         ADIOS_SELECTION *sel = adios_selection_boundingbox(2, start, count);
 
         // Read stuff
-        for (size_t t = 0; t < 3; ++t)
+        for (size_t t = 0; t < NSteps; ++t)
         {
+            // Generate test data for each rank uniquely
+            SmallTestData currentTestData =
+                generateNewSmallTestData(m_TestData, t, mpiRank, mpiSize);
             // Read the current step
             adios_schedule_read_byid(f, sel, var_i8->varid, t, 1, I8.data());
             adios_schedule_read_byid(f, sel, var_i16->varid, t, 1, I16.data());
@@ -645,22 +879,22 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2stdio)
             adios_perform_reads(f, 1);
 
             // Check if it's correct
-            for (size_t i = 0; i < 8; ++i)
+            for (size_t i = 0; i < Nx; ++i)
             {
                 std::stringstream ss;
-                ss << "t=" << t << " i=" << i;
+                ss << "t=" << t << " i=" << i << " rank=" << mpiRank;
                 std::string msg = ss.str();
 
-                EXPECT_EQ(I8[i], m_TestData.I8[i + t]) << msg;
-                EXPECT_EQ(I16[i], m_TestData.I16[i + t]) << msg;
-                EXPECT_EQ(I32[i], m_TestData.I32[i + t]) << msg;
-                EXPECT_EQ(I64[i], m_TestData.I64[i + t]) << msg;
-                EXPECT_EQ(U8[i], m_TestData.U8[i + t]) << msg;
-                EXPECT_EQ(U16[i], m_TestData.U16[i + t]) << msg;
-                EXPECT_EQ(U32[i], m_TestData.U32[i + t]) << msg;
-                EXPECT_EQ(U64[i], m_TestData.U64[i + t]) << msg;
-                EXPECT_EQ(R32[i], m_TestData.R32[i + t]) << msg;
-                EXPECT_EQ(R64[i], m_TestData.R64[i + t]) << msg;
+                EXPECT_EQ(I8[i], currentTestData.I8[i]) << msg;
+                EXPECT_EQ(I16[i], currentTestData.I16[i]) << msg;
+                EXPECT_EQ(I32[i], currentTestData.I32[i]) << msg;
+                EXPECT_EQ(I64[i], currentTestData.I64[i]) << msg;
+                EXPECT_EQ(U8[i], currentTestData.U8[i]) << msg;
+                EXPECT_EQ(U16[i], currentTestData.U16[i]) << msg;
+                EXPECT_EQ(U32[i], currentTestData.U32[i]) << msg;
+                EXPECT_EQ(U64[i], currentTestData.U64[i]) << msg;
+                EXPECT_EQ(R32[i], currentTestData.R32[i]) << msg;
+                EXPECT_EQ(R64[i], currentTestData.R64[i]) << msg;
             }
         }
 
@@ -680,6 +914,8 @@ TEST_F(BPWriteReadTest, ADIOS2BPWriteADIOS1Read2D4x2stdio)
 
         // Cleanup file
         adios_read_close(f);
+
+        adios_read_finalize_method(ADIOS_READ_METHOD_BP);
     }
 }