Loading graph_korc/xkorc.cpp +118 −131 Original line number Diff line number Diff line //------------------------------------------------------------------------------ // xkorc.cpp – Full-orbit Boris integrator (SI units) // xkorc.cpp — Full-orbit Boris integrator (SI units) // // Reads from init_particles.nc // • x, y, z , ux, uy, uz (m and m/s) Loading Loading @@ -40,7 +40,7 @@ void run_korc() { const timeing::measure_diagnostic t_total("Total Time"); // ─────────────────────────────────── 1. READ INITIAL DATA ──────────────── // ─────────────────────────────────── 1. READ INITIAL DATA ───────────────── int ncid; check_nc( nc_open("init_particles.nc", NC_NOWRITE, &ncid) ); Loading Loading @@ -78,13 +78,29 @@ void run_korc() if (nc_inq_varid(ncid,"particle_charge",&vid)==NC_NOERR) check_nc( nc_get_var_double(ncid,vid,&q_val) ); double R0_tok = 3.0; double B0_tok = 2.0; double q0_tok = 1.0; double lambda_tok = 1.0; if (nc_inq_varid(ncid,"R0",&vid)==NC_NOERR) check_nc( nc_get_var_double(ncid,vid,&R0_tok) ); if (nc_inq_varid(ncid,"B0",&vid)==NC_NOERR) check_nc( nc_get_var_double(ncid,vid,&B0_tok) ); if (nc_inq_varid(ncid,"q0",&vid)==NC_NOERR) check_nc( nc_get_var_double(ncid,vid,&q0_tok) ); if (nc_inq_varid(ncid,"lambda",&vid)==NC_NOERR) check_nc( nc_get_var_double(ncid,vid,&lambda_tok) ); check_nc( nc_close(ncid) ); const std::size_t NSTEPS = static_cast<std::size_t>(h_NSTEPS); const std::size_t DUMP_EVERY = static_cast<std::size_t>(h_DUMP ); const double q_over_m_val = q_val / m_val; // ─────────────────────────────────── 2. CREATE GRAPH VARS ──────────────── std::cout << "NSTEPS: " << NSTEPS << ", DUMP_EVERY: " << DUMP_EVERY << '\n'; std::cout << "dt: " << dt_val << " s, q: " << q_val << " C, m: " << m_val << " kg\n"; // ─────────────────────────────────── 2. CREATE GRAPH VARS ───────────────── auto x = graph::variable<T,false>(Np,"x"); auto y = graph::variable<T,false>(Np,"y"); auto z = graph::variable<T,false>(Np,"z"); Loading @@ -97,19 +113,12 @@ void run_korc() ux->set(i,h_ux[i]); uy->set(i,h_uy[i]); uz->set(i,h_uz[i]); } // ─────────────────────────────────── 3. THREAD LAYOUT ──────────────────── unsigned n_thr = std::max(1u, std::min<unsigned>(jit::context<T>::max_concurrency(), static_cast<unsigned>(Np))); std::vector<std::thread> workers(n_thr); // ─────────────────────────────────── 3. SINGLE THREAD MODE ─────────────── // For single output file, run single-threaded to avoid synchronization overhead // GPU parallelism happens within the kernel, not across CPU threads const size_t batch = Np / n_thr, extra = Np % n_thr; for(unsigned tid=0; tid<n_thr; ++tid) { workers[tid] = std::thread([=]() mutable { const size_t Nloc = batch + (extra>tid ? 1 : 0); const timeing::measure_diagnostic t_setup("Setup T"+std::to_string(tid)); const timeing::measure_diagnostic t_setup("Setup"); /* constants */ auto dt = graph::constant<T,false>(static_cast<T>(dt_val)); Loading @@ -121,7 +130,11 @@ void run_korc() auto coeff = dt_half * qom; // (dt/2)*(q/m) /* geometry */ auto eq = equilibrium::make_efit<T>(EFIT_FILE); auto eq = equilibrium::make_tokamak_field<T>( static_cast<T>(R0_tok), static_cast<T>(B0_tok), static_cast<T>(q0_tok), static_cast<T>(lambda_tok)); /* vector wrappers */ auto pos = graph::vector(x ,y ,z ); Loading @@ -133,7 +146,7 @@ void run_korc() auto e_vec = eq->get_electric_field( pos->get_x(), pos->get_y(), pos->get_z()); /* ── Boris pusher with E ─────────────────────────────────────── */ /* ── Boris pusher with E ──────────────────────────────────────────── */ auto u_minus = u_vec + coeff * e_vec; // first E-kick auto t_vec = coeff * b_vec; // t = (qB/m)*(dt/2) auto t_sq = t_vec->dot(t_vec); Loading @@ -144,8 +157,8 @@ void run_korc() auto pos_next = pos + dt * u_next; // advance position /* workflow manager */ workflow::manager<T,false> w(tid); /* workflow manager - single instance */ workflow::manager<T,false> w(0); w.add_item({graph::variable_cast(x ), graph::variable_cast(y ), graph::variable_cast(z ), Loading @@ -165,8 +178,7 @@ void run_korc() w.compile(); /* output file */ std::ostringstream fn; fn<<"korc_"<<tid<<".nc"; output::result_file of(fn.str(), Nloc); output::result_file of("korc_0.nc", Np); output::data_set<T> ds(of); ds.template create_variable<false>(of,"x", x, w.get_context()); ds.template create_variable<false>(of,"y", y, w.get_context()); Loading @@ -176,60 +188,35 @@ void run_korc() ds.template create_variable<false>(of,"uz", uz, w.get_context()); of.end_define_mode(); /* writer thread */ std::queue<size_t> q; std::mutex mtx; std::condition_variable cv; bool finish=false; std::thread writer([&](){ std::unique_lock<std::mutex> lk(mtx); while(true){ cv.wait(lk,[&](){ return finish || !q.empty(); }); while(!q.empty()){ q.pop(); lk.unlock(); ds.write(of); // I/O outside lock lk.lock(); } if(finish) break; } }); t_setup.print(); const timeing::measure_diagnostic t_run("Run T"+std::to_string(tid)); const timeing::measure_diagnostic t_run("Run"); w.pre_run(); // ── MAIN LOOP ────────────────────────────────────────────────── for(size_t s=0; s<NSTEPS; ++s) { // Write initial condition w.wait(); ds.write(of); if(s % DUMP_EVERY == 0){ { std::lock_guard<std::mutex> lg(mtx); q.push(s); } cv.notify_one(); } // ── MAIN LOOP ─────────────────────────────────────────────────────── for(size_t s=1; s<=NSTEPS; ++s) { w.run(); } w.wait(); // final dump + shutdown { std::lock_guard<std::mutex> lg(mtx); q.push(NSTEPS); finish=true; if(s % DUMP_EVERY == 0 || s == NSTEPS){ ds.write(of); } cv.notify_one(); writer.join(); }); //if(s % 10000 == 0) { // std::cout << "Step " << s << "/" << NSTEPS << '\n' << std::flush; //} } for(auto& th: workers) th.join(); t_run.print(); std::cout << "\nTiming:\n"; t_total.print(); } // ───────────────────────────────────────────────────────────────────────────── // ────────────────────────────────────────────────────────────────────────────── int main(int argc, const char* argv[]) { START_GPU Loading Loading
graph_korc/xkorc.cpp +118 −131 Original line number Diff line number Diff line //------------------------------------------------------------------------------ // xkorc.cpp – Full-orbit Boris integrator (SI units) // xkorc.cpp — Full-orbit Boris integrator (SI units) // // Reads from init_particles.nc // • x, y, z , ux, uy, uz (m and m/s) Loading Loading @@ -40,7 +40,7 @@ void run_korc() { const timeing::measure_diagnostic t_total("Total Time"); // ─────────────────────────────────── 1. READ INITIAL DATA ──────────────── // ─────────────────────────────────── 1. READ INITIAL DATA ───────────────── int ncid; check_nc( nc_open("init_particles.nc", NC_NOWRITE, &ncid) ); Loading Loading @@ -78,13 +78,29 @@ void run_korc() if (nc_inq_varid(ncid,"particle_charge",&vid)==NC_NOERR) check_nc( nc_get_var_double(ncid,vid,&q_val) ); double R0_tok = 3.0; double B0_tok = 2.0; double q0_tok = 1.0; double lambda_tok = 1.0; if (nc_inq_varid(ncid,"R0",&vid)==NC_NOERR) check_nc( nc_get_var_double(ncid,vid,&R0_tok) ); if (nc_inq_varid(ncid,"B0",&vid)==NC_NOERR) check_nc( nc_get_var_double(ncid,vid,&B0_tok) ); if (nc_inq_varid(ncid,"q0",&vid)==NC_NOERR) check_nc( nc_get_var_double(ncid,vid,&q0_tok) ); if (nc_inq_varid(ncid,"lambda",&vid)==NC_NOERR) check_nc( nc_get_var_double(ncid,vid,&lambda_tok) ); check_nc( nc_close(ncid) ); const std::size_t NSTEPS = static_cast<std::size_t>(h_NSTEPS); const std::size_t DUMP_EVERY = static_cast<std::size_t>(h_DUMP ); const double q_over_m_val = q_val / m_val; // ─────────────────────────────────── 2. CREATE GRAPH VARS ──────────────── std::cout << "NSTEPS: " << NSTEPS << ", DUMP_EVERY: " << DUMP_EVERY << '\n'; std::cout << "dt: " << dt_val << " s, q: " << q_val << " C, m: " << m_val << " kg\n"; // ─────────────────────────────────── 2. CREATE GRAPH VARS ───────────────── auto x = graph::variable<T,false>(Np,"x"); auto y = graph::variable<T,false>(Np,"y"); auto z = graph::variable<T,false>(Np,"z"); Loading @@ -97,19 +113,12 @@ void run_korc() ux->set(i,h_ux[i]); uy->set(i,h_uy[i]); uz->set(i,h_uz[i]); } // ─────────────────────────────────── 3. THREAD LAYOUT ──────────────────── unsigned n_thr = std::max(1u, std::min<unsigned>(jit::context<T>::max_concurrency(), static_cast<unsigned>(Np))); std::vector<std::thread> workers(n_thr); // ─────────────────────────────────── 3. SINGLE THREAD MODE ─────────────── // For single output file, run single-threaded to avoid synchronization overhead // GPU parallelism happens within the kernel, not across CPU threads const size_t batch = Np / n_thr, extra = Np % n_thr; for(unsigned tid=0; tid<n_thr; ++tid) { workers[tid] = std::thread([=]() mutable { const size_t Nloc = batch + (extra>tid ? 1 : 0); const timeing::measure_diagnostic t_setup("Setup T"+std::to_string(tid)); const timeing::measure_diagnostic t_setup("Setup"); /* constants */ auto dt = graph::constant<T,false>(static_cast<T>(dt_val)); Loading @@ -121,7 +130,11 @@ void run_korc() auto coeff = dt_half * qom; // (dt/2)*(q/m) /* geometry */ auto eq = equilibrium::make_efit<T>(EFIT_FILE); auto eq = equilibrium::make_tokamak_field<T>( static_cast<T>(R0_tok), static_cast<T>(B0_tok), static_cast<T>(q0_tok), static_cast<T>(lambda_tok)); /* vector wrappers */ auto pos = graph::vector(x ,y ,z ); Loading @@ -133,7 +146,7 @@ void run_korc() auto e_vec = eq->get_electric_field( pos->get_x(), pos->get_y(), pos->get_z()); /* ── Boris pusher with E ─────────────────────────────────────── */ /* ── Boris pusher with E ──────────────────────────────────────────── */ auto u_minus = u_vec + coeff * e_vec; // first E-kick auto t_vec = coeff * b_vec; // t = (qB/m)*(dt/2) auto t_sq = t_vec->dot(t_vec); Loading @@ -144,8 +157,8 @@ void run_korc() auto pos_next = pos + dt * u_next; // advance position /* workflow manager */ workflow::manager<T,false> w(tid); /* workflow manager - single instance */ workflow::manager<T,false> w(0); w.add_item({graph::variable_cast(x ), graph::variable_cast(y ), graph::variable_cast(z ), Loading @@ -165,8 +178,7 @@ void run_korc() w.compile(); /* output file */ std::ostringstream fn; fn<<"korc_"<<tid<<".nc"; output::result_file of(fn.str(), Nloc); output::result_file of("korc_0.nc", Np); output::data_set<T> ds(of); ds.template create_variable<false>(of,"x", x, w.get_context()); ds.template create_variable<false>(of,"y", y, w.get_context()); Loading @@ -176,60 +188,35 @@ void run_korc() ds.template create_variable<false>(of,"uz", uz, w.get_context()); of.end_define_mode(); /* writer thread */ std::queue<size_t> q; std::mutex mtx; std::condition_variable cv; bool finish=false; std::thread writer([&](){ std::unique_lock<std::mutex> lk(mtx); while(true){ cv.wait(lk,[&](){ return finish || !q.empty(); }); while(!q.empty()){ q.pop(); lk.unlock(); ds.write(of); // I/O outside lock lk.lock(); } if(finish) break; } }); t_setup.print(); const timeing::measure_diagnostic t_run("Run T"+std::to_string(tid)); const timeing::measure_diagnostic t_run("Run"); w.pre_run(); // ── MAIN LOOP ────────────────────────────────────────────────── for(size_t s=0; s<NSTEPS; ++s) { // Write initial condition w.wait(); ds.write(of); if(s % DUMP_EVERY == 0){ { std::lock_guard<std::mutex> lg(mtx); q.push(s); } cv.notify_one(); } // ── MAIN LOOP ─────────────────────────────────────────────────────── for(size_t s=1; s<=NSTEPS; ++s) { w.run(); } w.wait(); // final dump + shutdown { std::lock_guard<std::mutex> lg(mtx); q.push(NSTEPS); finish=true; if(s % DUMP_EVERY == 0 || s == NSTEPS){ ds.write(of); } cv.notify_one(); writer.join(); }); //if(s % 10000 == 0) { // std::cout << "Step " << s << "/" << NSTEPS << '\n' << std::flush; //} } for(auto& th: workers) th.join(); t_run.print(); std::cout << "\nTiming:\n"; t_total.print(); } // ───────────────────────────────────────────────────────────────────────────── // ────────────────────────────────────────────────────────────────────────────── int main(int argc, const char* argv[]) { START_GPU Loading