babyromu | Je vous met ici une partie du code. Le seul kernel que j'utilise pour le moment dans la grosse boucle est advection (car deja elle seule ralentit fortement), les autres sont commentés et n'agissent pas pour le moment. Par exemple, pour 1000 iteration elle met 13ms, our 1500 elle en met 280ms.
Code :
- extern "C" void molecule_set_CollisionsGPU(molecule_set_t molecule_set)//, histogram_t *histograms)
- {
- unsigned int timer = 0;
- cutilCheckError( cutCreateTimer( &timer));
-
- /*Définition des variables utilisées*/
- int TAILLE;
- TAILLE = molecule_set->number*11/10;
- printf("On fixe la taille des tableaux à : %d\n",TAILLE);
-
- /////////////////Definition des variables utilisées
-
- int i,k,j,m;
- v3 *tmp,*tmp1;
- mlcl *h_mol;
- mlcl *d_mol;
- float *collisions,*d_collisions, *samples,*d_samples,*statsize,*d_statsize;
- int *appcell,*d_appcell,*nbrecumu, *d_nbrecumu,*nbrecell,*d_nbrecell;
- molecule_t mol;
- cells_table_item_t cell;
- ////////////////Allocation mémoire sur CPU
- h_mol = (mlcl*)malloc(sizeof(mlcl)*TAILLE);
- appcell = (int*)malloc(sizeof(int)*TAILLE);
- nbrecumu = (int*)malloc(sizeof(int)*(molecule_set->table->size+1));
- nbrecumu[0]=0;
- collisions = (float*)malloc(sizeof(float)*molecule_set->table->size);
- samples = (float*)malloc(sizeof(float)*molecule_set->table->size);
- statsize = (float*)malloc(sizeof(float)*molecule_set->table->size);
- nbrecell = (int*)malloc(sizeof(int)*molecule_set->table->size);
- static float f_max_1, f_max_2;
- static float normalization_1, normalization_2;
- cuda_init_inflow_distribution(&normalization_1,&normalization_2,&f_max_1,&f_max_2);
- float u_max_at_boundary_1 = mean_vel_1 + width_cutoff;
-
-
- static float flux_rate_1 = (2 *sqrt(2) *sigma) / ( exp(-(mean_vel_1*mean_vel_1))/sqrt(PI) + mean_vel_1*(1.0 + erf(mean_vel_1)) );
- /*Remplissage des tableaux*/
- for (k=0;k<molecule_set->table->size;k++) {
- cell = &(molecule_set->table->array[k]);
- if(cell->number>1)
- {
- for (i=0; i<cell->number; i++) {
- mol = (molecule_t)cells_table_item_Get_Data(cell,i);
- for (j=0;j<3;j++)
- {
- tmp = molecule_GetVelocity(mol);
- tmp1= molecule_GetPosition(mol);
- h_mol[i+nbrecumu[k]].vit[j]=(*tmp)[j];
- h_mol[i+nbrecumu[k]].pos[j]=(*tmp1)[j];
- }
- appcell[i+nbrecumu[k]] =k;
- }
- nbrecumu[k+1]=cell->number + nbrecumu[k];
- nbrecell[k]=0;
- samples[k]=cell->statistics.samples;
- collisions[k]=cell->collision_delay;
- statsize[k]=cell->statistics.size;
- }
- }
- for(k=nbrecumu[molecule_set->table->size];k<TAILLE;k++)
- {
- appcell[k]=10000;
- }
- ////////////////// Tableau pour les sorties de donnés : histo + statistiques
- int jj;
- statistic Stat[molecule_set->table->size];
- for (i=0;i<molecule_set->table->size;i++) Stat[i]=((molecule_set->table->array)[i]).statistics;
- statistic *d_Stat;
- HANDLE_ERROR( cudaMalloc((void **)&d_Stat, sizeof(statistic)*(molecule_set->table->size)));
- HANDLE_ERROR( cudaMemcpy(d_Stat, Stat, molecule_set->table->size* sizeof(statistic), cudaMemcpyHostToDevice));
- /*
- histoGPU Histo[n_histos];
- for (i=0;i<n_histos;i++)
- {
- for (jj=0;jj<3;jj++) for (j=0;j<2;j++) (Histo[i].range[j])[jj]=(histograms[i]->range[j])[jj];
- Histo[i].sample_size=histograms[i]->sample_size;
- for (j=0;j<3;j++) Histo[i].n_bins[j]=histograms[i]->n_bins[j];
- Histo[i].cell[0]=histo_cells[i];
- }
- */
- ///////////////////////////////////////////////////////////////////////////////////////////////////
- //GENERATION DE LA STRUCTURE INITIALE OUR LES NOMBRES ALEATOIRES
- mt_struct_stripped* ds_MTConfigs;
- unsigned int* ds_MTStates;
- int* ds_MTiStates;
- mt_struct_stripped h_MTConfigs[N_GPU_RNG_THREADS_TOTAL];
- const char* fname = "./MersenneTwister.dat";
- FILE* fd = fopen(fname, "rb" );
- if(!fd){
- printf("initMTGPU(): failed to open %s\n", fname);
- printf("TEST FAILED\n" );
- exit(0);
- }
- if( !fread(h_MTConfigs, sizeof(h_MTConfigs), 1, fd) ){
- printf("initMTGPU(): failed to load %s\n", fname);
- printf("TEST FAILED\n" );
- exit(0);
- }
- fclose(fd);
- cudaMalloc((void **)&ds_MTConfigs, sizeof(h_MTConfigs));
- cudaMemcpy(ds_MTConfigs, h_MTConfigs, sizeof(h_MTConfigs), cudaMemcpyHostToDevice);
- cudaMalloc((void **)&ds_MTStates, N_GPU_RNG_THREADS_TOTAL*MT_NN*sizeof(unsigned int));
- cudaMalloc((void **)&ds_MTiStates, N_GPU_RNG_THREADS_TOTAL*sizeof(int));
- initialize_CUDA_UniformMT(ds_MTConfigs, ds_MTStates, ds_MTiStates);
- //printf("N_GPU_RNG_THREADS_TOTAL %d\n",N_GPU_RNG_THREADS_TOTAL );
- //printf("%10lu\n",sizeof(mt_struct_stripped));
- //printf("%10lu\n",sizeof(h_MTConfigs));
- ///////////////////////////////////////////////////////////////////////////////////////////////////////
- dim3 block;
- dim3 grid;
-
- /*Allocation et envoi mémoire vers GPU*/
-
- HANDLE_ERROR( cudaMalloc((void **)&d_mol, sizeof(mlcl)*TAILLE));
- HANDLE_ERROR( cudaMalloc((void **)&d_appcell, sizeof(int)*TAILLE));
- HANDLE_ERROR( cudaMalloc((void **)&d_nbrecumu, sizeof(int)*(molecule_set->table->size+1)));
- HANDLE_ERROR( cudaMalloc((void **)&d_nbrecell, sizeof(int)*(molecule_set->table->size)));
- HANDLE_ERROR( cudaMalloc((void **)&d_collisions, sizeof(float)*molecule_set->table->size));
- HANDLE_ERROR( cudaMalloc((void **)&d_samples, sizeof(float)*molecule_set->table->size));
- HANDLE_ERROR( cudaMalloc((void **)&d_statsize, sizeof(float)*molecule_set->table->size));
- HANDLE_ERROR( cudaMemcpy(d_mol, h_mol, TAILLE* sizeof(mlcl), cudaMemcpyHostToDevice));
- HANDLE_ERROR( cudaMemcpy(d_nbrecumu, nbrecumu, (molecule_set->table->size+1)* sizeof(int), cudaMemcpyHostToDevice));
- HANDLE_ERROR( cudaMemcpy(d_nbrecell, nbrecell, (molecule_set->table->size)* sizeof(int), cudaMemcpyHostToDevice));
- HANDLE_ERROR( cudaMemcpy(d_samples, samples, molecule_set->table->size* sizeof(float), cudaMemcpyHostToDevice));
- HANDLE_ERROR( cudaMemcpy(d_collisions, collisions, molecule_set->table->size* sizeof(float), cudaMemcpyHostToDevice));
- HANDLE_ERROR( cudaMemcpy(d_statsize, statsize, molecule_set->table->size* sizeof(float), cudaMemcpyHostToDevice));
- HANDLE_ERROR( cudaMemcpy(d_appcell, appcell, TAILLE *sizeof(int), cudaMemcpyHostToDevice));
- //////////////////////////////////////////////////////////////////////////////////
- ///////Boucle de travail//////////////////////////////////////////////////////////
- //////////////////////////////////////////////////////////////////////////////////
- float* dtsum;
- dtsum = (float*)malloc(sizeof(float));
- dtsum[0]=0;
- float* d_dtsum;
- HANDLE_ERROR( cudaMalloc((void **)&d_dtsum, sizeof(float)));
- cudaMemcpy(d_dtsum, dtsum,sizeof(float), cudaMemcpyHostToDevice);
- int* increm;
- increm = (int*)malloc(sizeof(int));
- dtsum[0]=0;
- int* d_increm;
- HANDLE_ERROR( cudaMalloc((void **)&d_increm, sizeof(int)));
- cudaMemcpy(d_increm, increm,sizeof(int), cudaMemcpyHostToDevice);
- int* col;
- col = (int*)malloc(molecule_set->table->size*sizeof(int));
- int* d_col;
- HANDLE_ERROR( cudaMalloc((void **)&d_col, molecule_set->table->size*sizeof(int)));
- cudaMemcpy(d_col, col,molecule_set->table->size*sizeof(int), cudaMemcpyHostToDevice);
- int sommegpu=0;
- cutilCheckError( cutStartTimer( timer));
- for(m=1;m<1500;m++)
- {
- ////////////////// ADVECTION
- block.x = 32;
- grid.x = TAILLE/32;
- AdvectionGPU<<<grid,block>>>(d_mol,d_appcell,dt,x_max,x_min,y_min,y_max,z_min,z_max,molecule_set->table->dx);
- //cutilCheckMsg("Kernel execution failed" );
- ///////////////////// TRI
- //grid.x = molecule_set->table->size;
- //block.x = 1;
-
- //tri<<<block,grid>>>(d_mol,d_nbrecumu,d_nbrecell,d_appcell);
- //cutilCheckMsg("Kernel execution failed" );
-
- // cudaMemcpy(nbrecumu, d_nbrecumu,sizeof(int)*201, cudaMemcpyDeviceToHost);
- // cudaMemcpy(nbrecell, d_nbrecell,sizeof(int)*200, cudaMemcpyDeviceToHost);
- // for (i=199;i<200;i++) printf("nombre de part dans la cellule %d vaut %d ajout de %d parti: total parti = %d et 1ere cell : %d\n",i,nbrecumu[i],nbrecell[i],nbrecell[199],nbrecell[0]);
- //cudaMemcpy(col, d_col,sizeof(int)*200, cudaMemcpyDeviceToHost);
- //for (i=199;i<200;i++) printf("le nombre de coll dans la cell %d vaut %d collisions \n",i,col[i]);
- // sommegpu =0;
- // for (i=0;i<200;i++) sommegpu+=col[i];
- // printf("%d\n",sommegpu/m);
- ////////////////// COLLISIONS
- // grid.x = 1;
- // block.x = molecule_set->table->size;
- // CollGPU<<<grid,block>>>(d_mol,d_nbrecumu,d_nbrecell,d_collisions,d_statsize,d_samples,c_r_max,dt,n_per_cell_ref, ds_MTStates, ds_MTConfigs, ds_MTiStates,d_appcell,d_col);
- // cutilCheckMsg("Kernel execution failed" );
- /*
- cudaMemcpy(appcell, d_appcell,sizeof(int)*TAILLE, cudaMemcpyDeviceToHost);
- cudaMemcpy(h_mol, d_mol,sizeof(mlcl)*TAILLE, cudaMemcpyDeviceToHost);
- for (i=19100;i<19300;i++) printf("la particule %d avec une vitesse de %f est dans la cellule %d\n",i,h_mol[i].vit[0],appcell[i]);
- */
- /////////////////// STAT
- //if (m%25==0) UpdateStat<<<grid,block>>>(d_Stat,d_mol,d_nbrecumu);
- if (m%500==0)
- {
- /*HANDLE_ERROR( cudaMemcpy(Stat, d_Stat, molecule_set->table->size* sizeof(statistic), cudaMemcpyDeviceToHost));
- for (i=0;i<molecule_set->table->size;i++) ((molecule_set->table->array)[i]).statistics=Stat[i];
- molecule_set_Output_Cell_Stats_File(molecule_set);*/
- }
-
-
- }
-
- cutilCheckError( cutStopTimer( timer));
- float gpu_time = cutGetTimerValue( timer);
- printf( "GPU Processing time: %f (ms)\n", gpu_time);
- HANDLE_ERROR( cudaMemcpy(Stat, d_Stat, molecule_set->table->size* sizeof(statistic), cudaMemcpyDeviceToHost));
- cudaMemcpy(nbrecumu, d_nbrecumu,sizeof(int)*201, cudaMemcpyDeviceToHost);
- cudaMemcpy(nbrecell, d_nbrecell,sizeof(int)*200, cudaMemcpyDeviceToHost);
- for (i=0;i<molecule_set->table->size;i++) ((molecule_set->table->array)[i]).statistics=Stat[i];
- cells_table_item_t myptr;
- cells_table_t mytable;
- mytable = molecule_set->table;
- for (i=0;i<mytable->size;i++) {
- myptr = &(mytable->array[i]);
- myptr->number = nbrecumu[i+1]-nbrecumu[i];
- }
- //for (i=0;i<200;i++) printf("nombre de part dans la cellule %d vaut %d ajout de %d parti\n",i,nbrecumu[i],nbrecell[i]);
- ///////////////////Renvoi des données sur le processeur
- cudaMemcpy(h_mol, d_mol, TAILLE* sizeof(mlcl), cudaMemcpyDeviceToHost);
- cudaMemcpy(nbrecumu, d_nbrecumu, (molecule_set->table->size+1)* sizeof(int), cudaMemcpyDeviceToHost);
- cudaMemcpy(samples, d_samples, molecule_set->table->size* sizeof(float), cudaMemcpyDeviceToHost);
- cudaMemcpy(collisions, d_collisions, molecule_set->table->size* sizeof(float), cudaMemcpyDeviceToHost);
- cudaMemcpy(statsize, d_statsize, molecule_set->table->size* sizeof(float), cudaMemcpyDeviceToHost);
- cudaFree(d_mol);
- cudaFree(d_nbrecumu);
- cudaFree(d_collisions);
- cudaFree(d_statsize);
- cudaFree(d_samples);
- cudaFree(d_nbrecell);
- cudaFree(d_appcell);
- cudaFree(ds_MTConfigs);
- cudaFree(ds_MTStates);
- cudaFree(ds_MTiStates);
- cudaFree(d_dtsum);
- free(h_mol);
- free(nbrecumu);
- free(collisions);
- free(statsize);
- free(samples);
- free(appcell);
- free(nbrecell);
- free(dtsum);
- }
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- void initialize_CUDA_UniformMT(mt_struct_stripped* ds_MTConfigs, unsigned int* ds_MTStates, int* ds_MTiStates)
- {
- c_initialize_CUDA_UniformMT(ds_MTConfigs, ds_MTStates, ds_MTiStates);
- }
- __device__ Real generateUniform(unsigned int *m_mtstate, mt_struct_stripped *m_config, int *m_istate)
- {
- int iState = *m_istate;
- unsigned int mti1 = m_mtstate[iState];
- int iState1 = iState + 1;
- int iStateM = iState + MT_MM;
- if(iState1 >= MT_NN) iState1 -= MT_NN;
- if(iStateM >= MT_NN) iStateM -= MT_NN;
- unsigned int mti = mti1;
- mti1 = m_mtstate[iState1];
- unsigned int mtiM = m_mtstate[iStateM];
- unsigned int x;
- x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
- x = mtiM ^ (x >> 1) ^ ((x & 1) ? (*m_config).matrix_a : 0);
- *m_istate = iState1;
- m_mtstate[iState] = x;
- //Tempering transformation
- x ^= (x >> MT_SHIFT0);
- x ^= (x << MT_SHIFTB) & (*m_config).mask_b;
- x ^= (x << MT_SHIFTC) & (*m_config).mask_c;
- x ^= (x >> MT_SHIFT1);
- //Convert to (0, 1] Real and write to global memory
- return ((Real)x + 1.0f) / (double)4294967296.0f;
- }
- __global__ void AdvectionGPU(mlcl *mol,int* cellule, float dt, int x_max, int x_min,int y_min,int y_max,int z_min,int z_max,float dx)
- {
- int tid;
- int i;
- int cell;
- float pos_collision, t_collision;
- tid = blockIdx.x*blockDim.x + threadIdx.x;
- if (tid<19283)
- {
- for (i=0;i<3;++i)
- {
- mol[tid].pos[i]+= dt*mol[tid].vit[i];
- }
- if (mol[tid].pos[0]<x_min) mol[tid].pos[0] = x_max-x_min+mol[tid].pos[0];
- if (mol[tid].pos[0]>x_max) mol[tid].pos[0] = x_min-x_max+mol[tid].pos[0];
- if (mol[tid].pos[1]<y_min) mol[tid].pos[1] = y_max-y_min+mol[tid].pos[1];
- if (mol[tid].pos[1]>y_max) mol[tid].pos[1] = y_min-y_max+mol[tid].pos[1];
- if (mol[tid].pos[2]<z_min) mol[tid].pos[2] = z_max-z_min+mol[tid].pos[2];
- if (mol[tid].pos[2]>z_max) mol[tid].pos[2] = z_min-z_max+mol[tid].pos[2];
- cell=(mol[tid].pos[0]-x_min)/dx;
- cellule[tid] = cell;
- }
- }
- __global__ void tri(mlcl *mol, int* nbrecumule,int* nbrecell,int *appcell)
- {
- int tid =0;
- tid = blockIdx.x*blockDim.x + threadIdx.x;
- int tic =0;
- int i;
- mlcl retrouve[250];
- //On les rerange dans la bonne cellule
- if (tid!=0 && tid!= 199)
- {
- for (i=nbrecumule[tid-1];i<nbrecumule[tid+2];i++)
- {
- if(appcell[i]==tid)
- {
- retrouve[tic]=mol[i];
- tic++;
- }
- }
- nbrecell[tid]=tic;
- }
- else if (tid==0)/////////Pas oublier les Boundary Left
- {
- for (i=nbrecumule[tid];i<nbrecumule[tid+2];i++)
- {
- if(appcell[i]==0)
- {
- retrouve[tic]=mol[i];
- tic++;
- }
- }
- for (i=nbrecumule[200];i<nbrecumule[200]+100;i++)
- {
- if(appcell[i]==0)
- {
- retrouve[tic]=mol[i];
- tic++;
- }
- }
- nbrecell[tid]=tic;
- }
- else
- {
- for (i=nbrecumule[tid-1];i<nbrecumule[tid+1];i++)
- {
- if(appcell[i]==tid)
- {
- retrouve[tic]=mol[i];
- tic++;
- }
- }
- nbrecell[tid]=tic;
- }
- nbrecumule[tid+1]=0;
- __syncthreads();
- for(i=0;i<tid+1;i++) nbrecumule[tid+1]+=nbrecell[i];
- __syncthreads();
- for(i=0;i<nbrecell[tid];i++)
- {
- mol[nbrecumule[tid]+i]=retrouve[i];
- appcell[nbrecumule[tid]+i]=tid;
- }
- }
- __global__ void CollGPU(mlcl *mol, int* nbrecumule,int* nbrecell, float* delai,float* taille, float* sample, float c_r_max,int dt, int n_per_cell_ref,unsigned int *m_mtstate, mt_struct_stripped *m_config, int *m_istate, int *appcell, int* col)
- {
- int tid,i,j,index1,index2;
- tid = blockIdx.x*blockDim.x + threadIdx.x;
- float epsilon,chi,seps,schi,ceps,cchi,norm_c_r;
- float B;
- float n_aver_for_collision_interval, delta_t_coll;
- v3 c_m,c_r,c_r_star;
- float arret;
- arret = float(1.0)/100;
- col[tid]=0;
- if(nbrecell[tid]>1)
- {
- ///////////////Collisions
- while (delai[tid]<=arret) {
- do {
- index1 = nbrecumule[tid]+(nbrecumule[tid+1]-nbrecumule[tid])*generateUniform(&m_mtstate[tid],&m_config[tid],&m_istate[tid]);
- index2 = nbrecumule[tid]+(nbrecumule[tid+1]-nbrecumule[tid])*generateUniform(&m_mtstate[tid],&m_config[tid],&m_istate[tid]);
- index2 = (index2>=index1) ? (index2+1) : index2;
-
- for (i=0;i<3;i++) {
- c_m[i] = (mol[index1].vit[i] + mol[index2].vit[i])/2;
- c_r[i] = mol[index1].vit[i] - mol[index2].vit[i];
- }
- norm_c_r = sqrt(c_r[0]*c_r[0] + c_r[1]*c_r[1] + c_r[2]*c_r[2]);
- } while(norm_c_r<c_r_max*generateUniform(&m_mtstate[tid],&m_config[tid],&m_istate[tid]));
- col[tid]++;
- epsilon = 2*generateUniform(&m_mtstate[tid],&m_config[tid],&m_istate[tid])*float(M_PI);
- chi = float(M_PI)*generateUniform(&m_mtstate[tid],&m_config[tid],&m_istate[tid]);
- seps = sin(epsilon);
- schi=sin(chi);
- cchi=cos(chi);
- ceps=cos(epsilon);
- B = sqrt(c_r[1]*c_r[1] + c_r[2]*c_r[2]);
- c_r_star[0] = (cchi * c_r[0] + schi * seps * B);
- c_r_star[1] = cchi * c_r[1] + schi * (norm_c_r * c_r[2] * ceps - c_r[0] * c_r[1] * seps)/B;
- c_r_star[2] = cchi * c_r[2] - schi * (norm_c_r * c_r[1] * ceps + c_r[0] * c_r[2] * seps)/B;
- for (j=0;j<3;++j) {
- mol[index1].vit[j] = (c_r_star[j] + 2*c_m[j])/2;
- mol[index2].vit[j] = (-c_r_star[j] + 2*c_m[j])/2;
- }
- n_aver_for_collision_interval = ((taille[tid] == 0) ? (nbrecumule[tid+1]-nbrecumule[tid]) : ((taille[tid]) / (sample[tid])));
- delta_t_coll = (2*float(M_SQRT2)/((nbrecumule[tid+1]-nbrecumule[tid]))) * (n_per_cell_ref)/(n_aver_for_collision_interval * norm_c_r);
- delai[tid] += delta_t_coll;
- }
- delai[tid] = delai[tid] - float(1.0)/100;
- }
- nbrecell[tid]=0;
- }
- void cuda_init_inflow_distribution(float *normalization_1,float *normalization_2,float *f_max_1,float *f_max_2)
- /* Initializes the static variables used repeatedly in inflow_distribution_left
- and inflow_distribution_right
- */
- {
- float u_f_max_1, u_f_max_2;
- *normalization_1 = 0.5 * exp(- beta2_o_beta1 * beta2_o_beta1 * mean_vel_1 * mean_vel_1) / (beta2_o_beta1 * beta2_o_beta1)
- + mean_vel_1 * 0.5 * sqrt(M_PI) * (1.0 + erf( beta2_o_beta1 * mean_vel_1)) / beta2_o_beta1;
-
- *normalization_2 = 0.5 * exp(- beta2_o_beta1 * beta2_o_beta1 * mean_vel_2 * mean_vel_2) / (beta2_o_beta1 * beta2_o_beta1)
- + mean_vel_2 * 0.5 * sqrt(M_PI) * (1.0 + erf( beta2_o_beta1 * mean_vel_2)) / beta2_o_beta1;
-
- u_f_max_1 = 0.5 * (mean_vel_1 + sqrt(mean_vel_1 * mean_vel_1 + 2.0 / (beta2_o_beta1 * beta2_o_beta1)));
-
- *f_max_1 = u_f_max_1 * exp(- beta2_o_beta1 * beta2_o_beta1 * pow((u_f_max_1 - mean_vel_1),2)) / *normalization_1;
-
- u_f_max_2 = 0.5 * (mean_vel_2 + sqrt(mean_vel_2 * mean_vel_2 + 2.0 / (beta2_o_beta1 * beta2_o_beta1)));
-
- *f_max_2 = u_f_max_2 * exp(- beta2_o_beta1 * beta2_o_beta1 * pow((u_f_max_2 - mean_vel_2),2)) / *normalization_2;
- }
- __global__ void UpdateStat(statistic *stat,mlcl *mol,int *nbrecumu)
- {
- int tid = blockIdx.x*blockDim.x + threadIdx.x;
- int i;
- stat[tid].size = stat[tid].size + nbrecumu[tid+1]-nbrecumu[tid];
- stat[tid].samples ++;
-
- for (i=nbrecumu[tid];i<nbrecumu[tid+1];i++) {
- stat[tid].sum_v[0] += mol[i].vit[0];
- stat[tid].sum_v[1] += mol[i].vit[1];
- stat[tid].sum_v[2] += mol[i].vit[2];
- stat[tid].sum_v2[0] += mol[i].vit[0]*mol[i].vit[0];
- stat[tid].sum_v2[1] += mol[i].vit[1]*mol[i].vit[1];
- stat[tid].sum_v2[2] += mol[i].vit[2]*mol[i].vit[2];
- }
- }
|
Message édité par gilou le 03-05-2011 à 11:11:10
|