I spend much time trying do it in O(nlogn) 's time, but failed.
Now I can do O(n2) :
#include <bits/stdc++.h>
#define i64 long long
#define rep(i,l,r) for(int i=(l);i<=(r);i++)
#define fdn(i,r,l) for(int i=(r);i>=(l);i--)
#define pii pair<int,int>
using namespace std;
typedef long long ll;
typedef double db;
typedef __int128 i128;
std::mt19937 rnd(std::chrono::steady_clock::now().time_since_epoch().count());
std::mt19937_64 rnd64(std::chrono::steady_clock::now().time_since_epoch().count());
int main()
{
#ifndef ONLINE_JUDGE
freopen("in.in","r",stdin);
freopen("out.out","w",stdout);
#endif
ios::sync_with_stdio(false);cin.tie(nullptr);cout.tie(nullptr);
int n,m;
cin>>n>>m;
vector<ll> a(n+1),sum(n+1,0);
rep(i,1,n) cin>>a[i],sum[i]=a[i];
rep(i,1,n) sum[i]+=sum[i-1];
vector<ll> dp(n+1,0),f(n+1,0);
rep(i,1,n)
{
dp[i]=dp[i-1];
rep(j,1,i) dp[i]+=(sum[i]-sum[j-1])%m;
}
cout<<dp[n];
}