rt,here。
#include<bits/stdc++.h>
using namespace std;
int n,m;
int s[4005][4005];
int pre[4005][4005];
int dp[805][4005];
int c[805][4005];
int main(){
ios::sync_with_stdio(0);
cin>>n>>m;
for(int i=1;i<=n;++i){
for(int j=1;j<=n;++j){
cin>>s[i][j];
pre[i][j]=pre[i-1][j]+pre[i][j-1]-pre[i-1][j-1]+s[i][j];
}
}
for(int j=1;j<=n;++j) dp[1][j]=pre[j][j]/2,c[1][j]=0;
for(int i=2;i<=m;++i) dp[i][i]=0,c[i][i]=i-1;
for(int i=2;i<=m;++i){
for(int j=n;j>=i+1;--j){
dp[i][j]=1e9;
int lower=c[i-1][j],upper=(j==n?j-1:c[i][j+1]);
for(int k=lower;k<=upper;++k){
int cost=dp[i-1][k];
cost+=(pre[j][j]-pre[k][j]-pre[j][k]+pre[k][k])/2;
if(cost<dp[i][j]){
dp[i][j]=cost;
c[i][j]=k;
}
}
//clog<<"("<<i<<","<<j<<") "<<lower<<" "<<c[i][j]<<" "<<dp[i][j]<<endl;
//assert(c[i][j]>=lower);
}
}
cout<<dp[m][n]<<endl;
return 0;
}