1【数学建模-某肿瘤疾病诊疗的经济学分析】数据分析 2 【数学建模-某肿瘤疾病诊疗的经济学分析】数据清洗和特征工程 3 【数学建模-某肿瘤疾病诊疗的经济学分析】第一问模型分析 4 【代码下载】 5【30页的论文下载】
题目 江西省数学建模-某肿瘤疾病诊疗的经济学分析基于病人的基本数据,疾病类型(主诉和并发,是否手术),住院天数和费用等,数据清洗并建立数学模型做如下分析: 1、建立根据不同疾病的分类模型。建立诊疗费用与疾病类型的数学关系,并进行预测和检验。 2、建立数学模型分析诊疗费用与各类疾病的亚群的特征,比如,高费用人群的年龄,性别,住院日期和相关数据的相关性,尝试对特定的亚群建立预测模型并进行验证。 3、如果该疾病纳入医保,尝试给出根据疾病类型、建议年龄段和国家承担的经济费用的方案并对相关方案合理性和经济性作出评估。
提供的数据集,包含患者序号、患者性别、出生日期、患者入院日期、患者出院日期、主要诊断编码名称、其他诊断、其他手术、住院总费用、住院天数、DRGS分组编码、DRGS分组名称、ADRG名称、费用异常标识。
2 数据集主要特征分析import numpy as np
import pandas as pd
train_data_file = './cdata.csv'
if __name__ =="__main__":
t_data = pd.read_csv(train_data_file)#, names=['id', 'sex','born','intime','outtime','maindiag','elsediag','surgery','fee','days','drgsid','drgs','adrgid','adrg','highfee'])
t_data.columns = ['id', 'sex','born','intime','outtime','maindiag','elsediag','surgery','fee','days','drgsid','drgs','adrgid','adrg','highfee']
print()
t_data.describe()
(1)数据长度:17739 (2)主要诊断类别:183种
def maindiag_extract(data):
text_len =[]
datalen = len(data)
for i in range(0,datalen):
one_lines = ''.join(list(data['maindiag'][i]))
text_id = one_lines.strip().split("|")
text_len.append(text_id[0])
all_category = list(set(text_len))
print(all_category)
print(len(all_category))
print()
```
(3)次要诊断类别:803
```python
def elsediag_extract(data):
text_len =[]
datalen = len(data)
for i in range(0,datalen):
nontext = data['elsediag'][i]
if pd.isnull(nontext):
continue
one_lines = ''.join(list(nontext))
text = one_lines.strip().split(",")
for j in range(len(text)):
text_id = text[j].strip().split("|")
text_len.append(text_id[0])
all_category = list(set(text_len))
print(all_category)
print(len(all_category))
(4)DRGs类别数:72类
def drgs_extract(data):
text_len =[]
datalen = len(data)
for i in range(0,datalen):
text_id = data['drgsid'][i]
text_len.append(text_id)
all_category = list(set(text_len))
print(all_category)
print(len(all_category))
print()
(5)DRGS分组平均费用分布分析
import numpy as np
import pandas as pd
# import tensorflow as tf
from category_encoders.target_encoder import TargetEncoder
import matplotlib.pyplot as plt
import statsmodels.api as sm
def fee_range(data):
text_len =[]
# category =[]
category={}
feelist =[]
datalen = len(data)
for i in range(0,datalen):
text_id = data['drgsid'][i]
data_fee = data['fee'][i]
feelist.append(data_fee)
category[text_id] =list(set(feelist))
ncate ={}
for k in category.keys():
# 取每个分组下的费用平均
ncate[k] = np.mean(category[k])
a_cate = dict(sorted(ncate.items(), key=lambda x: x[1], reverse=True))
x = list(a_cate.keys())
y = list(a_cate.values())
plt.scatter(x, y, alpha=0.9) # 绘制散点图,透明度为0.6(这样颜色浅一点,比较好看)
plt.show()
print(a_cate)
print()
if __name__ =="__main__":
t_data = pd.read_csv(train_data_file)#, names=['id', 'sex','born','intime','outtime','maindiag','elsediag','surgery','fee','days','drgsid','drgs','adrgid','adrg','highfee'])
t_data.columns = ['id', 'sex','born','intime','outtime','maindiag','elsediag','surgery','fee','days','drgsid','drgs','adrgid','adrg','highfee']
fee_range(t_data)
print()
可以看出,DRGS其实是大体上是划分了费用的取用的。
(6)DRGS分组类别分布
def box_line(data):
text_len =[]
# category =[]
category={}
cate_box = pd.DataFrame()
datalen = len(data)
# feelist =[]
for i in range(0,datalen):
text_id = data['drgsid'][i]
data_fee = data['fee'][i]
if text_id in category.keys():
templist = list(category[text_id])
templist.append(data_fee)
category[text_id] =list(set(templist))
else:
category[text_id] = [data_fee]
pxy = {}
for k in category.keys():
pxy[k] = len(category[k])
# print(k,len(category[k]))
resultxy = dict(sorted(pxy.items(), key=lambda x: x[1]))
x = list(resultxy.keys())
y = list(resultxy.values())
for j in resultxy.keys():
print(j,resultxy[j])
plt.xlabel('DRGs')
plt.title('Distribution of the number of grouping categories ')
plt.ylabel('The amount of DRGS')
plt.xticks([])
# x = [i for i in range(len(y))]
plt.scatter(x, y, alpha=0.9) # 绘制散点图,透明度为0.6(这样颜色浅一点,比较好看)
plt.show()
print()
DA13 1 DE11 1 DK13 1 DR15 1 DR11 1 GK35 1 IJ13 1 IU35 1 IU31 1 JB23 1 KR13 1 LT13 1 LZ13 1 QR15 1 QS31 1 QT11 1 RA21 1 RA31 1 RA35 1 RD15 1 RU15 1 KR11 2 RA23 2 RT15 2 RV15 2 EJ15 3 ET13 3 RD13 3 RD11 3 RS15 3 RS13 3 RT11 3 XT19 3 BU11 4 EJ13 4 QS43 4 RA33 4 XJ19 6 RA41 12 JR15 13 ED13 14 HR13 16 QR13 17 JR13 19 QT13 19 ER11 25 RT13 35 RU23 35 IU33 40 RU11 43 BU13 46 DR13 51 RA45 54 QS33 55 GR11 59 RV11 68 RA43 96 RE15 170 GR15 214 GR13 217 RC15 223 RE11 241 RC11 243 ER15 288 XS29 378 ER13 412 XT39 420 RW19 469 RV13 2465 RU13 2829 RC13 3910 RE13 4272
(7)DRGS分组中费用范围箱线图
def box_line(data):
text_len =[]
# category =[]
category={}
cate_box = pd.DataFrame()
datalen = len(data)
# feelist =[]
for i in range(0,datalen):
text_id = data['drgsid'][i]
data_fee = data['fee'][i]
if text_id in category.keys():
templist = list(category[text_id])
templist.append(data_fee)
category[text_id] =list(set(templist))
else:
category[text_id] = [data_fee]
pxy = {}
for k in category.keys():
pxy[k] = len(category[k])
# print(k,len(category[k]))
sordict = dict(sorted(pxy.items(), key=lambda x: x[1]))
resultxy ={}
for k in sordict.keys():
resultxy[k] = category[k]
for k in resultxy.keys():
templi = list(resultxy[k])
templen = len(templi)
if 4272 > templen:
for i in range(4272-templen):
templi.append(np.nan)
cate_box[k] = templi
cate_box.plot.box(title="Fee-categroy")
plt.grid(linestyle="--", alpha=0.3)
plt.show()
print()
(8)ADRG的类别分布,39种类别
def drgs_box_line(data):
category={}
cate_box = pd.DataFrame()
datalen = len(data)
# feelist =[]
for i in range(0,datalen):
text_id = data['adrgid'][i]
data_fee = data['fee'][i]
if text_id in category.keys():
templist = list(category[text_id])
templist.append(data_fee)
category[text_id] =list(set(templist))
else:
category[text_id] = [data_fee]
pxy = {}
for k in category.keys():
pxy[k] = len(category[k])
# print(k,len(sordict[k]))
sordict = dict(sorted(pxy.items(), key=lambda x: x[1]))
resultxy ={}
# ADRG计算类别排序
for k in sordict.keys():
# resultxy[k] = category[k]
print(k,sordict[k])
DA1 1 DE1 1 DK1 1 GK3 1 IJ1 1 JB2 1 LT1 1 LZ1 1 ET1 3 KR1 3 RA2 3 XT1 3 QS4 4 RA3 6 RS1 6 XJ1 6 EJ1 7 RD1 7 ED1 14 HR1 16 QR1 18 QT1 20 JR1 32 RU2 35 RT1 40 IU3 42 BU1 50 DR1 53 QS3 56 RA4 162 XS2 378 XT3 420 RW1 469 GR1 490 ER1 725 RV1 2535 RU1 2872 RC1 4376 RE1 4682
(9)ADRG与费用的箱线图
def drgs_box_line(data):
category={}
cate_box = pd.DataFrame()
datalen = len(data)
# feelist =[]
for i in range(0,datalen):
text_id = data['adrgid'][i]
data_fee = data['fee'][i]
if text_id in category.keys():
templist = list(category[text_id])
templist.append(data_fee)
category[text_id] =list(set(templist))
else:
category[text_id] = [data_fee]
pxy = {}
for k in category.keys():
pxy[k] = np.mean(category[k])
# print(k,len(sordict[k]))
sordict = dict(sorted(pxy.items(), key=lambda x: x[1]))
resultxy ={}
for k in sordict.keys():
resultxy[k] = category[k]
# print(k,sordict[k])
for k in resultxy.keys():
templi = list(resultxy[k])
templen = len(templi)
if 4682 > templen:
for i in range(4682-templen):
templi.append(np.nan)
cate_box[k] = templi
cate_box.plot.box(title="Fee-categroy")
plt.grid(linestyle="--", alpha=0.3)
plt.title('Relationship between ADRG and medical fee')
plt.xlabel('ADRG')
plt.ylabel('medical fee')
plt.show()
(10)ADRG中ER1、GR1、QS3等每个类别中的样本数据分布,都呈现相似曲线上升。
总结: (1)数据长度:17739行 (2)主要诊断类别:183种 (3)DRGs类别数:72种 (4)次要诊断类别:803 (5)ADRG的类别:39种 (6)ADRG编码和DRGS编码无缺失值,但是分布很不均匀,有的类别,只有1个样本,有的类别有4682种。对训练模型来说很不友好。 (7)最后一列属性,是费用异常,可以看到有高费用异常和低费用异常,暂且不知道这些属性有何意义 (8)ADRG中每个类别中的样本数据分布,都呈现相似曲线上升。
3 数据集亚群特征分析参考类似的病例分析案例,需要分析年龄、性别、有无并发症、住院时长等特征https://www.cn-healthcare.com/articlewm/20181214/content-1042985.html
(1)年龄与平均费用关系折线图
def age_static(data):
age_fee ={}
datalen = len(data)
for i in range(0,datalen):
born_year = data['born'][i]
if born_year=='0 AM':
continue
else:
intime = ''.join(data['intime'][i])
in_year = intime.strip().split("/")
age = int(in_year[2])-int(born_year)
data_fee = data['fee'][i]
if age in age_fee.keys():
templist = list(age_fee[age])
templist.append(data_fee)
age_fee[age] =list(templist)
else:
age_fee[age] = [data_fee]
# 计算平均费用
avg_age_fee ={}
for k in age_fee.keys():
avg = np.mean(list(age_fee[k]))
avg_age_fee[k] = avg
sort_avg_fee = dict(sorted(avg_age_fee.items(), key=lambda x: x[0]))
print(sort_avg_fee)
x = list(sort_avg_fee.keys())
y = list(sort_avg_fee.values())
plt.plot(x,y,'b--',label='age-fee')
plt.title('Relationship between age and cost')
plt.xlabel('age')
plt.ylabel('medical-fee')
plt.show()
print()
(2)阶段年龄分布柱状图
30: 25658.83080291971, 40: 25232.891867549668 50: 26072.089125503106 60: 27377.498989296368 70: 32492.331597490345 90: 36317.296185236126}
def age_static(data):
age_fee ={}
datalen = len(data)
for i in range(0,datalen):
born_year = data['born'][i]
if born_year=='0 AM':
continue
else:
intime = ''.join(data['intime'][i])
in_year = intime.strip().split("/")
age = int(in_year[2])-int(born_year)
data_fee = data['fee'][i]
if age in age_fee.keys():
templist = list(age_fee[age])
templist.append(data_fee)
age_fee[age] =list(templist)
else:
age_fee[age] = [data_fee]
# 计算平均费用
avg_age_fee ={}
for k in age_fee.keys():
avg = np.mean(list(age_fee[k]))
avg_age_fee[k] = avg
sort_avg_fee = dict(sorted(avg_age_fee.items(), key=lambda x: x[0]))
#绘制直方图,阶段年龄与平均费用的
li30 =[]
li40 =[]
li50 =[]
li60 =[]
li70 =[]
limax =[]
n_age_fee = {}
for k in age_fee.keys():
age = int(k)
if age
关注
打赏
最近更新
- 深拷贝和浅拷贝的区别(重点)
- 【Vue】走进Vue框架世界
- 【云服务器】项目部署—搭建网站—vue电商后台管理系统
- 【React介绍】 一文带你深入React
- 【React】React组件实例的三大属性之state,props,refs(你学废了吗)
- 【脚手架VueCLI】从零开始,创建一个VUE项目
- 【React】深入理解React组件生命周期----图文详解(含代码)
- 【React】DOM的Diffing算法是什么?以及DOM中key的作用----经典面试题
- 【React】1_使用React脚手架创建项目步骤--------详解(含项目结构说明)
- 【React】2_如何使用react脚手架写一个简单的页面?