작성일자 : 2023-09-29
Ver 0.1.1
- Data frame은 자료분석을 위해 만들어진 객체형이다.
- Data frame은 여러면에서 matrix 및 list와 유사한 형태를 지닌다
- Data frame의 행(row)은 샘플(observations)에 대응되며 열(column)은 변수(variables)에 대응된다.
Data frame 만들기: read.table()
data1 = read.table( 'files/data1.txt' )
data1
## V1 V2 V3 V4
## 1 no name age sex
## 2 1 LEE 55 M
## 3 2 PARK 47 F
## 4 3 SO 35 M
## 5 4 KIM 26 F
## 6 5 YOON 29 M
data2 = read.table( 'files/data1.txt' , row.names = 'no' , header = T )
data2
## name age sex
## 1 LEE 55 M
## 2 PARK 47 F
## 3 SO 35 M
## 4 KIM 26 F
## 5 YOON 29 M
str(data2)
## 'data.frame': 5 obs. of 3 variables:
## $ name: Factor w/ 5 levels "KIM","LEE","PARK",..: 2 3 4 1 5
## $ age : int 55 47 35 26 29
## $ sex : Factor w/ 2 levels "F","M": 2 1 2 1 2
col1 = data2[ , 1 ]
col2 = data2[ , 2 ]
col3 = data2[ , 3 ]
col1
## [1] LEE PARK SO KIM YOON
## Levels: KIM LEE PARK SO YOON
class( col1 ) # the first column is a factor
## [1] "factor"
class( col2 ) # the second column is numeric(integer)
## [1] "integer"
class( col3 ) # the third column is a factor
## [1] "factor"
- header 옵션은 첫번째 행을 변수명 또는 열이름으로 설정한다.
- row.names='no' 옵션은 변수 no를 행이름으로 설정한다. 이 옵션을 사용하지 않거나 row.names = NULL로 설정하면 디폴트로 일련번호가 행이름으로 정해진다.
Data frame 만들기: data.frame() 함수 이용
data3 = scan( 'files/data1.txt' , list( no = 0 , name = '' , age = 0 , sex = '' ) , skip = 1 )
data3
## $no
## [1] 1 2 3 4 5
##
## $name
## [1] "LEE" "PARK" "SO" "KIM" "YOON"
##
## $age
## [1] 55 47 35 26 29
##
## $sex
## [1] "M" "F" "M" "F" "M"
no = data3$no
name = data3$name
age = data3$age
sex = data3$sex
data4 = data.frame( no , name , age , sex )
data4
## no name age sex
## 1 1 LEE 55 M
## 2 2 PARK 47 F
## 3 3 SO 35 M
## 4 4 KIM 26 F
## 5 5 YOON 29 M
- 위의 scan() 함수의 옵션 list는 제 1, 3열을 numeric으로, 제 2, 4열을 character로 받아들인다는 의미이다. 즉, 0은 numeric을, ''는 character를 뜻한다.
- 옵션 skip = 1 은 처음행을 건너뜀을 의미한다.
data5 = data.frame( no = 1:5 ,
name = c( 'LEE' , 'PARK' , 'SO' , 'KIM' , 'YOON' ) ,
age = c( 55 , 47 , 35 , 26 , 29 ) ,
sex = c( 'M' , 'F' , 'M' , 'F' , 'M' ) )
data5
## no name age sex
## 1 1 LEE 55 M
## 2 2 PARK 47 F
## 3 3 SO 35 M
## 4 4 KIM 26 F
## 5 5 YOON 29 M
Data frame의 특성
data2
## name age sex
## 1 LEE 55 M
## 2 PARK 47 F
## 3 SO 35 M
## 4 KIM 26 F
## 5 YOON 29 M
mode( data2 )
## [1] "list"
length( data2 )
## [1] 3
names( data2 )
## [1] "name" "age" "sex"
row.names( data2 )
## [1] "1" "2" "3" "4" "5"
dim( data2 )
## [1] 5 3
dimnames( data2 )
## [[1]]
## [1] "1" "2" "3" "4" "5"
##
## [[2]]
## [1] "name" "age" "sex"
colnames( data2 )
## [1] "name" "age" "sex"
rownames( data2 )
## [1] "1" "2" "3" "4" "5"
ncol( data2 )
## [1] 3
nrow( data2 )
## [1] 5
class( data2 )
## [1] "data.frame"
Data frame의 성분 조작
data2
## name age sex
## 1 LEE 55 M
## 2 PARK 47 F
## 3 SO 35 M
## 4 KIM 26 F
## 5 YOON 29 M
data2[ 2 , 1 ] # the first variable of the second observation
## [1] PARK
## Levels: KIM LEE PARK SO YOON
data2[[ 1 ]][ 2 ] # the first variable of the second observation
## [1] PARK
## Levels: KIM LEE PARK SO YOON
data2[ data2$name == 'LEE' , ] # observation of name=='LEE'
## name age sex
## 1 LEE 55 M
data2[ , 2 ] # the second component (or variable)
## [1] 55 47 35 26 29
data2$age # the second component (age) -- same as a list
## [1] 55 47 35 26 29
data2[[ 2 ]] # the second component (age) -- same as a list
## [1] 55 47 35 26 29
is.list( data2 )
## [1] TRUE
is.data.frame( data2 )
## [1] TRUE
샘플과 변수의 추가
levels( data2$name ) = c( levels( data2$name ) , 'RYU' ) # add new level to factor
data2 = rbind( data2 , c( 'RYU' , 36 , 'M' ) ) # add new row (observation)
data2[ 7 , ] = c( 'CHOI' , 41 , 'F' ) # 'CHOI' is not in the set of levels
## Warning in `[<-.factor`(`*tmp*`, iseq, value = "CHOI"): invalid factor
## level, NA generated
data2
## name age sex
## 1 LEE 55 M
## 2 PARK 47 F
## 3 SO 35 M
## 4 KIM 26 F
## 5 YOON 29 M
## 6 RYU 36 M
## 7 <NA> 41 F
class( data2$age )
## [1] "character"
data2$age
## [1] "55" "47" "35" "26" "29" "36" "41"
data2$age = as.numeric( data2$age ) # convert age into numeric
data2 = cbind( data2 , married = c( T , T , T , F , F , T , T ) ) # add new column (variable)
data2
## name age sex married
## 1 LEE 55 M TRUE
## 2 PARK 47 F TRUE
## 3 SO 35 M TRUE
## 4 KIM 26 F FALSE
## 5 YOON 29 M FALSE
## 6 RYU 36 M TRUE
## 7 <NA> 41 F TRUE
str( data2 )
## 'data.frame': 7 obs. of 4 variables:
## $ name : Factor w/ 6 levels "KIM","LEE","PARK",..: 2 3 4 1 5 6 NA
## $ age : num 55 47 35 26 29 36 41
## $ sex : Factor w/ 2 levels "F","M": 2 1 2 1 2 2 1
## $ married: logi TRUE TRUE TRUE FALSE FALSE TRUE ...
Data frame들 간의 병합
df1 = data.frame( name = c( 'LEE' , 'PARK' , 'SO' , 'KIM' , 'YOON' ) ,
age = c( 55 , 47 , 35 , 26 , 29 ) ,
sex = c( 'M' , 'F' , 'M' , 'F' , 'M' ) )
df2 = data.frame( name = c( 'LEE' , 'SO' , 'LEE' , 'YOON' , 'PARK' ) ,
married = c( F , T , T , F , T ) )
df1
## name age sex
## 1 LEE 55 M
## 2 PARK 47 F
## 3 SO 35 M
## 4 KIM 26 F
## 5 YOON 29 M
df2
## name married
## 1 LEE FALSE
## 2 SO TRUE
## 3 LEE TRUE
## 4 YOON FALSE
## 5 PARK TRUE
merge( df1 , df2 ) # merge two data frames with a key 'name'
## name age sex married
## 1 LEE 55 M FALSE
## 2 LEE 55 M TRUE
## 3 PARK 47 F TRUE
## 4 SO 35 M TRUE
## 5 YOON 29 M FALSE
df3 = data.frame( surname = c( 'KIM' , 'SO' , 'LEE' , 'YOON' , 'PARK' ) ,
married = c( F , T , T , F , T ) )
merge( df1 , df3 , by.x = 'name' , by.y = 'surname' )
## name age sex married
## 1 KIM 26 F FALSE
## 2 LEE 55 M TRUE
## 3 PARK 47 F TRUE
## 4 SO 35 M TRUE
## 5 YOON 29 M FALSE
- 병합의 기준변수(key variable)은 옵션 by.x 과 by.y로 설정한다.
df4 = data.frame( name = c( 'LEE' , 'PARK' , 'SO' , 'KIM' , 'YOON' ) ,
sex = c( 'M' , 'M' , 'F' , 'F' , 'M' ) ,
married = c( T , T , F , T , F ) )
df1
## name age sex
## 1 LEE 55 M
## 2 PARK 47 F
## 3 SO 35 M
## 4 KIM 26 F
## 5 YOON 29 M
df4
## name sex married
## 1 LEE M TRUE
## 2 PARK M TRUE
## 3 SO F FALSE
## 4 KIM F TRUE
## 5 YOON M FALSE
merge( df1 , df4 ) # 2 keys (name and sex): intersection
## name sex age married
## 1 KIM F 26 TRUE
## 2 LEE M 55 TRUE
## 3 YOON M 29 FALSE
merge( df1 , df4 , all = T ) # 2 keys (name and sex): union
## name sex age married
## 1 KIM F 26 TRUE
## 2 LEE M 55 TRUE
## 3 PARK F 47 NA
## 4 PARK M NA TRUE
## 5 SO F NA FALSE
## 6 SO M 35 NA
## 7 YOON M 29 FALSE
행과 열의 제거
data2
## name age sex married
## 1 LEE 55 M TRUE
## 2 PARK 47 F TRUE
## 3 SO 35 M TRUE
## 4 KIM 26 F FALSE
## 5 YOON 29 M FALSE
## 6 RYU 36 M TRUE
## 7 <NA> 41 F TRUE
( data2 = data2[ -2 , ] ) # delete the second row
## name age sex married
## 1 LEE 55 M TRUE
## 3 SO 35 M TRUE
## 4 KIM 26 F FALSE
## 5 YOON 29 M FALSE
## 6 RYU 36 M TRUE
## 7 <NA> 41 F TRUE
( data2 = data2[ , -4 ] ) # delete teh fourth column
( data2 = data2[ -2 , ] ) # delete the second row
## name age sex
## 1 LEE 55 M
## 3 SO 35 M
## 4 KIM 26 F
## 5 YOON 29 M
## 6 RYU 36 M
## 7 <NA> 41 F
원소의 수정
data2
## name age sex
## 1 LEE 55 M
## 3 SO 35 M
## 4 KIM 26 F
## 5 YOON 29 M
## 6 RYU 36 M
## 7 <NA> 41 F
data2[ 4 , 2 ] = 28
data2
## name age sex
## 1 LEE 55 M
## 3 SO 35 M
## 4 KIM 26 F
## 5 YOON 28 M
## 6 RYU 36 M
## 7 <NA> 41 F
성분 접근하기
data2$age
## [1] 55 35 26 28 36 41
mean( data2$age )
## [1] 36.83333
attach() 함수
ls()
## [1] "A1" "Ashley" "Jake" "Jason"
## [5] "age" "age1" "age2" "ages"
## [9] "col1" "col2" "col3" "data1"
## [13] "data2" "data3" "data4" "data5"
## [17] "df1" "df2" "df3" "df4"
## [21] "drink" "fac1" "fives" "list1"
## [25] "logical.mat1" "mat1" "mat2" "mat3"
## [29] "mat4" "month.kor" "my.class" "name"
## [33] "no" "ord1" "ord2" "pencil"
## [37] "ramen" "row1" "row2" "row3"
## [41] "school.store" "sex" "stationery" "std1"
## [45] "student" "surname" "tbl" "vec"
## [49] "vec1" "vec2" "year"
rm( age )
age
## Error in eval(expr, envir, enclos): object 'age' not found
attach( data2 )
## The following objects are masked _by_ .GlobalEnv:
##
## name, sex
age
## [1] 55 35 26 28 36 41
detach( data2 )
age
## Error in eval(expr, envir, enclos): object 'age' not found
data2$age
## [1] 55 35 26 28 36 41
- attach()는 data frame의 각 성분을 변수로 사용가능하게 한다. Data frame의 변수가 매우 많을때 유용하게 쓰인다.
- attach()를 사용한 후, 반드시 detach()를 사용하여 예상치 않은 오류를 줄이는 습관이 중요하다.